Cope 2.5.0
My personal "standard library" of all the generally useful code I've written for various projects over the years
Loading...
Searching...
No Matches
Functions | Variables
Cope.experimental.data Namespace Reference

Functions

def _cast2dataframe (func)
 
def installLibs (libs=['pandas', 'numpy', 'imblearn', 'ipywidgets', 'seaborn', 'scipy', 'matplotlib'])
 
def addVerbose (func)
 
def _cleaning_func (**decorator_kwargs)
 
def insertSample (df, sample, index=-1)
 
def ensureIterable (obj, useList=False)
 
def ensureNotIterable (obj, emptyBecomes=None)
 
def getOutliers (data, zscore=None)
 
def normalizePercentage (p, error='Percentage is of the wrong type(int or float expected)')
 
def isiterable (obj, includeStr=False)
 
def sort_dict_by_value_length (d)
 
pd.DataFrame timeFeatures (df)
 
pd.DataFrame catagorical (df, time=False)
 
pd.DataFrame quantitative (df, time=True)
 
def isTimeFeature (pd.Series s)
 
def isCatagorical (pd.Series s, time=False)
 
def isQuantatative (pd.Series s, time=True)
 
def missingSummary (df, thresh=.6)
 
def significantCorrelations (df, thresh=.5)
 
def getNiceTypesTable (df, types=None)
 
def percentCountPlot (data, feature, target=None, ax=None, title='Percentage of values used in {}')
 
def column_entropy (pd.Series column, base=e)
 
def pretty_2_column_array (a, limit=30, paren=None)
 
def pretty_counts (pd.Series s, paren=False)
 
def meanConfInterval (data, confidence=0.95, mean=False)
 
def showOutliers (data, column, zscore, **snsArgs)
 
def interactWithOutliers (df, feature=None, step=.2)
 
def handle_outliers (col, Union['remove', 'constrain'] method='remove', zscore=3, log=...)
 
def handle_missing (col, Union[pd.Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any] method, missing_value=np.nan, log=...)
 
def query (pd.DataFrame df, str column, str query, Union[pd.Series, 'remove', 'new', 'mean', 'median', 'mode', 'random', 'balanced_random', Any] method, true=1, false=0, verbose=False)
 
def remove (col, val, log=...)
 
def bin (col, Union['frequency', 'width', Tuple, List] method, amt=5, log=...)
 
def rescale (df, return_scaler=False, log=...)
 
def convert_time (df_or_col, str col=None, Union['timestamp'] method='timestamp', verbose=False)
 
def convert_numeric (df, str col=None, Union['assign', 'one_hot_encode'] method='one_hot_encode', returnAssignments=False, skip=[], verbose=False)
 
def split (*data, amt=.2, Union['random', 'chunk', 'head', 'tail'] method='random', target=[], splitTargets=False, seed=42)
 
def explore (data, target=None, stats=None, additionalStats=[], missing=True, corr=.55, entropy=None, start='Description', startFeature=None, startx=None, starty=None, startHue=None, alpha=None)
 
def suggestedCleaning (df, target)
 
def _cleanColumn (df, args, column, verbose, ignoreWarnings=False)
 
pd.DataFrame clean (pd.DataFrame df, Dict[str, Dict[str, Any]] config, bool verbose=False, str split=None)
 
def resample (X, y, Union['oversample', 'undersample', 'mixed'] method='oversample', seed=None)
 
def evaluateQuantitative (test, testPredictions, train=None, trainPredictions=None, accuracy=3, explain=False, compact=False, line=False, log=...)
 
def evaluateCatagorical (test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False)
 
def evaluate (catagorical, test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False, line=False)
 
def importances (tree, names=None, rtn=False, graph=True, best=.01)
 
def saveStats (file, name, model, testY, predY, trainY=None, trainPredY=None, notes='', new=False, show=True, save=True)
 
def plot_history (history)
 

Variables

 try :
 
pass except
 
 else :
 
str MODE_SELECTION = 'random'
 
int SMALL_DATASET = 1000
 
int HIGH_CARDINALITY = 50
 
int ALERT_MISSING = .55
 
int OUTLIER_THRESHOLD = .5
 
bool CONTINUOUS_UPDATE_SLIDER = False
 
list _catagoricalTypes = ['bool', 'bool_', 'object', 'object_', 'Interval', 'bool8', 'category']
 
list _quantitativeTypes = ['number']
 
list _timeTypes = ['datetimetz', 'timedelta', 'datetime']
 
print todo = lambda *a('TODO: ', *a)
 
 transform
 
def quickSummary = explore
 
def evaluateQ = evaluateQuantitative
 
def evaluateC = evaluateCatagorical
 

Function Documentation

◆ _cast2dataframe()

def Cope.experimental.data._cast2dataframe (   func)
protected

◆ _cleanColumn()

def Cope.experimental.data._cleanColumn (   df,
  args,
  column,
  verbose,
  ignoreWarnings = False 
)
protected

◆ _cleaning_func()

def Cope.experimental.data._cleaning_func ( **  decorator_kwargs)
protected
 Auto-converts the given named parameter to the given type
    Supports inputs of pd.DataFrame, pd.Series, np.ndarray, and tuple/list pd.Series or np.ndarray
    Supports outputs of pd.DataFrame, pd.Series, and a tuple of pd.Series
    Does NOT support input types of tuple/list of pd.DataFrames

◆ addVerbose()

def Cope.experimental.data.addVerbose (   func)

◆ bin()

def Cope.experimental.data.bin (   col,
Union['frequency', 'width', Tuple, List]  method,
  amt = 5,
  log = ... 
)

◆ catagorical()

pd.DataFrame Cope.experimental.data.catagorical (   df,
  time = False 
)

◆ clean()

pd.DataFrame Cope.experimental.data.clean ( pd.DataFrame  df,
Dict[str, Dict[str, Any]]  config,
bool  verbose = False,
str  split = None 
)
 Returns a cleaned copy of the DataFrame passed to it
    NOTE: The order of the entries in the config dict determine the order they are performed

    Arguments:
        config is a dict of this signature:
        NOTE: This is the suggested order
            {
                # Do these to all the columns, or a specified column
                'column/all': {
                    # Drop duplicate samples
                    ## Only applies to all
                    'drop_duplicates': bool,
                    # Removes samples which have a Z-score magnitude of greater than this value
                    'handle_outliers': Union[bool, Tuple[float, Union['remove', 'constrain']]],
                    # Maps feature values to a dictionary
                    'replace': Union[bool, Dict],
                    # Applies a function to the column
                    'apply': Union[bool, Callable],
                    # A list of (query, replacements).
                    ## If a Series is given, it will replace those values with the values at it's corresponding index
                    ## 'random' replaces values with either a random catagory, or a random number between min and max
                    ## 'balanced_random' replaces values with either a randomly sampled catagory (sampled from the column
                    ## itself, so it's properly biased), or a normally distributed sample
                    'queries': Union[bool, List[Tuple[str, Union[Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any]]]],
                    # A ndarray of shape (1, n) of values to create a new column with the given name
                    ## Calling from a specific column has no effect, behaves the same under all
                    'add_column': Union[Tuple[str, np.ndarray], List[Tuple[str, np.ndarray]]],
                    # Specifies a value that is equivalent to the feature being missing
                    'missing_value': Any,
                    # Specifies a method by which to transform samples with missing features. Acts just like queries, but with missing values specifically
                    'handle_missing': Union[bool, Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any],
                    # Removes all samples with the given value
                    'remove': Union[bool, Any],
                    # Specifies a method by which to bin the quantative value, or specify custom ranges
                    'bin': Union[bool, Tuple['frequency', int], Tuple['width', int], Iterable],
                    # Specifies a method by which to normalize the quantative values
                    'normalize': Union[bool, 'min-max', 'range'],
                    # Specifies a method by which to convert a catagorical feature to a quantative one
                    'convert_numeric': Union[bool, 'assign', 'one_hot_encode'],
                    # Drop the column
                    'drop': bool,
                },
            }

◆ column_entropy()

def Cope.experimental.data.column_entropy ( pd.Series  column,
  base = e 
)
 This works, but it's slow for some reason? 

◆ convert_numeric()

def Cope.experimental.data.convert_numeric (   df,
str  col = None,
Union['assign', 'one_hot_encode']  method = 'one_hot_encode',
  returnAssignments = False,
  skip = [],
  verbose = False 
)

◆ convert_time()

def Cope.experimental.data.convert_time (   df_or_col,
str  col = None,
Union['timestamp']  method = 'timestamp',
  verbose = False 
)

◆ ensureIterable()

def Cope.experimental.data.ensureIterable (   obj,
  useList = False 
)

◆ ensureNotIterable()

def Cope.experimental.data.ensureNotIterable (   obj,
  emptyBecomes = None 
)

◆ evaluate()

def Cope.experimental.data.evaluate (   catagorical,
  test,
  testPredictions,
  train = None,
  trainPredictions = None,
  accuracy = 3,
  curve = False,
  confusion = False,
  explain = False,
  compact = False,
  line = False 
)
 Evaluate your predictions of an ML model.
    NOTE: compact overrides explain.

◆ evaluateCatagorical()

def Cope.experimental.data.evaluateCatagorical (   test,
  testPredictions,
  train = None,
  trainPredictions = None,
  accuracy = 3,
  curve = False,
  confusion = False,
  explain = False,
  compact = False 
)
 Evaluate your predictions of an ML model.
    NOTE: compact overrides explain.

◆ evaluateQuantitative()

def Cope.experimental.data.evaluateQuantitative (   test,
  testPredictions,
  train = None,
  trainPredictions = None,
  accuracy = 3,
  explain = False,
  compact = False,
  line = False,
  log = ... 
)
 Evaluate your predictions of an ML model.
    NOTE: compact overrides explain.

◆ explore()

def Cope.experimental.data.explore (   data,
  target = None,
  stats = None,
  additionalStats = [],
  missing = True,
  corr = .55,
  entropy = None,
  start = 'Description',
  startFeature = None,
  startx = None,
  starty = None,
  startHue = None,
  alpha = None 
)

◆ getNiceTypesTable()

def Cope.experimental.data.getNiceTypesTable (   df,
  types = None 
)

◆ getOutliers()

def Cope.experimental.data.getOutliers (   data,
  zscore = None 
)

◆ handle_missing()

def Cope.experimental.data.handle_missing (   col,
Union[pd.Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any]  method,
  missing_value = np.nan,
  log = ... 
)

◆ handle_outliers()

def Cope.experimental.data.handle_outliers (   col,
Union['remove', 'constrain']  method = 'remove',
  zscore = 3,
  log = ... 
)

◆ importances()

def Cope.experimental.data.importances (   tree,
  names = None,
  rtn = False,
  graph = True,
  best = .01 
)

◆ insertSample()

def Cope.experimental.data.insertSample (   df,
  sample,
  index = -1 
)
 Because theres not a function for this? 

◆ installLibs()

def Cope.experimental.data.installLibs (   libs = ['pandas', 'numpy', 'imblearn', 'ipywidgets', 'seaborn', 'scipy', 'matplotlib'])

◆ interactWithOutliers()

def Cope.experimental.data.interactWithOutliers (   df,
  feature = None,
  step = .2 
)

◆ isCatagorical()

def Cope.experimental.data.isCatagorical ( pd.Series  s,
  time = False 
)

◆ isiterable()

def Cope.experimental.data.isiterable (   obj,
  includeStr = False 
)

◆ isQuantatative()

def Cope.experimental.data.isQuantatative ( pd.Series  s,
  time = True 
)

◆ isTimeFeature()

def Cope.experimental.data.isTimeFeature ( pd.Series  s)

◆ meanConfInterval()

def Cope.experimental.data.meanConfInterval (   data,
  confidence = 0.95,
  mean = False 
)

◆ missingSummary()

def Cope.experimental.data.missingSummary (   df,
  thresh = .6 
)

◆ normalizePercentage()

def Cope.experimental.data.normalizePercentage (   p,
  error = 'Percentage is of the wrong type (int or float expected)' 
)

◆ percentCountPlot()

def Cope.experimental.data.percentCountPlot (   data,
  feature,
  target = None,
  ax = None,
  title = 'Percentage of values used in {}' 
)

◆ plot_history()

def Cope.experimental.data.plot_history (   history)

◆ pretty_2_column_array()

def Cope.experimental.data.pretty_2_column_array (   a,
  limit = 30,
  paren = None 
)

◆ pretty_counts()

def Cope.experimental.data.pretty_counts ( pd.Series  s,
  paren = False 
)

◆ quantitative()

pd.DataFrame Cope.experimental.data.quantitative (   df,
  time = True 
)

◆ query()

def Cope.experimental.data.query ( pd.DataFrame  df,
str  column,
str  query,
Union[pd.Series, 'remove', 'new', 'mean', 'median', 'mode', 'random', 'balanced_random', Any]  method,
  true = 1,
  false = 0,
  verbose = False 
)

◆ remove()

def Cope.experimental.data.remove (   col,
  val,
  log = ... 
)

◆ resample()

def Cope.experimental.data.resample (   X,
  y,
Union['oversample', 'undersample', 'mixed']  method = 'oversample',
  seed = None 
)

◆ rescale()

def Cope.experimental.data.rescale (   df,
  return_scaler = False,
  log = ... 
)

◆ saveStats()

def Cope.experimental.data.saveStats (   file,
  name,
  model,
  testY,
  predY,
  trainY = None,
  trainPredY = None,
  notes = '',
  new = False,
  show = True,
  save = True 
)

◆ showOutliers()

def Cope.experimental.data.showOutliers (   data,
  column,
  zscore,
**  snsArgs 
)

◆ significantCorrelations()

def Cope.experimental.data.significantCorrelations (   df,
  thresh = .5 
)

◆ sort_dict_by_value_length()

def Cope.experimental.data.sort_dict_by_value_length (   d)

◆ split()

def Cope.experimental.data.split ( data,
  amt = .2,
Union['random', 'chunk', 'head', 'tail']  method = 'random',
  target = [],
  splitTargets = False,
  seed = 42 
)
 Splits the given data, both into train/test sets, and by taking out targets at the same time
    `target` can be a string or an iterable
    If `splitTargets` is set to False, the targets will always return DataFrames, even if
        they only have 1 column
    If you pass in multiple items for data, AND specify a target feature[s], then all the items
        must have the target columns
    The order goes:
        train_X, test_X, train_X1, test_X1, ..., train_y, test_y, train_y1, test_y1
        where it continues adding data and target splits in the order they are given.
        Simply put, it outputs in the same order you input the parameters as much as possible.
        Don't give multiple data AND split targets at the same time. While it can do it,
            it's simply too confusing to think through the order of the returned parameters.
    Setting the `method` to 'chunk' is the same as setting it to 'tail'.

◆ suggestedCleaning()

def Cope.experimental.data.suggestedCleaning (   df,
  target 
)

◆ timeFeatures()

pd.DataFrame Cope.experimental.data.timeFeatures (   df)

Variable Documentation

◆ _catagoricalTypes

list Cope.experimental.data._catagoricalTypes = ['bool', 'bool_', 'object', 'object_', 'Interval', 'bool8', 'category']
protected

◆ _quantitativeTypes

list Cope.experimental.data._quantitativeTypes = ['number']
protected

◆ _timeTypes

list Cope.experimental.data._timeTypes = ['datetimetz', 'timedelta', 'datetime']
protected

◆ ALERT_MISSING

int Cope.experimental.data.ALERT_MISSING = .55

◆ CONTINUOUS_UPDATE_SLIDER

bool Cope.experimental.data.CONTINUOUS_UPDATE_SLIDER = False

◆ else

Cope.experimental.data.else :

◆ evaluateC

def Cope.experimental.data.evaluateC = evaluateCatagorical

◆ evaluateQ

def Cope.experimental.data.evaluateQ = evaluateQuantitative

◆ except

pass Cope.experimental.data.except

◆ HIGH_CARDINALITY

int Cope.experimental.data.HIGH_CARDINALITY = 50

◆ MODE_SELECTION

str Cope.experimental.data.MODE_SELECTION = 'random'

◆ OUTLIER_THRESHOLD

int Cope.experimental.data.OUTLIER_THRESHOLD = .5

◆ quickSummary

def Cope.experimental.data.quickSummary = explore

◆ SMALL_DATASET

int Cope.experimental.data.SMALL_DATASET = 1000

◆ todo

print Cope.experimental.data.todo = lambda *a('TODO: ', *a)

◆ transform

Cope.experimental.data.transform

◆ try

Cope.experimental.data.try :