Cope 2.5.0
My personal "standard library" of all the generally useful code I've written for various projects over the years
|
Functions | |
def | _cast2dataframe (func) |
def | installLibs (libs=['pandas', 'numpy', 'imblearn', 'ipywidgets', 'seaborn', 'scipy', 'matplotlib']) |
def | addVerbose (func) |
def | _cleaning_func (**decorator_kwargs) |
def | insertSample (df, sample, index=-1) |
def | ensureIterable (obj, useList=False) |
def | ensureNotIterable (obj, emptyBecomes=None) |
def | getOutliers (data, zscore=None) |
def | normalizePercentage (p, error='Percentage is of the wrong type(int or float expected)') |
def | isiterable (obj, includeStr=False) |
def | sort_dict_by_value_length (d) |
pd.DataFrame | timeFeatures (df) |
pd.DataFrame | catagorical (df, time=False) |
pd.DataFrame | quantitative (df, time=True) |
def | isTimeFeature (pd.Series s) |
def | isCatagorical (pd.Series s, time=False) |
def | isQuantatative (pd.Series s, time=True) |
def | missingSummary (df, thresh=.6) |
def | significantCorrelations (df, thresh=.5) |
def | getNiceTypesTable (df, types=None) |
def | percentCountPlot (data, feature, target=None, ax=None, title='Percentage of values used in {}') |
def | column_entropy (pd.Series column, base=e) |
def | pretty_2_column_array (a, limit=30, paren=None) |
def | pretty_counts (pd.Series s, paren=False) |
def | meanConfInterval (data, confidence=0.95, mean=False) |
def | showOutliers (data, column, zscore, **snsArgs) |
def | interactWithOutliers (df, feature=None, step=.2) |
def | handle_outliers (col, Union['remove', 'constrain'] method='remove', zscore=3, log=...) |
def | handle_missing (col, Union[pd.Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any] method, missing_value=np.nan, log=...) |
def | query (pd.DataFrame df, str column, str query, Union[pd.Series, 'remove', 'new', 'mean', 'median', 'mode', 'random', 'balanced_random', Any] method, true=1, false=0, verbose=False) |
def | remove (col, val, log=...) |
def | bin (col, Union['frequency', 'width', Tuple, List] method, amt=5, log=...) |
def | rescale (df, return_scaler=False, log=...) |
def | convert_time (df_or_col, str col=None, Union['timestamp'] method='timestamp', verbose=False) |
def | convert_numeric (df, str col=None, Union['assign', 'one_hot_encode'] method='one_hot_encode', returnAssignments=False, skip=[], verbose=False) |
def | split (*data, amt=.2, Union['random', 'chunk', 'head', 'tail'] method='random', target=[], splitTargets=False, seed=42) |
def | explore (data, target=None, stats=None, additionalStats=[], missing=True, corr=.55, entropy=None, start='Description', startFeature=None, startx=None, starty=None, startHue=None, alpha=None) |
def | suggestedCleaning (df, target) |
def | _cleanColumn (df, args, column, verbose, ignoreWarnings=False) |
pd.DataFrame | clean (pd.DataFrame df, Dict[str, Dict[str, Any]] config, bool verbose=False, str split=None) |
def | resample (X, y, Union['oversample', 'undersample', 'mixed'] method='oversample', seed=None) |
def | evaluateQuantitative (test, testPredictions, train=None, trainPredictions=None, accuracy=3, explain=False, compact=False, line=False, log=...) |
def | evaluateCatagorical (test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False) |
def | evaluate (catagorical, test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False, line=False) |
def | importances (tree, names=None, rtn=False, graph=True, best=.01) |
def | saveStats (file, name, model, testY, predY, trainY=None, trainPredY=None, notes='', new=False, show=True, save=True) |
def | plot_history (history) |
Variables | |
try : | |
pass | except |
else : | |
str | MODE_SELECTION = 'random' |
int | SMALL_DATASET = 1000 |
int | HIGH_CARDINALITY = 50 |
int | ALERT_MISSING = .55 |
int | OUTLIER_THRESHOLD = .5 |
bool | CONTINUOUS_UPDATE_SLIDER = False |
list | _catagoricalTypes = ['bool', 'bool_', 'object', 'object_', 'Interval', 'bool8', 'category'] |
list | _quantitativeTypes = ['number'] |
list | _timeTypes = ['datetimetz', 'timedelta', 'datetime'] |
todo = lambda *a('TODO: ', *a) | |
transform | |
def | quickSummary = explore |
def | evaluateQ = evaluateQuantitative |
def | evaluateC = evaluateCatagorical |
|
protected |
|
protected |
|
protected |
Auto-converts the given named parameter to the given type Supports inputs of pd.DataFrame, pd.Series, np.ndarray, and tuple/list pd.Series or np.ndarray Supports outputs of pd.DataFrame, pd.Series, and a tuple of pd.Series Does NOT support input types of tuple/list of pd.DataFrames
def Cope.experimental.data.addVerbose | ( | func | ) |
def Cope.experimental.data.bin | ( | col, | |
Union['frequency', 'width', Tuple, List] | method, | ||
amt = 5 , |
|||
log = ... |
|||
) |
pd.DataFrame Cope.experimental.data.catagorical | ( | df, | |
time = False |
|||
) |
pd.DataFrame Cope.experimental.data.clean | ( | pd.DataFrame | df, |
Dict[str, Dict[str, Any]] | config, | ||
bool | verbose = False , |
||
str | split = None |
||
) |
Returns a cleaned copy of the DataFrame passed to it NOTE: The order of the entries in the config dict determine the order they are performed Arguments: config is a dict of this signature: NOTE: This is the suggested order { # Do these to all the columns, or a specified column 'column/all': { # Drop duplicate samples ## Only applies to all 'drop_duplicates': bool, # Removes samples which have a Z-score magnitude of greater than this value 'handle_outliers': Union[bool, Tuple[float, Union['remove', 'constrain']]], # Maps feature values to a dictionary 'replace': Union[bool, Dict], # Applies a function to the column 'apply': Union[bool, Callable], # A list of (query, replacements). ## If a Series is given, it will replace those values with the values at it's corresponding index ## 'random' replaces values with either a random catagory, or a random number between min and max ## 'balanced_random' replaces values with either a randomly sampled catagory (sampled from the column ## itself, so it's properly biased), or a normally distributed sample 'queries': Union[bool, List[Tuple[str, Union[Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any]]]], # A ndarray of shape (1, n) of values to create a new column with the given name ## Calling from a specific column has no effect, behaves the same under all 'add_column': Union[Tuple[str, np.ndarray], List[Tuple[str, np.ndarray]]], # Specifies a value that is equivalent to the feature being missing 'missing_value': Any, # Specifies a method by which to transform samples with missing features. Acts just like queries, but with missing values specifically 'handle_missing': Union[bool, Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any], # Removes all samples with the given value 'remove': Union[bool, Any], # Specifies a method by which to bin the quantative value, or specify custom ranges 'bin': Union[bool, Tuple['frequency', int], Tuple['width', int], Iterable], # Specifies a method by which to normalize the quantative values 'normalize': Union[bool, 'min-max', 'range'], # Specifies a method by which to convert a catagorical feature to a quantative one 'convert_numeric': Union[bool, 'assign', 'one_hot_encode'], # Drop the column 'drop': bool, }, }
def Cope.experimental.data.column_entropy | ( | pd.Series | column, |
base = e |
|||
) |
This works, but it's slow for some reason?
def Cope.experimental.data.convert_numeric | ( | df, | |
str | col = None , |
||
Union['assign', 'one_hot_encode'] | method = 'one_hot_encode' , |
||
returnAssignments = False , |
|||
skip = [] , |
|||
verbose = False |
|||
) |
def Cope.experimental.data.convert_time | ( | df_or_col, | |
str | col = None , |
||
Union['timestamp'] | method = 'timestamp' , |
||
verbose = False |
|||
) |
def Cope.experimental.data.ensureIterable | ( | obj, | |
useList = False |
|||
) |
def Cope.experimental.data.ensureNotIterable | ( | obj, | |
emptyBecomes = None |
|||
) |
def Cope.experimental.data.evaluate | ( | catagorical, | |
test, | |||
testPredictions, | |||
train = None , |
|||
trainPredictions = None , |
|||
accuracy = 3 , |
|||
curve = False , |
|||
confusion = False , |
|||
explain = False , |
|||
compact = False , |
|||
line = False |
|||
) |
Evaluate your predictions of an ML model. NOTE: compact overrides explain.
def Cope.experimental.data.evaluateCatagorical | ( | test, | |
testPredictions, | |||
train = None , |
|||
trainPredictions = None , |
|||
accuracy = 3 , |
|||
curve = False , |
|||
confusion = False , |
|||
explain = False , |
|||
compact = False |
|||
) |
Evaluate your predictions of an ML model. NOTE: compact overrides explain.
def Cope.experimental.data.evaluateQuantitative | ( | test, | |
testPredictions, | |||
train = None , |
|||
trainPredictions = None , |
|||
accuracy = 3 , |
|||
explain = False , |
|||
compact = False , |
|||
line = False , |
|||
log = ... |
|||
) |
Evaluate your predictions of an ML model. NOTE: compact overrides explain.
def Cope.experimental.data.explore | ( | data, | |
target = None , |
|||
stats = None , |
|||
additionalStats = [] , |
|||
missing = True , |
|||
corr = .55 , |
|||
entropy = None , |
|||
start = 'Description' , |
|||
startFeature = None , |
|||
startx = None , |
|||
starty = None , |
|||
startHue = None , |
|||
alpha = None |
|||
) |
def Cope.experimental.data.getNiceTypesTable | ( | df, | |
types = None |
|||
) |
def Cope.experimental.data.getOutliers | ( | data, | |
zscore = None |
|||
) |
def Cope.experimental.data.handle_missing | ( | col, | |
Union[pd.Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any] | method, | ||
missing_value = np.nan , |
|||
log = ... |
|||
) |
def Cope.experimental.data.handle_outliers | ( | col, | |
Union['remove', 'constrain'] | method = 'remove' , |
||
zscore = 3 , |
|||
log = ... |
|||
) |
def Cope.experimental.data.importances | ( | tree, | |
names = None , |
|||
rtn = False , |
|||
graph = True , |
|||
best = .01 |
|||
) |
def Cope.experimental.data.insertSample | ( | df, | |
sample, | |||
index = -1 |
|||
) |
Because theres not a function for this?
def Cope.experimental.data.installLibs | ( | libs = ['pandas', 'numpy', 'imblearn', 'ipywidgets', 'seaborn', 'scipy', 'matplotlib'] | ) |
def Cope.experimental.data.interactWithOutliers | ( | df, | |
feature = None , |
|||
step = .2 |
|||
) |
def Cope.experimental.data.isCatagorical | ( | pd.Series | s, |
time = False |
|||
) |
def Cope.experimental.data.isiterable | ( | obj, | |
includeStr = False |
|||
) |
def Cope.experimental.data.isQuantatative | ( | pd.Series | s, |
time = True |
|||
) |
def Cope.experimental.data.isTimeFeature | ( | pd.Series | s | ) |
def Cope.experimental.data.meanConfInterval | ( | data, | |
confidence = 0.95 , |
|||
mean = False |
|||
) |
def Cope.experimental.data.missingSummary | ( | df, | |
thresh = .6 |
|||
) |
def Cope.experimental.data.normalizePercentage | ( | p, | |
error = 'Percentage is of the wrong type (int or float expected)' |
|||
) |
def Cope.experimental.data.percentCountPlot | ( | data, | |
feature, | |||
target = None , |
|||
ax = None , |
|||
title = 'Percentage of values used in {}' |
|||
) |
def Cope.experimental.data.plot_history | ( | history | ) |
def Cope.experimental.data.pretty_2_column_array | ( | a, | |
limit = 30 , |
|||
paren = None |
|||
) |
def Cope.experimental.data.pretty_counts | ( | pd.Series | s, |
paren = False |
|||
) |
pd.DataFrame Cope.experimental.data.quantitative | ( | df, | |
time = True |
|||
) |
def Cope.experimental.data.query | ( | pd.DataFrame | df, |
str | column, | ||
str | query, | ||
Union[pd.Series, 'remove', 'new', 'mean', 'median', 'mode', 'random', 'balanced_random', Any] | method, | ||
true = 1 , |
|||
false = 0 , |
|||
verbose = False |
|||
) |
def Cope.experimental.data.remove | ( | col, | |
val, | |||
log = ... |
|||
) |
def Cope.experimental.data.resample | ( | X, | |
y, | |||
Union['oversample', 'undersample', 'mixed'] | method = 'oversample' , |
||
seed = None |
|||
) |
def Cope.experimental.data.rescale | ( | df, | |
return_scaler = False , |
|||
log = ... |
|||
) |
def Cope.experimental.data.saveStats | ( | file, | |
name, | |||
model, | |||
testY, | |||
predY, | |||
trainY = None , |
|||
trainPredY = None , |
|||
notes = '' , |
|||
new = False , |
|||
show = True , |
|||
save = True |
|||
) |
def Cope.experimental.data.showOutliers | ( | data, | |
column, | |||
zscore, | |||
** | snsArgs | ||
) |
def Cope.experimental.data.significantCorrelations | ( | df, | |
thresh = .5 |
|||
) |
def Cope.experimental.data.sort_dict_by_value_length | ( | d | ) |
def Cope.experimental.data.split | ( | * | data, |
amt = .2 , |
|||
Union['random', 'chunk', 'head', 'tail'] | method = 'random' , |
||
target = [] , |
|||
splitTargets = False , |
|||
seed = 42 |
|||
) |
Splits the given data, both into train/test sets, and by taking out targets at the same time `target` can be a string or an iterable If `splitTargets` is set to False, the targets will always return DataFrames, even if they only have 1 column If you pass in multiple items for data, AND specify a target feature[s], then all the items must have the target columns The order goes: train_X, test_X, train_X1, test_X1, ..., train_y, test_y, train_y1, test_y1 where it continues adding data and target splits in the order they are given. Simply put, it outputs in the same order you input the parameters as much as possible. Don't give multiple data AND split targets at the same time. While it can do it, it's simply too confusing to think through the order of the returned parameters. Setting the `method` to 'chunk' is the same as setting it to 'tail'.
def Cope.experimental.data.suggestedCleaning | ( | df, | |
target | |||
) |
pd.DataFrame Cope.experimental.data.timeFeatures | ( | df | ) |
|
protected |
|
protected |
|
protected |
int Cope.experimental.data.ALERT_MISSING = .55 |
bool Cope.experimental.data.CONTINUOUS_UPDATE_SLIDER = False |
Cope.experimental.data.else : |
def Cope.experimental.data.evaluateC = evaluateCatagorical |
def Cope.experimental.data.evaluateQ = evaluateQuantitative |
pass Cope.experimental.data.except |
int Cope.experimental.data.HIGH_CARDINALITY = 50 |
str Cope.experimental.data.MODE_SELECTION = 'random' |
int Cope.experimental.data.OUTLIER_THRESHOLD = .5 |
def Cope.experimental.data.quickSummary = explore |
int Cope.experimental.data.SMALL_DATASET = 1000 |
print Cope.experimental.data.todo = lambda *a('TODO: ', *a) |
Cope.experimental.data.transform |
Cope.experimental.data.try : |