3 from imblearn.over_sampling
import RandomOverSampler
4 from functools
import wraps
5 from contextlib
import redirect_stdout
6 from imblearn.under_sampling
import RandomUnderSampler
7 from warnings
import warn
9 from sklearn.metrics
import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
11 from sklearn.preprocessing
import MinMaxScaler
16 import matplotlib.pyplot
as plt
17 from typing
import Optional, Any, Tuple, List, Iterable, Dict, Union, Callable, Iterable, Literal
19 from sklearn.metrics
import PrecisionRecallDisplay, ConfusionMatrixDisplay
20 import ipywidgets
as widgets
21 from collections
import OrderedDict
22 from IPython.display
import clear_output, display
23 from math
import log, e
24 import sklearn.model_selection
as skms
29 MODE_SELECTION =
'random'
37 OUTLIER_THRESHOLD = .5
39 CONTINUOUS_UPDATE_SLIDER =
False
43 _catagoricalTypes = [
'bool',
'bool_',
'object',
'object_',
'Interval',
'bool8',
'category']
44 _quantitativeTypes = [
'number']
45 _timeTypes = [
'datetimetz',
'timedelta',
'datetime']
52 todo =
lambda *a: print(
'TODO: ', *a)
56 def _cast2dataframe(func):
57 def wrapper(self, *args, **kwargs):
58 return pd.DataFrame(func(self, *args, **kwargs), columns=self.feature_names_in_)
61 MinMaxScaler.transform = _cast2dataframe(MinMaxScaler.transform)
63 def installLibs(libs=['pandas', 'numpy', 'imblearn', 'ipywidgets', 'seaborn', 'scipy', 'matplotlib']):
68 print(f
'IPython doesnt seem to be installed. Simply run `pip install {libs}` in a terminal')
70 if (ipython := IPython.get_ipython())
is not None:
71 ipython.run_line_magic(
"pip", f
"install {libs}")
73 print(
'You dont seem to be calling from IPython. Simply run `pip install pandas altair numpy imblearn ipywidgets seaborn scipy matplotlib IPython` in a terminal')
81 def inner(*args, verbose=False, **kwargs):
85 log=
lambda s: print(f
'\t{s}')
if verbose
else None,
89 def _cleaning_func(**decorator_kwargs):
90 """ Auto-converts the given named parameter to the given type
91 Supports inputs of pd.DataFrame, pd.Series, np.ndarray, and tuple/list pd.Series
or np.ndarray
92 Supports outputs of pd.DataFrame, pd.Series,
and a tuple of pd.Series
93 Does NOT support input types of tuple/list of pd.DataFrames
99 raise TypeError(f
"Cant cast {toType} to {type(x)}")
103 pd.DataFrame:
lambda t: pd.DataFrame(t).T,
104 pd.Series:
lambda t: (pd.Series(t[0])
if len(t) == 1
else error(pd.Series)(t)),
105 tuple:
lambda t: tuple([pd.Series(i)
for i
in t]),
110 pd.DataFrame: trivial,
111 pd.Series:
lambda d: pd.Series(d.iloc[:,0])
if len(d.columns) == 1
else error(pd.Series),
112 tuple:
lambda d: tuple([d[i]
for i
in d]),
115 pd.DataFrame:
lambda s: pd.DataFrame(s),
117 tuple:
lambda s: (s,),
120 pd.DataFrame:
lambda n: pd.DataFrame(n),
121 pd.Series:
lambda n: pd.Series(n),
122 tuple:
lambda s: (pd.Series(s),),
124 tuple: iterableInput,
128 def outer(decorator_func):
130 @wraps(decorator_func)
132 def inner(dat, *args, **kwargs):
134 if isinstance(dat, (list, tuple)):
136 raise TypeError(
'Please dont pass in an empty list')
141 elif isinstance(dat[0], pd.DataFrame):
142 _kwargs = kwargs.copy()
145 for paramName, outputType
in decorator_kwargs.items():
146 _kwargs[paramName] = input2output[pd.DataFrame][outputType](d)
147 rtn.append(decorator_func(*args, **kwargs))
150 for paramName, outputType
in decorator_kwargs.items():
151 kwargs[paramName] = input2output[type(dat)][outputType](dat)
152 return decorator_func(*args, **kwargs)
157 def insertSample(df, sample, index=-1):
158 """ Because theres not a function for this? """
159 df.loc[index - .5] = sample
160 return df.sort_index().reset_index(drop=
True)
162 def ensureIterable(obj, useList=False):
163 if not isiterable(obj):
164 return [obj, ]
if useList
else (obj, )
168 def ensureNotIterable(obj, emptyBecomes=None):
182 return obj
if emptyBecomes
is _None
else emptyBecomes
188 def getOutliers(data, zscore=None):
194 if isinstance(data, pd.Series):
195 if zscore
is not None:
196 return data[np.abs(scipy.stats.zscore(data)) > zscore]
198 elif isinstance(data, pd.DataFrame):
199 if zscore
is not None:
201 for f
in data.columns:
202 rtn[f] = data[f][np.abs(scipy.stats.zscore(data[f])) > zscore]
203 return pd.DataFrame(rtn)
206 raise TypeError(f
"Invalid type {type(data)} given")
208 def normalizePercentage(p, error='Percentage is of the wrong type (int
or float expected)
'):
209 if isinstance(p, int):
211 elif isinstance(p, float):
213 elif isinstance(p, bool):
219 if error
is not None:
220 raise TypeError(error)
222 def isiterable(obj, includeStr=False):
223 return isinstance(obj, Iterable)
and (type(obj)
is not str
if not includeStr
else True)
225 def sort_dict_by_value_length(d):
226 return dict(sorted(d.items(), key=
lambda item: len(item[1])))
228 def timeFeatures(df) -> pd.DataFrame:
229 return df.select_dtypes(include=_timeTypes)
231 def catagorical(df, time=False) -> pd.DataFrame:
232 return df.select_dtypes(include=_catagoricalTypes + (_timeTypes
if time
else []))
234 def quantitative(df, time=True) -> pd.DataFrame:
235 return df.select_dtypes(include=_quantitativeTypes + (_timeTypes
if time
else []))
237 def isTimeFeature(s: pd.Series):
238 s = pd.Series(s, name=
'__dummy')
239 return s.name
in timeFeatures(pd.DataFrame(s))
241 def isCatagorical(s: pd.Series, time=
False):
242 s = pd.Series(s, name=
'__dummy')
243 return s.name
in catagorical(pd.DataFrame(s), time)
245 def isQuantatative(s: pd.Series, time=
True):
246 s = pd.Series(s, name=
'__dummy')
247 return s.name
in quantitative(pd.DataFrame(s), time)
249 def missingSummary(df, thresh=.6):
250 table = df.isnull().sum()/len(df)
251 return table[table >= thresh]
253 def significantCorrelations(df, thresh=.5):
257 pos = cor[cor >= thresh]
258 neg = cor[cor <= -thresh]
266 assert l == w,
'Somehow the correlation matrix isnt square?'
268 arr[np.eye(w) == 1] = 0
270 arr = arr.loc[:, (arr != 0).any(axis=0)]
271 arr = arr[(arr != 0).any(axis=1)]
273 arr = np.triu(arr.to_numpy())
275 nonzero_indices = list(zip(*np.where(arr != 0)))
277 for r, c
in nonzero_indices:
278 rtn.append((names[r], names[c], arr[r, c]))
281 def getNiceTypesTable(df, types=None):
283 if isCatagorical(col, time=
False):
284 return [col.dtype,
'C']
285 if isQuantatative(col, time=
False):
286 return [col.dtype,
'Q']
287 if isTimeFeature(col):
288 return [col.dtype,
'T']
290 return pd.DataFrame(dict(zip(
292 [_getLabels(df[f])
for f
in df.columns]
295 def percentCountPlot(data, feature, target=None, ax=None, title='Percentage of values used in {}'):
299 total = float(len(Y))
300 ax = sns.countplot(x=feature, data=data, hue=target, ax=ax)
301 ax.set_title(title.format(feature))
303 ax.annotate(
'{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
306 ax.yaxis.set_ticks(np.linspace(0, total, 11))
308 ax.set_yticklabels(map(
'{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
309 ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha=
"right")
314 def column_entropy(column:pd.Series, base=e):
315 """ This works, but it's slow for some reason? """
316 vc = pd.Series(column).value_counts(normalize=
True, sort=
False)
317 return -(vc * np.log(vc)/np.log(base)).sum()
319 def pretty_2_column_array(a, limit=30, paren=None):
325 offset = max(list(a.index), key=len)
327 for i
in range(len(a)):
329 rtn += f
'\t{a.index[i]:>{len(offset)}}: {a[i]:.1%}\n'
331 rtn += f
'\t{a.index[i]:>{len(offset)}}: {a[i]:.1%} ({paren[i]})\n'
334 def pretty_counts(s:pd.Series, paren=
False):
340 rtn = pretty_2_column_array(s.value_counts(normalize=
True, sort=
True), paren=s.value_counts(sort=
True))
342 rtn = pretty_2_column_array(s.value_counts(normalize=
True, sort=
True))
346 def meanConfInterval(data, confidence=0.95, mean=False):
347 a = 1.0 * np.array(data)
349 m, se = np.mean(a), scipy.stats.sem(a)
350 h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
356 def showOutliers(data, column, zscore, **snsArgs):
357 if isCatagorical(data[column]):
358 raise TypeError(
'Outliers only apply to quantitative values')
359 samples = getOutliers(data[column], zscore=zscore)
360 print(len(samples), len(data[column]), sep=
'/')
361 sns.scatterplot(data=data[column], **snsArgs)
362 sns.scatterplot(data=samples, **snsArgs)
365 def interactWithOutliers(df, feature=None, step=.2):
366 return widgets.interactive(showOutliers,
367 data=widgets.fixed(df),
368 column=list(df.columns)
if feature
is None else widgets.fixed(feature),
369 zscore=(0., df[feature].max() / df[feature].std(), step)
if feature
is not None else (0., 10, step)
374 @_cleaning_func(col=pd.Series)
375 def handle_outliers(col, method:Union[
'remove',
'constrain']=
'remove', zscore=3, log=...):
377 samples = getOutliers(col, zscore=zscore)
378 if method ==
'remove':
379 log(f
'Removing outliers with zscore magnitudes >{zscore} from {col.name}')
380 return col.drop(samples.index)
381 elif method ==
'constrain':
382 todo(
'This breaks on negative values')
385 max = col.std() * zscore
387 log(f
'Constraining outliers with zscore magnitudes >{zscore} from {col.name}')
390 return col.apply(
lambda s: np.clip(s, -max, max))
392 raise TypeError(f
"Invalid method arguement '{method}' given")
394 @_cleaning_func(col=pd.Series)
395 def handle_missing(col, method:Union[pd.Series,
'remove',
'mean',
'median',
'mode',
'random',
'balanced_random', Any], missing_value=np.nan, log=...):
396 without = col.loc[col != missing_value]
398 if isinstance(method, (pd.Series, np.ndarray)):
399 assert len(method) == len(col),
'Both arrays are not of the same length'
400 log(f
'Replacing all samples with a "{col.name}" value of "{missing_value}" with their indexes in "{method.name}"')
402 return pd.Series([(method[i]
if col[i] == missing_value
else col[i])
for i
in range(len(col))])
406 elif method ==
'remove':
407 log(f
'Removing all samples with "{col.name}" values of "{missing_value}"')
409 elif method ==
'mean':
410 if isCatagorical(col):
411 raise TypeError(
"Cannot get mean of a catagorical feature")
412 mean = without.mean()
413 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to the mean ({mean:.2f})')
415 return col.copy().mask(col == missing_value, mean)
416 elif method ==
'median':
417 if isCatagorical(col):
418 raise TypeError(
"Cannot get median of a catagorical feature")
419 median = without.median()
420 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to the median ({median})')
421 return col.copy().mask(col == missing_value, median)
422 elif method ==
'mode':
424 if MODE_SELECTION ==
'random':
425 mode = random.choice(without.mode())
426 elif MODE_SELECTION ==
'first':
427 mode = without.mode()[0]
428 elif MODE_SELECTION ==
'last':
429 mode = without.mode()[-1]
430 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to a mode ({mode})')
431 return col.copy().mask(col == missing_value, mode)
432 elif method ==
'random':
433 if isCatagorical(col):
434 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to random catagories')
435 fill =
lambda sample: random.choice(without.unique())
if sample == missing_value
else sample
437 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to random values along a uniform distrobution')
438 fill =
lambda sample: type(sample)(random.uniform(without.min(), without.max()))
if sample == missing_value
else sample
440 return col.apply(fill)
441 elif method ==
'balanced_random':
442 if isCatagorical(col):
443 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to evenly distributed random catagories')
444 fill =
lambda sample: random.choice(without)
if sample == missing_value
else sample
446 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to random values along a normal distrobution')
447 fill =
lambda sample: type(sample)(random.gauss(without.mean(), without.std()))
if sample == missing_value
else sample
448 return col.apply(fill)
450 log(f
'Setting all samples with a "{col.name}" value of "{missing_value}" to {method}')
452 return col.copy().mask(col == missing_value, method)
455 def query(df:pd.DataFrame, column:str, query:str, method:Union[pd.Series,
'remove',
'new',
'mean',
'median',
'mode',
'random',
'balanced_random', Any], true=1, false=0, verbose=
False):
457 if isinstance(method, pd.Series):
458 log(f
'Changing all samples where "{query}" is true to have the {column} values of their indecies in "{method.name}"')
460 df.loc[q.index, column] = q.apply(
lambda s: method[s.name], axis=1)
461 elif method ==
'remove':
462 log(f
'Removing all samples where "{query}" is true')
463 df = df.drop(df.query(query).index)
464 elif method ==
'mean':
465 if isCatagorical(df[column]):
466 raise TypeError(
"Cannot get mean of a catagorical feature")
467 mean = df[column].mean()
468 log(f
'Setting all samples where {query} is true to the mean of "{column}" ({mean:.2})')
469 df.loc[df.query(query).index, column] = mean
470 elif method ==
'median':
471 if isCatagorical(df[column]):
472 raise TypeError(
"Cannot get median of a catagorical feature")
473 median = df[column].median()
474 log(f
'Setting all samples where "{query}" is true to the median of "{column}" ({median})')
475 df.loc[df.query(query).index, column] = median
476 elif method ==
'mode':
478 if MODE_SELECTION ==
'random':
479 mode = random.choice(df[column].mode())
480 elif MODE_SELECTION ==
'first':
481 mode = df[column].mode()[0]
482 elif MODE_SELECTION ==
'last':
483 mode = df[column].mode()[-1]
484 log(f
'Setting all samples where "{query}" is true to a mode of "{column}" ({mode})')
485 df.loc[df.query(query).index, column] = mode
486 elif method ==
'random':
487 if isCatagorical(df[column]):
488 log(f
'Setting all samples where "{query}" is true to have random catagories')
489 fill =
lambda s: random.choice(df[column].unique())
491 log(f
'Setting all samples where "{query}" is true to have random values along a uniform distrobution')
492 fill =
lambda s: type(s)(random.uniform(df[column].min(), df[column].max()))
495 df.loc[q.index, column] = q[column].apply(fill)
496 elif method ==
'new':
499 df.loc[q.index, column] = true
500 elif method ==
'balanced_random':
501 if isCatagorical(df[column]):
502 log(f
'Setting all samples where "{query}" is true to have evenly distributed random catagories')
503 fill =
lambda s: random.choice(df[column])
505 log(f
'Setting all samples where "{query}" is true to have random values along a normal distrobution')
506 fill =
lambda s: type(s)(random.gauss(df[column].mean(), df[column].std()))
509 df.loc[q.index, column] = q[column].apply(fill)
511 log(f
'Setting all samples where "{query}" is true to have a "{column}" value of {method}')
512 df.loc[df.query(query).index, column] = method
515 @_cleaning_func(col=pd.Series)
516 def remove(col, val, log=...):
517 log(f
'Removing all samples with a "{col.name}" value of {val}')
519 return col.drop(index=col[col == val].index)
521 @_cleaning_func(col=pd.Series)
522 def bin(col, method:Union[
'frequency',
'width', Tuple, List], amt=5, log=...):
523 if isCatagorical(col):
524 raise TypeError(f
"Can't bin catagorical feature '{col.name}'")
526 if method ==
'frequency':
527 log(f
'Binning "{col.name}" by frequency into {amt} bins')
528 return pd.qcut(col, amt, duplicates=
'drop')
529 elif method ==
'width':
530 log(f
'Binning "{col.name}" by width into {amt} bins')
531 raise NotImplementedError(
'Width binning')
532 elif isinstance(method, (tuple, list)):
533 log(f
'Custom binning "{col.name}" into {len(method)} bins')
534 return pd.cut(col, method)
536 raise TypeError(f
"Bin method parameter given invalid option {method}")
538 @_cleaning_func(df=pd.DataFrame)
539 def rescale(df, return_scaler=False, log=...):
542 scaler = MinMaxScaler().fit(df)
543 ans = pd.DataFrame(scaler.transform(df), columns=df.columns)
544 return (ans, scaler)
if return_scaler
else ans
546 def convert_time(df_or_col, col:str=
None, method:Union[
'timestamp']=
'timestamp', verbose=
False):
547 assert not (isinstance(df_or_col, pd.Series)
and col
is not None),
'Please dont provide a col parameter if passing a Series'
548 if isinstance(df_or_col, pd.DataFrame)
and col
is None:
549 df = df_or_col.copy()
550 df[timeFeatures(df).columns] = timeFeatures(df).applymap(
lambda date: date.timestamp())
553 if isinstance(df_or_col, pd.DataFrame):
554 df_or_col = df_or_col[col]
555 return df_or_col.apply(
lambda date: date.timestamp())
557 def convert_numeric(df, col:str=
None, method:Union[
'assign',
'one_hot_encode']=
'one_hot_encode', returnAssignments=
False, skip=[], verbose=
False):
564 if (col
is not None and isQuantatative(df[col]))
or (col
is None and isinstance(df, pd.Series)
and isQuantatative(df)):
565 raise TypeError(
"Series given is already quantatitive")
567 if method ==
'assign':
568 log(f
'Converting "{col}" to quantatative by assinging to arbitrary values', verbose)
569 if isinstance(df, pd.Series):
570 column, assings = pd.factorize(df)
571 return (column, assings)
if returnAssignments
else column
573 assert col
is not None,
'Please provide column to assign'
574 column, assings = pd.factorize(df[col])
576 return (df, assings)
if returnAssignments
else df
577 elif method ==
'one_hot_encode':
579 skip = ensureIterable(skip)
581 col = set(ensureIterable(col))
583 if isinstance(df, pd.DataFrame):
585 col = set(catagorical(df).columns)
587 return pd.get_dummies(df)
592 if isinstance(col, pd.Series):
593 log(f
'Converting "{df.name}" to quantatative by one hot encoding', verbose)
595 log(
'Converting DataFrame to quantatative by one hot encoding', verbose)
597 return pd.get_dummies(df, columns=list(col))
599 raise TypeError(f
"Bad method arguement '{method}' given to convert_numeric")
601 def split(*data, amt=.2, method:Union[
'random',
'chunk',
'head',
'tail']=
'random', target=[], splitTargets=
False, seed=42):
602 """ Splits the given data, both into train/test sets, and by taking out targets at the same time
603 `target` can be a string or an iterable
604 If `splitTargets`
is set to
False, the targets will always
return DataFrames, even
if
605 they only have 1 column
606 If you
pass in multiple items
for data, AND specify a target feature[s], then all the items
607 must have the target columns
609 train_X, test_X, train_X1, test_X1, ..., train_y, test_y, train_y1, test_y1
610 where it continues adding data
and target splits
in the order they are given.
611 Simply put, it outputs
in the same order you input the parameters
as much
as possible.
612 Don
't give multiple data AND split targets at the same time. While it can do it,
613 it's simply too confusing to think through the order of the returned parameters.
614 Setting the `method` to 'chunk' is the same
as setting it to
'tail'.
616 if len(ensureIterable(data)) > 1
and len(target):
617 warn(
"Please don't give multiple data AND split targets at the same time. While it can do it, "
618 "it's simply too confusing to think through the order of the returned parameters.")
621 for d
in ensureIterable(data):
624 targets = [d.pop(t)
for t
in ensureIterable(target)]
628 splitMe.append(pd.DataFrame(dict(zip(ensureIterable(target), targets))))
630 splitMe.insert(0
if len(targets)
else len(splitMe), d)
633 if method ==
'random':
634 return skms.train_test_split(*splitMe, test_size=amt, random_state=seed)
635 elif method
in (
'head',
'tail',
'chunk'):
639 split = round(len(d) * (amt
if method ==
'head' else (1-amt)))
640 rtn += [d.iloc[:split], d.iloc[split:]]
643 raise TypeError(
"Invalid method parameter given")
661 assert not isinstance(target, (list, tuple)),
'There can only be 1 target feature'
662 assert target
is None or target
in data.columns, f
'Target {target} is not one of the features'
663 assert startFeature
is None or startFeature
in data.columns,
'startFeature must be a valid column name'
664 assert len(data),
'DataFrame cannot be empty'
667 stats = [
'mean',
'median',
'std',
'min',
'max']
669 stats += ensureIterable(additionalStats,
True)
670 if startFeature
is None:
671 if target
is not None:
672 startFeature = target
674 startFeature = data.columns[0]
677 whatTheHeck = (corr, missing)
678 max_name_len = len(max(data.columns, key=len))
679 ALPHA = min(1, 1000/len(data))
if alpha
is None else alpha
682 combobox = widgets.Dropdown(
699 description=
'Select Summary',
700 style={
'description_width':
'initial'},
709 featureBox = widgets.Dropdown(
710 options=list(data.columns),
712 description=
'Feature',
714 featureBox.layout.visibility =
'hidden'
715 featureABox = widgets.Dropdown(
716 options=list(data.columns),
717 value=startx
if startx
is not None else startFeature,
720 featureABox.layout.visibility =
'hidden'
721 featureBBox = widgets.Dropdown(
722 options=list(data.columns),
723 value=starty
if starty
is not None else startFeature,
726 featureBBox.layout.visibility =
'hidden'
727 featureHueBox = widgets.Dropdown(
728 options=list(data.columns) + [
'None'],
729 value=startHue
if startHue
is not None else 'None',
732 featureHueBox.layout.visibility =
'hidden'
733 outlierSlider = widgets.FloatSlider(
738 description=
'Z-Score:',
740 continuous_update=CONTINUOUS_UPDATE_SLIDER,
745 outlierSlider.layout.visibility =
'hidden'
748 def output(page, feature, a, b, hue, zscore):
750 corr, missing = whatTheHeck
751 featureBox.layout.visibility =
'hidden'
752 featureABox.layout.visibility =
'hidden'
753 featureBBox.layout.visibility =
'hidden'
754 featureHueBox.layout.visibility =
'hidden'
755 outlierSlider.layout.visibility =
'hidden'
757 clear_output(wait=
True)
759 plt.xticks(rotation=45)
762 if page ==
'Description':
763 print(f
'There are {len(data):,} samples, with {len(data.columns)} columns:')
765 print(
', '.join(data.columns))
767 print(
'which have types:')
768 display(getNiceTypesTable(data))
770 if len(quantitative(data)):
771 print(
'\nThe possible values for the Catagorical values:')
773 for key, value
in sort_dict_by_value_length(dict([(c, data[c].unique())
for c
in catagorical(data).columns])).items():
782 joined_list =
", ".join(value)
783 if len(joined_list) <= 80:
784 print(
' ' + joined_list)
789 print(f
'... ({card - 29} more catagories)')
790 elif page ==
'Stats':
791 if len(quantitative(data)):
793 display(data.agg(dict(zip(quantitative(data), [stats]*len(data.columns)))))
794 elif page ==
'Entropy':
795 todo(
'Calculate entropy relative to the target feature')
798 for c
in data.columns:
799 print(f
'The entropy of {c:>{max_name_len}} is: {round(scipy.stats.entropy(data[c].value_counts(normalize=True), base=entropy), 3)}')
803 elif page ==
'Duplicates':
806 with pd.option_context(
'display.max_columns',
None):
808 elif page ==
'Counts':
810 for i
in sorted(catagorical(data), key=
lambda c: len(data[c].unique())):
811 print(f
'{i} value counts:')
813 if len(data[i].unique()) == len(data[i]):
814 print(
'\tEvery sample has a unique catagory')
816 print(pretty_counts(data[i]))
817 elif page ==
'Correlations':
818 if len(quantitative(data)):
819 print(
'Correlations Between Quantatative Values:')
820 if type(corr)
is bool:
821 display(quantitative(data).corr())
822 elif isinstance(corr, (int, float)):
823 corr = normalizePercentage(corr)
826 _corr = significantCorrelations(quantitative(data), corr)
828 a_len = max([len(i[0])
for i
in _corr])
829 b_len = max([len(i[1])
for i
in _corr])
831 print(f
'\t{a:<{a_len}} <-> {b:<{b_len}}: {round(c, 2):+}')
833 print(f
'\tThere are no correlations greater than {corr:.0%}')
834 elif page ==
'Missing':
836 print(
'Missing Percentages:')
837 if type(missing)
is bool:
838 percent = data.isnull().sum()/len(data)*100
841 print(pretty_2_column_array(percent))
842 elif isinstance(missing, (int, float)):
843 missing = normalizePercentage(missing)
844 _missing = missingSummary(data, missing/100)
848 print(f
'\tAll values are missing less than {missing:.0%} of their entries')
850 raise TypeError(
'Missing is a bad type')
851 elif page ==
'Features':
853 featureBox.layout.visibility =
'visible'
856 group =
'catagorical' if isCatagorical(data[feature])
else 'quantative'
857 missing = data[feature].isnull().sum()/len(data[feature])
858 shared = f
'"{feature}" is {"the target" if feature == target else "a"} {group} feature of type {data[feature].dtype}.\n' \
859 f
'{missing:.1%} of it is missing.'
862 if isCatagorical(data[feature]):
864 print(f
'It has an entropy of {scipy.stats.entropy(data[feature].value_counts(normalize=True), base=entropy):.3f}', end=
', ')
865 print(f
'and a cardinaltiy of {len(data[feature].unique())}')
866 print(
'Value counts:')
867 print(pretty_counts(data[feature], paren=
True))
869 sns.histplot(data[feature])
874 outlierSlider.layout.visibility =
'visible'
877 if data[feature].std() > 1:
878 outlierSlider.max = abs(data[feature].max()) / data[feature].std()
880 outlierSlider.max = abs(data[feature].max()) * data[feature].std()
883 for a, b, c
in significantCorrelations(quantitative(data), corr):
889 if other
is not None:
890 correlations.append(f
'{other}({c:.1%})')
892 if len(correlations):
893 correlations =
'It correlates with ' +
', '.join(correlations)
895 correlations = f
'It has no significant (>{corr:.1%}) correlations with any features'
898 print(f
'It has an average value of {data[feature].mean():,.2f}, and a median of {data[feature].median():,.2f}.')
901 print(f
'It has a kurtosis value of {scipy.stats.kurtosis(data[feature]):,.2f}.')
902 print(
'\tNegative values mean less outliers than a normal distrobution, positive values mean more.')
903 except np.core._exceptions.UFuncTypeError:
pass
904 print(f
'It has a minimum value of {data[feature].min():,.2f}, and a maximum value of {data[feature].max():,.2f}.')
914 showOutliers(data, feature, zscore, alpha=ALPHA)
923 elif page ==
'General Plots':
924 if len(quantitative(data)):
925 print(
'Plot of Quantatative Values:')
926 sns.catplot(data=quantitative(data))
928 if len(catagorical(data)):
929 print(
'Plot of Catagorical Value Counts:')
930 todo(
'catagorical (count?) plots')
932 elif page ==
'Custom Plots':
933 featureABox.layout.visibility =
'visible'
934 featureBBox.layout.visibility =
'visible'
935 featureHueBox.layout.visibility =
'visible'
937 graph = sns.scatterplot(x=data[a], y=data[b], hue=
None if hue ==
'None' else data[hue], alpha=ALPHA)
938 if isQuantatative(data[a])
and isQuantatative(data[b]):
940 graph.set(title=f
'Correlation: {data.corr()[a][b]:0.1%}')
942 print(
'Cant calculate the correlations of dates for some reason')
947 graph.set(title=
'Most common together: Todo')
950 elif page ==
'Matrix':
951 if len(quantitative(data)):
952 print(
'Something Something Matrix:')
953 if target
in quantitative(data):
954 sns.pairplot(data=quantitative(data), hue=target)
956 sns.pairplot(data=quantitative(data))
958 elif page ==
'Alerts':
967 if data[feature].count() < SMALL_DATASET:
968 print(f
"Your dataset isn't very large ({data[feature].count()}<{SMALL_DATASET})")
971 for c
in catagorical(data):
972 card = len(data[c].unique())
974 print(f
'All values in feature "{c}" are the same')
975 elif card >= data[feature].count():
976 print(f
'Every value in feature "{c}" is unique, are you sure its not quantatative?')
977 elif card > HIGH_CARDINALITY:
978 print(f
'Feature "{c}" has a very high cardinality ({card}>{HIGH_CARDINALITY})')
981 for i
in data.columns:
982 miss = data[i].isnull().sum()/len(data[i])
983 if miss >= ALERT_MISSING:
984 print(f
'Feature {i} is missing a significant portion ({miss}>={ALERT_MISSING})')
987 for q
in quantitative(data):
989 upper = data[q].max() - data[q].quantile(.75)
990 upperMid = data[q].quantile(.75) - data[q].median()
991 if upper - upperMid > OUTLIER_THRESHOLD * data[q].median():
992 print(f
'Feature {q:>{max_name_len}} may have some upper outliers', end=
' | ')
993 print(f
'upper: {upper:>6.1f} | upperMid: {upperMid:>6.1f} | median: {data[q].median():>6.1f} | diff: {upper-upperMid:>6.1f}')
995 lower = data[q].quantile(.25) - data[q].min()
996 lowerMid = data[q].median() - data[q].quantile(.25)
997 if lower - lowerMid > OUTLIER_THRESHOLD * data[q].median():
998 print(f
'Feature {q:>{max_name_len}} may have some lower outliers', end=
' | ')
999 print(f
'lower: {lower:>6.1f} | lowerMid: {lowerMid:>6.1f} | median: {data[q].median():>6.1f} | diff: {lower-lowerMid:>6.1f}')
1001 todo(
'checking dates for outliers isnt implemented')
1003 print(
'Invalid start option')
1006 ui = widgets.GridBox([combobox, featureABox, featureBox, featureBBox, outlierSlider, featureHueBox], layout=widgets.Layout(
1007 grid_template_columns=
'auto auto',
1008 grid_row_gap=
'10px',
1009 grid_column_gap=
'100px',
1012 out = widgets.interactive_output(output, {
'page': combobox,
'feature': featureBox,
'a': featureABox,
'b': featureBBox,
'hue': featureHueBox,
'zscore': outlierSlider})
1014 quickSummary = explore
1016 def suggestedCleaning(df, target):
1017 todo(
'suggestedCleaning')
1019 def _cleanColumn(df, args, column, verbose, ignoreWarnings=False):
1020 global MODE_SELECTION
1023 if column
in df.columns
or column
is None:
1024 for op, options
in args.items():
1026 if options
is False and op
not in (
'missing_value',
'remove'):
1028 if options
is True and not ignoreWarnings
and op
not in (
'drop_duplicates',
'missing_value',
'remove',
'drop'):
1029 raise TypeError(f
"'True' is an invalid option for {op} (for column {column})")
1031 if op ==
'drop_duplicates':
1032 warn(
'drop_duplicates hasnt been implemented yet for induvidual columns. What are you trying to do?')
1033 elif op ==
'handle_outliers':
1034 zscore, method = options
1035 df[column] = handle_outliers(df[colulmn], method, zscore=zscore, verbose=verbose)
1036 elif op ==
'replace':
1037 if not isinstance(options, dict):
1038 raise TypeError(f
"Please specify a dict for the replace option (under column {column})")
1039 log(f
'Replacing specified entries in {column}', verbose)
1040 df[column] = df[column].replace(options)
1042 if callable(options):
1043 log(f
'Applying function to {column}')
1044 df[column] = df[column].apply(options, axis=1)
1046 if not ignoreWarnings:
1047 raise TypeError(f
"Please specify a function to apply (under column {column})")
1048 elif op ==
'missing_value':
1050 elif op ==
'handle_missing':
1051 if options
in (
'mean',
'median')
and isCatagorical(df[column])
and ignoreWarnings:
1054 df[column] = handle_missing(df[column], method=options, missing_value=missing, verbose=verbose)
1055 elif op ==
'queries':
1056 if options
in (
'mean',
'median')
and isCatagorical(df[column])
and ignoreWarnings:
1059 if len(options) == 2
and type(options[0])
is str:
1061 for q, method
in options:
1062 df = query(df, column, q, method, verbose=verbose)
1063 elif op ==
'remove':
1064 df[column] = remove(df[column], options, verbose=verbose)
1066 if isCatagorical(df[column])
and not ignoreWarnings:
1067 warn(f
'The bin option was set on "{column}", which is not quantatative, skipping.')
1069 df[column] = bin(df[column], method, amt, verbose=verbose)
1070 elif op ==
'normalize':
1071 if isCatagorical(df[column])
and not ignoreWarnings:
1072 warn(f
'The normalize option was set on {column}, which is not quantatative, skipping.')
1074 df[column] = normalize(df[column], options, verbose=verbose)
1075 elif op ==
'convert_numeric':
1076 if isQuantatative(df[column], time=
False)
and not ignoreWarnings:
1077 warn(f
'The conver_numeric option was set on {column}, which is not catagorical, skipping.')
1079 df = convert_numeric(df, column, options, verbose=verbose)
1080 elif op ==
'add_column':
1081 if isinstance(options, (tuple, list)):
1082 if not isinstance(options[0], (tuple, list)):
1084 for name, selection
in options:
1085 log(f
'Adding new column "{name}"')
1086 df[name] = selection
1088 raise TypeError(f
"add_column argument must be a tuple, or a list of tuples, not {type(options)}")
1091 log(f
'Dropping column "{column}"')
1092 df = df.drop(columns=[column])
1094 raise TypeError(f
'Invalid arguement {op} given')
1096 raise TypeError(f
'Column "{column}" provided is not in the given DataFrame')
1100 def clean(df:pd.DataFrame,
1101 config: Dict[str, Dict[str, Any]],
1105 """ Returns a cleaned copy of the DataFrame passed to it
1106 NOTE: The order of the entries in the config dict determine the order they are performed
1109 config
is a dict of this signature:
1110 NOTE: This
is the suggested order
1116 'drop_duplicates': bool,
1118 'handle_outliers': Union[bool, Tuple[float, Union[
'remove',
'constrain']]],
1120 'replace': Union[bool, Dict],
1122 'apply': Union[bool, Callable],
1128 'queries': Union[bool, List[Tuple[str, Union[Series,
'remove',
'mean',
'median',
'mode',
'random',
'balanced_random', Any]]]],
1131 'add_column': Union[Tuple[str, np.ndarray], List[Tuple[str, np.ndarray]]],
1133 'missing_value': Any,
1135 'handle_missing': Union[bool, Series,
'remove',
'mean',
'median',
'mode',
'random',
'balanced_random', Any],
1137 'remove': Union[bool, Any],
1139 'bin': Union[bool, Tuple[
'frequency', int], Tuple[
'width', int], Iterable],
1141 'normalize': Union[bool,
'min-max',
'range'],
1143 'convert_numeric': Union[bool,
'assign',
'one_hot_encode'],
1149 raise DeprecationWarning(
'This function is no longer supported and is likely to break')
1151 log =
lambda s: print(s)
if verbose
else None
1154 config = OrderedDict(config)
1155 if 'all' in config.keys():
1156 config.move_to_end(
'all')
1158 for column, args
in config.items():
1159 log(f
'Working on "{column}"')
1160 if column.lower() ==
'all':
1162 if 'add_column' in args:
1164 df = _cleanColumn(df, {
'add_column': args[
'add_column']},
None, verbose)
1165 del args[
'add_column']
1169 if 'drop_duplicates' in args:
1170 log(
'\tDropping duplicate samples')
1171 df = df.drop_duplicates()
1172 del args[
'drop_duplicates']
1174 for c
in df.columns:
1178 adjusted = args.copy()
1179 if c
in config.keys():
1180 for op, params
in config[c].items():
1181 log(f
'\tExcluding column {c} from {op}')
1182 if op
in adjusted.keys():
1184 df = _cleanColumn(df, adjusted, c, verbose,
True)
1186 df = _cleanColumn(df, args, column, verbose)
1187 if split
is not None:
1188 if split
in df.columns:
1189 return df.drop(columns=split), df[split]
1191 raise TypeError(
'Provided feature not in the resulting data (did you drop it in the cleaning process by accident?)')
1195 def resample(X, y, method:Union[
'oversample',
'undersample',
'mixed']=
'oversample', seed=
None):
1197 if method ==
'oversample':
1198 sampler = RandomOverSampler(random_state=seed)
1199 return sampler.fit_resample(X, y)
1200 elif method ==
'undersample':
1201 sampler = RandomUnderSampler(random_state=seed)
1202 return sampler.fit_resample(X, y)
1203 elif method ==
'mixed':
1204 todo(
'figure out how to mix under and over sampling')
1206 raise TypeError(
"Invalid method arguement given")
1209 @_cleaning_func(testPredictions=pd.Series)
1210 def evaluateQuantitative(test, testPredictions, train=None, trainPredictions=None, accuracy=3, explain=False, compact=False, line=False, log=...):
1211 """ Evaluate your predictions of an ML model.
1212 NOTE: compact overrides explain.
1214 assert (train
is None) == (trainPredictions
is None),
'You have to pass both train & trainPredictions'
1218 test = pd.Series(test)
1219 testPredictions = pd.Series(testPredictions)
1221 def _score(name, func, explaination, _test=True, **kwargs):
1224 print(f
'{name} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}', end=
' ')
1227 print(f
'\t{name:<23} {ensureNotIterable(func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs)):,.{accuracy}f}')
1229 print(
'\t\t' + explaination)
1231 def _quantatative(_test=True):
1232 _score(
'Root Mean Square Error', mean_squared_error,
'An average of how far off we are from the target, in the same units as the target. Smaller is better.', _test, squared=
False)
1233 _score(
'My own measure',
lambda a, b, **k: mean_squared_error(a, b, **k) / a.mean(),
'Root mean square / average value. Eliminates the domain a bit. Smaller is better.', _test, squared=
False)
1234 _score(
'Mean Absolute Error', mean_absolute_error,
'Similar to Root Mean Square Error, but better at weeding out outliers. Smaller is better.', _test)
1235 _score(
'Median Absolute Error', median_absolute_error,
'', _test)
1236 _score(
'R^2 Score', r2_score,
'An average of how far off we are from just using the mean as a prediction. Larger is better.', _test)
1238 def amtInPercent(truth, pred, precent):
1239 return ((truth - pred).abs() / truth <= (percent / 100)).values.sum() / len(truth) * 100
1241 for percent
in (5, 10, 20, 50):
1242 _score(f
'Within {percent}%',
lambda a, b, **k: amtInPercent(a, b, percent), f
'How many of the samples are within {percent}% of their actual values', _test)
1246 if train
is not None and trainPredictions
is not None:
1248 _quantatative(
False)
1251 sns.set(rc={
'figure.figsize':(11.7,8.27)})
1253 delta = test - testPredictions
1257 testfinal = pd.DataFrame({
1258 'Predictions': testPredictions,
1259 'Ground Truth': test,
1260 'difference': delta,
1261 'percent_difference': abs(delta/test),
1264 testfinal[
'percent_difference'] = bin(testfinal[
'percent_difference'], method=(0, .05, .10, .20, .50, 1))
1268 testfinal[
'percent_difference'] = testfinal[
'percent_difference'].replace({
1269 pd.Interval(0, .05, closed=
'right'):
'Within 5%',
1270 pd.Interval(.05, .1, closed=
'right'):
'Within 10%',
1271 pd.Interval(.1, .2, closed=
'right'):
'Within 20%',
1272 pd.Interval(.2, .5, closed=
'right'):
'Within 50%',
1273 pd.Interval(0.5, 1.0, closed=
'right'):
'Within 100%',
1277 'Within 5%':
'tab:green',
1278 'Within 10%':
'tab:green',
1279 'Within 20%':
'tab:blue',
1280 'Within 50%':
'tab:orange',
1281 'Within 100%':
'tab:red',
1289 ax = sns.scatterplot(data=testfinal,x=
'Ground Truth',y=
'Predictions',hue=
"percent_difference",palette=color_dict)
1291 ax.plot(xlims,xlims, color=
'r')
1295 evaluateQ = evaluateQuantitative
1297 def evaluateCatagorical(test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False):
1298 """ Evaluate your predictions of an ML model.
1299 NOTE: compact overrides explain.
1301 assert (train
is None) == (trainPredictions
is None),
'You have to pass both train & trainPredictions'
1303 def _score(name, func, explaination, _test=True, **kwargs):
1306 print(f
'{name} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}', end=
' ')
1308 print(f
'\t{name:<23} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}')
1310 print(
'\t\t' + explaination)
1312 def _catagorical(_test=True):
1315 _score(
'F1', sklearn.metrics.f1_score,
'F1 is essentially an averaged score combining precision and recall', _test)
1318 _score(
'Accuracy', sklearn.metrics.accuracy_score,
'Accuracy is a measure of how well the model did on average', _test)
1319 _score(
'Precision', sklearn.metrics.precision_score,
'Precision is a measure of how many things we said were true and we were wrong', _test)
1320 _score(
'Recall', sklearn.metrics.recall_score,
'Recall is a measure of how many things we missed out on', _test)
1326 ConfusionMatrixDisplay.from_predictions(test, testPredictions, cmap=
'Blues')
1329 PrecisionRecallDisplay.from_predictions(test, testPredictions)
1332 if train
is not None and trainPredictions
is not None:
1337 ConfusionMatrixDisplay.from_predictions(train, trainPredictions, cmap=
'Blues')
1340 PrecisionRecallDisplay.from_predictions(train, trainPredictions)
1342 evaluateC = evaluateCatagorical
1344 def evaluate(catagorical, test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False, line=False):
1345 """ Evaluate your predictions of an ML model.
1346 NOTE: compact overrides explain.
1348 assert (train
is None) == (trainPredictions
is None),
'You have to pass both train & trainPredictions'
1349 raise DeprecationWarning(
'Please use evaluateQ or evaluateC instead')
1351 def _score(name, func, explaination, _test=True, **kwargs):
1354 print(f
'{name} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}', end=
' ')
1356 print(f
'\t{name:<23} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}')
1358 print(
'\t\t' + explaination)
1365 def _catagorical(_test=True):
1366 _score(
'F1', sklearn.metrics.f1_score,
'F1 is essentially an averaged score combining precision and recall', _test)
1367 _score(
'Accuracy', sklearn.metrics.accuracy_score,
'Accuracy is a measure of how well the model did on average', _test)
1368 _score(
'Precision', sklearn.metrics.precision_score,
'Precision is a measure of how many things we said were true and we were wrong', _test)
1369 _score(
'Recall', sklearn.metrics.recall_score,
'Recall is a measure of how many things we missed out on', _test)
1371 def _quantatative(_test=True):
1372 _score(
'Root Mean Square Error', mean_squared_error,
'An average of how far off we are from the target, in the same units as the target. Smaller is better.', _test, squared=
False)
1373 _score(
'My own measure',
lambda a, b, **k: mean_squared_error(a, b, **k) / a.mean(),
'Root mean square / average value. Eliminates the domain a bit. Smaller is better.', _test, squared=
False)
1374 _score(
'Mean Absolute Error', mean_absolute_error,
'Similar to Root Mean Square Error, but better at weeding out outliers. Smaller is better.', _test)
1375 _score(
'Median Absolute Error', median_absolute_error,
'', _test)
1376 _score(
'R^2 Score', r2_score,
'An average of how far off we are from just using the mean as a prediction. Larger is better.', _test)
1377 for percent
in (5, 10, 20, 50):
1378 _score(f
'Within {percent}%',
lambda a, b, **k: amtInPercent(a, b, percent), f
'How many of the samples are within {percent}% of their actual values', _test)
1380 def amtInPercent(truth, pred, precent):
1381 combined = pd.concat([truth, pred], axis=1)
1382 combined.columns = [
"truth",
"pred"]
1383 combined[
"absdiff"] = (combined[
"truth"] - combined[
"pred"]).abs()
1384 combined[
"absdiff_pct"] = combined[
"absdiff"] / combined[
"truth"]
1385 return len(combined[combined[
"absdiff_pct"] <= (percent / 100)]) / len(combined) * 100
1393 ConfusionMatrixDisplay.from_predictions(test, testPredictions, cmap=
'Blues')
1396 PrecisionRecallDisplay.from_predictions(test, testPredictions)
1399 if train
is not None and trainPredictions
is not None:
1404 ConfusionMatrixDisplay.from_predictions(train, trainPredictions, cmap=
'Blues')
1407 PrecisionRecallDisplay.from_predictions(train, trainPredictions)
1413 if train
is not None and trainPredictions
is not None:
1415 _quantatative(
False)
1418 sns.set(rc={
'figure.figsize':(11.7,8.27)})
1419 color_dict = dict({
'below 20%':
'tab:blue',
1420 'above 20%':
'tab:orange'})
1422 shower = pd.DataFrame(student_ds, columns=[
'predictions'])
1423 shower.columns = [
'predictions']
1424 testfinal = pd.concat([shower,targets[
'actual']],axis=1)
1425 testfinal[
'difference'] = testfinal[
'actual']-testfinal[
'predictions']
1426 testfinal[
'percent_difference'] = abs(testfinal[
'difference']/testfinal[
'actual'])
1427 testfinal[
'percent_bucket'] = [
"above 20%" if i >= 0.2
else "below 20%" for i
in testfinal.percent_difference]
1432 ax = sns.scatterplot(data=testfinal,x=
'actual',y=
'predictions',hue=
"percent_bucket",palette=color_dict)
1443 TODO: Add cross-validation to evaluate
1444 from sklearn.linear_model
import LogisticRegression
1445 from sklearn.model_selection
import cross_val_score
1459 def importances(tree, names=None, rtn=False, graph=True, best=.01):
1461 names = tree.feature_names_in_
1464 'importance': tree.feature_importances_
1469 df = df.loc[df.importance >= best]
1471 df = df.sort_values(by=
'importance', ascending=
False, axis=0)
1473 sns.catplot(data=df, x=
'importance', y=
'feature', kind=
'bar', height=10, aspect=2)
1479 def saveStats(file, name, model, testY, predY, trainY=None, trainPredY=None, notes='', new=False, show=True, save=True):
1484 print(
'Model type:', type(model))
1485 print(
'Parameters:')
1486 for key, val
in model.get_params().items():
1487 print(f
'\t{key}: {val}')
1488 print(
'\nImportances:')
1489 print(importances(model, rtn=
True, graph=
False))
1491 evaluate(testY, predY, trainY, trainPredY, compact=
False)
1492 print(
'\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n')
1494 with open(file,
'w' if new
else 'a')
as f:
1495 with redirect_stdout(f):
1500 def plot_history(history):
1504 plt.plot(history[
'index'], history[
'loss'], label=
'Train Loss')
1505 plt.plot(history[
'index'], history[
'val_loss'], label=
'Value Loss')