Cope 2.5.0
My personal "standard library" of all the generally useful code I've written for various projects over the years
Loading...
Searching...
No Matches
data.py
1try:
2 import pandas as pd
3 from imblearn.over_sampling import RandomOverSampler
4 from functools import wraps
5 from contextlib import redirect_stdout
6 from imblearn.under_sampling import RandomUnderSampler
7 from warnings import warn
8 import random
9 from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
10 import sklearn
11 from sklearn.preprocessing import MinMaxScaler
12 import seaborn as sns
13 # from scipy.stats import entropy as _entropy
14 # from scipy.stats import kurtosis
15 import scipy.stats
16 import matplotlib.pyplot as plt
17 from typing import Optional, Any, Tuple, List, Iterable, Dict, Union, Callable, Iterable, Literal
18 import numpy as np
19 from sklearn.metrics import PrecisionRecallDisplay, ConfusionMatrixDisplay
20 import ipywidgets as widgets
21 from collections import OrderedDict
22 from IPython.display import clear_output, display
23 from math import log, e
24 import sklearn.model_selection as skms
25except: pass
26else:
27 # If there's mutliple modes, how do we want to choose one? Used in _cleanColumn
28 # options: 'random', 'first', 'last'
29 MODE_SELECTION = 'random'
30
31 # How small is a "small" dataset
32 SMALL_DATASET = 1000
33 HIGH_CARDINALITY = 50
34 # At what percentage does it become worrisome if that many samples are missing the feature?
35 ALERT_MISSING = .55
36 # If the difference between the extreme and the mid extreme > median * this, then it indicates outliers
37 OUTLIER_THRESHOLD = .5
38 # This is an option for the zscore slider in explore(). Turning it on is fancier, but it's really slow
39 CONTINUOUS_UPDATE_SLIDER = False
40
41 # For use with DataFrame.select_dtypes(include=/exclude=)
42 # _catagoricalTypes = (str, Enum, np.object_, pd.CategoricalDtype, pd.Interval, pd.IntervalDtype, type(np.dtype('O')), bool, np.bool_, np.bool8)
43 _catagoricalTypes = ['bool', 'bool_', 'object', 'object_', 'Interval', 'bool8', 'category']
44 _quantitativeTypes = ['number']
45 _timeTypes = ['datetimetz', 'timedelta', 'datetime']
46
47
48 try:
49 # from Cope import todo
50 pass
51 except ImportError:
52 todo = lambda *a: print('TODO: ', *a)
53
54
55 # I got tired of MinMaxScaler returning numpy arrays
56 def _cast2dataframe(func):
57 def wrapper(self, *args, **kwargs):
58 return pd.DataFrame(func(self, *args, **kwargs), columns=self.feature_names_in_)
59 return wrapper
60
61 MinMaxScaler.transform = _cast2dataframe(MinMaxScaler.transform)
62
63 def installLibs(libs=['pandas', 'numpy', 'imblearn', 'ipywidgets', 'seaborn', 'scipy', 'matplotlib']):
64 libs = ' '.join(libs)
65 try:
66 import IPython
67 except:
68 print(f'IPython doesnt seem to be installed. Simply run `pip install {libs}` in a terminal')
69
70 if (ipython := IPython.get_ipython()) is not None:
71 ipython.run_line_magic("pip", f"install {libs}")
72 else:
73 print('You dont seem to be calling from IPython. Simply run `pip install pandas altair numpy imblearn ipywidgets seaborn scipy matplotlib IPython` in a terminal')
74
75
76 # I AM THE COMPUTER GOBLIN
77 # FEAR ME
78 def addVerbose(func):
79 # Runs when the decorator is added
80 @wraps(func)
81 def inner(*args, verbose=False, **kwargs):
82 # Runs when the decorated function gets called
83 return func(
84 *args,
85 log=lambda s: print(f'\t{s}') if verbose else None,
86 **kwargs)
87 return inner
88
89 def _cleaning_func(**decorator_kwargs):
90 """ Auto-converts the given named parameter to the given type
91 Supports inputs of pd.DataFrame, pd.Series, np.ndarray, and tuple/list pd.Series or np.ndarray
92 Supports outputs of pd.DataFrame, pd.Series, and a tuple of pd.Series
93 Does NOT support input types of tuple/list of pd.DataFrames
94 """
95 trivial = lambda x: x
96
97 def error(toType):
98 def _error(x):
99 raise TypeError(f"Cant cast {toType} to {type(x)}")
100 return _error
101
102 iterableInput = {
103 pd.DataFrame: lambda t: pd.DataFrame(t).T,
104 pd.Series: lambda t: (pd.Series(t[0]) if len(t) == 1 else error(pd.Series)(t)),
105 tuple: lambda t: tuple([pd.Series(i) for i in t]),
106 }
107
108 input2output = {
109 pd.DataFrame: {
110 pd.DataFrame: trivial,
111 pd.Series: lambda d: pd.Series(d.iloc[:,0]) if len(d.columns) == 1 else error(pd.Series),
112 tuple: lambda d: tuple([d[i] for i in d]),
113 },
114 pd.Series: {
115 pd.DataFrame: lambda s: pd.DataFrame(s),
116 pd.Series: trivial,
117 tuple: lambda s: (s,),
118 },
119 np.ndarray: {
120 pd.DataFrame: lambda n: pd.DataFrame(n),
121 pd.Series: lambda n: pd.Series(n),
122 tuple: lambda s: (pd.Series(s),),
123 },
124 tuple: iterableInput,
125 list: iterableInput,
126 }
127
128 def outer(decorator_func):
129 # Also runs when the decorator is added
130 @wraps(decorator_func)
131 @addVerbose
132 def inner(dat, *args, **kwargs):
133 # Runs when the decorated function gets called
134 if isinstance(dat, (list, tuple)):
135 if len(dat) == 0:
136 raise TypeError('Please dont pass in an empty list')
137 elif len(dat) == 1:
138 dat = dat[0]
139 # If we're given a collection of pd.DataFrames, then iterate through the function and
140 # apply it to all of them
141 elif isinstance(dat[0], pd.DataFrame):
142 _kwargs = kwargs.copy()
143 rtn = []
144 for d in dat:
145 for paramName, outputType in decorator_kwargs.items():
146 _kwargs[paramName] = input2output[pd.DataFrame][outputType](d)
147 rtn.append(decorator_func(*args, **kwargs))
148 return rtn
149
150 for paramName, outputType in decorator_kwargs.items():
151 kwargs[paramName] = input2output[type(dat)][outputType](dat)
152 return decorator_func(*args, **kwargs)
153 return inner
154 return outer
155
156
157 def insertSample(df, sample, index=-1):
158 """ Because theres not a function for this? """
159 df.loc[index - .5] = sample
160 return df.sort_index().reset_index(drop=True)
161
162 def ensureIterable(obj, useList=False):
163 if not isiterable(obj):
164 return [obj, ] if useList else (obj, )
165 else:
166 return obj
167
168 def ensureNotIterable(obj, emptyBecomes=None):
169 if isiterable(obj):
170 # Generators are iterable, but don't inherantly have a length
171 try:
172 len(obj)
173 except:
174 obj = list(obj)
175
176 if len(obj) == 1:
177 try:
178 return obj[0]
179 except TypeError:
180 return list(obj)[0]
181 elif len(obj) == 0:
182 return obj if emptyBecomes is _None else emptyBecomes
183 else:
184 return obj
185 else:
186 return obj
187
188 def getOutliers(data, zscore=None):
189 # TODO: add more options here (like getting outliers via kurtosis & IQR)
190 # IQR (inner quartile range) = Q3-Q1
191 # +/- 1.5*IQR == possible outlier
192 # +/- 3*IQR == outlier
193 # kurtosis
194 if isinstance(data, pd.Series):
195 if zscore is not None:
196 return data[np.abs(scipy.stats.zscore(data)) > zscore]
197
198 elif isinstance(data, pd.DataFrame):
199 if zscore is not None:
200 rtn = {}
201 for f in data.columns:
202 rtn[f] = data[f][np.abs(scipy.stats.zscore(data[f])) > zscore]
203 return pd.DataFrame(rtn)
204
205 else:
206 raise TypeError(f"Invalid type {type(data)} given")
207
208 def normalizePercentage(p, error='Percentage is of the wrong type (int or float expected)'):
209 if isinstance(p, int):
210 return p / 100
211 elif isinstance(p, float):
212 return p
213 elif isinstance(p, bool):
214 if p is True:
215 return 1.
216 else:
217 return 0.
218 else:
219 if error is not None:
220 raise TypeError(error)
221
222 def isiterable(obj, includeStr=False):
223 return isinstance(obj, Iterable) and (type(obj) is not str if not includeStr else True)
224
225 def sort_dict_by_value_length(d):
226 return dict(sorted(d.items(), key=lambda item: len(item[1])))
227
228 def timeFeatures(df) -> pd.DataFrame:
229 return df.select_dtypes(include=_timeTypes)
230
231 def catagorical(df, time=False) -> pd.DataFrame:
232 return df.select_dtypes(include=_catagoricalTypes + (_timeTypes if time else []))
233
234 def quantitative(df, time=True) -> pd.DataFrame:
235 return df.select_dtypes(include=_quantitativeTypes + (_timeTypes if time else []))
236
237 def isTimeFeature(s: pd.Series):
238 s = pd.Series(s, name='__dummy')
239 return s.name in timeFeatures(pd.DataFrame(s))
240
241 def isCatagorical(s: pd.Series, time=False):
242 s = pd.Series(s, name='__dummy')
243 return s.name in catagorical(pd.DataFrame(s), time)
244
245 def isQuantatative(s: pd.Series, time=True):
246 s = pd.Series(s, name='__dummy')
247 return s.name in quantitative(pd.DataFrame(s), time)
248
249 def missingSummary(df, thresh=.6):
250 table = df.isnull().sum()/len(df)
251 return table[table >= thresh]
252
253 def significantCorrelations(df, thresh=.5):
254 names = df.columns
255 cor = df.corr()
256 # Find the significant correlations
257 pos = cor[cor >= thresh]
258 neg = cor[cor <= -thresh]
259 # Convert the NaN's to 0's (because math)
260 pos[pos.isna()] = 0
261 neg[neg.isna()] = 0
262 # We can add these, because there will never be both a positive and negative corellation at the same time
263 arr = pos + neg
264 # Remove the obvious correlations along the diagonal
265 l, w = cor.shape
266 assert l == w, 'Somehow the correlation matrix isnt square?'
267 # np.fill_diagonal(arr, 0)
268 arr[np.eye(w) == 1] = 0
269 # Remove the rows and columns which don't have any significant correlations (all 0's)
270 arr = arr.loc[:, (arr != 0).any(axis=0)]
271 arr = arr[(arr != 0).any(axis=1)]
272 # Because the correlations repeat, remove the upper triangular matrix
273 arr = np.triu(arr.to_numpy())
274 # Get the indecies of the non-zero entries
275 nonzero_indices = list(zip(*np.where(arr != 0)))
276 rtn = []
277 for r, c in nonzero_indices:
278 rtn.append((names[r], names[c], arr[r, c]))
279 return rtn
280
281 def getNiceTypesTable(df, types=None):
282 def _getLabels(col):
283 if isCatagorical(col, time=False):
284 return [col.dtype, 'C']
285 if isQuantatative(col, time=False):
286 return [col.dtype, 'Q']
287 if isTimeFeature(col):
288 return [col.dtype, 'T']
289
290 return pd.DataFrame(dict(zip(
291 df.columns,
292 [_getLabels(df[f]) for f in df.columns]
293 )))
294
295 def percentCountPlot(data, feature, target=None, ax=None, title='Percentage of values used in {}'):
296 # plt.figure(figsize=(20,10))
297 # plt.title(f'Percentage of values used in {feature}')
298 Y = data[feature]
299 total = float(len(Y))
300 ax = sns.countplot(x=feature, data=data, hue=target, ax=ax)
301 ax.set_title(title.format(feature))
302 for p in ax.patches:
303 ax.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x()+0.1, p.get_height()+5))
304
305 #put 11 ticks (therefore 10 steps), from 0 to the total number of rows in the dataframe
306 ax.yaxis.set_ticks(np.linspace(0, total, 11))
307 #adjust the ticklabel to the desired format, without changing the position of the ticks.
308 ax.set_yticklabels(map('{:.1f}%'.format, 100*ax.yaxis.get_majorticklocs()/total))
309 ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
310 # ax.legend(labels=["no","yes"])
311 # plt.show()
312 return ax
313
314 def column_entropy(column:pd.Series, base=e):
315 """ This works, but it's slow for some reason? """
316 vc = pd.Series(column).value_counts(normalize=True, sort=False)
317 return -(vc * np.log(vc)/np.log(base)).sum()
318
319 def pretty_2_column_array(a, limit=30, paren=None):
320 card = len(a)
321 if card > limit:
322 a = a[:limit-1]
323 # a.append(f'... ({card - limit - 1} more)')
324
325 offset = max(list(a.index), key=len)
326 rtn = ''
327 for i in range(len(a)):
328 if paren is None:
329 rtn += f'\t{a.index[i]:>{len(offset)}}: {a[i]:.1%}\n'
330 else:
331 rtn += f'\t{a.index[i]:>{len(offset)}}: {a[i]:.1%} ({paren[i]})\n'
332 return rtn
333
334 def pretty_counts(s:pd.Series, paren=False):
335 # rtn = ''
336 # for i in s.value_counts(normalize=True, sort=True):
337 # rtn += str(i)
338 # rtn = str()
339 if paren:
340 rtn = pretty_2_column_array(s.value_counts(normalize=True, sort=True), paren=s.value_counts(sort=True))
341 else:
342 rtn = pretty_2_column_array(s.value_counts(normalize=True, sort=True))
343 return rtn
344
345
346 def meanConfInterval(data, confidence=0.95, mean=False):
347 a = 1.0 * np.array(data)
348 n = len(a)
349 m, se = np.mean(a), scipy.stats.sem(a)
350 h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
351 if mean:
352 return m, m-h, m+h
353 else:
354 return m-h, m+h
355
356 def showOutliers(data, column, zscore, **snsArgs):
357 if isCatagorical(data[column]):
358 raise TypeError('Outliers only apply to quantitative values')
359 samples = getOutliers(data[column], zscore=zscore)
360 print(len(samples), len(data[column]), sep='/')
361 sns.scatterplot(data=data[column], **snsArgs)
362 sns.scatterplot(data=samples, **snsArgs)
363 plt.show()
364
365 def interactWithOutliers(df, feature=None, step=.2):
366 return widgets.interactive(showOutliers,
367 data=widgets.fixed(df),
368 column=list(df.columns) if feature is None else widgets.fixed(feature),
369 zscore=(0., df[feature].max() / df[feature].std(), step) if feature is not None else (0., 10, step)
370 # zscore=(0., 20, step)
371 )
372
373 # Clean Functions
374 @_cleaning_func(col=pd.Series)
375 def handle_outliers(col, method:Union['remove', 'constrain']='remove', zscore=3, log=...):
376 # TODO: add more options here (like getting outliers via kurtosis & IQR)
377 samples = getOutliers(col, zscore=zscore)
378 if method == 'remove':
379 log(f'Removing outliers with zscore magnitudes >{zscore} from {col.name}')
380 return col.drop(samples.index)
381 elif method == 'constrain':
382 todo('This breaks on negative values')
383 # todo try optionally getting everything *not* in range instead of just the things in range
384 # The value that corresponds to a given score is the standard deviate * zscore
385 max = col.std() * zscore
386 # df.loc[samples.index, column] = np.clip(samples, -max, max)
387 log(f'Constraining outliers with zscore magnitudes >{zscore} from {col.name}')
388 # col[samples.index] = np.clip(samples, -max, max)
389 # col.mask()
390 return col.apply(lambda s: np.clip(s, -max, max))
391 else:
392 raise TypeError(f"Invalid method arguement '{method}' given")
393
394 @_cleaning_func(col=pd.Series)
395 def handle_missing(col, method:Union[pd.Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any], missing_value=np.nan, log=...):
396 without = col.loc[col != missing_value]
397 # match options:
398 if isinstance(method, (pd.Series, np.ndarray)):
399 assert len(method) == len(col), 'Both arrays are not of the same length'
400 log(f'Replacing all samples with a "{col.name}" value of "{missing_value}" with their indexes in "{method.name}"')
401 # return col.apply(lambda sample: method[sample.index] if sample == missing_value else sample)
402 return pd.Series([(method[i] if col[i] == missing_value else col[i]) for i in range(len(col))])
403
404 # return pd.Series(col.reset_index().apply(lambda i: method[i] if col[i] == missing_value else col[i], axis=1).values, index=col.index)
405 # return col.apply(lambda sample: method[sample.index] if sample == missing_value else sample)
406 elif method == 'remove':
407 log(f'Removing all samples with "{col.name}" values of "{missing_value}"')
408 return without
409 elif method == 'mean':
410 if isCatagorical(col):
411 raise TypeError("Cannot get mean of a catagorical feature")
412 mean = without.mean()
413 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to the mean ({mean:.2f})')
414 # Copy it for consistency
415 return col.copy().mask(col == missing_value, mean)
416 elif method == 'median':
417 if isCatagorical(col):
418 raise TypeError("Cannot get median of a catagorical feature")
419 median = without.median()
420 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to the median ({median})')
421 return col.copy().mask(col == missing_value, median)
422 elif method == 'mode':
423 # I'm not sure how else to pick a mode, so just pick one at random
424 if MODE_SELECTION == 'random':
425 mode = random.choice(without.mode())
426 elif MODE_SELECTION == 'first':
427 mode = without.mode()[0]
428 elif MODE_SELECTION == 'last':
429 mode = without.mode()[-1]
430 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to a mode ({mode})')
431 return col.copy().mask(col == missing_value, mode)
432 elif method == 'random':
433 if isCatagorical(col):
434 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to random catagories')
435 fill = lambda sample: random.choice(without.unique()) if sample == missing_value else sample
436 else:
437 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to random values along a uniform distrobution')
438 fill = lambda sample: type(sample)(random.uniform(without.min(), without.max())) if sample == missing_value else sample
439
440 return col.apply(fill)
441 elif method == 'balanced_random':
442 if isCatagorical(col):
443 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to evenly distributed random catagories')
444 fill = lambda sample: random.choice(without) if sample == missing_value else sample
445 else:
446 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to random values along a normal distrobution')
447 fill = lambda sample: type(sample)(random.gauss(without.mean(), without.std())) if sample == missing_value else sample
448 return col.apply(fill)
449 else:
450 log(f'Setting all samples with a "{col.name}" value of "{missing_value}" to {method}')
451 # col.loc[col == missing_value] = method
452 return col.copy().mask(col == missing_value, method)
453 return col
454
455 def query(df:pd.DataFrame, column:str, query:str, method:Union[pd.Series, 'remove', 'new', 'mean', 'median', 'mode', 'random', 'balanced_random', Any], true=1, false=0, verbose=False):
456 df = df.copy()
457 if isinstance(method, pd.Series):
458 log(f'Changing all samples where "{query}" is true to have the {column} values of their indecies in "{method.name}"')
459 q = df.query(query)
460 df.loc[q.index, column] = q.apply(lambda s: method[s.name], axis=1)
461 elif method == 'remove':
462 log(f'Removing all samples where "{query}" is true')
463 df = df.drop(df.query(query).index)
464 elif method == 'mean':
465 if isCatagorical(df[column]):
466 raise TypeError("Cannot get mean of a catagorical feature")
467 mean = df[column].mean()
468 log(f'Setting all samples where {query} is true to the mean of "{column}" ({mean:.2})')
469 df.loc[df.query(query).index, column] = mean
470 elif method == 'median':
471 if isCatagorical(df[column]):
472 raise TypeError("Cannot get median of a catagorical feature")
473 median = df[column].median()
474 log(f'Setting all samples where "{query}" is true to the median of "{column}" ({median})')
475 df.loc[df.query(query).index, column] = median
476 elif method == 'mode':
477 # I'm not sure how else to pick a mode, so just pick one at random
478 if MODE_SELECTION == 'random':
479 mode = random.choice(df[column].mode())
480 elif MODE_SELECTION == 'first':
481 mode = df[column].mode()[0]
482 elif MODE_SELECTION == 'last':
483 mode = df[column].mode()[-1]
484 log(f'Setting all samples where "{query}" is true to a mode of "{column}" ({mode})')
485 df.loc[df.query(query).index, column] = mode
486 elif method == 'random':
487 if isCatagorical(df[column]):
488 log(f'Setting all samples where "{query}" is true to have random catagories')
489 fill = lambda s: random.choice(df[column].unique())
490 else:
491 log(f'Setting all samples where "{query}" is true to have random values along a uniform distrobution')
492 fill = lambda s: type(s)(random.uniform(df[column].min(), df[column].max()))
493
494 q = df.query(query)
495 df.loc[q.index, column] = q[column].apply(fill)
496 elif method == 'new':
497 q = df.query(query)
498 df[column] = false
499 df.loc[q.index, column] = true
500 elif method == 'balanced_random':
501 if isCatagorical(df[column]):
502 log(f'Setting all samples where "{query}" is true to have evenly distributed random catagories')
503 fill = lambda s: random.choice(df[column])
504 else:
505 log(f'Setting all samples where "{query}" is true to have random values along a normal distrobution')
506 fill = lambda s: type(s)(random.gauss(df[column].mean(), df[column].std()))
507
508 q = df.query(query)
509 df.loc[q.index, column] = q[column].apply(fill)
510 else:
511 log(f'Setting all samples where "{query}" is true to have a "{column}" value of {method}')
512 df.loc[df.query(query).index, column] = method
513 return df
514
515 @_cleaning_func(col=pd.Series)
516 def remove(col, val, log=...):
517 log(f'Removing all samples with a "{col.name}" value of {val}')
518 # return col.mask(col == val, val)
519 return col.drop(index=col[col == val].index)
520
521 @_cleaning_func(col=pd.Series)
522 def bin(col, method:Union['frequency', 'width', Tuple, List], amt=5, log=...):
523 if isCatagorical(col):
524 raise TypeError(f"Can't bin catagorical feature '{col.name}'")
525
526 if method == 'frequency':
527 log(f'Binning "{col.name}" by frequency into {amt} bins')
528 return pd.qcut(col, amt, duplicates='drop')
529 elif method == 'width':
530 log(f'Binning "{col.name}" by width into {amt} bins')
531 raise NotImplementedError('Width binning')
532 elif isinstance(method, (tuple, list)):
533 log(f'Custom binning "{col.name}" into {len(method)} bins')
534 return pd.cut(col, method)
535 else:
536 raise TypeError(f"Bin method parameter given invalid option {method}")
537
538 @_cleaning_func(df=pd.DataFrame)
539 def rescale(df, return_scaler=False, log=...):
540 log('Rescaling')
541 # display(df)
542 scaler = MinMaxScaler().fit(df)
543 ans = pd.DataFrame(scaler.transform(df), columns=df.columns)
544 return (ans, scaler) if return_scaler else ans
545
546 def convert_time(df_or_col, col:str=None, method:Union['timestamp']='timestamp', verbose=False):
547 assert not (isinstance(df_or_col, pd.Series) and col is not None), 'Please dont provide a col parameter if passing a Series'
548 if isinstance(df_or_col, pd.DataFrame) and col is None:
549 df = df_or_col.copy()
550 df[timeFeatures(df).columns] = timeFeatures(df).applymap(lambda date: date.timestamp())
551 return df
552 else:
553 if isinstance(df_or_col, pd.DataFrame):
554 df_or_col = df_or_col[col]
555 return df_or_col.apply(lambda date: date.timestamp())
556
557 def convert_numeric(df, col:str=None, method:Union['assign', 'one_hot_encode']='one_hot_encode', returnAssignments=False, skip=[], verbose=False):
558 df = df.copy()
559 # if isinstance(df, pd.Series) and method == 'one_hot_encode':
560 # raise TypeError("A DataFrame and column name is required when using one hot encoding to convert to numeric")
561 # if isQuantatative(df):
562 # raise TypeError(f"Series given is already quantatitive")
563 # else:
564 if (col is not None and isQuantatative(df[col])) or (col is None and isinstance(df, pd.Series) and isQuantatative(df)):
565 raise TypeError("Series given is already quantatitive")
566
567 if method == 'assign':
568 log(f'Converting "{col}" to quantatative by assinging to arbitrary values', verbose)
569 if isinstance(df, pd.Series):
570 column, assings = pd.factorize(df)
571 return (column, assings) if returnAssignments else column
572 else:
573 assert col is not None, 'Please provide column to assign'
574 column, assings = pd.factorize(df[col])
575 df[col] = column
576 return (df, assings) if returnAssignments else df
577 elif method == 'one_hot_encode':
578 # This is all just overly-complicated parameter handling for 1 line of code
579 skip = ensureIterable(skip)
580 if col is not None:
581 col = set(ensureIterable(col))
582 else:
583 if isinstance(df, pd.DataFrame):
584 # col = set(df)
585 col = set(catagorical(df).columns)
586 else:
587 return pd.get_dummies(df)
588
589 for s in skip:
590 col.remove(s)
591
592 if isinstance(col, pd.Series):
593 log(f'Converting "{df.name}" to quantatative by one hot encoding', verbose)
594 else:
595 log('Converting DataFrame to quantatative by one hot encoding', verbose)
596
597 return pd.get_dummies(df, columns=list(col))
598 else:
599 raise TypeError(f"Bad method arguement '{method}' given to convert_numeric")
600
601 def split(*data, amt=.2, method:Union['random', 'chunk', 'head', 'tail']='random', target=[], splitTargets=False, seed=42):
602 """ Splits the given data, both into train/test sets, and by taking out targets at the same time
603 `target` can be a string or an iterable
604 If `splitTargets` is set to False, the targets will always return DataFrames, even if
605 they only have 1 column
606 If you pass in multiple items for data, AND specify a target feature[s], then all the items
607 must have the target columns
608 The order goes:
609 train_X, test_X, train_X1, test_X1, ..., train_y, test_y, train_y1, test_y1
610 where it continues adding data and target splits in the order they are given.
611 Simply put, it outputs in the same order you input the parameters as much as possible.
612 Don't give multiple data AND split targets at the same time. While it can do it,
613 it's simply too confusing to think through the order of the returned parameters.
614 Setting the `method` to 'chunk' is the same as setting it to 'tail'.
615 """
616 if len(ensureIterable(data)) > 1 and len(target):
617 warn("Please don't give multiple data AND split targets at the same time. While it can do it, "
618 "it's simply too confusing to think through the order of the returned parameters.")
619 # Pop the targets and combine everything into 1 ordered list of things we need to split
620 splitMe = []
621 for d in ensureIterable(data):
622 d = d.copy()
623
624 targets = [d.pop(t) for t in ensureIterable(target)]
625 if splitTargets:
626 splitMe += targets
627 else:
628 splitMe.append(pd.DataFrame(dict(zip(ensureIterable(target), targets))))
629 # It makes more sense to do data, then target, not target then data
630 splitMe.insert(0 if len(targets) else len(splitMe), d)
631
632 # Now split everything in the list (order is important!)
633 if method == 'random':
634 return skms.train_test_split(*splitMe, test_size=amt, random_state=seed)
635 elif method in ('head', 'tail', 'chunk'):
636 rtn = []
637 for d in splitMe:
638 # Head an tail splitting are the same, just with opposite amts
639 split = round(len(d) * (amt if method == 'head' else (1-amt)))
640 rtn += [d.iloc[:split], d.iloc[split:]]
641 return rtn
642 else:
643 raise TypeError("Invalid method parameter given")
644
645 # The main functions
646 def explore(data,
647 target=None,
648 stats=None,
649 additionalStats=[],
650 missing=True,
651 corr=.55,
652 entropy=None,
653 start='Description',
654 startFeature=None,
655 startx=None,
656 starty=None,
657 startHue=None,
658 alpha=None,
659 ):
660 # Parse params and make sure all the params are valid
661 assert not isinstance(target, (list, tuple)), 'There can only be 1 target feature'
662 assert target is None or target in data.columns, f'Target {target} is not one of the features'
663 assert startFeature is None or startFeature in data.columns, 'startFeature must be a valid column name'
664 assert len(data), 'DataFrame cannot be empty'
665
666 if stats is None:
667 stats = ['mean', 'median', 'std', 'min', 'max']
668 if stats:
669 stats += ensureIterable(additionalStats, True)
670 if startFeature is None:
671 if target is not None:
672 startFeature = target
673 else:
674 startFeature = data.columns[0]
675
676 # Define variables
677 whatTheHeck = (corr, missing)
678 max_name_len = len(max(data.columns, key=len))
679 ALPHA = min(1, 1000/len(data)) if alpha is None else alpha
680
681 # Define widget[s]
682 combobox = widgets.Dropdown(
683 options=[
684 'Description',
685 'Features',
686 'Head',
687 'Stats',
688 'Missing',
689 'Duplicates',
690 'Entropy',
691 'Counts',
692 'Correlations',
693 'General Plots',
694 'Custom Plots',
695 'Matrix',
696 'Alerts',
697 ],
698 value=start,
699 description='Select Summary',
700 style={'description_width': 'initial'},
701
702 # title='hello there'
703 )
704 # This doesn't work
705 # combobox.box_style = 'primary'
706
707 # To make this work, this is always there, we just set it to hidden when not
708 # under the features page
709 featureBox = widgets.Dropdown(
710 options=list(data.columns),
711 value=startFeature,
712 description='Feature',
713 )
714 featureBox.layout.visibility = 'hidden'
715 featureABox = widgets.Dropdown(
716 options=list(data.columns),
717 value=startx if startx is not None else startFeature,
718 description='x',
719 )
720 featureABox.layout.visibility = 'hidden'
721 featureBBox = widgets.Dropdown(
722 options=list(data.columns),
723 value=starty if starty is not None else startFeature,
724 description='y',
725 )
726 featureBBox.layout.visibility = 'hidden'
727 featureHueBox = widgets.Dropdown(
728 options=list(data.columns) + ['None'],
729 value=startHue if startHue is not None else 'None',
730 description='hue',
731 )
732 featureHueBox.layout.visibility = 'hidden'
733 outlierSlider = widgets.FloatSlider(
734 value=3,
735 min=0.,
736 max=10.,
737 step=0.1,
738 description='Z-Score:',
739 # disabled=False,
740 continuous_update=CONTINUOUS_UPDATE_SLIDER,
741 # orientation='horizontal',
742 # readout=True,
743 # readout_format='.1f',
744 )
745 outlierSlider.layout.visibility = 'hidden'
746
747 # All the actual logic
748 def output(page, feature, a, b, hue, zscore):
749 # See baffled comment above
750 corr, missing = whatTheHeck
751 featureBox.layout.visibility = 'hidden'
752 featureABox.layout.visibility = 'hidden'
753 featureBBox.layout.visibility = 'hidden'
754 featureHueBox.layout.visibility = 'hidden'
755 outlierSlider.layout.visibility = 'hidden'
756 # Clear the output (because colab doesn't automatically or something?)
757 clear_output(wait=True)
758
759 plt.xticks(rotation=45)
760
761 # match page:
762 if page == 'Description':
763 print(f'There are {len(data):,} samples, with {len(data.columns)} columns:')
764 print()
765 print(', '.join(data.columns))
766 print()
767 print('which have types:')
768 display(getNiceTypesTable(data))
769
770 if len(quantitative(data)):
771 print('\nThe possible values for the Catagorical values:')
772 # This is just an overly complicated way to print them all nicely
773 for key, value in sort_dict_by_value_length(dict([(c, data[c].unique()) for c in catagorical(data).columns])).items():
774 # If it has too high of a cardinality, just print the first few
775 card = len(value)
776 shortened = False
777 if card > 30:
778 shortened = True
779 value = value[:29]
780
781 print(key + ":")
782 joined_list = ", ".join(value)
783 if len(joined_list) <= 80: # adjust this number as needed
784 print(' ' + joined_list)
785 else:
786 for item in value:
787 print(' ' + item)
788 if shortened:
789 print(f'... ({card - 29} more catagories)')
790 elif page == 'Stats':
791 if len(quantitative(data)):
792 # print('Summary of Quantatative Values:')
793 display(data.agg(dict(zip(quantitative(data), [stats]*len(data.columns)))))
794 elif page == 'Entropy':
795 todo('Calculate entropy relative to the target feature')
796 # if target is not None:
797 # base = e if entropy is not None else entropy
798 for c in data.columns:
799 print(f'The entropy of {c:>{max_name_len}} is: {round(scipy.stats.entropy(data[c].value_counts(normalize=True), base=entropy), 3)}')
800 # print(f'The entropy of {c} is: {entropy(data[c], data[target])}')
801 # else:
802 # print('Target feature must be provided in order to calculate the entropy')
803 elif page == 'Duplicates':
804 todo()
805 elif page == 'Head':
806 with pd.option_context('display.max_columns', None):
807 display(data.head())
808 elif page == 'Counts':
809 # This is sorted just so the features with less unique options go first
810 for i in sorted(catagorical(data), key=lambda c: len(data[c].unique())):
811 print(f'{i} value counts:')
812
813 if len(data[i].unique()) == len(data[i]):
814 print('\tEvery sample has a unique catagory')
815 else:
816 print(pretty_counts(data[i]))
817 elif page == 'Correlations':
818 if len(quantitative(data)):
819 print('Correlations Between Quantatative Values:')
820 if type(corr) is bool:
821 display(quantitative(data).corr())
822 elif isinstance(corr, (int, float)):
823 corr = normalizePercentage(corr)
824 # Ignore if they're looking for a negative correlation, just get both
825 corr = abs(corr)
826 _corr = significantCorrelations(quantitative(data), corr)
827 if len(_corr):
828 a_len = max([len(i[0]) for i in _corr])
829 b_len = max([len(i[1]) for i in _corr])
830 for a,b,c in _corr:
831 print(f'\t{a:<{a_len}} <-> {b:<{b_len}}: {round(c, 2):+}')
832 else:
833 print(f'\tThere are no correlations greater than {corr:.0%}')
834 elif page == 'Missing':
835 # if len(_relevant):
836 print('Missing Percentages:')
837 if type(missing) is bool:
838 percent = data.isnull().sum()/len(data)*100
839 # This works, but instead I decided to overcomplicate it just so I can indent it
840 # print(percent)
841 print(pretty_2_column_array(percent))
842 elif isinstance(missing, (int, float)):
843 missing = normalizePercentage(missing)
844 _missing = missingSummary(data, missing/100)
845 if len(_missing):
846 display(_missing)
847 else:
848 print(f'\tAll values are missing less than {missing:.0%} of their entries')
849 else:
850 raise TypeError('Missing is a bad type')
851 elif page == 'Features':
852 # TODO: mode[s], std, quantative entropy, catagorical correlations, data.groupby(feature)[target].value_counts(),
853 featureBox.layout.visibility = 'visible'
854
855 # Quantative and Catagorical attributes
856 group = 'catagorical' if isCatagorical(data[feature]) else 'quantative'
857 missing = data[feature].isnull().sum()/len(data[feature])
858 shared = f'"{feature}" is {"the target" if feature == target else "a"} {group} feature of type {data[feature].dtype}.\n' \
859 f'{missing:.1%} of it is missing.'
860
861 # Catagorical description
862 if isCatagorical(data[feature]):
863 print(shared)
864 print(f'It has an entropy of {scipy.stats.entropy(data[feature].value_counts(normalize=True), base=entropy):.3f}', end=', ')
865 print(f'and a cardinaltiy of {len(data[feature].unique())}')
866 print('Value counts:')
867 print(pretty_counts(data[feature], paren=True))
868
869 sns.histplot(data[feature])
870
871 # Quantative description
872 else:
873 # Set the slider variables
874 outlierSlider.layout.visibility = 'visible'
875 # Todo This breaks on time data (I think)
876 # Todo This is usable, but can definitely be improved
877 if data[feature].std() > 1:
878 outlierSlider.max = abs(data[feature].max()) / data[feature].std()
879 else:
880 outlierSlider.max = abs(data[feature].max()) * data[feature].std()
881
882 correlations = []
883 for a, b, c in significantCorrelations(quantitative(data), corr):
884 other = None
885 if a == feature:
886 other = b
887 elif b == feature:
888 other = a
889 if other is not None:
890 correlations.append(f'{other}({c:.1%})')
891
892 if len(correlations):
893 correlations = 'It correlates with ' + ', '.join(correlations)
894 else:
895 correlations = f'It has no significant (>{corr:.1%}) correlations with any features'
896
897 print(shared)
898 print(f'It has an average value of {data[feature].mean():,.2f}, and a median of {data[feature].median():,.2f}.')
899 # Because dates are weird
900 try:
901 print(f'It has a kurtosis value of {scipy.stats.kurtosis(data[feature]):,.2f}.')
902 print('\tNegative values mean less outliers than a normal distrobution, positive values mean more.')
903 except np.core._exceptions.UFuncTypeError: pass
904 print(f'It has a minimum value of {data[feature].min():,.2f}, and a maximum value of {data[feature].max():,.2f}.')
905 print(correlations)
906
907 # sns.scatterplot(data=data[feature])
908 # plt.show()
909 # display(interactWithOutliers(data, feature))
910
911 # def interactWithOutliers(df, feature=None, step=.2):
912 # widgets.interactive(
913 print()
914 showOutliers(data, feature, zscore, alpha=ALPHA)
915 # data=widgets.fixed(df),
916 # column=list(df.columns) if feature is None else widgets.fixed(feature),
917 # zscore=(0., df[feature].max() / df[feature].std(), step) if feature is not None else (0., 10, step)
918 # zscore=(0., 20, step)
919 # )
920
921 print()
922 # todo('Add nice plots here: scatterplots, histograms, and relating to the target feature')
923 elif page == 'General Plots':
924 if len(quantitative(data)):
925 print('Plot of Quantatative Values:')
926 sns.catplot(data=quantitative(data))
927 plt.show()
928 if len(catagorical(data)):
929 print('Plot of Catagorical Value Counts:')
930 todo('catagorical (count?) plots')
931 # plt.show()
932 elif page == 'Custom Plots':
933 featureABox.layout.visibility = 'visible'
934 featureBBox.layout.visibility = 'visible'
935 featureHueBox.layout.visibility = 'visible'
936
937 graph = sns.scatterplot(x=data[a], y=data[b], hue=None if hue == 'None' else data[hue], alpha=ALPHA)
938 if isQuantatative(data[a]) and isQuantatative(data[b]):
939 try:
940 graph.set(title=f'Correlation: {data.corr()[a][b]:0.1%}')
941 except KeyError:
942 print('Cant calculate the correlations of dates for some reason')
943 else:
944 # counts = data.groupby(a)[b].value_counts()
945 # print(counts.index.max())
946 # print(counts)
947 graph.set(title='Most common together: Todo')
948
949 plt.show()
950 elif page == 'Matrix':
951 if len(quantitative(data)):
952 print('Something Something Matrix:')
953 if target in quantitative(data):
954 sns.pairplot(data=quantitative(data), hue=target)
955 else:
956 sns.pairplot(data=quantitative(data))
957 plt.show()
958 elif page == 'Alerts':
959 # TODO:
960 # Check that entropy isn't too low
961 # check that relative entropy isn't too low
962 # check for spikes and plummets
963 # high correlations between features
964 # Print the kurtosis score with the outlier stuff
965
966 # Check if our dataset is small
967 if data[feature].count() < SMALL_DATASET:
968 print(f"Your dataset isn't very large ({data[feature].count()}<{SMALL_DATASET})")
969
970 # Check the cardinality
971 for c in catagorical(data):
972 card = len(data[c].unique())
973 if card == 1:
974 print(f'All values in feature "{c}" are the same')
975 elif card >= data[feature].count():
976 print(f'Every value in feature "{c}" is unique, are you sure its not quantatative?')
977 elif card > HIGH_CARDINALITY:
978 print(f'Feature "{c}" has a very high cardinality ({card}>{HIGH_CARDINALITY})')
979
980 # Check we're not missing too many
981 for i in data.columns:
982 miss = data[i].isnull().sum()/len(data[i])
983 if miss >= ALERT_MISSING:
984 print(f'Feature {i} is missing a significant portion ({miss}>={ALERT_MISSING})')
985
986 # Check for outliers
987 for q in quantitative(data):
988 try:
989 upper = data[q].max() - data[q].quantile(.75)
990 upperMid = data[q].quantile(.75) - data[q].median()
991 if upper - upperMid > OUTLIER_THRESHOLD * data[q].median():
992 print(f'Feature {q:>{max_name_len}} may have some upper outliers', end=' | ')
993 print(f'upper: {upper:>6.1f} | upperMid: {upperMid:>6.1f} | median: {data[q].median():>6.1f} | diff: {upper-upperMid:>6.1f}')
994
995 lower = data[q].quantile(.25) - data[q].min()
996 lowerMid = data[q].median() - data[q].quantile(.25)
997 if lower - lowerMid > OUTLIER_THRESHOLD * data[q].median():
998 print(f'Feature {q:>{max_name_len}} may have some lower outliers', end=' | ')
999 print(f'lower: {lower:>6.1f} | lowerMid: {lowerMid:>6.1f} | median: {data[q].median():>6.1f} | diff: {lower-lowerMid:>6.1f}')
1000 except TypeError:
1001 todo('checking dates for outliers isnt implemented')
1002 else:
1003 print('Invalid start option')
1004
1005 # widgets.interact(output, page=combobox, feature=featureBox)
1006 ui = widgets.GridBox([combobox, featureABox, featureBox, featureBBox, outlierSlider, featureHueBox], layout=widgets.Layout(
1007 grid_template_columns='auto auto',
1008 grid_row_gap='10px',
1009 grid_column_gap='100px',
1010 )
1011 )
1012 out = widgets.interactive_output(output, {'page': combobox, 'feature': featureBox, 'a': featureABox, 'b': featureBBox, 'hue': featureHueBox, 'zscore': outlierSlider})
1013 display(ui, out)
1014 quickSummary = explore
1015
1016 def suggestedCleaning(df, target):
1017 todo('suggestedCleaning')
1018
1019 def _cleanColumn(df, args, column, verbose, ignoreWarnings=False):
1020 global MODE_SELECTION
1021 missing = np.nan
1022 # We're allowing column to be None for the specific case of add_column (which doesn't require a column)
1023 if column in df.columns or column is None:
1024 for op, options in args.items():
1025 # Quick parameter type checking for bools
1026 if options is False and op not in ('missing_value', 'remove'):
1027 continue
1028 if options is True and not ignoreWarnings and op not in ('drop_duplicates', 'missing_value', 'remove', 'drop'):
1029 raise TypeError(f"'True' is an invalid option for {op} (for column {column})")
1030
1031 if op == 'drop_duplicates':
1032 warn('drop_duplicates hasnt been implemented yet for induvidual columns. What are you trying to do?')
1033 elif op == 'handle_outliers':
1034 zscore, method = options
1035 df[column] = handle_outliers(df[colulmn], method, zscore=zscore, verbose=verbose)
1036 elif op == 'replace':
1037 if not isinstance(options, dict):
1038 raise TypeError(f"Please specify a dict for the replace option (under column {column})")
1039 log(f'Replacing specified entries in {column}', verbose)
1040 df[column] = df[column].replace(options)
1041 elif op == 'apply':
1042 if callable(options):
1043 log(f'Applying function to {column}')
1044 df[column] = df[column].apply(options, axis=1)
1045 else:
1046 if not ignoreWarnings:
1047 raise TypeError(f"Please specify a function to apply (under column {column})")
1048 elif op == 'missing_value':
1049 missing = options
1050 elif op == 'handle_missing':
1051 if options in ('mean', 'median') and isCatagorical(df[column]) and ignoreWarnings:
1052 continue
1053 # This will throw the appropriate errors otherwise
1054 df[column] = handle_missing(df[column], method=options, missing_value=missing, verbose=verbose)
1055 elif op == 'queries':
1056 if options in ('mean', 'median') and isCatagorical(df[column]) and ignoreWarnings:
1057 continue
1058 # If there's just one query, just accept it
1059 if len(options) == 2 and type(options[0]) is str:
1060 options = [options]
1061 for q, method in options:
1062 df = query(df, column, q, method, verbose=verbose)
1063 elif op == 'remove':
1064 df[column] = remove(df[column], options, verbose=verbose)
1065 elif op == 'bin':
1066 if isCatagorical(df[column]) and not ignoreWarnings:
1067 warn(f'The bin option was set on "{column}", which is not quantatative, skipping.')
1068 else:
1069 df[column] = bin(df[column], method, amt, verbose=verbose)
1070 elif op == 'normalize':
1071 if isCatagorical(df[column]) and not ignoreWarnings:
1072 warn(f'The normalize option was set on {column}, which is not quantatative, skipping.')
1073 else:
1074 df[column] = normalize(df[column], options, verbose=verbose)
1075 elif op == 'convert_numeric':
1076 if isQuantatative(df[column], time=False) and not ignoreWarnings:
1077 warn(f'The conver_numeric option was set on {column}, which is not catagorical, skipping.')
1078 else:
1079 df = convert_numeric(df, column, options, verbose=verbose)
1080 elif op == 'add_column':
1081 if isinstance(options, (tuple, list)):
1082 if not isinstance(options[0], (tuple, list)):
1083 options = [options]
1084 for name, selection in options:
1085 log(f'Adding new column "{name}"')
1086 df[name] = selection
1087 else:
1088 raise TypeError(f"add_column argument must be a tuple, or a list of tuples, not {type(options)}")
1089 elif op == 'drop':
1090 if options:
1091 log(f'Dropping column "{column}"')
1092 df = df.drop(columns=[column])
1093 else:
1094 raise TypeError(f'Invalid arguement {op} given')
1095 else:
1096 raise TypeError(f'Column "{column}" provided is not in the given DataFrame')
1097
1098 return df
1099
1100 def clean(df:pd.DataFrame,
1101 config: Dict[str, Dict[str, Any]],
1102 verbose:bool=False,
1103 split:str=None,
1104 ) -> pd.DataFrame:
1105 """ Returns a cleaned copy of the DataFrame passed to it
1106 NOTE: The order of the entries in the config dict determine the order they are performed
1107
1108 Arguments:
1109 config is a dict of this signature:
1110 NOTE: This is the suggested order
1111 {
1112 # Do these to all the columns, or a specified column
1113 'column/all': {
1114 # Drop duplicate samples
1115
1116 'drop_duplicates': bool,
1117 # Removes samples which have a Z-score magnitude of greater than this value
1118 'handle_outliers': Union[bool, Tuple[float, Union['remove', 'constrain']]],
1119 # Maps feature values to a dictionary
1120 'replace': Union[bool, Dict],
1121 # Applies a function to the column
1122 'apply': Union[bool, Callable],
1123 # A list of (query, replacements).
1124
1128 'queries': Union[bool, List[Tuple[str, Union[Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any]]]],
1129 # A ndarray of shape (1, n) of values to create a new column with the given name
1130
1131 'add_column': Union[Tuple[str, np.ndarray], List[Tuple[str, np.ndarray]]],
1132 # Specifies a value that is equivalent to the feature being missing
1133 'missing_value': Any,
1134 # Specifies a method by which to transform samples with missing features. Acts just like queries, but with missing values specifically
1135 'handle_missing': Union[bool, Series, 'remove', 'mean', 'median', 'mode', 'random', 'balanced_random', Any],
1136 # Removes all samples with the given value
1137 'remove': Union[bool, Any],
1138 # Specifies a method by which to bin the quantative value, or specify custom ranges
1139 'bin': Union[bool, Tuple['frequency', int], Tuple['width', int], Iterable],
1140 # Specifies a method by which to normalize the quantative values
1141 'normalize': Union[bool, 'min-max', 'range'],
1142 # Specifies a method by which to convert a catagorical feature to a quantative one
1143 'convert_numeric': Union[bool, 'assign', 'one_hot_encode'],
1144 # Drop the column
1145 'drop': bool,
1146 },
1147 }
1148 """
1149 raise DeprecationWarning('This function is no longer supported and is likely to break')
1150 df = df.copy()
1151 log = lambda s: print(s) if verbose else None
1152
1153 # Make sure the all section is done last (so if we're doing one hot encoding it doesn't throw errors)
1154 config = OrderedDict(config)
1155 if 'all' in config.keys():
1156 config.move_to_end('all')
1157
1158 for column, args in config.items():
1159 log(f'Working on "{column}"')
1160 if column.lower() == 'all':
1161 # We only want to add new columns once (not inside the for loop)
1162 if 'add_column' in args:
1163 # We only want to call that command manually
1164 df = _cleanColumn(df, {'add_column': args['add_column']}, None, verbose)
1165 del args['add_column']
1166
1167 # Dropping duplicates means something different on the scale of a single column
1168 # than it does applied to the whole table
1169 if 'drop_duplicates' in args:
1170 log('\tDropping duplicate samples')
1171 df = df.drop_duplicates()
1172 del args['drop_duplicates']
1173
1174 for c in df.columns:
1175 # This makes a new args for a specific column, and removes any operations we've
1176 # already done (we want column specific options to override all, and we don't want
1177 # to redo them)
1178 adjusted = args.copy()
1179 if c in config.keys():
1180 for op, params in config[c].items():
1181 log(f'\tExcluding column {c} from {op}')
1182 if op in adjusted.keys():
1183 del adjusted[op]
1184 df = _cleanColumn(df, adjusted, c, verbose, True)
1185 else:
1186 df = _cleanColumn(df, args, column, verbose)
1187 if split is not None:
1188 if split in df.columns:
1189 return df.drop(columns=split), df[split]
1190 else:
1191 raise TypeError('Provided feature not in the resulting data (did you drop it in the cleaning process by accident?)')
1192 else:
1193 return df
1194
1195 def resample(X, y, method:Union['oversample', 'undersample', 'mixed']='oversample', seed=None):
1196 # match method:
1197 if method == 'oversample':
1198 sampler = RandomOverSampler(random_state=seed)
1199 return sampler.fit_resample(X, y)
1200 elif method == 'undersample':
1201 sampler = RandomUnderSampler(random_state=seed)
1202 return sampler.fit_resample(X, y)
1203 elif method == 'mixed':
1204 todo('figure out how to mix under and over sampling')
1205 else:
1206 raise TypeError("Invalid method arguement given")
1207
1208 # @_cleaning_func(test=pd.Series)
1209 @_cleaning_func(testPredictions=pd.Series)
1210 def evaluateQuantitative(test, testPredictions, train=None, trainPredictions=None, accuracy=3, explain=False, compact=False, line=False, log=...):
1211 """ Evaluate your predictions of an ML model.
1212 NOTE: compact overrides explain.
1213 """
1214 assert (train is None) == (trainPredictions is None), 'You have to pass both train & trainPredictions'
1215 # display(test)
1216 # display(testPredictions)
1217 # @_cleaning_func() SHOULD handle this
1218 test = pd.Series(test)
1219 testPredictions = pd.Series(testPredictions)
1220
1221 def _score(name, func, explaination, _test=True, **kwargs):
1222 name += ':'
1223 if compact:
1224 print(f'{name} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}', end=' ')
1225 else:
1226 # print('~'*20, func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs), '~'*20)
1227 print(f'\t{name:<23} {ensureNotIterable(func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs)):,.{accuracy}f}')
1228 if explain:
1229 print('\t\t' + explaination)
1230
1231 def _quantatative(_test=True):
1232 _score('Root Mean Square Error', mean_squared_error, 'An average of how far off we are from the target, in the same units as the target. Smaller is better.', _test, squared=False)
1233 _score('My own measure', lambda a, b, **k: mean_squared_error(a, b, **k) / a.mean(), 'Root mean square / average value. Eliminates the domain a bit. Smaller is better.', _test, squared=False)
1234 _score('Mean Absolute Error', mean_absolute_error, 'Similar to Root Mean Square Error, but better at weeding out outliers. Smaller is better.', _test)
1235 _score('Median Absolute Error', median_absolute_error, '', _test)
1236 _score('R^2 Score', r2_score, 'An average of how far off we are from just using the mean as a prediction. Larger is better.', _test)
1237
1238 def amtInPercent(truth, pred, precent):
1239 return ((truth - pred).abs() / truth <= (percent / 100)).values.sum() / len(truth) * 100
1240
1241 for percent in (5, 10, 20, 50):
1242 _score(f'Within {percent}%', lambda a, b, **k: amtInPercent(a, b, percent), f'How many of the samples are within {percent}% of their actual values', _test)
1243
1244 print('Test:')
1245 _quantatative()
1246 if train is not None and trainPredictions is not None:
1247 print('\nTrain:')
1248 _quantatative(False)
1249
1250 if line:
1251 sns.set(rc={'figure.figsize':(11.7,8.27)})
1252
1253 delta = test - testPredictions
1254 # display(testPredictions)
1255 # display(test)
1256 # display(delta)
1257 testfinal = pd.DataFrame({
1258 'Predictions': testPredictions,
1259 'Ground Truth': test,
1260 'difference': delta,
1261 'percent_difference': abs(delta/test),
1262 # 'percent_bucket': (test - testPredictions).abs() / test <= (percent / 100)#[ "above 20%" if i >= 0.2 else "below 20%" for i in testfinal.percent_difference ],
1263 })
1264 testfinal['percent_difference'] = bin(testfinal['percent_difference'], method=(0, .05, .10, .20, .50, 1))
1265 # display(testfinal['percent_difference'].iloc[0] == pd.Interval(0.5, 1.0, closed='right'))
1266 # display(testfinal['percent_difference'])
1267
1268 testfinal['percent_difference'] = testfinal['percent_difference'].replace({
1269 pd.Interval(0, .05, closed='right'): 'Within 5%',
1270 pd.Interval(.05, .1, closed='right'): 'Within 10%',
1271 pd.Interval(.1, .2, closed='right'): 'Within 20%',
1272 pd.Interval(.2, .5, closed='right'): 'Within 50%',
1273 pd.Interval(0.5, 1.0, closed='right'): 'Within 100%',
1274 })
1275 # display(testfinal['percent_difference'])
1276 color_dict = dict({
1277 'Within 5%': 'tab:green',
1278 'Within 10%': 'tab:green',
1279 'Within 20%': 'tab:blue',
1280 'Within 50%': 'tab:orange',
1281 'Within 100%': 'tab:red',
1282 np.NaN: 'tab:red'
1283 })
1284 # Interval(0.5, 1.0, closed='right'), Interval(0.05, 0.1, closed='right'), Interval(0.2, 0.5, closed='right'), Interval(0.0, 0.05, closed='right'), Interval(0.1, 0.2, closed='right')
1285
1286 # print(testfinal['abspercentmiss'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,.95]))
1287 xlims = (0,1e3)
1288 # ylims=(0,1e3)
1289 ax = sns.scatterplot(data=testfinal,x='Ground Truth',y='Predictions',hue="percent_difference",palette=color_dict)
1290 # ax.set(xscale="log", yscale="log", xlim=xlims, ylim=ylims)
1291 ax.plot(xlims,xlims, color='r')
1292 # ax.plot(color='r')
1293 # plt.legend(labels=['perfect',"below 5",'above 5','10-20%','above 20'])
1294 plt.show()
1295 evaluateQ = evaluateQuantitative
1296
1297 def evaluateCatagorical(test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False):
1298 """ Evaluate your predictions of an ML model.
1299 NOTE: compact overrides explain.
1300 """
1301 assert (train is None) == (trainPredictions is None), 'You have to pass both train & trainPredictions'
1302
1303 def _score(name, func, explaination, _test=True, **kwargs):
1304 name += ':'
1305 if compact:
1306 print(f'{name} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}', end=' ')
1307 else:
1308 print(f'\t{name:<23} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}')
1309 if explain:
1310 print('\t\t' + explaination)
1311
1312 def _catagorical(_test=True):
1313 # Can't do an F1 score with more than 2 classes
1314 try:
1315 _score('F1', sklearn.metrics.f1_score, 'F1 is essentially an averaged score combining precision and recall', _test)
1316 except ValueError:
1317 pass
1318 _score('Accuracy', sklearn.metrics.accuracy_score, 'Accuracy is a measure of how well the model did on average', _test)
1319 _score('Precision', sklearn.metrics.precision_score, 'Precision is a measure of how many things we said were true and we were wrong', _test)
1320 _score('Recall', sklearn.metrics.recall_score, 'Recall is a measure of how many things we missed out on', _test)
1321
1322 print('Test:')
1323 _catagorical()
1324
1325 if confusion:
1326 ConfusionMatrixDisplay.from_predictions(test, testPredictions, cmap='Blues')
1327 plt.show()
1328 if curve:
1329 PrecisionRecallDisplay.from_predictions(test, testPredictions)
1330 plt.show()
1331
1332 if train is not None and trainPredictions is not None:
1333 print('\nTrain:')
1334 _catagorical(False)
1335
1336 if confusion:
1337 ConfusionMatrixDisplay.from_predictions(train, trainPredictions, cmap='Blues')
1338 plt.show()
1339 if curve:
1340 PrecisionRecallDisplay.from_predictions(train, trainPredictions)
1341 plt.show()
1342 evaluateC = evaluateCatagorical
1343
1344 def evaluate(catagorical, test, testPredictions, train=None, trainPredictions=None, accuracy=3, curve=False, confusion=False, explain=False, compact=False, line=False):
1345 """ Evaluate your predictions of an ML model.
1346 NOTE: compact overrides explain.
1347 """
1348 assert (train is None) == (trainPredictions is None), 'You have to pass both train & trainPredictions'
1349 raise DeprecationWarning('Please use evaluateQ or evaluateC instead')
1350
1351 def _score(name, func, explaination, _test=True, **kwargs):
1352 name += ':'
1353 if compact:
1354 print(f'{name} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}', end=' ')
1355 else:
1356 print(f'\t{name:<23} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}')
1357 if explain:
1358 print('\t\t' + explaination)
1359
1360 # def _catagorical(_test=True):
1361 # print(f'\t{name:<23} {func(test, testPredictions, **kwargs) if _test else func(train, trainPredictions, **kwargs):,.{accuracy}f}')
1362 # if explain:
1363 # print('\t\t' + explaination)
1364
1365 def _catagorical(_test=True):
1366 _score('F1', sklearn.metrics.f1_score, 'F1 is essentially an averaged score combining precision and recall', _test)
1367 _score('Accuracy', sklearn.metrics.accuracy_score, 'Accuracy is a measure of how well the model did on average', _test)
1368 _score('Precision', sklearn.metrics.precision_score, 'Precision is a measure of how many things we said were true and we were wrong', _test)
1369 _score('Recall', sklearn.metrics.recall_score, 'Recall is a measure of how many things we missed out on', _test)
1370
1371 def _quantatative(_test=True):
1372 _score('Root Mean Square Error', mean_squared_error, 'An average of how far off we are from the target, in the same units as the target. Smaller is better.', _test, squared=False)
1373 _score('My own measure', lambda a, b, **k: mean_squared_error(a, b, **k) / a.mean(), 'Root mean square / average value. Eliminates the domain a bit. Smaller is better.', _test, squared=False)
1374 _score('Mean Absolute Error', mean_absolute_error, 'Similar to Root Mean Square Error, but better at weeding out outliers. Smaller is better.', _test)
1375 _score('Median Absolute Error', median_absolute_error, '', _test)
1376 _score('R^2 Score', r2_score, 'An average of how far off we are from just using the mean as a prediction. Larger is better.', _test)
1377 for percent in (5, 10, 20, 50):
1378 _score(f'Within {percent}%', lambda a, b, **k: amtInPercent(a, b, percent), f'How many of the samples are within {percent}% of their actual values', _test)
1379
1380 def amtInPercent(truth, pred, precent):
1381 combined = pd.concat([truth, pred], axis=1)
1382 combined.columns = ["truth", "pred"]
1383 combined["absdiff"] = (combined["truth"] - combined["pred"]).abs()
1384 combined["absdiff_pct"] = combined["absdiff"] / combined["truth"]
1385 return len(combined[combined["absdiff_pct"] <= (percent / 100)]) / len(combined) * 100
1386
1387 # Catagorical measures
1388 if catagorical:
1389 print('Test:')
1390 _catagorical()
1391
1392 if confusion:
1393 ConfusionMatrixDisplay.from_predictions(test, testPredictions, cmap='Blues')
1394 plt.show()
1395 if curve:
1396 PrecisionRecallDisplay.from_predictions(test, testPredictions)
1397 plt.show()
1398
1399 if train is not None and trainPredictions is not None:
1400 print('\nTrain:')
1401 _catagorical(False)
1402
1403 if confusion:
1404 ConfusionMatrixDisplay.from_predictions(train, trainPredictions, cmap='Blues')
1405 plt.show()
1406 if curve:
1407 PrecisionRecallDisplay.from_predictions(train, trainPredictions)
1408 plt.show()
1409 # Quantative measures
1410 else:
1411 print('Test:')
1412 _quantatative()
1413 if train is not None and trainPredictions is not None:
1414 print('\nTrain:')
1415 _quantatative(False)
1416
1417 if line:
1418 sns.set(rc={'figure.figsize':(11.7,8.27)})
1419 color_dict = dict({'below 20%':'tab:blue',
1420 'above 20%': 'tab:orange'})
1421
1422 shower = pd.DataFrame(student_ds, columns=['predictions'])
1423 shower.columns = ['predictions']
1424 testfinal = pd.concat([shower,targets['actual']],axis=1)
1425 testfinal['difference'] = testfinal['actual']-testfinal['predictions']
1426 testfinal['percent_difference'] = abs(testfinal['difference']/testfinal['actual'])
1427 testfinal['percent_bucket'] = ["above 20%" if i >= 0.2 else "below 20%" for i in testfinal.percent_difference]
1428
1429 # print(testfinal['abspercentmiss'].describe(percentiles=[.1,.2,.3,.4,.5,.6,.7,.8,.9,.95]))
1430 # xlims=(0,1e3)
1431 # # ylims=(0,1e3)
1432 ax = sns.scatterplot(data=testfinal,x='actual',y='predictions',hue="percent_bucket",palette=color_dict)
1433 # ax.set(xscale="log", yscale="log", xlim=xlims, ylim=ylims)
1434 # ax.plot(xlims,xlims, color='r')
1435 ax.plot(color='r')
1436 # plt.legend(labels=['perfect',"below 5",'above 5','10-20%','above 20'])
1437 plt.show()
1438 print("-"*77)
1439 print("\n"*3)
1440
1441
1442 """
1443 TODO: Add cross-validation to evaluate
1444 from sklearn.linear_model import LogisticRegression
1445 from sklearn.model_selection import cross_val_score
1446 import numpy as np
1447
1448 # X = ... # training data
1449 # y = ... # target variable
1450
1451 # model = LogisticRegression()
1452 # scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
1453
1454 # print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2))
1455
1456 """
1457
1458
1459 def importances(tree, names=None, rtn=False, graph=True, best=.01):
1460 if names is None:
1461 names = tree.feature_names_in_
1462 df = pd.DataFrame({
1463 'feature': names,
1464 'importance': tree.feature_importances_
1465 })
1466
1467 if best:
1468 # df = df.assign(best=df.importance > best)
1469 df = df.loc[df.importance >= best]
1470
1471 df = df.sort_values(by='importance', ascending=False, axis=0)
1472 if graph:
1473 sns.catplot(data=df, x='importance', y='feature', kind='bar', height=10, aspect=2)
1474 plt.show()
1475
1476 if rtn:
1477 return df
1478
1479 def saveStats(file, name, model, testY, predY, trainY=None, trainPredY=None, notes='', new=False, show=True, save=True):
1480 def doit():
1481 print(name + ':')
1482 print(notes)
1483 print()
1484 print('Model type:', type(model))
1485 print('Parameters:')
1486 for key, val in model.get_params().items():
1487 print(f'\t{key}: {val}')
1488 print('\nImportances:')
1489 print(importances(model, rtn=True, graph=False))
1490 print('\nStats:')
1491 evaluate(testY, predY, trainY, trainPredY, compact=False)
1492 print('\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n')
1493
1494 with open(file, 'w' if new else 'a') as f:
1495 with redirect_stdout(f):
1496 doit()
1497 if show:
1498 doit()
1499
1500 def plot_history(history):
1501 plt.figure()
1502 plt.xlabel('Epoch')
1503 plt.ylabel('Loss')
1504 plt.plot(history['index'], history['loss'], label='Train Loss')
1505 plt.plot(history['index'], history['val_loss'], label='Value Loss')
1506 plt.legend()
1507 plt.show()