Source code for deeptables.eda.utils

# -*- coding:utf-8 -*-
__author__ = 'yangjian'
"""

"""

import itertools

import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt


[docs]def columns_info(dataframe, topN=10): #if not isinstance(self, pd.DataFrame): # raise TypeError('object must be an instance of pandas DataFrame') #dataframe = self max_row = dataframe.shape[0] print(f'Shape: {dataframe.shape}') info = dataframe.dtypes.to_frame() info.columns = ['DataType'] info['#Nulls'] = dataframe.isnull().sum() info['#Uniques'] = dataframe.nunique() # stats info['Min'] = dataframe.min(numeric_only=True) info['Mean'] = dataframe.mean(numeric_only=True) info['Max'] = dataframe.max(numeric_only=True) info['Std'] = dataframe.std(numeric_only=True) # top 10 values info[f'top{topN} val'] = 0 info[f'top{topN} cnt'] = 0 info[f'top{topN} raito'] = 0 for c in info.index: vc = dataframe[c].value_counts().head(topN) val = list(vc.index) cnt = list(vc.values) raito = list((vc.values / max_row).round(2)) info.loc[c, f'top{topN} val'] = str(val) info.loc[c, f'top{topN} cnt'] = str(cnt) info.loc[c, f'top{topN} raito'] = str(raito) return info
[docs]def top_categories(df, category_feature, topN=30): return df[category_feature].value_counts().head(topN).index
[docs]def count_categories(df, category_features, topN=30, sort='freq', df2=None): for c in category_features: target_value = df[c].value_counts().head(topN).index if sort == 'freq': order = target_value elif sort == 'alphabetic': order = df[c].value_counts().head(topN).sort_index().index if df2 is not None: plt.subplot(1, 2, 1) sns.countplot(x=c, data=df[df[c].isin(order)], order=order) plt.xticks(rotation=90) if df2 is not None: plt.subplot(1, 2, 2) sns.countplot(x=c, data=df2[df2[c].isin(order)], order=order) plt.xticks(rotation=90) if df2 is not None: plt.suptitle(f'{c} TOP{topN}', size=25) else: plt.title(f'{c} TOP{topN}', size=25) plt.tight_layout() plt.show() return
[docs]def hist_continuous(df, continuous_features, bins=30, df2=None): for c in continuous_features: if df2 is not None: plt.subplot(1, 2, 1) df[c].hist(bins=bins) if df2 is not None: plt.subplot(1, 2, 2) df2[c].hist(bins=bins) if df2 is not None: plt.suptitle(f'{c}', size=25) else: plt.title(f'{c}', size=25) plt.tight_layout() plt.show() return
[docs]def venn_diagram(train, test, category_features, names=('train', 'test'), figsize=(18, 13)): from matplotlib_venn import venn2 """ category_features: max==6 """ n = int(np.ceil(len(category_features) / 2)) plt.figure(figsize=figsize) for i, c in enumerate(category_features): plt.subplot(int(f'{n}2{i + 1}')) venn2([set(train[c].unique()), set(test[c].unique())], set_labels=names) plt.title(f'{c}', fontsize=18) plt.show() return
[docs]def split_seq(iterable, size): """ In: list(split_seq(range(9), 4)) Out: [[0, 1, 2, 3], [4, 5, 6, 7], [8]] """ it = iter(iterable) item = list(itertools.islice(it, size)) while item: yield item item = list(itertools.islice(it, size))
[docs]def reduce_mem_usage(df, verbose=True): numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] start_mem = df.memory_usage().sum() / 1024**2 for col in df.columns: col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))