# -*- coding:utf-8 -*-
__author__ = 'yangjian'
"""
"""
import itertools
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
[docs]def columns_info(dataframe, topN=10):
#if not isinstance(self, pd.DataFrame):
# raise TypeError('object must be an instance of pandas DataFrame')
#dataframe = self
max_row = dataframe.shape[0]
print(f'Shape: {dataframe.shape}')
info = dataframe.dtypes.to_frame()
info.columns = ['DataType']
info['#Nulls'] = dataframe.isnull().sum()
info['#Uniques'] = dataframe.nunique()
# stats
info['Min'] = dataframe.min(numeric_only=True)
info['Mean'] = dataframe.mean(numeric_only=True)
info['Max'] = dataframe.max(numeric_only=True)
info['Std'] = dataframe.std(numeric_only=True)
# top 10 values
info[f'top{topN} val'] = 0
info[f'top{topN} cnt'] = 0
info[f'top{topN} raito'] = 0
for c in info.index:
vc = dataframe[c].value_counts().head(topN)
val = list(vc.index)
cnt = list(vc.values)
raito = list((vc.values / max_row).round(2))
info.loc[c, f'top{topN} val'] = str(val)
info.loc[c, f'top{topN} cnt'] = str(cnt)
info.loc[c, f'top{topN} raito'] = str(raito)
return info
[docs]def top_categories(df, category_feature, topN=30):
return df[category_feature].value_counts().head(topN).index
[docs]def count_categories(df, category_features, topN=30, sort='freq', df2=None):
for c in category_features:
target_value = df[c].value_counts().head(topN).index
if sort == 'freq':
order = target_value
elif sort == 'alphabetic':
order = df[c].value_counts().head(topN).sort_index().index
if df2 is not None:
plt.subplot(1, 2, 1)
sns.countplot(x=c, data=df[df[c].isin(order)], order=order)
plt.xticks(rotation=90)
if df2 is not None:
plt.subplot(1, 2, 2)
sns.countplot(x=c, data=df2[df2[c].isin(order)], order=order)
plt.xticks(rotation=90)
if df2 is not None:
plt.suptitle(f'{c} TOP{topN}', size=25)
else:
plt.title(f'{c} TOP{topN}', size=25)
plt.tight_layout()
plt.show()
return
[docs]def hist_continuous(df, continuous_features, bins=30, df2=None):
for c in continuous_features:
if df2 is not None:
plt.subplot(1, 2, 1)
df[c].hist(bins=bins)
if df2 is not None:
plt.subplot(1, 2, 2)
df2[c].hist(bins=bins)
if df2 is not None:
plt.suptitle(f'{c}', size=25)
else:
plt.title(f'{c}', size=25)
plt.tight_layout()
plt.show()
return
[docs]def venn_diagram(train, test, category_features, names=('train', 'test'), figsize=(18, 13)):
from matplotlib_venn import venn2
"""
category_features: max==6
"""
n = int(np.ceil(len(category_features) / 2))
plt.figure(figsize=figsize)
for i, c in enumerate(category_features):
plt.subplot(int(f'{n}2{i + 1}'))
venn2([set(train[c].unique()), set(test[c].unique())],
set_labels=names)
plt.title(f'{c}', fontsize=18)
plt.show()
return
[docs]def split_seq(iterable, size):
"""
In: list(split_seq(range(9), 4))
Out: [[0, 1, 2, 3], [4, 5, 6, 7], [8]]
"""
it = iter(iterable)
item = list(itertools.islice(it, size))
while item:
yield item
item = list(itertools.islice(it, size))
[docs]def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))