Source code for deeptables.preprocessing.utils

# -*- coding:utf-8 -*-

import pandas as pd
from category_encoders import TargetEncoder
from sklearn.model_selection import StratifiedKFold

from ..utils import dt_logging

logger = dt_logging.get_logger(__name__)


[docs]def target_encoding(train, target, test=None, feat_to_encode=None, smooth=0.2, random_state=9527): print('Target encoding...') train.sort_index(inplace=True) target = train.pop(target) if feat_to_encode is None: feat_to_encode = train.columns.tolist() smoothing = smooth oof = pd.DataFrame([]) for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True).split(train, target): ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx]) oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]), ignore_index=False) ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train, target) train = oof.sort_index() if test is not None: test = ce_target_encoder.transform(test) features = list(train) print('Target encoding done!') return train, test, features, target
[docs]def target_rate_encodeing(feat_to_encode, target, df, mode='order'): # mode:order/rate new_df = None for col in feat_to_encode: df[col] = df[col].astype('str').fillna('-1') data = df[[col, target]].groupby(col)[target].value_counts().unstack() data['rate'] = data[1] / (data[0] + data[1]) data.sort_values(by=['rate'], inplace=True) # display(datasets.style.highlight_max(color='lightgreen').highlight_min(color='#cd4f39')) data = data.reset_index() if mode == 'order': dict_ord = {k: i + 1 for i, k in enumerate(data[col].values)} nc = df[col].map(dict_ord).astype('int32') else: dict_ord = {k[0]: k[1] for k in data[[col, 'rate']].values} nc = df[col].map(dict_ord) nn = f'{col}_tre' if new_df is None: new_df = pd.DataFrame(nc, columns=[nn]) else: new_df[nn] = nc return df