# -*- coding:utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import column_or_1d
from ..utils import dt_logging, consts
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import FeatureUnion
logger = dt_logging.get_logger()
[docs]class LgbmLeavesEncoder:
def __init__(self, cat_vars, cont_vars, task, **params):
self.lgbm = None
self.cat_vars = cat_vars
self.cont_vars = cont_vars
self.new_columns = []
self.task = task
self.lgbm_params = params
[docs] def fit(self, X, y):
from lightgbm import LGBMClassifier, LGBMRegressor
X[self.cont_vars] = X[self.cont_vars].astype('float')
X[self.cat_vars] = X[self.cat_vars].astype('int')
logger.info(f'LightGBM task:{self.task}')
if self.task == consts.TASK_MULTICLASS: # multiclass label
num_class = y.shape[-1]
if self.lgbm_params is None:
self.lgbm_params = {}
self.lgbm_params['num_class'] = num_class
self.lgbm_params['n_estimators'] = int(100 / num_class) + 1
y = y.argmax(axis=-1)
if self.task == consts.TASK_REGRESSION:
self.lgbm = LGBMRegressor(**self.lgbm_params)
else:
self.lgbm = LGBMClassifier(**self.lgbm_params)
self.lgbm.fit(X, y)
[docs]class CategorizeEncoder:
def __init__(self, columns=None, remain_numeric=True):
self.columns = columns
self.remain_numeric = remain_numeric
self.new_columns = []
self.encoders = {}
[docs] def fit(self, X):
if self.columns is None:
self.columns = X.columns.tolist()
return self
[docs]class MultiLabelEncoder:
def __init__(self, columns=None):
self.columns = columns
self.encoders = {}
[docs] def fit(self, X):
if self.columns is None:
self.columns = X.columns.tolist()
for col in self.columns:
logger.debug(f'LabelEncoder fitting [{col}]')
if X.loc[:, col].dtype == 'object':
X.loc[:, col] = X.loc[:, col].astype('str')
# print(f'Column "{col}" has been convert to "str" type.')
le = SafeLabelEncoder()
le.fit(X.loc[:, col])
self.encoders[col] = le
return self
[docs]class MultiKBinsDiscretizer:
def __init__(self, columns=None, bins=None, strategy='quantile'):
logger.info(f'{len(columns)} variables to discrete.')
self.columns = columns
self.bins = bins
self.stragegy = strategy
self.new_columns = []
self.encoders = {}
[docs] def fit(self, X):
self.new_columns = []
if self.columns is None:
self.columns = X.columns.tolist()
for col in self.columns:
new_name = col + consts.COLUMNNAME_POSTFIX_DISCRETE
n_unique = X.loc[:, col].nunique()
n_null = X.loc[:, col].isnull().sum()
c_bins = self.bins
if c_bins is None or c_bins <= 0:
c_bins = round(n_unique ** 0.25) + 1
encoder = KBinsDiscretizer(n_bins=c_bins, encode='ordinal', strategy=self.stragegy)
self.new_columns.append((col, new_name, encoder.n_bins))
encoder.fit(X[[col]])
self.encoders[col] = encoder
return self
[docs]class DataFrameWrapper:
def __init__(self, transform, columns=None):
self.transformer = transform
self.columns = columns
[docs] def fit(self, X):
if self.columns is None:
self.columns = X.columns.tolist()
self.transformer.fit(X)
return self
[docs]class SafeLabelEncoder(LabelEncoder):
[docs]class GaussRankScaler:
def __init__(self):
self.epsilon = 0.001
self.lower = -1 + self.epsilon
self.upper = 1 - self.epsilon
self.range = self.upper - self.lower
self.divider = None
[docs]class PassThroughEstimator(object):
[docs] def fit(self, X):
return self
[docs]class VarLenFeatureEncoder:
def __init__(self, sep='|'):
self.sep = sep
self.encoder: SafeLabelEncoder = None
self._max_element_length = 0
[docs] def fit(self, X: pd.Series):
self._max_element_length = 0 # reset
if not isinstance(X, pd.Series):
X = pd.Series(X)
key_set = set()
# flat map
for keys in X.map(lambda _: _.split(self.sep)):
if len(keys) > self._max_element_length:
self._max_element_length = len(keys)
for key in keys:
key_set.add(key)
lb = SafeLabelEncoder() # fix unseen values
lb.fit(np.array(list(key_set)))
self.encoder = lb
return self
@property
def n_classes(self):
return len(self.encoder.classes_)
@property
def max_element_length(self):
return self._max_element_length
[docs]class MultiVarLenFeatureEncoder:
def __init__(self, features):
self._encoders = {feature[0]: VarLenFeatureEncoder(feature[1]) for feature in features}
[docs] def fit(self, X):
for k, v in self._encoders.items():
v.fit(X[k])
return self