Source code for deeptables.models.deeptable

# -*- coding:utf-8 -*-
"""Training and inference for tabular datasets using neural nets."""

import datetime
import os

import numpy as np
import time
import pandas as pd
import pickle
import shutil
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score
from sklearn.utils.validation import check_array
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Concatenate, BatchNormalization
from tensorflow.keras.utils import to_categorical

from . import modelset, deepnets
from .config import ModelConfig
from .deepmodel import DeepModel
from .preprocessor import DefaultPreprocessor
from ..utils import dt_logging, consts

logger = dt_logging.get_logger()


[docs]class DeepTable: """`DeepTables` can be use to solve classification and regression prediction problems on tabular datasets. Easy to use and provide good performance out of box, no datasets preprocessing is required. Arguments --------- config : ModelConfig Options of ModelConfig ---------------------- name: str, (default='conf-1') nets: list of str or callable object, (default=['dnn_nets']) Preset Nets ----------- - DeepFM -> ['linear','dnn_nets','fm_nets'] - xDeepFM - DCN - PNN - WideDeep - AutoInt - AFM - FGCNN - FibiNet Avalible Build Blocks --------------------- - 'dnn_nets' - 'linear' - 'cin_nets' - 'fm_nets' - 'afm_nets' - 'opnn_nets' - 'ipnn_nets' - 'pnn_nets', - 'cross_nets' - 'cross_dnn_nets' - 'dcn_nets', - 'autoint_nets' - 'fg_nets' - 'fgcnn_cin_nets' - 'fgcnn_fm_nets' - 'fgcnn_ipnn_nets' - 'fgcnn_dnn_nets' - 'fibi_nets' - 'fibi_dnn_nets' Examples -------- >>>from deeptables.models import deepnets >>>#preset nets >>>conf = ModelConfig(nets=deepnets.DeepFM) >>>#list of names of nets >>>conf = ModelConfig(nets=['linear','dnn_nets','cin_nets','cross_nets']) >>>#mixed preset nets and names >>>conf = ModelConfig(nets=deepnets.WideDeep+['cin_nets']) >>>#mixed names and custom nets >>>def custom_net(embeddings, flatten_emb_layer, dense_layer, concat_emb_dense, config, model_desc): >>> out = layers.Dense(10)(flatten_emb_layer) >>> return out >>>conf = ModelConfig(nets=['linear', custom_net]) categorical_columns: list of strings, (default='auto') - 'auto' get the columns of categorical type automatically. By default, the object, bool and category will be selected. if 'auto' the [auto_categorize] will no longer takes effect. - list of strings e.g. ['x1','x2','x3','..'] exclude_columns: list of strings, (default=[]) pos_label: str or int, (default=None) The label of positive class, used only when task is binary. metrics: list of string or callable object, (default=['accuracy']) List of metrics to be evaluated by the model during training and testing. Typically you will use `metrics=['accuracy']` or `metrics=['AUC']`. Every metric should be a built-in evaluation metric in tf.keras.metrics or a callable object like `r2(y_true, y_pred):...` . See also: https://tensorflow.google.cn/versions/r2.0/api_docs/python/tf/keras/metrics auto_categorize: bool, (default=False) cat_exponent: float, (default=0.5) cat_remain_numeric: bool, (default=True) auto_encode_label: bool, (default=True) auto_imputation: bool, (default=True) auto_discrete: bool, (default=False) apply_gbm_features: bool, (default=False) gbm_params: dict, (default={}) gbm_feature_type: str, (default=embedding) - embedding - dense fixed_embedding_dim: bool, (default=True) embeddings_output_dim: int, (default=4) embeddings_initializer: str or object, (default='uniform') Initializer for the `embeddings` matrix. embeddings_regularizer: str or object, (default=None) Regularizer function applied to the `embeddings` matrix. dense_dropout: float, (default=0) between 0 and 1 Fraction of the dense input units to drop. embedding_dropout: float, (default=0.3) between 0 and 1 Fraction of the embedding input units to drop. stacking_op: str, (default='add') - add - concat output_use_bias: bool, (default=True) apply_class_weight: bool, (default=False) optimizer: str or object, (default='auto') - auto - str - object loss: str or object, (default='auto') dnn_params: dict, (default={'hidden_units': ((128, 0, False), (64, 0, False)), 'dnn_activation': 'relu'}) autoint_params:dict, (default={'num_attention': 3,'num_heads': 1, 'dropout_rate': 0,'use_residual': True}) fgcnn_params={'fg_filters': (14, 16), 'fg_widths': (7, 7), 'fg_pool_widths': (2, 2), 'fg_new_feat_filters': (2, 2), }, fibinet_params={ 'senet_pooling_op': 'mean', 'senet_reduction_ratio': 3, 'bilinear_type': 'field_interaction', }, cross_params={ 'num_cross_layer': 4, }, pnn_params={ 'outer_product_kernel_type': 'mat', }, afm_params={ 'attention_factor': 4, 'dropout_rate': 0 }, cin_params={ 'cross_layer_size': (128, 128), 'activation': 'relu', 'use_residual': False, 'use_bias': False, 'direct': False, 'reduce_D': False, }, home_dir: str, (default=None) The home directory for saving model-related files. Each time running `fit(...)` or `fit_cross_validation(...)`, a subdirectory with a time-stamp will be created in this directory. monitor_metric: str, (default=None) earlystopping_patience: int, (default=1) gpu_usage_strategy: str, (default='memory_growth') - memory_growth - None distribute_strategy: tensorflow.python.distribute.distribute_lib.Strategy, (default=None) - Attributes ---------- task: str Type of prediction problem, if 'config.task = None'(by default), it will be inferred base on the values of `y` when calling 'fit(...)' or 'fit_cross_validation(...)'. -'binary' : binary classification task -'multiclass' multiclass classfication task -'regression' regression task num_classes: int The number of classes, used only when task is multiclass. pos_label: str or int The label of positive class, used only when task is binary. output_path: str Path to directory used to save models. In addition, if a valid 'X_test' is passed into `fit_cross_validation(...)`, the prediction results of the test set will be saved in this path as well. The path is a subdirectory with time-stamp created in the `home directory`. `home directory` is specified through `config.home_dir`, if `config.home_dir=None` `output_path` will be created in working directory. preprocessor: AbstractPreprocessor (default = DefaultPreprocessor) Preprocessor is used to perform datasets preprocessing, such as categorization, label encoding, imputation, discretization, etc., before feeding into neural nets. nets: list(str) List of the network cells used to build the DeepModel monitor: str The metric for monitoring the quality of model in early_stopping, if not specified, the first metric in [config.metrics] will be used. (e.g. log_loss/auc_val/accuracy_val...) modelset: ModelSet The models produced by `fit(...)` or `fit_cross_validation(...)` best_model: Model A set of models will be produced by `fit_cross_validation(...)`, instead of only one model by `fit(...)`. The Best Model is the model with best performance on specific metric. The first metric in [config.metrics] will be used by default. leaderboard: pandas.DataFrame List sorted by specific metric with some meta information and scores. The first metric in [config.metrics] will be used by default. References ---------- .. [1] ``_ See also -------- Examples -------- >>>X_train = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv') >>>X_eval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv') >>>y_train = X_train.pop('survived') >>>y_eval = X_eval.pop('survived') >>> >>>config = ModelConfig(nets=deepnets.DeepFM, fixed_embedding_dim=True, embeddings_output_dim=4, auto_discrete=True) >>>dt = DeepTable(config=config) >>> >>>model, history = dt.fit(train, y_train, epochs=100) >>>preds = dt.predict(X_eval) """ def __init__(self, config=None, preprocessor=None): if config is None: config = ModelConfig() self.config = config self.nets = config.nets self.output_path = self._prepare_output_dir(config.home_dir, self.nets) self.preprocessor = preprocessor if preprocessor is not None else DefaultPreprocessor(config) self.__current_model = None self.__modelset = modelset.ModelSet(metric=self.config.first_metric_name, best_mode=consts.MODEL_SELECT_MODE_AUTO) @property def task(self): return self.preprocessor.task @property def num_classes(self): return len(self.preprocessor.labels) @property def classes_(self): return self.preprocessor.labels @property def pos_label(self): if self.config.pos_label is not None: return self.config.pos_label else: return self.preprocessor.pos_label @property def monitor(self): monitor = self.config.monitor_metric if monitor is None: if self.config.metrics is not None and len(self.config.metrics) > 0: monitor = 'val_' + self.config.first_metric_name return monitor @property def modelset(self): return self.__modelset @property def best_model(self): return self.__modelset.best_model().model @property def leaderboard(self): return self.__modelset.leaderboard()
[docs] def fit(self, X=None, y=None, batch_size=128, epochs=1, verbose=1, callbacks=None, validation_split=0.2, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False): logger.info(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}, config={self.config}') logger.info(f'metrics:{self.config.metrics}') self.__modelset.clear() X, y = self.preprocessor.fit_transform(X, y) if validation_data is not None: validation_data = self.preprocessor.transform(*validation_data) logger.info(f'Training...') if class_weight is None and self.config.apply_class_weight and self.task != consts.TASK_REGRESSION: class_weight = self.get_class_weight(y) callbacks = self.__inject_callbacks(callbacks) model = DeepModel(self.task, self.num_classes, self.config, self.preprocessor.categorical_columns, self.preprocessor.continuous_columns) history = model.fit(X, y, batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=shuffle, validation_split=validation_split, validation_data=validation_data, validation_steps=validation_steps, validation_freq=validation_freq, callbacks=callbacks, class_weight=class_weight, sample_weight=sample_weight, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing) name = f'{"+".join(self.nets)}' logger.info(f'Training finished.') self.__set_model('val', name, model, history.history) return model, history
[docs] def fit_cross_validation(self, X, y, X_eval=None, X_test=None, num_folds=5, stratified=False, iterators=None, batch_size=None, epochs=1, verbose=1, callbacks=None, n_jobs=1, random_state=9527, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False ): print("Start cross validation") start = time.time() logger.info(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}, config={self.config}') logger.info(f'metrics:{self.config.metrics}') self.__modelset.clear() X, y = self.preprocessor.fit_transform(X, y) if X_eval is not None: print(f'transform X_eval') X_eval = self.preprocessor.transform_X(X_eval) if X_test is not None: print(f'transform X_test') X_test = self.preprocessor.transform_X(X_test) if iterators is None: if stratified and self.task != consts.TASK_REGRESSION: iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state) else: iterators = KFold(n_splits=num_folds, shuffle=True, random_state=random_state) print(f'Iterators:{iterators}') test_proba_mean = None eval_proba_mean = None if self.task == consts.TASK_MULTICLASS: oof_proba = np.zeros((y.shape[0], self.num_classes)) else: oof_proba = np.zeros((y.shape[0], 1)) y = np.array(y) if class_weight is None and self.config.apply_class_weight and self.task == consts.TASK_BINARY: class_weight = self.get_class_weight(y) callbacks = self.__inject_callbacks(callbacks) parallel = Parallel(n_jobs=n_jobs, verbose=verbose) fit_and_score_kwargs = dict( batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, class_weight=class_weight, shuffle=shuffle, sample_weight=sample_weight, validation_steps=validation_steps, validation_freq=validation_freq, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing ) with parallel: out = parallel(delayed(_fit_and_score)( self.task, self.num_classes, self.config, self.preprocessor.categorical_columns, self.preprocessor.continuous_columns, n_fold, valid_idx, X.iloc[train_idx], y[train_idx], X.iloc[valid_idx], y[valid_idx], X_eval, X_test, f'{self.output_path}{"_".join(self.nets)}-kfold-{n_fold + 1}.h5', **fit_and_score_kwargs) for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y))) for n_fold, idx, history, fold_oof_proba, fold_eval_proba, fold_test_proba in out: oof_proba[idx] = fold_oof_proba if X_eval is not None: if eval_proba_mean is None: eval_proba_mean = fold_eval_proba / num_folds else: eval_proba_mean += fold_eval_proba / num_folds if X_test is not None: if test_proba_mean is None: test_proba_mean = fold_test_proba / num_folds else: test_proba_mean += fold_test_proba / num_folds self.__push_model('val', f'{"+".join(self.nets)}-kfold-{n_fold + 1}', f'{self.output_path}{"_".join(self.nets)}-kfold-{n_fold + 1}.h5', history) if oof_proba.shape[-1] == 1: oof_proba = oof_proba.reshape(-1) if eval_proba_mean is not None and eval_proba_mean.shape[-1] == 1: eval_proba_mean = eval_proba_mean.reshape(-1) if test_proba_mean is not None and test_proba_mean.shape[-1] == 1: test_proba_mean = test_proba_mean.reshape(-1) file = f'{self.output_path}{"_".join(self.nets)}-cv-{num_folds}.csv' pd.DataFrame(test_proba_mean).to_csv(file, index=False) print(f'fit_cross_validation cost:{time.time() - start}') return oof_proba, eval_proba_mean, test_proba_mean
[docs] def evaluate(self, X_test, y_test, batch_size=256, verbose=0, model_selector=consts.MODEL_SELECTOR_CURRENT, ): X_t, y_t = self.preprocessor.transform(X_test, y_test) y_t = np.array(y_t) model = self.get_model(model_selector) if not isinstance(model, DeepModel): raise ValueError(f'Wrong model_selector:{model_selector}') result = model.evaluate(X_t, y_t, batch_size=batch_size, verbose=verbose) return result
[docs] def predict_proba(self, X, batch_size=128, verbose=0, model_selector=consts.MODEL_SELECTOR_CURRENT, auto_transform_data=True, ): start = time.time() if model_selector == consts.MODEL_SELECTOR_ALL: models = self.get_model(model_selector) proba_avg = None if auto_transform_data: X = self.preprocessor.transform_X(X) for model in models: proba = self.__predict(model, X, batch_size=batch_size, verbose=verbose, auto_transform_data=False) if proba_avg is None: proba_avg = np.zeros(proba.shape) proba_avg += proba proba_avg /= len(models) print(f'predict_proba cost:{time.time() - start}') return proba_avg else: proba = self.__predict(self.get_model(model_selector), X, batch_size=batch_size, verbose=verbose, auto_transform_data=auto_transform_data) print(f'predict_proba cost:{time.time() - start}') return proba
[docs] def predict_proba_all(self, X, batch_size=128, verbose=0, auto_transform_data=True, ): mis = self.__modelset.get_modelinfos() proba_all = {} if auto_transform_data: X = self.preprocessor.transform_X(X) for mi in mis: model = self.get_model(mi.name) proba = self.__predict(model, X, batch_size=batch_size, verbose=verbose, auto_transform_data=False) proba_all[mi.name] = proba return proba_all
[docs] def predict(self, X, encode_to_label=True, batch_size=128, verbose=0, model_selector=consts.MODEL_SELECTOR_CURRENT, auto_transform_data=True): proba = self.predict_proba(X, batch_size, verbose, model_selector=model_selector, auto_transform_data=auto_transform_data) return self.proba2predict(proba, encode_to_label)
[docs] def proba2predict(self, proba, encode_to_label=True): if self.task == consts.TASK_REGRESSION: return proba if proba is None: raise ValueError('[proba] can not be none.') if len(proba.shape) == 1: proba = proba.reshape((-1, 1)) if proba.shape[-1] > 1: predict = proba.argmax(axis=-1) else: predict = (proba > 0.5).astype(consts.DATATYPE_PREDICT_CLASS) if encode_to_label: logger.info('Reverse indicators to labels.') predict = self.preprocessor.inverse_transform_y(predict) return predict
[docs] def apply(self, X, output_layers, concat_outputs=False, batch_size=128, verbose=0, model_selector=consts.MODEL_SELECTOR_CURRENT, auto_transform_data=True, transformer=None): start = time.time() model = self.get_model(model_selector) if not isinstance(model, DeepModel): raise ValueError(f'Wrong model_selector:{model_selector}') if auto_transform_data: X = self.preprocessor.transform_X(X) output = model.apply(X, output_layers, concat_outputs, batch_size, verbose, transformer) print(f'apply cost:{time.time() - start}') return output
[docs] def concat_emb_dense(self, flatten_emb_layer, dense_layer): x = None if flatten_emb_layer is not None and dense_layer is not None: x = Concatenate(name='concat_embedding_dense')([flatten_emb_layer, dense_layer]) elif flatten_emb_layer is not None: x = flatten_emb_layer elif dense_layer is not None: x = dense_layer else: raise ValueError('No input layer exists.') x = BatchNormalization(name='bn_concat_emb_dense')(x) print(f'Concat embedding and dense layer shape:{x.shape}') return x
[docs] def get_model(self, model_selector=consts.MODEL_SELECTOR_CURRENT, ): if model_selector == consts.MODEL_SELECTOR_CURRENT: # get model by name mi = self.__modelset.get_modelinfo(self.__current_model) elif model_selector == consts.MODEL_SELECTOR_BEST: mi = self.__modelset.best_model() elif model_selector == consts.MODEL_SELECTOR_ALL: ms = [] for mi in self.__modelset.get_modelinfos(): if isinstance(mi.model, str): dm = self.load_deepmodel(mi.model) mi.model = dm ms.append(mi.model) return ms else: # get model by name mi = self.__modelset.get_modelinfo(model_selector) if mi is None: raise ValueError(f'{model_selector} does not exsit.') if isinstance(mi.model, str): dm = self.load_deepmodel(mi.model) mi.model = dm return mi.model
[docs] def get_class_weight(self, y): print('Calc classes weight.') if len(y.shape) == 1: y = to_categorical(y) y_sum = y.sum(axis=0) class_weight = {} total = y.shape[0] classes = len(y_sum) print(f"Examples:\nTotal:{total}") for i in range(classes): weight = total / y_sum[i] / classes class_weight[i] = weight print(f'class {i}:{weight}') return class_weight
def _prepare_output_dir(self, home_dir, nets): if home_dir is None: home_dir = 'dt_output' if home_dir[-1] == '/': home_dir = home_dir[:-1] running_dir = f'dt_{datetime.datetime.now().__format__("%Y%m%d %H%M%S")}_{"_".join(nets)}' output_path = os.path.expanduser(f'{home_dir}/{running_dir}/') if not os.path.exists(output_path): os.makedirs(output_path) return output_path def __predict(self, model, X, batch_size=128, verbose=0, auto_transform_data=True, ): logger.info("Perform prediction...") if auto_transform_data: X = self.preprocessor.transform_X(X) return model.predict(X, batch_size=batch_size, verbose=verbose) def __set_model(self, type, name, model, history): self.__modelset.clear() self.__push_model(type, name, model, history) def __push_model(self, type, name, model, history, save_model=True): modelfile = '' if save_model and isinstance(model, DeepModel): modelfile = f'{self.output_path}{name}.h5' model.save(modelfile) print(f'Model has been saved to:{modelfile}') mi = modelset.ModelInfo(type, name, model, {}, history=history, modelfile=modelfile) self.__modelset.push(mi) self.__current_model = mi.name def __inject_callbacks(self, callbacks): # mcp = None es = None if callbacks is not None: for callback in callbacks: # if isinstance(callback, ModelCheckpoint): # mcp = callback if isinstance(callback, EarlyStopping): es = callback else: callbacks = [] if 'auc' in self.monitor.lower() or 'acc' in self.monitor.lower(): mode = 'max' else: mode = 'min' # if mcp is None: # mcp = ModelCheckpoint(self.model_filepath, # monitor=self.monitor, # verbose=0, # save_best_only=False, # save_weights_only=False, # mode=mode, # save_freq='epoch', # ) # callbacks.append(mcp) # print(f'Injected a callback [ModelCheckpoint].\nfilepath:{mcp.filepath}\nmonitor:{mcp.monitor}') if es is None: es = EarlyStopping(monitor=self.monitor, restore_best_weights=True, patience=self.config.earlystopping_patience, verbose=1, #min_delta=0.0001, mode=mode, baseline=None, ) callbacks.append(es) print(f'Injected a callback [EarlyStopping]. monitor:{es.monitor}, patience:{es.patience}, mode:{mode}') return callbacks
[docs] def save(self, filepath, deepmodel_basename=None): if filepath[-1] != '/': filepath = filepath + '/' if not os.path.exists(filepath): os.makedirs(filepath) num_model = len(self.__modelset.get_modelinfos()) for mi in self.__modelset.get_modelinfos(): if isinstance(mi.model, str): dm = self.load_deepmodel(mi.model) mi.model = dm if not isinstance(mi.model, DeepModel): raise ValueError(f'Currently does not support saving non-DeepModel models.') if num_model == 1 and deepmodel_basename is not None: mi.name = deepmodel_basename self.__current_model = deepmodel_basename modelfile = f'{filepath}{mi.name}.h5' mi.model.save(modelfile) mi.model = modelfile with open(f'{filepath}dt.pkl', 'wb') as output: pickle.dump(self, output, protocol=2)
[docs] @staticmethod def load(filepath): if filepath[-1] != '/': filepath = filepath + '/' with open(f'{filepath}dt.pkl', 'rb') as input: dt = pickle.load(input) dt.restore_modelset(filepath) return dt
[docs] def restore_modelset(self, filepath): for mi in self.__modelset.get_modelinfos(): if isinstance(mi.model, str): modelfile = mi.model modelfile = os.path.split(modelfile)[-1] dm = self.load_deepmodel(f'{filepath}{modelfile}') mi.model = dm
[docs] def load_deepmodel(self, filepath): if os.path.exists(filepath): print(f'Load model from disk:{filepath}.') dm = DeepModel(self.task, self.num_classes, self.config, self.preprocessor.categorical_columns, self.preprocessor.continuous_columns, filepath) return dm else: raise ValueError(f'Invalid model filename:{filepath}.')
def _fit_and_score(task, num_classes, config, categorical_columns, continuous_columns, n_fold, valid_idx, X_train, y_train, X_val, y_val, X_eval=None, X_test=None, model_file=None, batch_size=128, epochs=1, verbose=0, callbacks=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False): print(f'\nFold:{n_fold + 1}\n') model = DeepModel(task, num_classes, config, categorical_columns, continuous_columns) history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, validation_data=(X_val, y_val), shuffle=shuffle, class_weight=class_weight, sample_weight=sample_weight, initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq, max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing) print(f'Fold {n_fold + 1} fitting over.') oof_proba = model.predict(X_val) eval_proba = None test_proba = None if X_eval is not None: eval_proba = model.predict(X_eval) if X_test is not None: test_proba = model.predict(X_test) if model_file is not None: file = f'{model_file}.test_proba.csv' pd.DataFrame(test_proba).to_csv(file, index=False) print(f'Fold {n_fold + 1} scoring over.') if model_file is not None: model.save(model_file) print(f'Save model to:{model_file}.') model.release() return n_fold, valid_idx, history.history, oof_proba, eval_proba, test_proba
[docs]def infer_task_type(y): if len(y.shape) > 1 and y.shape[-1] > 1: labels = list(range(y.shape[-1])) task = consts.TASK_MULTILABEL return task, labels uniques = set(y) n_unique = len(uniques) labels = [] if n_unique == 2: print(f'2 class detected, {uniques}, so inferred as a [binary classification] task') task = consts.TASK_BINARY # TASK_BINARY labels = sorted(uniques) else: if y.dtype == 'float': print(f'Target column type is float, so inferred as a [regression] task.') task = consts.TASK_REGRESSION else: if n_unique > 1000: if 'int' in y.dtype: print( 'The number of classes exceeds 1000 and column type is int, so inferred as a [regression] task ') task = consts.TASK_REGRESSION else: raise ValueError( 'The number of classes exceeds 1000, please confirm whether your predict target is correct ') else: print(f'{n_unique} class detected, inferred as a [multiclass classification] task') task = consts.TASK_MULTICLASS labels = sorted(uniques) return task, labels
[docs]def probe_evaluate(dt, X, y, X_test, y_test, layers, score_fn={}): from sklearn.linear_model import LogisticRegression print('Extracting features of train set...') features_train = dt.apply(X, output_layers=layers) print('Extracting features of test set...') features_test = dt.apply(X_test, output_layers=layers) y = dt.preprocessor.transform_y(y) y_test = dt.preprocessor.transform_y(y_test) if not isinstance(features_train, list): features_train = [features_train] features_test = [features_test] result = {} for i, x_train in enumerate(features_train): clf = LogisticRegression(random_state=0).fit(x_train, y) print(f'Fit model for layer[{layers[i]}]...') y_proba = clf.predict_proba(features_test[i])[:, 1] y_score = clf.predict(features_test[i]) print(f'Scoring...') if len(score_fn) == 0: score = clf.score(features_test[i], y_test) print(f'Evaluating accuracy score...') result[layers[i]] = {'accuracy': score} else: result[layers[i]] = {} for metric in score_fn.keys(): print(f'Evaluating {metric} score...') fn = score_fn[metric] if fn == roc_auc_score: score = fn(y_test, y_proba) else: score = fn(y_test, y_score) result[layers[i]][metric] = score print(f'{metric}:{score}') # result[layers[i]] = {metric:score_fn[metric](features_test[i], y_score) for metric in score_fn.keys()} return result