Source code for deeptables.models.deeptable

# -*- coding:utf-8 -*-
"""Training and inference for tabular datasets using neural nets."""

import datetime
import os

import numpy as np
import time
import pandas as pd
import pickle
import shutil
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score
from sklearn.utils.validation import check_array
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Concatenate, BatchNormalization
from tensorflow.keras.utils import to_categorical

from . import modelset, deepnets
from .config import ModelConfig
from .deepmodel import DeepModel
from .preprocessor import DefaultPreprocessor
from ..utils import dt_logging, consts

logger = dt_logging.get_logger()


[docs]class DeepTable:
    """`DeepTables` can be use to solve classification and regression prediction problems on tabular datasets.
    Easy to use and provide good performance out of box, no datasets preprocessing is required.

    Arguments
    ---------
    config : ModelConfig

        Options of ModelConfig
        ----------------------
            name: str, (default='conf-1')

            nets: list of str or callable object, (default=['dnn_nets'])
                Preset Nets
                -----------
                - DeepFM    -> ['linear','dnn_nets','fm_nets']
                - xDeepFM
                - DCN
                - PNN
                - WideDeep
                - AutoInt
                - AFM
                - FGCNN
                - FibiNet

                Avalible Build Blocks
                ---------------------
                - 'dnn_nets'
                - 'linear'
                - 'cin_nets'
                - 'fm_nets'
                - 'afm_nets'
                - 'opnn_nets'
                - 'ipnn_nets'
                - 'pnn_nets',
                - 'cross_nets'
                - 'cross_dnn_nets'
                - 'dcn_nets',
                - 'autoint_nets'
                - 'fg_nets'
                - 'fgcnn_cin_nets'
                - 'fgcnn_fm_nets'
                - 'fgcnn_ipnn_nets'
                - 'fgcnn_dnn_nets'
                - 'fibi_nets'
                - 'fibi_dnn_nets'

                Examples
                --------
                >>>from deeptables.models import deepnets
                >>>#preset nets
                >>>conf = ModelConfig(nets=deepnets.DeepFM)
                >>>#list of names of nets
                >>>conf = ModelConfig(nets=['linear','dnn_nets','cin_nets','cross_nets'])
                >>>#mixed preset nets and names
                >>>conf = ModelConfig(nets=deepnets.WideDeep+['cin_nets'])
                >>>#mixed names and custom nets
                >>>def custom_net(embeddings, flatten_emb_layer, dense_layer, concat_emb_dense, config, model_desc):
                >>>     out = layers.Dense(10)(flatten_emb_layer)
                >>>     return out
                >>>conf = ModelConfig(nets=['linear', custom_net])

            categorical_columns: list of strings, (default='auto')
                - 'auto'
                    get the columns of categorical type automatically. By default, the object,
                    bool and category will be selected.
                    if 'auto' the [auto_categorize] will no longer takes effect.
                - list of strings
                    e.g. ['x1','x2','x3','..']

            exclude_columns: list of strings, (default=[])

            pos_label: str or int, (default=None)
                The label of positive class, used only when task is binary.

            metrics: list of string or callable object, (default=['accuracy'])
                List of metrics to be evaluated by the model during training and testing.
                Typically you will use `metrics=['accuracy']` or `metrics=['AUC']`.
                Every metric should be a built-in evaluation metric in tf.keras.metrics or a callable object
                like `r2(y_true, y_pred):...` .
                See also: https://tensorflow.google.cn/versions/r2.0/api_docs/python/tf/keras/metrics

            auto_categorize: bool, (default=False)

            cat_exponent: float, (default=0.5)

            cat_remain_numeric: bool, (default=True)

            auto_encode_label: bool, (default=True)

            auto_imputation: bool, (default=True)

            auto_discrete: bool, (default=False)

            apply_gbm_features: bool, (default=False)

            gbm_params: dict, (default={})

            gbm_feature_type: str, (default=embedding)
                - embedding
                - dense

            fixed_embedding_dim: bool, (default=True)

            embeddings_output_dim: int, (default=4)

            embeddings_initializer: str or object, (default='uniform')
                Initializer for the `embeddings` matrix.

            embeddings_regularizer: str or object, (default=None)
                Regularizer function applied to the `embeddings` matrix.

            dense_dropout: float, (default=0) between 0 and 1
                Fraction of the dense input units to drop.

            embedding_dropout: float, (default=0.3) between 0 and 1
                Fraction of the embedding input units to drop.

            stacking_op: str, (default='add')
                - add
                - concat

            output_use_bias: bool, (default=True)

            apply_class_weight: bool, (default=False)

            optimizer: str or object, (default='auto')
                - auto
                - str
                - object

            loss: str or object, (default='auto')

            dnn_params: dict, (default={'hidden_units': ((128, 0, False), (64, 0, False)),
                                        'dnn_activation': 'relu'})

            autoint_params:dict, (default={'num_attention': 3,'num_heads': 1,
                                            'dropout_rate': 0,'use_residual': True})

                fgcnn_params={'fg_filters': (14, 16),
                              'fg_widths': (7, 7),
                              'fg_pool_widths': (2, 2),
                              'fg_new_feat_filters': (2, 2),
                              },
                fibinet_params={
                    'senet_pooling_op': 'mean',
                    'senet_reduction_ratio': 3,
                    'bilinear_type': 'field_interaction',
                },
                cross_params={
                    'num_cross_layer': 4,
                },
                pnn_params={
                    'outer_product_kernel_type': 'mat',
                },
                afm_params={
                    'attention_factor': 4,
                    'dropout_rate': 0
                },
                cin_params={
                    'cross_layer_size': (128, 128),
                    'activation': 'relu',
                    'use_residual': False,
                    'use_bias': False,
                    'direct': False,
                    'reduce_D': False,
                },

            home_dir: str, (default=None)
                The home directory for saving model-related files. Each time running `fit(...)`
                or `fit_cross_validation(...)`, a subdirectory with a time-stamp will be created
                in this directory.

            monitor_metric: str, (default=None)

            earlystopping_patience: int, (default=1)

            gpu_usage_strategy: str, (default='memory_growth')
                - memory_growth
                - None

            distribute_strategy: tensorflow.python.distribute.distribute_lib.Strategy, (default=None)
                -

    Attributes
    ----------
        task: str
            Type of prediction problem, if 'config.task = None'(by default), it will be inferred
            base on the values of `y` when calling 'fit(...)' or 'fit_cross_validation(...)'.
            -'binary' : binary classification task
            -'multiclass' multiclass classfication task
            -'regression' regression task

        num_classes: int
            The number of classes, used only when task is multiclass.

        pos_label: str or int
            The label of positive class, used only when task is binary.

        output_path: str
            Path to directory used to save models. In addition, if a valid 'X_test' is passed into
            `fit_cross_validation(...)`, the prediction results of the test set will be saved in
            this path as well.
            The path is a subdirectory with time-stamp created in the `home directory`. `home directory`
            is specified through `config.home_dir`, if `config.home_dir=None` `output_path` will be created
            in working directory.

        preprocessor: AbstractPreprocessor (default = DefaultPreprocessor)
            Preprocessor is used to perform datasets preprocessing, such as categorization, label encoding,
            imputation, discretization, etc., before feeding into neural nets.

        nets: list(str)
            List of the network cells used to build the DeepModel

        monitor: str
            The metric for monitoring the quality of model in early_stopping, if not specified, the
            first metric in [config.metrics] will be used. (e.g. log_loss/auc_val/accuracy_val...)

        modelset: ModelSet
            The models produced by `fit(...)` or `fit_cross_validation(...)`

        best_model: Model
            A set of models will be produced by `fit_cross_validation(...)`, instead of only one
            model by `fit(...)`. The Best Model is the model with best performance on specific metric.
            The first metric in [config.metrics] will be used by default.

        leaderboard: pandas.DataFrame
            List sorted by specific metric with some meta information and scores. The first metric
            in [config.metrics] will be used by default.

    References
    ----------
    .. [1] ``_

    See also
    --------

    Examples
    --------
    >>>X_train = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
    >>>X_eval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
    >>>y_train = X_train.pop('survived')
    >>>y_eval = X_eval.pop('survived')
    >>>
    >>>config = ModelConfig(nets=deepnets.DeepFM, fixed_embedding_dim=True, embeddings_output_dim=4, auto_discrete=True)
    >>>dt = DeepTable(config=config)
    >>>
    >>>model, history = dt.fit(train, y_train, epochs=100)
    >>>preds = dt.predict(X_eval)
    """

    def __init__(self, config=None, preprocessor=None):
        if config is None:
            config = ModelConfig()
        self.config = config
        self.nets = config.nets
        self.output_path = self._prepare_output_dir(config.home_dir, self.nets)

        self.preprocessor = preprocessor if preprocessor is not None else DefaultPreprocessor(config)
        self.__current_model = None
        self.__modelset = modelset.ModelSet(metric=self.config.first_metric_name,
                                            best_mode=consts.MODEL_SELECT_MODE_AUTO)

    @property
    def task(self):
        return self.preprocessor.task

    @property
    def num_classes(self):
        return len(self.preprocessor.labels)

    @property
    def classes_(self):
        return self.preprocessor.labels

    @property
    def pos_label(self):
        if self.config.pos_label is not None:
            return self.config.pos_label
        else:
            return self.preprocessor.pos_label

    @property
    def monitor(self):
        monitor = self.config.monitor_metric
        if monitor is None:
            if self.config.metrics is not None and len(self.config.metrics) > 0:
                monitor = 'val_' + self.config.first_metric_name
        return monitor

    @property
    def modelset(self):
        return self.__modelset

    @property
    def best_model(self):
        return self.__modelset.best_model().model

    @property
    def leaderboard(self):
        return self.__modelset.leaderboard()

[docs]    def fit(self, X=None, y=None, batch_size=128, epochs=1, verbose=1, callbacks=None,
            validation_split=0.2, validation_data=None, shuffle=True,
            class_weight=None, sample_weight=None,
            initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1,
            max_queue_size=10, workers=1, use_multiprocessing=False):
        logger.info(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}, config={self.config}')
        logger.info(f'metrics:{self.config.metrics}')
        self.__modelset.clear()

        X, y = self.preprocessor.fit_transform(X, y)
        if validation_data is not None:
            validation_data = self.preprocessor.transform(*validation_data)

        logger.info(f'Training...')
        if class_weight is None and self.config.apply_class_weight and self.task != consts.TASK_REGRESSION:
            class_weight = self.get_class_weight(y)

        callbacks = self.__inject_callbacks(callbacks)
        model = DeepModel(self.task, self.num_classes, self.config,
                          self.preprocessor.categorical_columns,
                          self.preprocessor.continuous_columns)
        history = model.fit(X, y, batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=shuffle,
                            validation_split=validation_split, validation_data=validation_data,
                            validation_steps=validation_steps, validation_freq=validation_freq,
                            callbacks=callbacks, class_weight=class_weight, sample_weight=sample_weight,
                            initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch,
                            max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing)
        name = f'{"+".join(self.nets)}'
        logger.info(f'Training finished.')
        self.__set_model('val', name, model, history.history)
        return model, history

[docs]    def fit_cross_validation(self, X, y, X_eval=None, X_test=None, num_folds=5, stratified=False, iterators=None,
                             batch_size=None, epochs=1, verbose=1, callbacks=None, n_jobs=1, random_state=9527,
                             shuffle=True, class_weight=None, sample_weight=None,
                             initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1,
                             max_queue_size=10, workers=1, use_multiprocessing=False
                             ):
        print("Start cross validation")
        start = time.time()
        logger.info(f'X.Shape={np.shape(X)}, y.Shape={np.shape(y)}, batch_size={batch_size}, config={self.config}')
        logger.info(f'metrics:{self.config.metrics}')
        self.__modelset.clear()

        X, y = self.preprocessor.fit_transform(X, y)

        if X_eval is not None:
            print(f'transform X_eval')
            X_eval = self.preprocessor.transform_X(X_eval)
        if X_test is not None:
            print(f'transform X_test')
            X_test = self.preprocessor.transform_X(X_test)

        if iterators is None:
            if stratified and self.task != consts.TASK_REGRESSION:
                iterators = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state)
            else:
                iterators = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
        print(f'Iterators:{iterators}')

        test_proba_mean = None
        eval_proba_mean = None
        if self.task == consts.TASK_MULTICLASS:
            oof_proba = np.zeros((y.shape[0], self.num_classes))
        else:
            oof_proba = np.zeros((y.shape[0], 1))

        y = np.array(y)
        if class_weight is None and self.config.apply_class_weight and self.task == consts.TASK_BINARY:
            class_weight = self.get_class_weight(y)

        callbacks = self.__inject_callbacks(callbacks)

        parallel = Parallel(n_jobs=n_jobs, verbose=verbose)

        fit_and_score_kwargs = dict(
            batch_size=batch_size, epochs=epochs, verbose=verbose,
            callbacks=callbacks, class_weight=class_weight, shuffle=shuffle, sample_weight=sample_weight,
            validation_steps=validation_steps, validation_freq=validation_freq,
            initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch,
            max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing
        )
        with parallel:
            out = parallel(delayed(_fit_and_score)(
                self.task, self.num_classes, self.config,
                self.preprocessor.categorical_columns, self.preprocessor.continuous_columns,
                n_fold, valid_idx,
                X.iloc[train_idx], y[train_idx], X.iloc[valid_idx], y[valid_idx],
                X_eval, X_test, f'{self.output_path}{"_".join(self.nets)}-kfold-{n_fold + 1}.h5',
                **fit_and_score_kwargs)
                           for n_fold, (train_idx, valid_idx) in enumerate(iterators.split(X, y)))

            for n_fold, idx, history, fold_oof_proba, fold_eval_proba, fold_test_proba in out:
                oof_proba[idx] = fold_oof_proba
                if X_eval is not None:
                    if eval_proba_mean is None:
                        eval_proba_mean = fold_eval_proba / num_folds
                    else:
                        eval_proba_mean += fold_eval_proba / num_folds
                if X_test is not None:
                    if test_proba_mean is None:
                        test_proba_mean = fold_test_proba / num_folds
                    else:
                        test_proba_mean += fold_test_proba / num_folds
                self.__push_model('val', f'{"+".join(self.nets)}-kfold-{n_fold + 1}',
                                  f'{self.output_path}{"_".join(self.nets)}-kfold-{n_fold + 1}.h5', history)

        if oof_proba.shape[-1] == 1:
            oof_proba = oof_proba.reshape(-1)
        if eval_proba_mean is not None and eval_proba_mean.shape[-1] == 1:
            eval_proba_mean = eval_proba_mean.reshape(-1)
        if test_proba_mean is not None and test_proba_mean.shape[-1] == 1:
            test_proba_mean = test_proba_mean.reshape(-1)
            file = f'{self.output_path}{"_".join(self.nets)}-cv-{num_folds}.csv'
            pd.DataFrame(test_proba_mean).to_csv(file, index=False)
        print(f'fit_cross_validation cost:{time.time() - start}')
        return oof_proba, eval_proba_mean, test_proba_mean

[docs]    def evaluate(self, X_test, y_test, batch_size=256, verbose=0, model_selector=consts.MODEL_SELECTOR_CURRENT, ):
        X_t, y_t = self.preprocessor.transform(X_test, y_test)
        y_t = np.array(y_t)
        model = self.get_model(model_selector)
        if not isinstance(model, DeepModel):
            raise ValueError(f'Wrong model_selector:{model_selector}')
        result = model.evaluate(X_t, y_t, batch_size=batch_size, verbose=verbose)
        return result

[docs]    def predict_proba(self, X, batch_size=128, verbose=0,
                      model_selector=consts.MODEL_SELECTOR_CURRENT, auto_transform_data=True, ):
        start = time.time()
        if model_selector == consts.MODEL_SELECTOR_ALL:
            models = self.get_model(model_selector)
            proba_avg = None
            if auto_transform_data:
                X = self.preprocessor.transform_X(X)
            for model in models:
                proba = self.__predict(model, X, batch_size=batch_size, verbose=verbose, auto_transform_data=False)
                if proba_avg is None:
                    proba_avg = np.zeros(proba.shape)
                proba_avg += proba
            proba_avg /= len(models)
            print(f'predict_proba cost:{time.time() - start}')
            return proba_avg
        else:
            proba = self.__predict(self.get_model(model_selector),
                                   X, batch_size=batch_size,
                                   verbose=verbose,
                                   auto_transform_data=auto_transform_data)
            print(f'predict_proba cost:{time.time() - start}')
            return proba

[docs]    def predict_proba_all(self, X, batch_size=128, verbose=0, auto_transform_data=True, ):
        mis = self.__modelset.get_modelinfos()
        proba_all = {}
        if auto_transform_data:
            X = self.preprocessor.transform_X(X)
        for mi in mis:
            model = self.get_model(mi.name)
            proba = self.__predict(model, X, batch_size=batch_size, verbose=verbose, auto_transform_data=False)
            proba_all[mi.name] = proba
        return proba_all

[docs]    def predict(self, X, encode_to_label=True, batch_size=128, verbose=0,
                model_selector=consts.MODEL_SELECTOR_CURRENT, auto_transform_data=True):
        proba = self.predict_proba(X, batch_size, verbose,
                                   model_selector=model_selector,
                                   auto_transform_data=auto_transform_data)

        return self.proba2predict(proba, encode_to_label)

[docs]    def proba2predict(self, proba, encode_to_label=True):
        if self.task == consts.TASK_REGRESSION:
            return proba
        if proba is None:
            raise ValueError('[proba] can not be none.')
        if len(proba.shape) == 1:
            proba = proba.reshape((-1, 1))

        if proba.shape[-1] > 1:
            predict = proba.argmax(axis=-1)
        else:
            predict = (proba > 0.5).astype(consts.DATATYPE_PREDICT_CLASS)
        if encode_to_label:
            logger.info('Reverse indicators to labels.')
            predict = self.preprocessor.inverse_transform_y(predict)

        return predict

[docs]    def apply(self, X, output_layers, concat_outputs=False, batch_size=128, verbose=0,
              model_selector=consts.MODEL_SELECTOR_CURRENT, auto_transform_data=True, transformer=None):
        start = time.time()

        model = self.get_model(model_selector)
        if not isinstance(model, DeepModel):
            raise ValueError(f'Wrong model_selector:{model_selector}')
        if auto_transform_data:
            X = self.preprocessor.transform_X(X)
        output = model.apply(X, output_layers, concat_outputs, batch_size, verbose, transformer)
        print(f'apply cost:{time.time() - start}')
        return output

[docs]    def concat_emb_dense(self, flatten_emb_layer, dense_layer):
        x = None
        if flatten_emb_layer is not None and dense_layer is not None:
            x = Concatenate(name='concat_embedding_dense')([flatten_emb_layer, dense_layer])
        elif flatten_emb_layer is not None:
            x = flatten_emb_layer
        elif dense_layer is not None:
            x = dense_layer
        else:
            raise ValueError('No input layer exists.')
        x = BatchNormalization(name='bn_concat_emb_dense')(x)
        print(f'Concat embedding and dense layer shape:{x.shape}')
        return x

[docs]    def get_model(self, model_selector=consts.MODEL_SELECTOR_CURRENT, ):
        if model_selector == consts.MODEL_SELECTOR_CURRENT:
            # get model by name
            mi = self.__modelset.get_modelinfo(self.__current_model)
        elif model_selector == consts.MODEL_SELECTOR_BEST:
            mi = self.__modelset.best_model()
        elif model_selector == consts.MODEL_SELECTOR_ALL:
            ms = []
            for mi in self.__modelset.get_modelinfos():
                if isinstance(mi.model, str):
                    dm = self.load_deepmodel(mi.model)
                    mi.model = dm
                ms.append(mi.model)
            return ms
        else:
            # get model by name
            mi = self.__modelset.get_modelinfo(model_selector)
        if mi is None:
            raise ValueError(f'{model_selector} does not exsit.')

        if isinstance(mi.model, str):
            dm = self.load_deepmodel(mi.model)
            mi.model = dm
        return mi.model

[docs]    def get_class_weight(self, y):
        print('Calc classes weight.')
        if len(y.shape) == 1:
            y = to_categorical(y)
        y_sum = y.sum(axis=0)
        class_weight = {}
        total = y.shape[0]
        classes = len(y_sum)
        print(f"Examples:\nTotal:{total}")
        for i in range(classes):
            weight = total / y_sum[i] / classes
            class_weight[i] = weight
            print(f'class {i}:{weight}')

        return class_weight

    def _prepare_output_dir(self, home_dir, nets):
        if home_dir is None:
            home_dir = 'dt_output'
        if home_dir[-1] == '/':
            home_dir = home_dir[:-1]

        running_dir = f'dt_{datetime.datetime.now().__format__("%Y%m%d %H%M%S")}_{"_".join(nets)}'
        output_path = os.path.expanduser(f'{home_dir}/{running_dir}/')
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        return output_path

    def __predict(self, model, X, batch_size=128, verbose=0, auto_transform_data=True, ):
        logger.info("Perform prediction...")
        if auto_transform_data:
            X = self.preprocessor.transform_X(X)
        return model.predict(X, batch_size=batch_size, verbose=verbose)

    def __set_model(self, type, name, model, history):
        self.__modelset.clear()
        self.__push_model(type, name, model, history)

    def __push_model(self, type, name, model, history, save_model=True):
        modelfile = ''
        if save_model and isinstance(model, DeepModel):
            modelfile = f'{self.output_path}{name}.h5'
            model.save(modelfile)
            print(f'Model has been saved to:{modelfile}')
        mi = modelset.ModelInfo(type, name, model, {}, history=history, modelfile=modelfile)
        self.__modelset.push(mi)
        self.__current_model = mi.name

    def __inject_callbacks(self, callbacks):
        # mcp = None
        es = None
        if callbacks is not None:
            for callback in callbacks:
                # if isinstance(callback, ModelCheckpoint):
                #   mcp = callback
                if isinstance(callback, EarlyStopping):
                    es = callback
        else:
            callbacks = []

        if 'auc' in self.monitor.lower() or 'acc' in self.monitor.lower():
            mode = 'max'
        else:
            mode = 'min'
        # if mcp is None:
        #    mcp = ModelCheckpoint(self.model_filepath,
        #                          monitor=self.monitor,
        #                          verbose=0,
        #                          save_best_only=False,
        #                          save_weights_only=False,
        #                          mode=mode,
        #                          save_freq='epoch',
        #                          )
        #    callbacks.append(mcp)
        #    print(f'Injected a callback [ModelCheckpoint].\nfilepath:{mcp.filepath}\nmonitor:{mcp.monitor}')
        if es is None:
            es = EarlyStopping(monitor=self.monitor,
                               restore_best_weights=True,
                               patience=self.config.earlystopping_patience,
                               verbose=1,
                               #min_delta=0.0001,
                               mode=mode,
                               baseline=None,
                               )
            callbacks.append(es)
            print(f'Injected a callback [EarlyStopping]. monitor:{es.monitor}, patience:{es.patience}, mode:{mode}')
        return callbacks

[docs]    def save(self, filepath, deepmodel_basename=None):
        if filepath[-1] != '/':
            filepath = filepath + '/'

        if not os.path.exists(filepath):
            os.makedirs(filepath)
        num_model = len(self.__modelset.get_modelinfos())
        for mi in self.__modelset.get_modelinfos():
            if isinstance(mi.model, str):
                dm = self.load_deepmodel(mi.model)
                mi.model = dm
            if not isinstance(mi.model, DeepModel):
                raise ValueError(f'Currently does not support saving non-DeepModel models.')

            if num_model == 1 and deepmodel_basename is not None:
                mi.name = deepmodel_basename
                self.__current_model = deepmodel_basename
            modelfile = f'{filepath}{mi.name}.h5'
            mi.model.save(modelfile)
            mi.model = modelfile

        with open(f'{filepath}dt.pkl', 'wb') as output:
            pickle.dump(self, output, protocol=2)

[docs]    @staticmethod
    def load(filepath):
        if filepath[-1] != '/':
            filepath = filepath + '/'
        with open(f'{filepath}dt.pkl', 'rb') as input:
            dt = pickle.load(input)
            dt.restore_modelset(filepath)
            return dt

[docs]    def restore_modelset(self, filepath):
        for mi in self.__modelset.get_modelinfos():
            if isinstance(mi.model, str):
                modelfile = mi.model
                modelfile = os.path.split(modelfile)[-1]
                dm = self.load_deepmodel(f'{filepath}{modelfile}')
                mi.model = dm

[docs]    def load_deepmodel(self, filepath):
        if os.path.exists(filepath):
            print(f'Load model from disk:{filepath}.')
            dm = DeepModel(self.task, self.num_classes, self.config,
                           self.preprocessor.categorical_columns, self.preprocessor.continuous_columns, filepath)
            return dm
        else:
            raise ValueError(f'Invalid model filename:{filepath}.')


def _fit_and_score(task, num_classes, config, categorical_columns, continuous_columns,
                   n_fold, valid_idx, X_train, y_train, X_val, y_val,
                   X_eval=None, X_test=None, model_file=None,
                   batch_size=128, epochs=1, verbose=0, callbacks=None,
                   shuffle=True, class_weight=None, sample_weight=None,
                   initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_freq=1,
                   max_queue_size=10, workers=1, use_multiprocessing=False):
    print(f'\nFold:{n_fold + 1}\n')
    model = DeepModel(task, num_classes, config, categorical_columns, continuous_columns)
    history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=verbose,
                        callbacks=callbacks, validation_data=(X_val, y_val),
                        shuffle=shuffle, class_weight=class_weight, sample_weight=sample_weight,
                        initial_epoch=initial_epoch, steps_per_epoch=steps_per_epoch,
                        validation_steps=validation_steps, validation_freq=validation_freq,
                        max_queue_size=max_queue_size, workers=workers, use_multiprocessing=use_multiprocessing)
    print(f'Fold {n_fold + 1} fitting over.')
    oof_proba = model.predict(X_val)
    eval_proba = None
    test_proba = None
    if X_eval is not None:
        eval_proba = model.predict(X_eval)
    if X_test is not None:
        test_proba = model.predict(X_test)
        if model_file is not None:
            file = f'{model_file}.test_proba.csv'
            pd.DataFrame(test_proba).to_csv(file, index=False)
    print(f'Fold {n_fold + 1} scoring over.')
    if model_file is not None:
        model.save(model_file)
        print(f'Save model to:{model_file}.')
    model.release()
    return n_fold, valid_idx, history.history, oof_proba, eval_proba, test_proba


[docs]def infer_task_type(y):
    if len(y.shape) > 1 and y.shape[-1] > 1:
        labels = list(range(y.shape[-1]))
        task = consts.TASK_MULTILABEL
        return task, labels

    uniques = set(y)
    n_unique = len(uniques)
    labels = []

    if n_unique == 2:
        print(f'2 class detected, {uniques}, so inferred as a [binary classification] task')
        task = consts.TASK_BINARY  # TASK_BINARY
        labels = sorted(uniques)
    else:
        if y.dtype == 'float':
            print(f'Target column type is float, so inferred as a [regression] task.')
            task = consts.TASK_REGRESSION
        else:
            if n_unique > 1000:
                if 'int' in y.dtype:
                    print(
                        'The number of classes exceeds 1000 and column type is int, so inferred as a [regression] task ')
                    task = consts.TASK_REGRESSION
                else:
                    raise ValueError(
                        'The number of classes exceeds 1000, please confirm whether your predict target is correct ')
            else:
                print(f'{n_unique} class detected, inferred as a [multiclass classification] task')
                task = consts.TASK_MULTICLASS
                labels = sorted(uniques)
    return task, labels


[docs]def probe_evaluate(dt, X, y, X_test, y_test, layers, score_fn={}):
    from sklearn.linear_model import LogisticRegression
    print('Extracting features of train set...')
    features_train = dt.apply(X, output_layers=layers)
    print('Extracting features of test set...')
    features_test = dt.apply(X_test, output_layers=layers)
    y = dt.preprocessor.transform_y(y)
    y_test = dt.preprocessor.transform_y(y_test)

    if not isinstance(features_train, list):
        features_train = [features_train]
        features_test = [features_test]

    result = {}
    for i, x_train in enumerate(features_train):
        clf = LogisticRegression(random_state=0).fit(x_train, y)
        print(f'Fit model for layer[{layers[i]}]...')
        y_proba = clf.predict_proba(features_test[i])[:, 1]
        y_score = clf.predict(features_test[i])
        print(f'Scoring...')
        if len(score_fn) == 0:
            score = clf.score(features_test[i], y_test)
            print(f'Evaluating accuracy score...')
            result[layers[i]] = {'accuracy': score}
        else:
            result[layers[i]] = {}
            for metric in score_fn.keys():
                print(f'Evaluating {metric} score...')
                fn = score_fn[metric]
                if fn == roc_auc_score:
                    score = fn(y_test, y_proba)
                else:
                    score = fn(y_test, y_score)
                result[layers[i]][metric] = score
                print(f'{metric}:{score}')
            # result[layers[i]] = {metric:score_fn[metric](features_test[i], y_score) for metric in score_fn.keys()}
    return result