Source code for autoflow.pipeline.components.preprocessing.select.base

from copy import deepcopy
from importlib import import_module
from typing import Dict

import numpy as np
import pandas as pd
import sklearn.feature_selection

from autoflow.pipeline.components.feature_engineer_base import AutoFlowFeatureEngineerAlgorithm
from autoflow.utils.dataframe import DataFrameValuesWrapper


[docs]class SklearnSelectMixin(): max_feature_name = "_max_features-sp1_percent" def _transform_proc(self, X): if X is None: return None else: trans = self.estimator.transform(X) mask = self.estimator.get_support() columns = X.columns[mask] return pd.DataFrame(trans, columns=columns)
[docs] def before_parse_escape_hyperparameters(self, hyperparams): hyperparams = deepcopy(hyperparams) if "_select_percent" in hyperparams: _select_percent = hyperparams.pop("_select_percent") hyperparams[self.max_feature_name] = _select_percent return hyperparams
# class SelectFromModelBase(AutoPLFeatureEngineerAlgorithm, SklearnSelectMixin):
[docs]class SelectFromModelBase(SklearnSelectMixin, AutoFlowFeatureEngineerAlgorithm): class__ = "SelectFromModel" module__ = "sklearn.feature_selection" need_y = True max_feature_name = "_max_features-sp1_percent"
[docs] def after_process_hyperparams(self, hyperparams): hyperparams = super(SelectFromModelBase, self).after_process_hyperparams(hyperparams) estimator_ = hyperparams["estimator"] splitted = estimator_.split(".") class_ = splitted[-1] module_ = ".".join(splitted[:-1]) M = import_module(module_) cls = getattr(M, class_) base_estimator_hp = self.filter_invalid(cls, hyperparams) if "max_features" in base_estimator_hp: base_estimator_hp.pop("max_features") base_estimator = cls(**base_estimator_hp) hyperparams["estimator"] = base_estimator hyperparams["threshold"] = -np.inf return hyperparams
[docs]class REF_Base(SklearnSelectMixin, AutoFlowFeatureEngineerAlgorithm): class__ = "RFE" module__ = "sklearn.feature_selection" need_y = True max_feature_name = "_n_features_to_select-sp1_percent"
[docs] def after_process_hyperparams(self, hyperparams): hyperparams = super(REF_Base, self).after_process_hyperparams(hyperparams) estimator_ = hyperparams["estimator"] splitted = estimator_.split(".") class_ = splitted[-1] module_ = ".".join(splitted[:-1]) M = import_module(module_) cls = getattr(M, class_) base_estimator_hp = self.filter_invalid(cls, hyperparams) if "max_features" in base_estimator_hp: base_estimator_hp.pop("max_features") base_estimator = cls(**base_estimator_hp) hyperparams["estimator"] = base_estimator return hyperparams
[docs]class SelectPercentileBase(SklearnSelectMixin, AutoFlowFeatureEngineerAlgorithm): class__ = "GenericUnivariateSelect" module__ = "sklearn.feature_selection" need_y = True max_feature_name = "param"
[docs] def get_name2func(self): raise NotImplementedError()
[docs] def get_default_name(self): raise NotImplementedError()
[docs] def after_process_hyperparams(self, hyperparams) -> Dict: hyperparams = super(SelectPercentileBase, self).after_process_hyperparams(hyperparams) name2func = self.get_name2func() default_name = self.get_default_name() name = hyperparams.get("score_func", default_name) self.score_func = name2func[name] hyperparams.update({ "score_func": self.score_func }) return hyperparams
[docs] def before_fit_X(self, X): wrapper = DataFrameValuesWrapper(X) X = wrapper.array if X is None: return None X = deepcopy(X) if self.score_func == sklearn.feature_selection.chi2: X[X < 0] = 0.0 return wrapper.wrap_to_dataframe(X)
[docs] def before_trans_X(self, X): wrapper = DataFrameValuesWrapper(X) X = wrapper.array X = deepcopy(X) if self.score_func == sklearn.feature_selection.chi2: X[X < 0] = 0.0 return wrapper.wrap_to_dataframe(X)