Source code for sklearn_utilities.append_prediction_to_x

from __future__ import annotations

from typing import Any, Generic, Sequence

import numpy as np
from joblib import Parallel, delayed
from pandas import DataFrame, Series, concat
from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted, clone
from typing_extensions import Self

from .types import TX, TEstimator


[docs] def generate_new_prefix(X: DataFrame, prefix: str = "y_pred_") -> str: """Generate a new column name for the prediction.""" last_number = ( X.columns.to_series() .str.extract(rf"^{prefix}(\d+)_.+$", expand=False) .fillna(-1) .astype(int) .max() ) return f"{prefix}{last_number + 1}"
[docs] class AppendPredictionToXSingle(BaseEstimator, TransformerMixin, Generic[TEstimator]): """Append the prediction of the estimator to X. To use multiple estimators, use AppendPredictionToX instead.""" estimator: TEstimator estimator_: TEstimator """The fitted estimator.""" def __init__(self, estimator: TEstimator, *, concat: bool = True) -> None: """Append the prediction of the estimator to X. If pandas DataFrame is given, the prediction is added as a new column with the name "y_pred_{estimator.__class__.__name__}_{i}" if the prediction is 1D, or "y_pred_{estimator.__class__.__name__}_{i}_{column_name}" if the prediction is 2D. To use multiple estimators, use AppendPredictionToX instead. Parameters ---------- estimator : Any The estimator to be wrapped. concat : bool, optional Whether to concatenate the prediction to X, by default True """ self.estimator = estimator self.concat = concat
[docs] def fit(self, X: Any, y: Any = None, **fit_params: Any) -> Self: """Fit the estimator.""" self.estimator_ = clone(self.estimator).fit(X, y, **fit_params) return self
[docs] def transform(self, X: TX, y: Any = None, **predict_params: Any) -> TX: """Append the prediction of the estimator to X. If pandas DataFrame is given, the prediction is added as a new column with the name "y_pred_{estimator.__class__.__name__}_{i}" if the prediction is 1D, or "y_pred_{estimator.__class__.__name__}_{i}_{column_name}" if the prediction is 2D.""" check_is_fitted(self, "estimator_") y_pred = self.estimator_.predict(X, **predict_params) # concat the prediction X if isinstance(X, DataFrame): # pandas # add prefix prefix = f"y_pred_{self.estimator_.__class__.__name__}_" prefix = generate_new_prefix(X, prefix) if y_pred.ndim == 1: y_pred = Series(y_pred, index=X.index, name=prefix).to_frame() else: y_pred = DataFrame(y_pred, index=X.index) y_pred = y_pred.add_prefix(prefix + "_") y_pred.columns = y_pred.columns.str.replace( r"[^a-zA-Z0-9_]", "_", regex=True ) # concatenate if self.concat: return concat([X, y_pred], axis=1) return y_pred else: # numpy if y_pred.ndim == 1: y_pred = y_pred.reshape(-1, 1) # concatenate if self.concat: return np.hstack([X, y_pred]) return y_pred
[docs] class AppendPredictionToX(BaseEstimator, TransformerMixin, Generic[TEstimator]): """Append the prediction of the estimators to X.""" estimators: Sequence[TEstimator] estimators_: Sequence[TEstimator] def __init__( self, estimators: Sequence[TEstimator] | TEstimator, *, concat: bool = True, n_jobs: int | None = -1, ) -> None: """Append the prediction of the estimators to X. If pandas DataFrame is given, the prediction is added as a new column with the name "y_pred_{estimator.__class__.__name__}_{i}_{estimator_index}" if the prediction is 1D, or "y_pred_{estimator.__class__.__name__}_{i}_{column_name}_{estimator_index}" if the prediction is 2D. Parameters ---------- estimators : Sequence[TEstimator] | TEstimator The estimator(s) to be wrapped. concat : bool, optional Whether to concatenate the prediction to X, by default True n_jobs : int | None, optional The number of jobs to run in parallel, by default -1 """ if not isinstance(estimators, Sequence): estimators = [estimators] self.estimators = estimators self.concat = concat self.n_jobs = n_jobs
[docs] def fit(self, X: Any, y: Any = None, **fit_params: Any) -> Self: """Fit the estimators.""" estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(AppendPredictionToXSingle(clone(estimator), concat=False).fit)( X, y, **fit_params ) for estimator in self.estimators ) if estimators_ is None: raise RuntimeError("Failed to fit estimators") self.estimators_ = estimators_ return self
[docs] def transform(self, X: TX, y: Any = None, **predict_params: Any) -> TX: """Append the prediction of the estimators to X. If pandas DataFrame is given, the prediction is added as a new column with the name "y_pred_{estimator.__class__.__name__}_{i}_{estimator_index}" if the prediction is 1D, or "y_pred_{estimator.__class__.__name__}_{i}_{column_name}_{estimator_index}" if the prediction is 2D.""" check_is_fitted(self, "estimators_") transformed: list[TX] = [ estimator_.transform(X, y, **predict_params).add_suffix(f"_{i}") for i, estimator_ in enumerate(self.estimators_) ] if isinstance(X, DataFrame): if self.concat: return concat([X, *transformed], axis=1) return concat(transformed, axis=1) else: if self.concat: return np.hstack([X, *transformed]) return np.hstack(transformed)