Source code for sklearn_utilities.eval_set

from __future__ import annotations

import importlib.util
import warnings
from typing import Any, Generic, Literal

from numpy.random import RandomState
from sklearn.model_selection import train_test_split
from typing_extensions import Self

from .estimator_wrapper import EstimatorWrapperBase
from .types import TEstimator



[docs]
class EvalSetWrapper(EstimatorWrapperBase[TEstimator], Generic[TEstimator]):
    """A wrapper that splits the data into train and validation sets and
    passes the validation set to `eval_set` parameter of the estimator."""

    def __init__(
        self,
        estimator: TEstimator,
        *,
        test_size: float | int | None = None,
        train_size: float | int | None = None,
        random_state: int | RandomState | None = None,
        shuffle: bool = True,
        stratify: bool = False,
        **kwargs: Any,
    ) -> None:
        """A wrapper that splits the data into train and validation sets and
        passes the validation set to `eval_set` parameter of the estimator.

        Parameters
        ----------
        estimator : Any
            The estimator to wrap.
        test_size : float or int, default=None
            If float, should be between 0.0 and 1.0 and represent the proportion
            of the dataset to include in the test split. If int, represents the
            absolute number of test samples. If None, the value is set to the
            complement of the train size. If ``train_size`` is also None, it will
            be set to 0.25.
            Alias: ``validation_fraction``
        train_size : float or int, default=None
            If float, should be between 0.0 and 1.0 and represent the
            proportion of the dataset to include in the train split. If
            int, represents the absolute number of train samples. If None,
            the value is automatically set to the complement of the test size.
            Alias: ``train_fraction``
        random_state : int, RandomState instance or None, default=None
            Controls the shuffling applied to the data before applying the split.
            Pass an int for reproducible output across multiple function calls.
            See :term:`Glossary <random_state>`.
        shuffle : bool, optional
            Whether or not to shuffle the data before splitting. If shuffle=False
            then stratify must be None. by default True
        stratify : bool, optional
            Whether or not stratify the data before splitting. If stratify=True,
            y must be categorical. by default False
        **kwargs : Any
            ``validation_fraction`` : alias for ``test_size`` (``sklearn`` style)
            ``train_fraction`` : alias for ``train_size`` (``sklearn`` style)
        """
        super().__init__(estimator)
        self.test_size = test_size
        self.train_size = train_size
        self.random_state = random_state
        self.shuffle = shuffle
        self.stratify = stratify
        self.kwargs = kwargs
        for key, value in kwargs.items():
            if key not in [
                "validation_fraction",
                "train_fraction",
            ]:
                warnings.warn(f"Unknown parameter: {key}: {value}")


[docs]
    def fit(self, X: Any, y: Any, **fit_params: Any) -> Self:
        """Fit the estimator with `eval_set` set to the validation set.

        Parameters
        ----------
        X : Any
            The training input samples.
        y : Any
            The target values.

        Returns
        -------
        Self
            The fitted estimator.
        """
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=self.test_size or self.kwargs.get("validation_fraction", None),
            train_size=self.train_size or self.kwargs.get("train_fraction", None),
            random_state=self.random_state,
            shuffle=self.shuffle,
            stratify=y if self.stratify else None,
        )
        fit_params = fit_params | {
            "eval_set": [(X_test, y_test)],
        }
        self.estimator.fit(X_train, y_train, **fit_params)
        return self




if importlib.util.find_spec("catboost") is not None:
    import re
    from typing import TypeVar

    import tqdm
    from catboost import CatBoost

    TCatBoost = TypeVar("TCatBoost", bound=CatBoost)


[docs]
    class CatBoostProgressBarWrapper(
        EstimatorWrapperBase[TEstimator], Generic[TEstimator]
    ):
        """A wrapper that splits the data into train and validation sets and
        passes the validation set to `eval_set` parameter of the estimator and
        shows the progress bar using `tqdm`.

        It is recommended to set `iterations` in `CatBoost.__init__` to show the
        progress bar.
        It is recommended to set `early_stopping_rounds` in `CatBoost.__init__`
        to enable early stopping."""

        def __init__(
            self,
            estimator: TEstimator,
            *,
            tqdm_cls: Literal[
                "auto",
                "autonotebook",
                "std",
                "notebook",
                "asyncio",
                "keras",
                "dask",
                "tk",
                "gui",
                "rich",
                "contrib.slack",
                "contrib.discord",
                "contrib.telegram",
                "contrib.bells",
            ]
            | type[tqdm.std.tqdm] = "auto",
            tqdm_kwargs: dict[str, Any] | None = None,
            verbose: bool = True,
        ) -> None:
            """It is recommended to set `iterations` in `CatBoost.__init__` to show the
            progress bar.

            Parameters
            ----------
            estimator : Any
                The estimator to wrap.
            tqdm_cls : Literal['auto', 'autonotebook', 'std', 'notebook', 'asyncio',
                'keras', 'dask', 'tk', 'gui', 'rich', 'contrib.slack', 'contrib.discord',
                'contrib.telegram', 'contrib.bells'] or type[tqdm.std.tqdm] or None, optional
                The tqdm class or module name, by default 'auto'
            tqdm_kwargs : dict[str, Any] or None, optional
                The keyword arguments passed to the tqdm class initializer
            verbose : bool, optional
                Whether to show other logs, by default True

            Examples
            --------
            >>> from catboost import CatBoostRegressor
            >>> from sklearn.datasets import make_regression
            >>> from sklearn_utilities import CatBoostProgressBarWrapper, EvalSetWrapper
            >>> X, y = make_regression()
            >>> # `iterations` is recommended to be set to show the progress bar.
            >>> # `early_stopping_rounds` should be set to enable early stopping.
            >>> estimator = CatBoostRegressor(iterations=1000, early_stopping_rounds=10)
            >>> estimator = EvalSetWrapper(estimator)
            >>> estimator = CatBoostProgressBarWrapper(estimator)
            >>> estimator.fit(X, y)
            """
            super().__init__(
                estimator,
            )
            self.tqdm_cls = tqdm_cls
            self.tqdm_kwargs = tqdm_kwargs
            self.verbose = verbose

            if isinstance(tqdm_cls, str):
                tqdm_module = importlib.import_module(f"tqdm.{tqdm_cls}")
                self.tqdm_cls_ = getattr(tqdm_module, "tqdm")
            else:
                self.tqdm_cls_ = tqdm_cls
            self.tqdm_kwargs_ = tqdm_kwargs or {}
            if "total" in self.tqdm_kwargs_:
                warnings.warn("'total' in tqdm_kwargs is ignored.", UserWarning)


[docs]
        def fit(self, X: Any, y: Any, **fit_params: Any) -> Self:
            class ProgressBarPrint:
                def __init__(self_child) -> None:
                    self_child.pbar: tqdm.std.tqdm | None = None

                def write(self_child, text: str) -> None:
                    if self_child.pbar is None:
                        params = self.estimator._get_params()
                        iterations = (
                            params.get("iterations", None)
                            or params.get("n_estimators", None)
                            or params.get("num_boost_round", None)
                            or params.get("num_trees", None)
                            or params.get("num_iterations", None)
                        )  # noqa
                        self_child.pbar = self.tqdm_cls_(
                            **(
                                self.tqdm_kwargs_
                                | {
                                    "total": iterations,
                                }
                            ),
                        )
                    try:
                        # 0:      learn: 221.7751345      test: 210.0125818       test1: 210.0125818
                        # best: 210.0125818 (0)   total: 196ms    remaining: 19.4s
                        if not re.match(r"^\d+:", text):
                            raise Exception("Text does not seem to be metrics")
                        text = re.sub(r"[^\s]*: ", "", text)
                        numbers = [
                            float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", text)
                        ]
                        n_iter = int(numbers[0]) + 1
                        if "(" in text:
                            # early stopping
                            scores = numbers[1:-4]
                            best_score = numbers[-4]
                            best_iter = int(numbers[-3])
                            self_child.pbar.set_postfix_str(
                                f"{', '.join(f'{score:g}' for score in scores)}"
                                f"{'=' if best_score == scores[1] else '<' if best_score > scores[1] else '>'}"  # noqa
                                f"{best_score:g}@{best_iter:g}it",
                            )
                        else:
                            scores = numbers[1:-2]
                            self_child.pbar.set_postfix_str(
                                f"{', '.join(f'{score:g}' for score in scores)}"
                            )

                        self_child.pbar.update(n_iter - self_child.pbar.n)
                    except Exception:
                        if self.verbose:
                            self_child.pbar.write(text)

            fit_params = fit_params | {
                "log_cout": ProgressBarPrint(),
            }
            self.estimator.fit(X, y, **fit_params)
            return self



    if __name__ == "__main__":
        from catboost import CatBoostRegressor
        from sklearn.datasets import make_regression
        from sklearn.metrics import mean_squared_error

        X, y = make_regression(n_targets=2, random_state=0)
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        estimator = CatBoostProgressBarWrapper(
            EvalSetWrapper(
                CatBoostRegressor(
                    iterations=100,
                    learning_rate=0.4,
                    early_stopping_rounds=10,
                    objective="MultiRMSE",
                ),
            )
        )
        estimator.fit(X_train, y_train)
        mean_squared_error(y_test, estimator.predict(X_test))