Source code for sklearn_utilities.dataset

from __future__ import annotations

import numpy as np
from numpy.random import RandomState
from pandas import DataFrame, Series

from .types import TX, TY


[docs] def add_missing_values( dataset: tuple[TX, TY], *, missing_rate_x: float = 0.5, missing_rate_y: float = 0.5, random_state: RandomState | int | None = None, ) -> tuple[TX, TY]: """Add missing values to a dataset. Parameters ---------- dataset : tuple[TX, TY] The dataset to add missing values to. missing_rate_x : float, optional The rate of missing values to add to X, by default 0.5 missing_rate_y : float, optional The rate of missing values to add to y, by default 0.5 random_state : RandomState | int | None, optional The random state to use, by default None Returns ------- tuple[TX, TY] The dataset with missing values added. """ random_state = RandomState(random_state) X_full_, y_full_ = dataset if isinstance(X_full_, DataFrame): X_full = X_full_.to_numpy() else: X_full = X_full_ if isinstance(y_full_, (DataFrame, Series)): y_full = y_full_.to_numpy() else: y_full = y_full_ X_mask = random_state.rand(*X_full.shape) < missing_rate_x y_mask = random_state.rand(*y_full.shape) < missing_rate_y X_missing = X_full.copy() X_missing[X_mask] = np.nan y_missing = y_full.copy() y_missing[y_mask] = np.nan if isinstance(X_full_, DataFrame): X_missing = DataFrame(X_missing, columns=X_full_.columns, index=X_full_.index) if isinstance(y_full_, Series): y_missing = Series(y_missing, index=y_full_.index, name=y_full_.name) elif isinstance(y_full_, DataFrame): y_missing = DataFrame(y_missing, columns=y_full_.columns, index=y_full_.index) return X_missing, y_missing