# Authors: Nicolas Tresegnie # Sergey Feldman # License: BSD 3 clause import numbers import warnings import numpy as np import numpy.ma as ma from scipy import sparse from scipy import stats from ..base import BaseEstimator, TransformerMixin from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES from ..utils.validation import _deprecate_positional_args from ..utils._mask import _get_mask from ..utils import is_scalar_nan def _check_inputs_dtype(X, missing_values): if (X.dtype.kind in ("f", "i", "u") and not isinstance(missing_values, numbers.Real)): raise ValueError("'X' and 'missing_values' types are expected to be" " both numerical. Got X.dtype={} and " " type(missing_values)={}." .format(X.dtype, type(missing_values))) def _most_frequent(array, extra_value, n_repeat): """Compute the most frequent value in a 1d array extended with [extra_value] * n_repeat, where extra_value is assumed to be not part of the array.""" # Compute the most frequent value in array only if array.size > 0: with warnings.catch_warnings(): # stats.mode raises a warning when input array contains objects due # to incapacity to detect NaNs. Irrelevant here since input array # has already been NaN-masked. warnings.simplefilter("ignore", RuntimeWarning) mode = stats.mode(array) most_frequent_value = mode[0][0] most_frequent_count = mode[1][0] else: most_frequent_value = 0 most_frequent_count = 0 # Compare to array + [extra_value] * n_repeat if most_frequent_count == 0 and n_repeat == 0: return np.nan elif most_frequent_count < n_repeat: return extra_value elif most_frequent_count > n_repeat: return most_frequent_value elif most_frequent_count == n_repeat: # Ties the breaks. Copy the behaviour of scipy.stats.mode if most_frequent_value < extra_value: return most_frequent_value else: return extra_value class _BaseImputer(TransformerMixin, BaseEstimator): """Base class for all imputers. It adds automatically support for `add_indicator`. """ def __init__(self, *, missing_values=np.nan, add_indicator=False): self.missing_values = missing_values self.add_indicator = add_indicator def _fit_indicator(self, X): """Fit a MissingIndicator.""" if self.add_indicator: self.indicator_ = MissingIndicator( missing_values=self.missing_values, error_on_new=False ) self.indicator_.fit(X) else: self.indicator_ = None def _transform_indicator(self, X): """Compute the indicator mask.' Note that X must be the original data as passed to the imputer before any imputation, since imputation may be done inplace in some cases. """ if self.add_indicator: if not hasattr(self, 'indicator_'): raise ValueError( "Make sure to call _fit_indicator before " "_transform_indicator" ) return self.indicator_.transform(X) def _concatenate_indicator(self, X_imputed, X_indicator): """Concatenate indicator mask with the imputed data.""" if not self.add_indicator: return X_imputed hstack = sparse.hstack if sparse.issparse(X_imputed) else np.hstack if X_indicator is None: raise ValueError( "Data from the missing indicator are not provided. Call " "_fit_indicator and _transform_indicator in the imputer " "implementation." ) return hstack((X_imputed, X_indicator)) def _more_tags(self): return {'allow_nan': is_scalar_nan(self.missing_values)} class SimpleImputer(_BaseImputer): """Imputation transformer for completing missing values. Read more in the :ref:`User Guide `. .. versionadded:: 0.20 `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer` estimator which is now removed. Parameters ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For pandas' dataframes with nullable integer dtypes with missing values, `missing_values` should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. strategy : string, default='mean' The imputation strategy. - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data. .. versionadded:: 0.20 strategy="constant" for fixed value imputation. fill_value : string or numerical value, default=None When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and "missing_value" for strings or object data types. verbose : integer, default=0 Controls the verbosity of the imputer. copy : boolean, default=True If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, in the following cases, a new copy will always be made, even if `copy=False`: - If X is not an array of floating values; - If X is encoded as a CSR matrix; - If add_indicator=True. add_indicator : boolean, default=False If True, a :class:`MissingIndicator` transform will stack onto output of the imputer's transform. This allows a predictive estimator to account for missingness despite imputation. If a feature has no missing values at fit/train time, the feature won't appear on the missing indicator even if there are missing values at transform/test time. Attributes ---------- statistics_ : array of shape (n_features,) The imputation fill value for each feature. Computing statistics can result in `np.nan` values. During :meth:`transform`, features corresponding to `np.nan` statistics will be discarded. indicator_ : :class:`sklearn.impute.MissingIndicator` Indicator used to add binary indicators for missing values. ``None`` if add_indicator is False. See also -------- IterativeImputer : Multivariate imputation of missing values. Examples -------- >>> import numpy as np >>> from sklearn.impute import SimpleImputer >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) SimpleImputer() >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] >>> print(imp_mean.transform(X)) [[ 7. 2. 3. ] [ 4. 3.5 6. ] [10. 3.5 9. ]] Notes ----- Columns which only contained missing values at :meth:`fit` are discarded upon :meth:`transform` if strategy is not "constant". """ @_deprecate_positional_args def __init__(self, *, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True, add_indicator=False): super().__init__( missing_values=missing_values, add_indicator=add_indicator ) self.strategy = strategy self.fill_value = fill_value self.verbose = verbose self.copy = copy def _validate_input(self, X, in_fit): allowed_strategies = ["mean", "median", "most_frequent", "constant"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " " got strategy={1}".format(allowed_strategies, self.strategy)) if self.strategy in ("most_frequent", "constant"): dtype = None else: dtype = FLOAT_DTYPES if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" try: X = self._validate_data(X, reset=in_fit, accept_sparse='csc', dtype=dtype, force_all_finite=force_all_finite, copy=self.copy) except ValueError as ve: if "could not convert" in str(ve): new_ve = ValueError("Cannot use {} strategy with non-numeric " "data:\n{}".format(self.strategy, ve)) raise new_ve from None else: raise ve _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("SimpleImputer does not support data with dtype " "{0}. Please provide either a numeric array (with" " a floating point or integer dtype) or " "categorical data represented either as an array " "with integer dtype or an array of string values " "with an object dtype.".format(X.dtype)) return X def fit(self, X, y=None): """Fit the imputer on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : SimpleImputer """ X = self._validate_input(X, in_fit=True) super()._fit_indicator(X) # default fill_value is 0 for numerical input and "missing_value" # otherwise if self.fill_value is None: if X.dtype.kind in ("i", "u", "f"): fill_value = 0 else: fill_value = "missing_value" else: fill_value = self.fill_value # fill_value should be numerical in case of numerical input if (self.strategy == "constant" and X.dtype.kind in ("i", "u", "f") and not isinstance(fill_value, numbers.Real)): raise ValueError("'fill_value'={0} is invalid. Expected a " "numerical value when imputing numerical " "data".format(fill_value)) if sparse.issparse(X): # missing_values = 0 not allowed with sparse data as it would # force densification if self.missing_values == 0: raise ValueError("Imputation not possible when missing_values " "== 0 and input is sparse. Provide a dense " "array instead.") else: self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, fill_value) else: self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, fill_value) return self def _sparse_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on sparse data.""" mask_data = _get_mask(X.data, missing_values) n_implicit_zeros = X.shape[0] - np.diff(X.indptr) statistics = np.empty(X.shape[1]) if strategy == "constant": # for constant strategy, self.statistcs_ is used to store # fill_value in each column statistics.fill(fill_value) else: for i in range(X.shape[1]): column = X.data[X.indptr[i]:X.indptr[i + 1]] mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]] column = column[~mask_column] # combine explicit and implicit zeros mask_zeros = _get_mask(column, 0) column = column[~mask_zeros] n_explicit_zeros = mask_zeros.sum() n_zeros = n_implicit_zeros[i] + n_explicit_zeros if strategy == "mean": s = column.size + n_zeros statistics[i] = np.nan if s == 0 else column.sum() / s elif strategy == "median": statistics[i] = _get_median(column, n_zeros) elif strategy == "most_frequent": statistics[i] = _most_frequent(column, 0, n_zeros) return statistics def _dense_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on dense data.""" mask = _get_mask(X, missing_values) masked_X = ma.masked_array(X, mask=mask) # Mean if strategy == "mean": mean_masked = np.ma.mean(masked_X, axis=0) # Avoid the warning "Warning: converting a masked element to nan." mean = np.ma.getdata(mean_masked) mean[np.ma.getmask(mean_masked)] = np.nan return mean # Median elif strategy == "median": median_masked = np.ma.median(masked_X, axis=0) # Avoid the warning "Warning: converting a masked element to nan." median = np.ma.getdata(median_masked) median[np.ma.getmaskarray(median_masked)] = np.nan return median # Most frequent elif strategy == "most_frequent": # Avoid use of scipy.stats.mstats.mode due to the required # additional overhead and slow benchmarking performance. # See Issue 14325 and PR 14399 for full discussion. # To be able access the elements by columns X = X.transpose() mask = mask.transpose() if X.dtype.kind == "O": most_frequent = np.empty(X.shape[0], dtype=object) else: most_frequent = np.empty(X.shape[0]) for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): row_mask = np.logical_not(row_mask).astype(np.bool) row = row[row_mask] most_frequent[i] = _most_frequent(row, np.nan, 0) return most_frequent # Constant elif strategy == "constant": # for constant strategy, self.statistcs_ is used to store # fill_value in each column return np.full(X.shape[1], fill_value, dtype=X.dtype) def transform(self, X): """Impute all missing values in X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. """ check_is_fitted(self) X = self._validate_input(X, in_fit=False) X_indicator = super()._transform_indicator(X) statistics = self.statistics_ if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) # Delete the invalid columns if strategy is not constant if self.strategy == "constant": valid_statistics = statistics else: # same as np.isnan but also works for object dtypes invalid_mask = _get_mask(statistics, np.nan) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.flatnonzero(valid_mask) if invalid_mask.any(): missing = np.arange(X.shape[1])[invalid_mask] if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] # Do actual imputation if sparse.issparse(X): if self.missing_values == 0: raise ValueError("Imputation not possible when missing_values " "== 0 and input is sparse. Provide a dense " "array instead.") else: mask = _get_mask(X.data, self.missing_values) indexes = np.repeat( np.arange(len(X.indptr) - 1, dtype=np.int), np.diff(X.indptr))[mask] X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False) else: mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=0) values = np.repeat(valid_statistics, n_missing) coordinates = np.where(mask.transpose())[::-1] X[coordinates] = values return super()._concatenate_indicator(X, X_indicator) class MissingIndicator(TransformerMixin, BaseEstimator): """Binary indicators for missing values. Note that this component typically should not be used in a vanilla :class:`Pipeline` consisting of transformers and a classifier, but rather could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. Read more in the :ref:`User Guide `. .. versionadded:: 0.20 Parameters ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For pandas' dataframes with nullable integer dtypes with missing values, `missing_values` should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. features : str, default=None Whether the imputer mask should represent all or a subset of features. - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. - If "all", the imputer mask will represent all features. sparse : boolean or "auto", default=None Whether the imputer mask format should be sparse or dense. - If "auto" (default), the imputer mask will be of same type as input. - If True, the imputer mask will be a sparse matrix. - If False, the imputer mask will be a numpy array. error_on_new : boolean, default=None If True (default), transform will raise an error when there are features with missing values in transform that have no missing values in fit. This is applicable only when ``features="missing-only"``. Attributes ---------- features_ : ndarray, shape (n_missing_features,) or (n_features,) The features indices which will be returned when calling ``transform``. They are computed during ``fit``. For ``features='all'``, it is to ``range(n_features)``. Examples -------- >>> import numpy as np >>> from sklearn.impute import MissingIndicator >>> X1 = np.array([[np.nan, 1, 3], ... [4, 0, np.nan], ... [8, 1, 0]]) >>> X2 = np.array([[5, 1, np.nan], ... [np.nan, 2, 3], ... [2, 4, 0]]) >>> indicator = MissingIndicator() >>> indicator.fit(X1) MissingIndicator() >>> X2_tr = indicator.transform(X2) >>> X2_tr array([[False, True], [ True, False], [False, False]]) """ @_deprecate_positional_args def __init__(self, *, missing_values=np.nan, features="missing-only", sparse="auto", error_on_new=True): self.missing_values = missing_values self.features = features self.sparse = sparse self.error_on_new = error_on_new def _get_missing_features_info(self, X): """Compute the imputer mask and the indices of the features containing missing values. Parameters ---------- X : {ndarray or sparse matrix}, shape (n_samples, n_features) The input data with missing values. Note that ``X`` has been checked in ``fit`` and ``transform`` before to call this function. Returns ------- imputer_mask : {ndarray or sparse matrix}, shape \ (n_samples, n_features) The imputer mask of the original data. features_with_missing : ndarray, shape (n_features_with_missing) The features containing missing values. """ if sparse.issparse(X): mask = _get_mask(X.data, self.missing_values) # The imputer mask will be constructed with the same sparse format # as X. sparse_constructor = (sparse.csr_matrix if X.format == 'csr' else sparse.csc_matrix) imputer_mask = sparse_constructor( (mask, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool) imputer_mask.eliminate_zeros() if self.features == 'missing-only': n_missing = imputer_mask.getnnz(axis=0) if self.sparse is False: imputer_mask = imputer_mask.toarray() elif imputer_mask.format == 'csr': imputer_mask = imputer_mask.tocsc() else: imputer_mask = _get_mask(X, self.missing_values) if self.features == 'missing-only': n_missing = imputer_mask.sum(axis=0) if self.sparse is True: imputer_mask = sparse.csc_matrix(imputer_mask) if self.features == 'all': features_indices = np.arange(X.shape[1]) else: features_indices = np.flatnonzero(n_missing) return imputer_mask, features_indices def _validate_input(self, X, in_fit): if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" X = self._validate_data(X, reset=in_fit, accept_sparse=('csc', 'csr'), dtype=None, force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("MissingIndicator does not support data with " "dtype {0}. Please provide either a numeric array" " (with a floating point or integer dtype) or " "categorical data represented either as an array " "with integer dtype or an array of string values " "with an object dtype.".format(X.dtype)) if sparse.issparse(X) and self.missing_values == 0: # missing_values = 0 not allowed with sparse data as it would # force densification raise ValueError("Sparse input with missing_values=0 is " "not supported. Provide a dense " "array instead.") return X def _fit(self, X, y=None): """Fit the transformer on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- imputer_mask : {ndarray or sparse matrix}, shape (n_samples, \ n_features) The imputer mask of the original data. """ X = self._validate_input(X, in_fit=True) self._n_features = X.shape[1] if self.features not in ('missing-only', 'all'): raise ValueError("'features' has to be either 'missing-only' or " "'all'. Got {} instead.".format(self.features)) if not ((isinstance(self.sparse, str) and self.sparse == "auto") or isinstance(self.sparse, bool)): raise ValueError("'sparse' has to be a boolean or 'auto'. " "Got {!r} instead.".format(self.sparse)) missing_features_info = self._get_missing_features_info(X) self.features_ = missing_features_info[1] return missing_features_info[0] def fit(self, X, y=None): """Fit the transformer on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : object Returns self. """ self._fit(X, y) return self def transform(self, X): """Generate missing values indicator for X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. Returns ------- Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \ or (n_samples, n_features_with_missing) The missing indicator for input data. The data type of ``Xt`` will be boolean. """ check_is_fitted(self) X = self._validate_input(X, in_fit=False) if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " "than during fitting.") imputer_mask, features = self._get_missing_features_info(X) if self.features == "missing-only": features_diff_fit_trans = np.setdiff1d(features, self.features_) if (self.error_on_new and features_diff_fit_trans.size > 0): raise ValueError("The features {} have missing values " "in transform but have no missing values " "in fit.".format(features_diff_fit_trans)) if self.features_.size < self._n_features: imputer_mask = imputer_mask[:, self.features_] return imputer_mask def fit_transform(self, X, y=None): """Generate missing values indicator for X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. Returns ------- Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \ or (n_samples, n_features_with_missing) The missing indicator for input data. The data type of ``Xt`` will be boolean. """ imputer_mask = self._fit(X, y) if self.features_.size < self._n_features: imputer_mask = imputer_mask[:, self.features_] return imputer_mask def _more_tags(self): return {'allow_nan': True, 'X_types': ['2darray', 'string']}