Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
16
venv/Lib/site-packages/sklearn/impute/__init__.py
Normal file
16
venv/Lib/site-packages/sklearn/impute/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
"""Transformers for missing value imputation"""
|
||||
import typing
|
||||
|
||||
from ._base import MissingIndicator, SimpleImputer
|
||||
from ._knn import KNNImputer
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
# Avoid errors in type checkers (e.g. mypy) for experimental estimators.
|
||||
# TODO: remove this check once the estimator is no longer experimental.
|
||||
from ._iterative import IterativeImputer # noqa
|
||||
|
||||
__all__ = [
|
||||
'MissingIndicator',
|
||||
'SimpleImputer',
|
||||
'KNNImputer'
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
735
venv/Lib/site-packages/sklearn/impute/_base.py
Normal file
735
venv/Lib/site-packages/sklearn/impute/_base.py
Normal file
|
@ -0,0 +1,735 @@
|
|||
# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
|
||||
# Sergey Feldman <sergeyfeldman@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numbers
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import numpy.ma as ma
|
||||
from scipy import sparse
|
||||
from scipy import stats
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..utils.sparsefuncs import _get_median
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import FLOAT_DTYPES
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..utils._mask import _get_mask
|
||||
from ..utils import is_scalar_nan
|
||||
|
||||
|
||||
def _check_inputs_dtype(X, missing_values):
|
||||
if (X.dtype.kind in ("f", "i", "u") and
|
||||
not isinstance(missing_values, numbers.Real)):
|
||||
raise ValueError("'X' and 'missing_values' types are expected to be"
|
||||
" both numerical. Got X.dtype={} and "
|
||||
" type(missing_values)={}."
|
||||
.format(X.dtype, type(missing_values)))
|
||||
|
||||
|
||||
def _most_frequent(array, extra_value, n_repeat):
|
||||
"""Compute the most frequent value in a 1d array extended with
|
||||
[extra_value] * n_repeat, where extra_value is assumed to be not part
|
||||
of the array."""
|
||||
# Compute the most frequent value in array only
|
||||
if array.size > 0:
|
||||
with warnings.catch_warnings():
|
||||
# stats.mode raises a warning when input array contains objects due
|
||||
# to incapacity to detect NaNs. Irrelevant here since input array
|
||||
# has already been NaN-masked.
|
||||
warnings.simplefilter("ignore", RuntimeWarning)
|
||||
mode = stats.mode(array)
|
||||
|
||||
most_frequent_value = mode[0][0]
|
||||
most_frequent_count = mode[1][0]
|
||||
else:
|
||||
most_frequent_value = 0
|
||||
most_frequent_count = 0
|
||||
|
||||
# Compare to array + [extra_value] * n_repeat
|
||||
if most_frequent_count == 0 and n_repeat == 0:
|
||||
return np.nan
|
||||
elif most_frequent_count < n_repeat:
|
||||
return extra_value
|
||||
elif most_frequent_count > n_repeat:
|
||||
return most_frequent_value
|
||||
elif most_frequent_count == n_repeat:
|
||||
# Ties the breaks. Copy the behaviour of scipy.stats.mode
|
||||
if most_frequent_value < extra_value:
|
||||
return most_frequent_value
|
||||
else:
|
||||
return extra_value
|
||||
|
||||
|
||||
class _BaseImputer(TransformerMixin, BaseEstimator):
|
||||
"""Base class for all imputers.
|
||||
|
||||
It adds automatically support for `add_indicator`.
|
||||
"""
|
||||
|
||||
def __init__(self, *, missing_values=np.nan, add_indicator=False):
|
||||
self.missing_values = missing_values
|
||||
self.add_indicator = add_indicator
|
||||
|
||||
def _fit_indicator(self, X):
|
||||
"""Fit a MissingIndicator."""
|
||||
if self.add_indicator:
|
||||
self.indicator_ = MissingIndicator(
|
||||
missing_values=self.missing_values, error_on_new=False
|
||||
)
|
||||
self.indicator_.fit(X)
|
||||
else:
|
||||
self.indicator_ = None
|
||||
|
||||
def _transform_indicator(self, X):
|
||||
"""Compute the indicator mask.'
|
||||
|
||||
Note that X must be the original data as passed to the imputer before
|
||||
any imputation, since imputation may be done inplace in some cases.
|
||||
"""
|
||||
if self.add_indicator:
|
||||
if not hasattr(self, 'indicator_'):
|
||||
raise ValueError(
|
||||
"Make sure to call _fit_indicator before "
|
||||
"_transform_indicator"
|
||||
)
|
||||
return self.indicator_.transform(X)
|
||||
|
||||
def _concatenate_indicator(self, X_imputed, X_indicator):
|
||||
"""Concatenate indicator mask with the imputed data."""
|
||||
if not self.add_indicator:
|
||||
return X_imputed
|
||||
|
||||
hstack = sparse.hstack if sparse.issparse(X_imputed) else np.hstack
|
||||
if X_indicator is None:
|
||||
raise ValueError(
|
||||
"Data from the missing indicator are not provided. Call "
|
||||
"_fit_indicator and _transform_indicator in the imputer "
|
||||
"implementation."
|
||||
)
|
||||
|
||||
return hstack((X_imputed, X_indicator))
|
||||
|
||||
def _more_tags(self):
|
||||
return {'allow_nan': is_scalar_nan(self.missing_values)}
|
||||
|
||||
|
||||
class SimpleImputer(_BaseImputer):
|
||||
"""Imputation transformer for completing missing values.
|
||||
|
||||
Read more in the :ref:`User Guide <impute>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
`SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`
|
||||
estimator which is now removed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
missing_values : number, string, np.nan (default) or None
|
||||
The placeholder for the missing values. All occurrences of
|
||||
`missing_values` will be imputed. For pandas' dataframes with
|
||||
nullable integer dtypes with missing values, `missing_values`
|
||||
should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
|
||||
|
||||
strategy : string, default='mean'
|
||||
The imputation strategy.
|
||||
|
||||
- If "mean", then replace missing values using the mean along
|
||||
each column. Can only be used with numeric data.
|
||||
- If "median", then replace missing values using the median along
|
||||
each column. Can only be used with numeric data.
|
||||
- If "most_frequent", then replace missing using the most frequent
|
||||
value along each column. Can be used with strings or numeric data.
|
||||
- If "constant", then replace missing values with fill_value. Can be
|
||||
used with strings or numeric data.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
strategy="constant" for fixed value imputation.
|
||||
|
||||
fill_value : string or numerical value, default=None
|
||||
When strategy == "constant", fill_value is used to replace all
|
||||
occurrences of missing_values.
|
||||
If left to the default, fill_value will be 0 when imputing numerical
|
||||
data and "missing_value" for strings or object data types.
|
||||
|
||||
verbose : integer, default=0
|
||||
Controls the verbosity of the imputer.
|
||||
|
||||
copy : boolean, default=True
|
||||
If True, a copy of X will be created. If False, imputation will
|
||||
be done in-place whenever possible. Note that, in the following cases,
|
||||
a new copy will always be made, even if `copy=False`:
|
||||
|
||||
- If X is not an array of floating values;
|
||||
- If X is encoded as a CSR matrix;
|
||||
- If add_indicator=True.
|
||||
|
||||
add_indicator : boolean, default=False
|
||||
If True, a :class:`MissingIndicator` transform will stack onto output
|
||||
of the imputer's transform. This allows a predictive estimator
|
||||
to account for missingness despite imputation. If a feature has no
|
||||
missing values at fit/train time, the feature won't appear on
|
||||
the missing indicator even if there are missing values at
|
||||
transform/test time.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
statistics_ : array of shape (n_features,)
|
||||
The imputation fill value for each feature.
|
||||
Computing statistics can result in `np.nan` values.
|
||||
During :meth:`transform`, features corresponding to `np.nan`
|
||||
statistics will be discarded.
|
||||
|
||||
indicator_ : :class:`sklearn.impute.MissingIndicator`
|
||||
Indicator used to add binary indicators for missing values.
|
||||
``None`` if add_indicator is False.
|
||||
|
||||
See also
|
||||
--------
|
||||
IterativeImputer : Multivariate imputation of missing values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.impute import SimpleImputer
|
||||
>>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
|
||||
>>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
|
||||
SimpleImputer()
|
||||
>>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
|
||||
>>> print(imp_mean.transform(X))
|
||||
[[ 7. 2. 3. ]
|
||||
[ 4. 3.5 6. ]
|
||||
[10. 3.5 9. ]]
|
||||
|
||||
Notes
|
||||
-----
|
||||
Columns which only contained missing values at :meth:`fit` are discarded
|
||||
upon :meth:`transform` if strategy is not "constant".
|
||||
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, missing_values=np.nan, strategy="mean",
|
||||
fill_value=None, verbose=0, copy=True, add_indicator=False):
|
||||
super().__init__(
|
||||
missing_values=missing_values,
|
||||
add_indicator=add_indicator
|
||||
)
|
||||
self.strategy = strategy
|
||||
self.fill_value = fill_value
|
||||
self.verbose = verbose
|
||||
self.copy = copy
|
||||
|
||||
def _validate_input(self, X, in_fit):
|
||||
allowed_strategies = ["mean", "median", "most_frequent", "constant"]
|
||||
if self.strategy not in allowed_strategies:
|
||||
raise ValueError("Can only use these strategies: {0} "
|
||||
" got strategy={1}".format(allowed_strategies,
|
||||
self.strategy))
|
||||
|
||||
if self.strategy in ("most_frequent", "constant"):
|
||||
dtype = None
|
||||
else:
|
||||
dtype = FLOAT_DTYPES
|
||||
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
force_all_finite = True
|
||||
else:
|
||||
force_all_finite = "allow-nan"
|
||||
|
||||
try:
|
||||
X = self._validate_data(X, reset=in_fit,
|
||||
accept_sparse='csc', dtype=dtype,
|
||||
force_all_finite=force_all_finite,
|
||||
copy=self.copy)
|
||||
except ValueError as ve:
|
||||
if "could not convert" in str(ve):
|
||||
new_ve = ValueError("Cannot use {} strategy with non-numeric "
|
||||
"data:\n{}".format(self.strategy, ve))
|
||||
raise new_ve from None
|
||||
else:
|
||||
raise ve
|
||||
|
||||
_check_inputs_dtype(X, self.missing_values)
|
||||
if X.dtype.kind not in ("i", "u", "f", "O"):
|
||||
raise ValueError("SimpleImputer does not support data with dtype "
|
||||
"{0}. Please provide either a numeric array (with"
|
||||
" a floating point or integer dtype) or "
|
||||
"categorical data represented either as an array "
|
||||
"with integer dtype or an array of string values "
|
||||
"with an object dtype.".format(X.dtype))
|
||||
|
||||
return X
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the imputer on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
Input data, where ``n_samples`` is the number of samples and
|
||||
``n_features`` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : SimpleImputer
|
||||
"""
|
||||
X = self._validate_input(X, in_fit=True)
|
||||
super()._fit_indicator(X)
|
||||
|
||||
# default fill_value is 0 for numerical input and "missing_value"
|
||||
# otherwise
|
||||
if self.fill_value is None:
|
||||
if X.dtype.kind in ("i", "u", "f"):
|
||||
fill_value = 0
|
||||
else:
|
||||
fill_value = "missing_value"
|
||||
else:
|
||||
fill_value = self.fill_value
|
||||
|
||||
# fill_value should be numerical in case of numerical input
|
||||
if (self.strategy == "constant" and
|
||||
X.dtype.kind in ("i", "u", "f") and
|
||||
not isinstance(fill_value, numbers.Real)):
|
||||
raise ValueError("'fill_value'={0} is invalid. Expected a "
|
||||
"numerical value when imputing numerical "
|
||||
"data".format(fill_value))
|
||||
|
||||
if sparse.issparse(X):
|
||||
# missing_values = 0 not allowed with sparse data as it would
|
||||
# force densification
|
||||
if self.missing_values == 0:
|
||||
raise ValueError("Imputation not possible when missing_values "
|
||||
"== 0 and input is sparse. Provide a dense "
|
||||
"array instead.")
|
||||
else:
|
||||
self.statistics_ = self._sparse_fit(X,
|
||||
self.strategy,
|
||||
self.missing_values,
|
||||
fill_value)
|
||||
else:
|
||||
self.statistics_ = self._dense_fit(X,
|
||||
self.strategy,
|
||||
self.missing_values,
|
||||
fill_value)
|
||||
return self
|
||||
|
||||
def _sparse_fit(self, X, strategy, missing_values, fill_value):
|
||||
"""Fit the transformer on sparse data."""
|
||||
mask_data = _get_mask(X.data, missing_values)
|
||||
n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
|
||||
|
||||
statistics = np.empty(X.shape[1])
|
||||
|
||||
if strategy == "constant":
|
||||
# for constant strategy, self.statistcs_ is used to store
|
||||
# fill_value in each column
|
||||
statistics.fill(fill_value)
|
||||
else:
|
||||
for i in range(X.shape[1]):
|
||||
column = X.data[X.indptr[i]:X.indptr[i + 1]]
|
||||
mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
|
||||
column = column[~mask_column]
|
||||
|
||||
# combine explicit and implicit zeros
|
||||
mask_zeros = _get_mask(column, 0)
|
||||
column = column[~mask_zeros]
|
||||
n_explicit_zeros = mask_zeros.sum()
|
||||
n_zeros = n_implicit_zeros[i] + n_explicit_zeros
|
||||
|
||||
if strategy == "mean":
|
||||
s = column.size + n_zeros
|
||||
statistics[i] = np.nan if s == 0 else column.sum() / s
|
||||
|
||||
elif strategy == "median":
|
||||
statistics[i] = _get_median(column,
|
||||
n_zeros)
|
||||
|
||||
elif strategy == "most_frequent":
|
||||
statistics[i] = _most_frequent(column,
|
||||
0,
|
||||
n_zeros)
|
||||
return statistics
|
||||
|
||||
def _dense_fit(self, X, strategy, missing_values, fill_value):
|
||||
"""Fit the transformer on dense data."""
|
||||
mask = _get_mask(X, missing_values)
|
||||
masked_X = ma.masked_array(X, mask=mask)
|
||||
|
||||
# Mean
|
||||
if strategy == "mean":
|
||||
mean_masked = np.ma.mean(masked_X, axis=0)
|
||||
# Avoid the warning "Warning: converting a masked element to nan."
|
||||
mean = np.ma.getdata(mean_masked)
|
||||
mean[np.ma.getmask(mean_masked)] = np.nan
|
||||
|
||||
return mean
|
||||
|
||||
# Median
|
||||
elif strategy == "median":
|
||||
median_masked = np.ma.median(masked_X, axis=0)
|
||||
# Avoid the warning "Warning: converting a masked element to nan."
|
||||
median = np.ma.getdata(median_masked)
|
||||
median[np.ma.getmaskarray(median_masked)] = np.nan
|
||||
|
||||
return median
|
||||
|
||||
# Most frequent
|
||||
elif strategy == "most_frequent":
|
||||
# Avoid use of scipy.stats.mstats.mode due to the required
|
||||
# additional overhead and slow benchmarking performance.
|
||||
# See Issue 14325 and PR 14399 for full discussion.
|
||||
|
||||
# To be able access the elements by columns
|
||||
X = X.transpose()
|
||||
mask = mask.transpose()
|
||||
|
||||
if X.dtype.kind == "O":
|
||||
most_frequent = np.empty(X.shape[0], dtype=object)
|
||||
else:
|
||||
most_frequent = np.empty(X.shape[0])
|
||||
|
||||
for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
|
||||
row_mask = np.logical_not(row_mask).astype(np.bool)
|
||||
row = row[row_mask]
|
||||
most_frequent[i] = _most_frequent(row, np.nan, 0)
|
||||
|
||||
return most_frequent
|
||||
|
||||
# Constant
|
||||
elif strategy == "constant":
|
||||
# for constant strategy, self.statistcs_ is used to store
|
||||
# fill_value in each column
|
||||
return np.full(X.shape[1], fill_value, dtype=X.dtype)
|
||||
|
||||
def transform(self, X):
|
||||
"""Impute all missing values in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = self._validate_input(X, in_fit=False)
|
||||
X_indicator = super()._transform_indicator(X)
|
||||
|
||||
statistics = self.statistics_
|
||||
|
||||
if X.shape[1] != statistics.shape[0]:
|
||||
raise ValueError("X has %d features per sample, expected %d"
|
||||
% (X.shape[1], self.statistics_.shape[0]))
|
||||
|
||||
# Delete the invalid columns if strategy is not constant
|
||||
if self.strategy == "constant":
|
||||
valid_statistics = statistics
|
||||
else:
|
||||
# same as np.isnan but also works for object dtypes
|
||||
invalid_mask = _get_mask(statistics, np.nan)
|
||||
valid_mask = np.logical_not(invalid_mask)
|
||||
valid_statistics = statistics[valid_mask]
|
||||
valid_statistics_indexes = np.flatnonzero(valid_mask)
|
||||
|
||||
if invalid_mask.any():
|
||||
missing = np.arange(X.shape[1])[invalid_mask]
|
||||
if self.verbose:
|
||||
warnings.warn("Deleting features without "
|
||||
"observed values: %s" % missing)
|
||||
X = X[:, valid_statistics_indexes]
|
||||
|
||||
# Do actual imputation
|
||||
if sparse.issparse(X):
|
||||
if self.missing_values == 0:
|
||||
raise ValueError("Imputation not possible when missing_values "
|
||||
"== 0 and input is sparse. Provide a dense "
|
||||
"array instead.")
|
||||
else:
|
||||
mask = _get_mask(X.data, self.missing_values)
|
||||
indexes = np.repeat(
|
||||
np.arange(len(X.indptr) - 1, dtype=np.int),
|
||||
np.diff(X.indptr))[mask]
|
||||
|
||||
X.data[mask] = valid_statistics[indexes].astype(X.dtype,
|
||||
copy=False)
|
||||
else:
|
||||
mask = _get_mask(X, self.missing_values)
|
||||
n_missing = np.sum(mask, axis=0)
|
||||
values = np.repeat(valid_statistics, n_missing)
|
||||
coordinates = np.where(mask.transpose())[::-1]
|
||||
|
||||
X[coordinates] = values
|
||||
|
||||
return super()._concatenate_indicator(X, X_indicator)
|
||||
|
||||
|
||||
class MissingIndicator(TransformerMixin, BaseEstimator):
|
||||
"""Binary indicators for missing values.
|
||||
|
||||
Note that this component typically should not be used in a vanilla
|
||||
:class:`Pipeline` consisting of transformers and a classifier, but rather
|
||||
could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
|
||||
|
||||
Read more in the :ref:`User Guide <impute>`.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Parameters
|
||||
----------
|
||||
missing_values : number, string, np.nan (default) or None
|
||||
The placeholder for the missing values. All occurrences of
|
||||
`missing_values` will be imputed. For pandas' dataframes with
|
||||
nullable integer dtypes with missing values, `missing_values`
|
||||
should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
|
||||
|
||||
features : str, default=None
|
||||
Whether the imputer mask should represent all or a subset of
|
||||
features.
|
||||
|
||||
- If "missing-only" (default), the imputer mask will only represent
|
||||
features containing missing values during fit time.
|
||||
- If "all", the imputer mask will represent all features.
|
||||
|
||||
sparse : boolean or "auto", default=None
|
||||
Whether the imputer mask format should be sparse or dense.
|
||||
|
||||
- If "auto" (default), the imputer mask will be of same type as
|
||||
input.
|
||||
- If True, the imputer mask will be a sparse matrix.
|
||||
- If False, the imputer mask will be a numpy array.
|
||||
|
||||
error_on_new : boolean, default=None
|
||||
If True (default), transform will raise an error when there are
|
||||
features with missing values in transform that have no missing values
|
||||
in fit. This is applicable only when ``features="missing-only"``.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
features_ : ndarray, shape (n_missing_features,) or (n_features,)
|
||||
The features indices which will be returned when calling ``transform``.
|
||||
They are computed during ``fit``. For ``features='all'``, it is
|
||||
to ``range(n_features)``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.impute import MissingIndicator
|
||||
>>> X1 = np.array([[np.nan, 1, 3],
|
||||
... [4, 0, np.nan],
|
||||
... [8, 1, 0]])
|
||||
>>> X2 = np.array([[5, 1, np.nan],
|
||||
... [np.nan, 2, 3],
|
||||
... [2, 4, 0]])
|
||||
>>> indicator = MissingIndicator()
|
||||
>>> indicator.fit(X1)
|
||||
MissingIndicator()
|
||||
>>> X2_tr = indicator.transform(X2)
|
||||
>>> X2_tr
|
||||
array([[False, True],
|
||||
[ True, False],
|
||||
[False, False]])
|
||||
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, missing_values=np.nan, features="missing-only",
|
||||
sparse="auto", error_on_new=True):
|
||||
self.missing_values = missing_values
|
||||
self.features = features
|
||||
self.sparse = sparse
|
||||
self.error_on_new = error_on_new
|
||||
|
||||
def _get_missing_features_info(self, X):
|
||||
"""Compute the imputer mask and the indices of the features
|
||||
containing missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray or sparse matrix}, shape (n_samples, n_features)
|
||||
The input data with missing values. Note that ``X`` has been
|
||||
checked in ``fit`` and ``transform`` before to call this function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
imputer_mask : {ndarray or sparse matrix}, shape \
|
||||
(n_samples, n_features)
|
||||
The imputer mask of the original data.
|
||||
|
||||
features_with_missing : ndarray, shape (n_features_with_missing)
|
||||
The features containing missing values.
|
||||
|
||||
"""
|
||||
if sparse.issparse(X):
|
||||
mask = _get_mask(X.data, self.missing_values)
|
||||
|
||||
# The imputer mask will be constructed with the same sparse format
|
||||
# as X.
|
||||
sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
|
||||
else sparse.csc_matrix)
|
||||
imputer_mask = sparse_constructor(
|
||||
(mask, X.indices.copy(), X.indptr.copy()),
|
||||
shape=X.shape, dtype=bool)
|
||||
imputer_mask.eliminate_zeros()
|
||||
|
||||
if self.features == 'missing-only':
|
||||
n_missing = imputer_mask.getnnz(axis=0)
|
||||
|
||||
if self.sparse is False:
|
||||
imputer_mask = imputer_mask.toarray()
|
||||
elif imputer_mask.format == 'csr':
|
||||
imputer_mask = imputer_mask.tocsc()
|
||||
else:
|
||||
imputer_mask = _get_mask(X, self.missing_values)
|
||||
|
||||
if self.features == 'missing-only':
|
||||
n_missing = imputer_mask.sum(axis=0)
|
||||
|
||||
if self.sparse is True:
|
||||
imputer_mask = sparse.csc_matrix(imputer_mask)
|
||||
|
||||
if self.features == 'all':
|
||||
features_indices = np.arange(X.shape[1])
|
||||
else:
|
||||
features_indices = np.flatnonzero(n_missing)
|
||||
|
||||
return imputer_mask, features_indices
|
||||
|
||||
def _validate_input(self, X, in_fit):
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
force_all_finite = True
|
||||
else:
|
||||
force_all_finite = "allow-nan"
|
||||
X = self._validate_data(X, reset=in_fit,
|
||||
accept_sparse=('csc', 'csr'), dtype=None,
|
||||
force_all_finite=force_all_finite)
|
||||
_check_inputs_dtype(X, self.missing_values)
|
||||
if X.dtype.kind not in ("i", "u", "f", "O"):
|
||||
raise ValueError("MissingIndicator does not support data with "
|
||||
"dtype {0}. Please provide either a numeric array"
|
||||
" (with a floating point or integer dtype) or "
|
||||
"categorical data represented either as an array "
|
||||
"with integer dtype or an array of string values "
|
||||
"with an object dtype.".format(X.dtype))
|
||||
|
||||
if sparse.issparse(X) and self.missing_values == 0:
|
||||
# missing_values = 0 not allowed with sparse data as it would
|
||||
# force densification
|
||||
raise ValueError("Sparse input with missing_values=0 is "
|
||||
"not supported. Provide a dense "
|
||||
"array instead.")
|
||||
|
||||
return X
|
||||
|
||||
def _fit(self, X, y=None):
|
||||
"""Fit the transformer on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
Input data, where ``n_samples`` is the number of samples and
|
||||
``n_features`` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
imputer_mask : {ndarray or sparse matrix}, shape (n_samples, \
|
||||
n_features)
|
||||
The imputer mask of the original data.
|
||||
|
||||
"""
|
||||
X = self._validate_input(X, in_fit=True)
|
||||
self._n_features = X.shape[1]
|
||||
|
||||
if self.features not in ('missing-only', 'all'):
|
||||
raise ValueError("'features' has to be either 'missing-only' or "
|
||||
"'all'. Got {} instead.".format(self.features))
|
||||
|
||||
if not ((isinstance(self.sparse, str) and
|
||||
self.sparse == "auto") or isinstance(self.sparse, bool)):
|
||||
raise ValueError("'sparse' has to be a boolean or 'auto'. "
|
||||
"Got {!r} instead.".format(self.sparse))
|
||||
|
||||
missing_features_info = self._get_missing_features_info(X)
|
||||
self.features_ = missing_features_info[1]
|
||||
|
||||
return missing_features_info[0]
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the transformer on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
Input data, where ``n_samples`` is the number of samples and
|
||||
``n_features`` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns self.
|
||||
"""
|
||||
self._fit(X, y)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Generate missing values indicator for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \
|
||||
or (n_samples, n_features_with_missing)
|
||||
The missing indicator for input data. The data type of ``Xt``
|
||||
will be boolean.
|
||||
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_input(X, in_fit=False)
|
||||
|
||||
if X.shape[1] != self._n_features:
|
||||
raise ValueError("X has a different number of features "
|
||||
"than during fitting.")
|
||||
|
||||
imputer_mask, features = self._get_missing_features_info(X)
|
||||
|
||||
if self.features == "missing-only":
|
||||
features_diff_fit_trans = np.setdiff1d(features, self.features_)
|
||||
if (self.error_on_new and features_diff_fit_trans.size > 0):
|
||||
raise ValueError("The features {} have missing values "
|
||||
"in transform but have no missing values "
|
||||
"in fit.".format(features_diff_fit_trans))
|
||||
|
||||
if self.features_.size < self._n_features:
|
||||
imputer_mask = imputer_mask[:, self.features_]
|
||||
|
||||
return imputer_mask
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Generate missing values indicator for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \
|
||||
or (n_samples, n_features_with_missing)
|
||||
The missing indicator for input data. The data type of ``Xt``
|
||||
will be boolean.
|
||||
|
||||
"""
|
||||
imputer_mask = self._fit(X, y)
|
||||
|
||||
if self.features_.size < self._n_features:
|
||||
imputer_mask = imputer_mask[:, self.features_]
|
||||
|
||||
return imputer_mask
|
||||
|
||||
def _more_tags(self):
|
||||
return {'allow_nan': True,
|
||||
'X_types': ['2darray', 'string']}
|
741
venv/Lib/site-packages/sklearn/impute/_iterative.py
Normal file
741
venv/Lib/site-packages/sklearn/impute/_iterative.py
Normal file
|
@ -0,0 +1,741 @@
|
|||
|
||||
from time import time
|
||||
from collections import namedtuple
|
||||
import warnings
|
||||
|
||||
from scipy import stats
|
||||
import numpy as np
|
||||
|
||||
from ..base import clone
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..preprocessing import normalize
|
||||
from ..utils import (check_array, check_random_state, _safe_indexing,
|
||||
is_scalar_nan)
|
||||
from ..utils.validation import FLOAT_DTYPES, check_is_fitted
|
||||
from ..utils._mask import _get_mask
|
||||
|
||||
from ._base import _BaseImputer
|
||||
from ._base import SimpleImputer
|
||||
from ._base import _check_inputs_dtype
|
||||
|
||||
|
||||
_ImputerTriplet = namedtuple('_ImputerTriplet', ['feat_idx',
|
||||
'neighbor_feat_idx',
|
||||
'estimator'])
|
||||
|
||||
|
||||
class IterativeImputer(_BaseImputer):
|
||||
"""Multivariate imputer that estimates each feature from all the others.
|
||||
|
||||
A strategy for imputing missing values by modeling each feature with
|
||||
missing values as a function of other features in a round-robin fashion.
|
||||
|
||||
Read more in the :ref:`User Guide <iterative_imputer>`.
|
||||
|
||||
.. versionadded:: 0.21
|
||||
|
||||
.. note::
|
||||
|
||||
This estimator is still **experimental** for now: the predictions
|
||||
and the API might change without any deprecation cycle. To use it,
|
||||
you need to explicitly import ``enable_iterative_imputer``::
|
||||
|
||||
>>> # explicitly require this experimental feature
|
||||
>>> from sklearn.experimental import enable_iterative_imputer # noqa
|
||||
>>> # now you can import normally from sklearn.impute
|
||||
>>> from sklearn.impute import IterativeImputer
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator object, default=BayesianRidge()
|
||||
The estimator to use at each step of the round-robin imputation.
|
||||
If ``sample_posterior`` is True, the estimator must support
|
||||
``return_std`` in its ``predict`` method.
|
||||
|
||||
missing_values : int, np.nan, default=np.nan
|
||||
The placeholder for the missing values. All occurrences of
|
||||
`missing_values` will be imputed. For pandas' dataframes with
|
||||
nullable integer dtypes with missing values, `missing_values`
|
||||
should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
|
||||
|
||||
sample_posterior : boolean, default=False
|
||||
Whether to sample from the (Gaussian) predictive posterior of the
|
||||
fitted estimator for each imputation. Estimator must support
|
||||
``return_std`` in its ``predict`` method if set to ``True``. Set to
|
||||
``True`` if using ``IterativeImputer`` for multiple imputations.
|
||||
|
||||
max_iter : int, default=10
|
||||
Maximum number of imputation rounds to perform before returning the
|
||||
imputations computed during the final round. A round is a single
|
||||
imputation of each feature with missing values. The stopping criterion
|
||||
is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
|
||||
where `X_t` is `X` at iteration `t. Note that early stopping is only
|
||||
applied if ``sample_posterior=False``.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Tolerance of the stopping condition.
|
||||
|
||||
n_nearest_features : int, default=None
|
||||
Number of other features to use to estimate the missing values of
|
||||
each feature column. Nearness between features is measured using
|
||||
the absolute correlation coefficient between each feature pair (after
|
||||
initial imputation). To ensure coverage of features throughout the
|
||||
imputation process, the neighbor features are not necessarily nearest,
|
||||
but are drawn with probability proportional to correlation for each
|
||||
imputed target feature. Can provide significant speed-up when the
|
||||
number of features is huge. If ``None``, all features will be used.
|
||||
|
||||
initial_strategy : str, default='mean'
|
||||
Which strategy to use to initialize the missing values. Same as the
|
||||
``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
|
||||
Valid values: {"mean", "median", "most_frequent", or "constant"}.
|
||||
|
||||
imputation_order : str, default='ascending'
|
||||
The order in which the features will be imputed. Possible values:
|
||||
|
||||
"ascending"
|
||||
From features with fewest missing values to most.
|
||||
"descending"
|
||||
From features with most missing values to fewest.
|
||||
"roman"
|
||||
Left to right.
|
||||
"arabic"
|
||||
Right to left.
|
||||
"random"
|
||||
A random order for each round.
|
||||
|
||||
skip_complete : boolean, default=False
|
||||
If ``True`` then features with missing values during ``transform``
|
||||
which did not have any missing values during ``fit`` will be imputed
|
||||
with the initial imputation method only. Set to ``True`` if you have
|
||||
many features with no missing values at both ``fit`` and ``transform``
|
||||
time to save compute.
|
||||
|
||||
min_value : float or array-like of shape (n_features,), default=None.
|
||||
Minimum possible imputed value. Broadcast to shape (n_features,) if
|
||||
scalar. If array-like, expects shape (n_features,), one min value for
|
||||
each feature. `None` (default) is converted to -np.inf.
|
||||
|
||||
max_value : float or array-like of shape (n_features,), default=None.
|
||||
Maximum possible imputed value. Broadcast to shape (n_features,) if
|
||||
scalar. If array-like, expects shape (n_features,), one max value for
|
||||
each feature. `None` (default) is converted to np.inf.
|
||||
|
||||
verbose : int, default=0
|
||||
Verbosity flag, controls the debug messages that are issued
|
||||
as functions are evaluated. The higher, the more verbose. Can be 0, 1,
|
||||
or 2.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
The seed of the pseudo random number generator to use. Randomizes
|
||||
selection of estimator features if n_nearest_features is not None, the
|
||||
``imputation_order`` if ``random``, and the sampling from posterior if
|
||||
``sample_posterior`` is True. Use an integer for determinism.
|
||||
See :term:`the Glossary <random_state>`.
|
||||
|
||||
add_indicator : boolean, default=False
|
||||
If True, a :class:`MissingIndicator` transform will stack onto output
|
||||
of the imputer's transform. This allows a predictive estimator
|
||||
to account for missingness despite imputation. If a feature has no
|
||||
missing values at fit/train time, the feature won't appear on
|
||||
the missing indicator even if there are missing values at
|
||||
transform/test time.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
|
||||
Imputer used to initialize the missing values.
|
||||
|
||||
imputation_sequence_ : list of tuples
|
||||
Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
|
||||
``feat_idx`` is the current feature to be imputed,
|
||||
``neighbor_feat_idx`` is the array of other features used to impute the
|
||||
current feature, and ``estimator`` is the trained estimator used for
|
||||
the imputation. Length is ``self.n_features_with_missing_ *
|
||||
self.n_iter_``.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iteration rounds that occurred. Will be less than
|
||||
``self.max_iter`` if early stopping criterion was reached.
|
||||
|
||||
n_features_with_missing_ : int
|
||||
Number of features with missing values.
|
||||
|
||||
indicator_ : :class:`sklearn.impute.MissingIndicator`
|
||||
Indicator used to add binary indicators for missing values.
|
||||
``None`` if add_indicator is False.
|
||||
|
||||
random_state_ : RandomState instance
|
||||
RandomState instance that is generated either from a seed, the random
|
||||
number generator or by `np.random`.
|
||||
|
||||
See also
|
||||
--------
|
||||
SimpleImputer : Univariate imputation of missing values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.experimental import enable_iterative_imputer
|
||||
>>> from sklearn.impute import IterativeImputer
|
||||
>>> imp_mean = IterativeImputer(random_state=0)
|
||||
>>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
|
||||
IterativeImputer(random_state=0)
|
||||
>>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
|
||||
>>> imp_mean.transform(X)
|
||||
array([[ 6.9584..., 2. , 3. ],
|
||||
[ 4. , 2.6000..., 6. ],
|
||||
[10. , 4.9999..., 9. ]])
|
||||
|
||||
Notes
|
||||
-----
|
||||
To support imputation in inductive mode we store each feature's estimator
|
||||
during the ``fit`` phase, and predict without refitting (in order) during
|
||||
the ``transform`` phase.
|
||||
|
||||
Features which contain all missing values at ``fit`` are discarded upon
|
||||
``transform``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
|
||||
Multivariate Imputation by Chained Equations in R". Journal of
|
||||
Statistical Software 45: 1-67.
|
||||
<https://www.jstatsoft.org/article/view/v045i03>`_
|
||||
|
||||
.. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
|
||||
Multivariate Data Suitable for use with an Electronic Computer".
|
||||
Journal of the Royal Statistical Society 22(2): 302-306.
|
||||
<https://www.jstor.org/stable/2984099>`_
|
||||
"""
|
||||
def __init__(self,
|
||||
estimator=None, *,
|
||||
missing_values=np.nan,
|
||||
sample_posterior=False,
|
||||
max_iter=10,
|
||||
tol=1e-3,
|
||||
n_nearest_features=None,
|
||||
initial_strategy="mean",
|
||||
imputation_order='ascending',
|
||||
skip_complete=False,
|
||||
min_value=None,
|
||||
max_value=None,
|
||||
verbose=0,
|
||||
random_state=None,
|
||||
add_indicator=False):
|
||||
super().__init__(
|
||||
missing_values=missing_values,
|
||||
add_indicator=add_indicator
|
||||
)
|
||||
|
||||
self.estimator = estimator
|
||||
self.sample_posterior = sample_posterior
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.n_nearest_features = n_nearest_features
|
||||
self.initial_strategy = initial_strategy
|
||||
self.imputation_order = imputation_order
|
||||
self.skip_complete = skip_complete
|
||||
self.min_value = min_value
|
||||
self.max_value = max_value
|
||||
self.verbose = verbose
|
||||
self.random_state = random_state
|
||||
|
||||
def _impute_one_feature(self,
|
||||
X_filled,
|
||||
mask_missing_values,
|
||||
feat_idx,
|
||||
neighbor_feat_idx,
|
||||
estimator=None,
|
||||
fit_mode=True):
|
||||
"""Impute a single feature from the others provided.
|
||||
|
||||
This function predicts the missing values of one of the features using
|
||||
the current estimates of all the other features. The ``estimator`` must
|
||||
support ``return_std=True`` in its ``predict`` method for this function
|
||||
to work.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_filled : ndarray
|
||||
Input data with the most recent imputations.
|
||||
|
||||
mask_missing_values : ndarray
|
||||
Input data's missing indicator matrix.
|
||||
|
||||
feat_idx : int
|
||||
Index of the feature currently being imputed.
|
||||
|
||||
neighbor_feat_idx : ndarray
|
||||
Indices of the features to be used in imputing ``feat_idx``.
|
||||
|
||||
estimator : object
|
||||
The estimator to use at this step of the round-robin imputation.
|
||||
If ``sample_posterior`` is True, the estimator must support
|
||||
``return_std`` in its ``predict`` method.
|
||||
If None, it will be cloned from self._estimator.
|
||||
|
||||
fit_mode : boolean, default=True
|
||||
Whether to fit and predict with the estimator or just predict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_filled : ndarray
|
||||
Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
|
||||
|
||||
estimator : estimator with sklearn API
|
||||
The fitted estimator used to impute
|
||||
``X_filled[missing_row_mask, feat_idx]``.
|
||||
"""
|
||||
if estimator is None and fit_mode is False:
|
||||
raise ValueError("If fit_mode is False, then an already-fitted "
|
||||
"estimator should be passed in.")
|
||||
|
||||
if estimator is None:
|
||||
estimator = clone(self._estimator)
|
||||
|
||||
missing_row_mask = mask_missing_values[:, feat_idx]
|
||||
if fit_mode:
|
||||
X_train = _safe_indexing(X_filled[:, neighbor_feat_idx],
|
||||
~missing_row_mask)
|
||||
y_train = _safe_indexing(X_filled[:, feat_idx],
|
||||
~missing_row_mask)
|
||||
estimator.fit(X_train, y_train)
|
||||
|
||||
# if no missing values, don't predict
|
||||
if np.sum(missing_row_mask) == 0:
|
||||
return X_filled, estimator
|
||||
|
||||
# get posterior samples if there is at least one missing value
|
||||
X_test = _safe_indexing(X_filled[:, neighbor_feat_idx],
|
||||
missing_row_mask)
|
||||
if self.sample_posterior:
|
||||
mus, sigmas = estimator.predict(X_test, return_std=True)
|
||||
imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
|
||||
# two types of problems: (1) non-positive sigmas
|
||||
# (2) mus outside legal range of min_value and max_value
|
||||
# (results in inf sample)
|
||||
positive_sigmas = sigmas > 0
|
||||
imputed_values[~positive_sigmas] = mus[~positive_sigmas]
|
||||
mus_too_low = mus < self._min_value[feat_idx]
|
||||
imputed_values[mus_too_low] = self._min_value[feat_idx]
|
||||
mus_too_high = mus > self._max_value[feat_idx]
|
||||
imputed_values[mus_too_high] = self._max_value[feat_idx]
|
||||
# the rest can be sampled without statistical issues
|
||||
inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
|
||||
mus = mus[inrange_mask]
|
||||
sigmas = sigmas[inrange_mask]
|
||||
a = (self._min_value[feat_idx] - mus) / sigmas
|
||||
b = (self._max_value[feat_idx] - mus) / sigmas
|
||||
|
||||
truncated_normal = stats.truncnorm(a=a, b=b,
|
||||
loc=mus, scale=sigmas)
|
||||
imputed_values[inrange_mask] = truncated_normal.rvs(
|
||||
random_state=self.random_state_)
|
||||
else:
|
||||
imputed_values = estimator.predict(X_test)
|
||||
imputed_values = np.clip(imputed_values,
|
||||
self._min_value[feat_idx],
|
||||
self._max_value[feat_idx])
|
||||
|
||||
# update the feature
|
||||
X_filled[missing_row_mask, feat_idx] = imputed_values
|
||||
return X_filled, estimator
|
||||
|
||||
def _get_neighbor_feat_idx(self,
|
||||
n_features,
|
||||
feat_idx,
|
||||
abs_corr_mat):
|
||||
"""Get a list of other features to predict ``feat_idx``.
|
||||
|
||||
If self.n_nearest_features is less than or equal to the total
|
||||
number of features, then use a probability proportional to the absolute
|
||||
correlation between ``feat_idx`` and each other feature to randomly
|
||||
choose a subsample of the other features (without replacement).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_features : int
|
||||
Number of features in ``X``.
|
||||
|
||||
feat_idx : int
|
||||
Index of the feature currently being imputed.
|
||||
|
||||
abs_corr_mat : ndarray, shape (n_features, n_features)
|
||||
Absolute correlation matrix of ``X``. The diagonal has been zeroed
|
||||
out and each feature has been normalized to sum to 1. Can be None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
neighbor_feat_idx : array-like
|
||||
The features to use to impute ``feat_idx``.
|
||||
"""
|
||||
if (self.n_nearest_features is not None and
|
||||
self.n_nearest_features < n_features):
|
||||
p = abs_corr_mat[:, feat_idx]
|
||||
neighbor_feat_idx = self.random_state_.choice(
|
||||
np.arange(n_features), self.n_nearest_features, replace=False,
|
||||
p=p)
|
||||
else:
|
||||
inds_left = np.arange(feat_idx)
|
||||
inds_right = np.arange(feat_idx + 1, n_features)
|
||||
neighbor_feat_idx = np.concatenate((inds_left, inds_right))
|
||||
return neighbor_feat_idx
|
||||
|
||||
def _get_ordered_idx(self, mask_missing_values):
|
||||
"""Decide in what order we will update the features.
|
||||
|
||||
As a homage to the MICE R package, we will have 4 main options of
|
||||
how to order the updates, and use a random order if anything else
|
||||
is specified.
|
||||
|
||||
Also, this function skips features which have no missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask_missing_values : array-like, shape (n_samples, n_features)
|
||||
Input data's missing indicator matrix, where "n_samples" is the
|
||||
number of samples and "n_features" is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ordered_idx : ndarray, shape (n_features,)
|
||||
The order in which to impute the features.
|
||||
"""
|
||||
frac_of_missing_values = mask_missing_values.mean(axis=0)
|
||||
if self.skip_complete:
|
||||
missing_values_idx = np.flatnonzero(frac_of_missing_values)
|
||||
else:
|
||||
missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0])
|
||||
if self.imputation_order == 'roman':
|
||||
ordered_idx = missing_values_idx
|
||||
elif self.imputation_order == 'arabic':
|
||||
ordered_idx = missing_values_idx[::-1]
|
||||
elif self.imputation_order == 'ascending':
|
||||
n = len(frac_of_missing_values) - len(missing_values_idx)
|
||||
ordered_idx = np.argsort(frac_of_missing_values,
|
||||
kind='mergesort')[n:]
|
||||
elif self.imputation_order == 'descending':
|
||||
n = len(frac_of_missing_values) - len(missing_values_idx)
|
||||
ordered_idx = np.argsort(frac_of_missing_values,
|
||||
kind='mergesort')[n:][::-1]
|
||||
elif self.imputation_order == 'random':
|
||||
ordered_idx = missing_values_idx
|
||||
self.random_state_.shuffle(ordered_idx)
|
||||
else:
|
||||
raise ValueError("Got an invalid imputation order: '{0}'. It must "
|
||||
"be one of the following: 'roman', 'arabic', "
|
||||
"'ascending', 'descending', or "
|
||||
"'random'.".format(self.imputation_order))
|
||||
return ordered_idx
|
||||
|
||||
def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
|
||||
"""Get absolute correlation matrix between features.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_filled : ndarray, shape (n_samples, n_features)
|
||||
Input data with the most recent imputations.
|
||||
|
||||
tolerance : float, default=1e-6
|
||||
``abs_corr_mat`` can have nans, which will be replaced
|
||||
with ``tolerance``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
abs_corr_mat : ndarray, shape (n_features, n_features)
|
||||
Absolute correlation matrix of ``X`` at the beginning of the
|
||||
current round. The diagonal has been zeroed out and each feature's
|
||||
absolute correlations with all others have been normalized to sum
|
||||
to 1.
|
||||
"""
|
||||
n_features = X_filled.shape[1]
|
||||
if (self.n_nearest_features is None or
|
||||
self.n_nearest_features >= n_features):
|
||||
return None
|
||||
with np.errstate(invalid='ignore'):
|
||||
# if a feature in the neighboorhood has only a single value
|
||||
# (e.g., categorical feature), the std. dev. will be null and
|
||||
# np.corrcoef will raise a warning due to a division by zero
|
||||
abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
|
||||
# np.corrcoef is not defined for features with zero std
|
||||
abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
|
||||
# ensures exploration, i.e. at least some probability of sampling
|
||||
np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
|
||||
# features are not their own neighbors
|
||||
np.fill_diagonal(abs_corr_mat, 0)
|
||||
# needs to sum to 1 for np.random.choice sampling
|
||||
abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
|
||||
return abs_corr_mat
|
||||
|
||||
def _initial_imputation(self, X):
|
||||
"""Perform initial imputation for input X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray, shape (n_samples, n_features)
|
||||
Input data, where "n_samples" is the number of samples and
|
||||
"n_features" is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : ndarray, shape (n_samples, n_features)
|
||||
Input data, where "n_samples" is the number of samples and
|
||||
"n_features" is the number of features.
|
||||
|
||||
X_filled : ndarray, shape (n_samples, n_features)
|
||||
Input data with the most recent imputations.
|
||||
|
||||
mask_missing_values : ndarray, shape (n_samples, n_features)
|
||||
Input data's missing indicator matrix, where "n_samples" is the
|
||||
number of samples and "n_features" is the number of features.
|
||||
"""
|
||||
if is_scalar_nan(self.missing_values):
|
||||
force_all_finite = "allow-nan"
|
||||
else:
|
||||
force_all_finite = True
|
||||
|
||||
X = self._validate_data(X, dtype=FLOAT_DTYPES, order="F",
|
||||
force_all_finite=force_all_finite)
|
||||
_check_inputs_dtype(X, self.missing_values)
|
||||
|
||||
mask_missing_values = _get_mask(X, self.missing_values)
|
||||
if self.initial_imputer_ is None:
|
||||
self.initial_imputer_ = SimpleImputer(
|
||||
missing_values=self.missing_values,
|
||||
strategy=self.initial_strategy
|
||||
)
|
||||
X_filled = self.initial_imputer_.fit_transform(X)
|
||||
else:
|
||||
X_filled = self.initial_imputer_.transform(X)
|
||||
|
||||
valid_mask = np.flatnonzero(np.logical_not(
|
||||
np.isnan(self.initial_imputer_.statistics_)))
|
||||
Xt = X[:, valid_mask]
|
||||
mask_missing_values = mask_missing_values[:, valid_mask]
|
||||
|
||||
return Xt, X_filled, mask_missing_values
|
||||
|
||||
@staticmethod
|
||||
def _validate_limit(limit, limit_type, n_features):
|
||||
"""Validate the limits (min/max) of the feature values
|
||||
Converts scalar min/max limits to vectors of shape (n_features,)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
limit: scalar or array-like
|
||||
The user-specified limit (i.e, min_value or max_value)
|
||||
limit_type: string, "max" or "min"
|
||||
n_features: Number of features in the dataset
|
||||
|
||||
Returns
|
||||
-------
|
||||
limit: ndarray, shape(n_features,)
|
||||
Array of limits, one for each feature
|
||||
"""
|
||||
limit_bound = np.inf if limit_type == "max" else -np.inf
|
||||
limit = limit_bound if limit is None else limit
|
||||
if np.isscalar(limit):
|
||||
limit = np.full(n_features, limit)
|
||||
limit = check_array(
|
||||
limit, force_all_finite=False, copy=False, ensure_2d=False
|
||||
)
|
||||
if not limit.shape[0] == n_features:
|
||||
raise ValueError(
|
||||
f"'{limit_type}_value' should be of "
|
||||
f"shape ({n_features},) when an array-like "
|
||||
f"is provided. Got {limit.shape}, instead."
|
||||
)
|
||||
return limit
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fits the imputer on X and return the transformed X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input data, where "n_samples" is the number of samples and
|
||||
"n_features" is the number of features.
|
||||
|
||||
y : ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : array-like, shape (n_samples, n_features)
|
||||
The imputed input data.
|
||||
"""
|
||||
self.random_state_ = getattr(self, "random_state_",
|
||||
check_random_state(self.random_state))
|
||||
|
||||
if self.max_iter < 0:
|
||||
raise ValueError(
|
||||
"'max_iter' should be a positive integer. Got {} instead."
|
||||
.format(self.max_iter))
|
||||
|
||||
if self.tol < 0:
|
||||
raise ValueError(
|
||||
"'tol' should be a non-negative float. Got {} instead."
|
||||
.format(self.tol)
|
||||
)
|
||||
|
||||
if self.estimator is None:
|
||||
from ..linear_model import BayesianRidge
|
||||
self._estimator = BayesianRidge()
|
||||
else:
|
||||
self._estimator = clone(self.estimator)
|
||||
|
||||
if hasattr(self._estimator, 'random_state'):
|
||||
self._estimator.random_state = self.random_state_
|
||||
|
||||
self.imputation_sequence_ = []
|
||||
|
||||
self.initial_imputer_ = None
|
||||
super()._fit_indicator(X)
|
||||
X_indicator = super()._transform_indicator(X)
|
||||
X, Xt, mask_missing_values = self._initial_imputation(X)
|
||||
if self.max_iter == 0 or np.all(mask_missing_values):
|
||||
self.n_iter_ = 0
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
# Edge case: a single feature. We return the initial ...
|
||||
if Xt.shape[1] == 1:
|
||||
self.n_iter_ = 0
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
self._min_value = IterativeImputer._validate_limit(
|
||||
self.min_value, "min", X.shape[1])
|
||||
self._max_value = IterativeImputer._validate_limit(
|
||||
self.max_value, "max", X.shape[1])
|
||||
|
||||
if not np.all(np.greater(self._max_value, self._min_value)):
|
||||
raise ValueError(
|
||||
"One (or more) features have min_value >= max_value.")
|
||||
|
||||
# order in which to impute
|
||||
# note this is probably too slow for large feature data (d > 100000)
|
||||
# and a better way would be good.
|
||||
# see: https://goo.gl/KyCNwj and subsequent comments
|
||||
ordered_idx = self._get_ordered_idx(mask_missing_values)
|
||||
self.n_features_with_missing_ = len(ordered_idx)
|
||||
|
||||
abs_corr_mat = self._get_abs_corr_mat(Xt)
|
||||
|
||||
n_samples, n_features = Xt.shape
|
||||
if self.verbose > 0:
|
||||
print("[IterativeImputer] Completing matrix with shape %s"
|
||||
% (X.shape,))
|
||||
start_t = time()
|
||||
if not self.sample_posterior:
|
||||
Xt_previous = Xt.copy()
|
||||
normalized_tol = self.tol * np.max(
|
||||
np.abs(X[~mask_missing_values])
|
||||
)
|
||||
for self.n_iter_ in range(1, self.max_iter + 1):
|
||||
if self.imputation_order == 'random':
|
||||
ordered_idx = self._get_ordered_idx(mask_missing_values)
|
||||
|
||||
for feat_idx in ordered_idx:
|
||||
neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
|
||||
feat_idx,
|
||||
abs_corr_mat)
|
||||
Xt, estimator = self._impute_one_feature(
|
||||
Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
|
||||
estimator=None, fit_mode=True)
|
||||
estimator_triplet = _ImputerTriplet(feat_idx,
|
||||
neighbor_feat_idx,
|
||||
estimator)
|
||||
self.imputation_sequence_.append(estimator_triplet)
|
||||
|
||||
if self.verbose > 1:
|
||||
print('[IterativeImputer] Ending imputation round '
|
||||
'%d/%d, elapsed time %0.2f'
|
||||
% (self.n_iter_, self.max_iter, time() - start_t))
|
||||
|
||||
if not self.sample_posterior:
|
||||
inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
|
||||
axis=None)
|
||||
if self.verbose > 0:
|
||||
print('[IterativeImputer] '
|
||||
'Change: {}, scaled tolerance: {} '.format(
|
||||
inf_norm, normalized_tol))
|
||||
if inf_norm < normalized_tol:
|
||||
if self.verbose > 0:
|
||||
print('[IterativeImputer] Early stopping criterion '
|
||||
'reached.')
|
||||
break
|
||||
Xt_previous = Xt.copy()
|
||||
else:
|
||||
if not self.sample_posterior:
|
||||
warnings.warn("[IterativeImputer] Early stopping criterion not"
|
||||
" reached.", ConvergenceWarning)
|
||||
Xt[~mask_missing_values] = X[~mask_missing_values]
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
def transform(self, X):
|
||||
"""Imputes all missing values in X.
|
||||
|
||||
Note that this is stochastic, and that if random_state is not fixed,
|
||||
repeated calls, or permuted input, will yield different results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : array-like, shape (n_samples, n_features)
|
||||
The imputed input data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X_indicator = super()._transform_indicator(X)
|
||||
X, Xt, mask_missing_values = self._initial_imputation(X)
|
||||
|
||||
if self.n_iter_ == 0 or np.all(mask_missing_values):
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
|
||||
i_rnd = 0
|
||||
if self.verbose > 0:
|
||||
print("[IterativeImputer] Completing matrix with shape %s"
|
||||
% (X.shape,))
|
||||
start_t = time()
|
||||
for it, estimator_triplet in enumerate(self.imputation_sequence_):
|
||||
Xt, _ = self._impute_one_feature(
|
||||
Xt,
|
||||
mask_missing_values,
|
||||
estimator_triplet.feat_idx,
|
||||
estimator_triplet.neighbor_feat_idx,
|
||||
estimator=estimator_triplet.estimator,
|
||||
fit_mode=False
|
||||
)
|
||||
if not (it + 1) % imputations_per_round:
|
||||
if self.verbose > 1:
|
||||
print('[IterativeImputer] Ending imputation round '
|
||||
'%d/%d, elapsed time %0.2f'
|
||||
% (i_rnd + 1, self.n_iter_, time() - start_t))
|
||||
i_rnd += 1
|
||||
|
||||
Xt[~mask_missing_values] = X[~mask_missing_values]
|
||||
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fits the imputer on X and return self.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input data, where "n_samples" is the number of samples and
|
||||
"n_features" is the number of features.
|
||||
|
||||
y : ignored
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns self.
|
||||
"""
|
||||
self.fit_transform(X)
|
||||
return self
|
301
venv/Lib/site-packages/sklearn/impute/_knn.py
Normal file
301
venv/Lib/site-packages/sklearn/impute/_knn.py
Normal file
|
@ -0,0 +1,301 @@
|
|||
# Authors: Ashim Bhattarai <ashimb9@gmail.com>
|
||||
# Thomas J Fan <thomasjpfan@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ._base import _BaseImputer
|
||||
from ..utils.validation import FLOAT_DTYPES
|
||||
from ..metrics import pairwise_distances_chunked
|
||||
from ..metrics.pairwise import _NAN_METRICS
|
||||
from ..neighbors._base import _get_weights
|
||||
from ..neighbors._base import _check_weights
|
||||
from ..utils import check_array
|
||||
from ..utils import is_scalar_nan
|
||||
from ..utils._mask import _get_mask
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
class KNNImputer(_BaseImputer):
|
||||
"""Imputation for completing missing values using k-Nearest Neighbors.
|
||||
|
||||
Each sample's missing values are imputed using the mean value from
|
||||
`n_neighbors` nearest neighbors found in the training set. Two samples are
|
||||
close if the features that neither is missing are close.
|
||||
|
||||
Read more in the :ref:`User Guide <knnimpute>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
missing_values : number, string, np.nan or None, default=`np.nan`
|
||||
The placeholder for the missing values. All occurrences of
|
||||
`missing_values` will be imputed. For pandas' dataframes with
|
||||
nullable integer dtypes with missing values, `missing_values`
|
||||
should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
|
||||
|
||||
n_neighbors : int, default=5
|
||||
Number of neighboring samples to use for imputation.
|
||||
|
||||
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||||
Weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood are
|
||||
weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- callable : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
metric : {'nan_euclidean'} or callable, default='nan_euclidean'
|
||||
Distance metric for searching neighbors. Possible values:
|
||||
|
||||
- 'nan_euclidean'
|
||||
- callable : a user-defined function which conforms to the definition
|
||||
of ``_pairwise_callable(X, Y, metric, **kwds)``. The function
|
||||
accepts two arrays, X and Y, and a `missing_values` keyword in
|
||||
`kwds` and returns a scalar distance value.
|
||||
|
||||
copy : bool, default=True
|
||||
If True, a copy of X will be created. If False, imputation will
|
||||
be done in-place whenever possible.
|
||||
|
||||
add_indicator : bool, default=False
|
||||
If True, a :class:`MissingIndicator` transform will stack onto the
|
||||
output of the imputer's transform. This allows a predictive estimator
|
||||
to account for missingness despite imputation. If a feature has no
|
||||
missing values at fit/train time, the feature won't appear on the
|
||||
missing indicator even if there are missing values at transform/test
|
||||
time.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
indicator_ : :class:`sklearn.impute.MissingIndicator`
|
||||
Indicator used to add binary indicators for missing values.
|
||||
``None`` if add_indicator is False.
|
||||
|
||||
References
|
||||
----------
|
||||
* Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
|
||||
Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
|
||||
value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
|
||||
no. 6, 2001 Pages 520-525.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.impute import KNNImputer
|
||||
>>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
|
||||
>>> imputer = KNNImputer(n_neighbors=2)
|
||||
>>> imputer.fit_transform(X)
|
||||
array([[1. , 2. , 4. ],
|
||||
[3. , 4. , 3. ],
|
||||
[5.5, 6. , 5. ],
|
||||
[8. , 8. , 7. ]])
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, missing_values=np.nan, n_neighbors=5,
|
||||
weights="uniform", metric="nan_euclidean", copy=True,
|
||||
add_indicator=False):
|
||||
super().__init__(
|
||||
missing_values=missing_values,
|
||||
add_indicator=add_indicator
|
||||
)
|
||||
self.n_neighbors = n_neighbors
|
||||
self.weights = weights
|
||||
self.metric = metric
|
||||
self.copy = copy
|
||||
|
||||
def _calc_impute(self, dist_pot_donors, n_neighbors,
|
||||
fit_X_col, mask_fit_X_col):
|
||||
"""Helper function to impute a single column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)
|
||||
Distance matrix between the receivers and potential donors from
|
||||
training set. There must be at least one non-nan distance between
|
||||
a receiver and a potential donor.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors to consider.
|
||||
|
||||
fit_X_col : ndarray of shape (n_potential_donors,)
|
||||
Column of potential donors from training set.
|
||||
|
||||
mask_fit_X_col : ndarray of shape (n_potential_donors,)
|
||||
Missing mask for fit_X_col.
|
||||
|
||||
Returns
|
||||
-------
|
||||
imputed_values: ndarray of shape (n_receivers,)
|
||||
Imputed values for receiver.
|
||||
"""
|
||||
# Get donors
|
||||
donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1,
|
||||
axis=1)[:, :n_neighbors]
|
||||
|
||||
# Get weight matrix from from distance matrix
|
||||
donors_dist = dist_pot_donors[
|
||||
np.arange(donors_idx.shape[0])[:, None], donors_idx]
|
||||
|
||||
weight_matrix = _get_weights(donors_dist, self.weights)
|
||||
|
||||
# fill nans with zeros
|
||||
if weight_matrix is not None:
|
||||
weight_matrix[np.isnan(weight_matrix)] = 0.0
|
||||
|
||||
# Retrieve donor values and calculate kNN average
|
||||
donors = fit_X_col.take(donors_idx)
|
||||
donors_mask = mask_fit_X_col.take(donors_idx)
|
||||
donors = np.ma.array(donors, mask=donors_mask)
|
||||
|
||||
return np.ma.average(donors, axis=1, weights=weight_matrix).data
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the imputer on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like shape of (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
# Check data integrity and calling arguments
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
force_all_finite = True
|
||||
else:
|
||||
force_all_finite = "allow-nan"
|
||||
if self.metric not in _NAN_METRICS and not callable(self.metric):
|
||||
raise ValueError(
|
||||
"The selected metric does not support NaN values")
|
||||
if self.n_neighbors <= 0:
|
||||
raise ValueError(
|
||||
"Expected n_neighbors > 0. Got {}".format(self.n_neighbors))
|
||||
|
||||
X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES,
|
||||
force_all_finite=force_all_finite,
|
||||
copy=self.copy)
|
||||
super()._fit_indicator(X)
|
||||
|
||||
_check_weights(self.weights)
|
||||
self._fit_X = X
|
||||
self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Impute all missing values in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : array-like of shape (n_samples, n_output_features)
|
||||
The imputed dataset. `n_output_features` is the number of features
|
||||
that is not always missing during `fit`.
|
||||
"""
|
||||
|
||||
check_is_fitted(self)
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
force_all_finite = True
|
||||
else:
|
||||
force_all_finite = "allow-nan"
|
||||
X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES,
|
||||
force_all_finite=force_all_finite, copy=self.copy)
|
||||
X_indicator = super()._transform_indicator(X)
|
||||
|
||||
if X.shape[1] != self._fit_X.shape[1]:
|
||||
raise ValueError("Incompatible dimension between the fitted "
|
||||
"dataset and the one to be transformed")
|
||||
|
||||
mask = _get_mask(X, self.missing_values)
|
||||
mask_fit_X = self._mask_fit_X
|
||||
valid_mask = ~np.all(mask_fit_X, axis=0)
|
||||
|
||||
if not np.any(mask):
|
||||
# No missing values in X
|
||||
# Remove columns where the training data is all nan
|
||||
return X[:, valid_mask]
|
||||
|
||||
row_missing_idx = np.flatnonzero(mask.any(axis=1))
|
||||
|
||||
non_missing_fix_X = np.logical_not(mask_fit_X)
|
||||
|
||||
# Maps from indices from X to indices in dist matrix
|
||||
dist_idx_map = np.zeros(X.shape[0], dtype=np.int)
|
||||
dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
|
||||
|
||||
def process_chunk(dist_chunk, start):
|
||||
row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)]
|
||||
|
||||
# Find and impute missing by column
|
||||
for col in range(X.shape[1]):
|
||||
if not valid_mask[col]:
|
||||
# column was all missing during training
|
||||
continue
|
||||
|
||||
col_mask = mask[row_missing_chunk, col]
|
||||
if not np.any(col_mask):
|
||||
# column has no missing values
|
||||
continue
|
||||
|
||||
potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col])
|
||||
|
||||
# receivers_idx are indices in X
|
||||
receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
|
||||
|
||||
# distances for samples that needed imputation for column
|
||||
dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start]
|
||||
[:, potential_donors_idx])
|
||||
|
||||
# receivers with all nan distances impute with mean
|
||||
all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
|
||||
all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
|
||||
|
||||
if all_nan_receivers_idx.size:
|
||||
col_mean = np.ma.array(self._fit_X[:, col],
|
||||
mask=mask_fit_X[:, col]).mean()
|
||||
X[all_nan_receivers_idx, col] = col_mean
|
||||
|
||||
if len(all_nan_receivers_idx) == len(receivers_idx):
|
||||
# all receivers imputed with mean
|
||||
continue
|
||||
|
||||
# receivers with at least one defined distance
|
||||
receivers_idx = receivers_idx[~all_nan_dist_mask]
|
||||
dist_subset = (dist_chunk[dist_idx_map[receivers_idx]
|
||||
- start]
|
||||
[:, potential_donors_idx])
|
||||
|
||||
n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
|
||||
value = self._calc_impute(
|
||||
dist_subset,
|
||||
n_neighbors,
|
||||
self._fit_X[potential_donors_idx, col],
|
||||
mask_fit_X[potential_donors_idx, col])
|
||||
X[receivers_idx, col] = value
|
||||
|
||||
# process in fixed-memory chunks
|
||||
gen = pairwise_distances_chunked(
|
||||
X[row_missing_idx, :],
|
||||
self._fit_X,
|
||||
metric=self.metric,
|
||||
missing_values=self.missing_values,
|
||||
force_all_finite=force_all_finite,
|
||||
reduce_func=process_chunk)
|
||||
for chunk in gen:
|
||||
# process_chunk modifies X in place. No return value.
|
||||
pass
|
||||
|
||||
return super()._concatenate_indicator(X[:, valid_mask], X_indicator)
|
0
venv/Lib/site-packages/sklearn/impute/tests/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/impute/tests/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
48
venv/Lib/site-packages/sklearn/impute/tests/test_base.py
Normal file
48
venv/Lib/site-packages/sklearn/impute/tests/test_base.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.impute._base import _BaseImputer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
X = np.random.randn(10, 2)
|
||||
X[::2] = np.nan
|
||||
return X
|
||||
|
||||
|
||||
class NoFitIndicatorImputer(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self._concatenate_indicator(X, self._transform_indicator(X))
|
||||
|
||||
|
||||
class NoTransformIndicatorImputer(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
super()._fit_indicator(X)
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self._concatenate_indicator(X, None)
|
||||
|
||||
|
||||
def test_base_imputer_not_fit(data):
|
||||
imputer = NoFitIndicatorImputer(add_indicator=True)
|
||||
err_msg = "Make sure to call _fit_indicator before _transform_indicator"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data).transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
def test_base_imputer_not_transform(data):
|
||||
imputer = NoTransformIndicatorImputer(add_indicator=True)
|
||||
err_msg = ("Call _fit_indicator and _transform_indicator in the "
|
||||
"imputer implementation")
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data).transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
115
venv/Lib/site-packages/sklearn/impute/tests/test_common.py
Normal file
115
venv/Lib/site-packages/sklearn/impute/tests/test_common.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
import pytest
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils._testing import assert_allclose_dense_sparse
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
from sklearn.experimental import enable_iterative_imputer # noqa
|
||||
|
||||
from sklearn.impute import IterativeImputer
|
||||
from sklearn.impute import KNNImputer
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
|
||||
IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()]
|
||||
SPARSE_IMPUTERS = [SimpleImputer()]
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("imputer", IMPUTERS)
|
||||
def test_imputation_missing_value_in_test_array(imputer):
|
||||
# [Non Regression Test for issue #13968] Missing value in test set should
|
||||
# not throw an error and return a finite dataset
|
||||
train = [[1], [2]]
|
||||
test = [[3], [np.nan]]
|
||||
imputer.set_params(add_indicator=True)
|
||||
imputer.fit(train).transform(test)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("marker", [np.nan, -1, 0])
|
||||
@pytest.mark.parametrize("imputer", IMPUTERS)
|
||||
def test_imputers_add_indicator(marker, imputer):
|
||||
X = np.array([
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4]
|
||||
])
|
||||
X_true_indicator = np.array([
|
||||
[1., 0., 0., 1.],
|
||||
[0., 1., 0., 1.],
|
||||
[0., 0., 1., 1.],
|
||||
[0., 0., 0., 1.]
|
||||
])
|
||||
imputer.set_params(missing_values=marker, add_indicator=True)
|
||||
|
||||
X_trans = imputer.fit_transform(X)
|
||||
assert_allclose(X_trans[:, -4:], X_true_indicator)
|
||||
assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
X_trans_no_indicator = imputer.fit_transform(X)
|
||||
assert_allclose(X_trans[:, :-4], X_trans_no_indicator)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("marker", [np.nan, -1])
|
||||
@pytest.mark.parametrize("imputer", SPARSE_IMPUTERS)
|
||||
def test_imputers_add_indicator_sparse(imputer, marker):
|
||||
X = sparse.csr_matrix([
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4]
|
||||
])
|
||||
X_true_indicator = sparse.csr_matrix([
|
||||
[1., 0., 0., 1.],
|
||||
[0., 1., 0., 1.],
|
||||
[0., 0., 1., 1.],
|
||||
[0., 0., 0., 1.]
|
||||
])
|
||||
imputer.set_params(missing_values=marker, add_indicator=True)
|
||||
|
||||
X_trans = imputer.fit_transform(X)
|
||||
assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator)
|
||||
assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
X_trans_no_indicator = imputer.fit_transform(X)
|
||||
assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("imputer", IMPUTERS)
|
||||
@pytest.mark.parametrize("add_indicator", [True, False])
|
||||
def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip('pandas', minversion="1.0")
|
||||
marker = np.nan
|
||||
imputer = imputer.set_params(add_indicator=add_indicator,
|
||||
missing_values=marker)
|
||||
|
||||
X = np.array([
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4]
|
||||
])
|
||||
# fit on numpy array
|
||||
X_trans_expected = imputer.fit_transform(X)
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
|
||||
|
||||
# fit on pandas dataframe with IntegerArrays
|
||||
X_trans = imputer.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans_expected, X_trans)
|
1366
venv/Lib/site-packages/sklearn/impute/tests/test_impute.py
Normal file
1366
venv/Lib/site-packages/sklearn/impute/tests/test_impute.py
Normal file
File diff suppressed because it is too large
Load diff
641
venv/Lib/site-packages/sklearn/impute/tests/test_knn.py
Normal file
641
venv/Lib/site-packages/sklearn/impute/tests/test_knn.py
Normal file
|
@ -0,0 +1,641 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.impute import KNNImputer
|
||||
from sklearn.metrics.pairwise import nan_euclidean_distances
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
@pytest.mark.parametrize("weights", ["uniform", "distance"])
|
||||
@pytest.mark.parametrize("n_neighbors", range(1, 6))
|
||||
def test_knn_imputer_shape(weights, n_neighbors):
|
||||
# Verify the shapes of the imputed matrix for different weights and
|
||||
# number of neighbors.
|
||||
n_rows = 10
|
||||
n_cols = 2
|
||||
X = np.random.rand(n_rows, n_cols)
|
||||
X[0, 0] = np.nan
|
||||
|
||||
imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
|
||||
X_imputed = imputer.fit_transform(X)
|
||||
assert X_imputed.shape == (n_rows, n_cols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_default_with_invalid_input(na):
|
||||
# Test imputation with default values and invalid input
|
||||
|
||||
# Test with inf present
|
||||
X = np.array([
|
||||
[np.inf, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
])
|
||||
with pytest.raises(ValueError, match="Input contains (infinity|NaN)"):
|
||||
KNNImputer(missing_values=na).fit(X)
|
||||
|
||||
# Test with inf present in matrix passed in transform()
|
||||
X = np.array([
|
||||
[np.inf, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
])
|
||||
|
||||
X_fit = np.array([
|
||||
[0, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
])
|
||||
imputer = KNNImputer(missing_values=na).fit(X_fit)
|
||||
with pytest.raises(ValueError, match="Input contains (infinity|NaN)"):
|
||||
imputer.transform(X)
|
||||
|
||||
# negative n_neighbors
|
||||
with pytest.raises(ValueError, match="Expected n_neighbors > 0"):
|
||||
KNNImputer(missing_values=na, n_neighbors=0).fit(X_fit)
|
||||
|
||||
# Test with missing_values=0 when NaN present
|
||||
imputer = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
|
||||
X = np.array([
|
||||
[np.nan, 0, 0, 0, 5],
|
||||
[np.nan, 1, 0, np.nan, 3],
|
||||
[np.nan, 2, 0, 0, 0],
|
||||
[np.nan, 6, 0, 5, 13],
|
||||
])
|
||||
msg = (r"Input contains NaN, infinity or a value too large for "
|
||||
r"dtype\('float64'\)")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
imputer.fit(X)
|
||||
|
||||
X = np.array([
|
||||
[0, 0],
|
||||
[np.nan, 2],
|
||||
])
|
||||
|
||||
# Test with a metric type without NaN support
|
||||
imputer = KNNImputer(metric="euclidean")
|
||||
bad_metric_msg = "The selected metric does not support NaN values"
|
||||
with pytest.raises(ValueError, match=bad_metric_msg):
|
||||
imputer.fit(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_removes_all_na_features(na):
|
||||
X = np.array([
|
||||
[1, 1, na, 1, 1, 1.],
|
||||
[2, 3, na, 2, 2, 2],
|
||||
[3, 4, na, 3, 3, na],
|
||||
[6, 4, na, na, 6, 6],
|
||||
])
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X)
|
||||
|
||||
X_transform = knn.transform(X)
|
||||
assert not np.isnan(X_transform).any()
|
||||
assert X_transform.shape == (4, 5)
|
||||
|
||||
X_test = np.arange(0, 12).reshape(2, 6)
|
||||
X_transform = knn.transform(X_test)
|
||||
assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_zero_nan_imputes_the_same(na):
|
||||
# Test with an imputable matrix and compare with different missing_values
|
||||
X_zero = np.array([
|
||||
[1, 0, 1, 1, 1.],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, 0],
|
||||
[6, 6, 0, 6, 6],
|
||||
])
|
||||
|
||||
X_nan = np.array([
|
||||
[1, na, 1, 1, 1.],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, na],
|
||||
[6, 6, na, 6, 6],
|
||||
])
|
||||
|
||||
X_imputed = np.array([
|
||||
[1, 2.5, 1, 1, 1.],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, 1.5],
|
||||
[6, 6, 2.5, 6, 6],
|
||||
])
|
||||
|
||||
imputer_zero = KNNImputer(missing_values=0, n_neighbors=2,
|
||||
weights="uniform")
|
||||
|
||||
imputer_nan = KNNImputer(missing_values=na, n_neighbors=2,
|
||||
weights="uniform")
|
||||
|
||||
assert_allclose(imputer_zero.fit_transform(X_zero), X_imputed)
|
||||
assert_allclose(imputer_zero.fit_transform(X_zero),
|
||||
imputer_nan.fit_transform(X_nan))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_verify(na):
|
||||
# Test with an imputable matrix
|
||||
X = np.array([
|
||||
[1, 0, 0, 1],
|
||||
[2, 1, 2, na],
|
||||
[3, 2, 3, na],
|
||||
[na, 4, 5, 5],
|
||||
[6, na, 6, 7],
|
||||
[8, 8, 8, 8],
|
||||
[16, 15, 18, 19],
|
||||
])
|
||||
|
||||
X_imputed = np.array([
|
||||
[1, 0, 0, 1],
|
||||
[2, 1, 2, 8],
|
||||
[3, 2, 3, 8],
|
||||
[4, 4, 5, 5],
|
||||
[6, 3, 6, 7],
|
||||
[8, 8, 8, 8],
|
||||
[16, 15, 18, 19],
|
||||
])
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test when there is not enough neighbors
|
||||
X = np.array([
|
||||
[1, 0, 0, na],
|
||||
[2, 1, 2, na],
|
||||
[3, 2, 3, na],
|
||||
[4, 4, 5, na],
|
||||
[6, 7, 6, na],
|
||||
[8, 8, 8, na],
|
||||
[20, 20, 20, 20],
|
||||
[22, 22, 22, 22]
|
||||
])
|
||||
|
||||
# Not enough neighbors, use column mean from training
|
||||
X_impute_value = (20 + 22) / 2
|
||||
X_imputed = np.array([
|
||||
[1, 0, 0, X_impute_value],
|
||||
[2, 1, 2, X_impute_value],
|
||||
[3, 2, 3, X_impute_value],
|
||||
[4, 4, 5, X_impute_value],
|
||||
[6, 7, 6, X_impute_value],
|
||||
[8, 8, 8, X_impute_value],
|
||||
[20, 20, 20, 20],
|
||||
[22, 22, 22, 22]
|
||||
])
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test when data in fit() and transform() are different
|
||||
X = np.array([
|
||||
[0, 0],
|
||||
[na, 2],
|
||||
[4, 3],
|
||||
[5, 6],
|
||||
[7, 7],
|
||||
[9, 8],
|
||||
[11, 16]
|
||||
])
|
||||
|
||||
X1 = np.array([
|
||||
[1, 0],
|
||||
[3, 2],
|
||||
[4, na]
|
||||
])
|
||||
|
||||
X_2_1 = (0 + 3 + 6 + 7 + 8) / 5
|
||||
X1_imputed = np.array([
|
||||
[1, 0],
|
||||
[3, 2],
|
||||
[4, X_2_1]
|
||||
])
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit(X).transform(X1), X1_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_one_n_neighbors(na):
|
||||
|
||||
X = np.array([
|
||||
[0, 0],
|
||||
[na, 2],
|
||||
[4, 3],
|
||||
[5, na],
|
||||
[7, 7],
|
||||
[na, 8],
|
||||
[14, 13]
|
||||
])
|
||||
|
||||
X_imputed = np.array([
|
||||
[0, 0],
|
||||
[4, 2],
|
||||
[4, 3],
|
||||
[5, 3],
|
||||
[7, 7],
|
||||
[7, 8],
|
||||
[14, 13]
|
||||
])
|
||||
|
||||
imputer = KNNImputer(n_neighbors=1, missing_values=na)
|
||||
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_all_samples_are_neighbors(na):
|
||||
X = np.array([
|
||||
[0, 0],
|
||||
[na, 2],
|
||||
[4, 3],
|
||||
[5, na],
|
||||
[7, 7],
|
||||
[na, 8],
|
||||
[14, 13]
|
||||
])
|
||||
|
||||
X_imputed = np.array([
|
||||
[0, 0],
|
||||
[6, 2],
|
||||
[4, 3],
|
||||
[5, 5.5],
|
||||
[7, 7],
|
||||
[6, 8],
|
||||
[14, 13]
|
||||
])
|
||||
|
||||
n_neighbors = X.shape[0] - 1
|
||||
imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
|
||||
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
n_neighbors = X.shape[0]
|
||||
imputer_plus1 = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
|
||||
assert_allclose(imputer_plus1.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_weight_uniform(na):
|
||||
|
||||
X = np.array([
|
||||
[0, 0],
|
||||
[na, 2],
|
||||
[4, 3],
|
||||
[5, 6],
|
||||
[7, 7],
|
||||
[9, 8],
|
||||
[11, 10]
|
||||
])
|
||||
|
||||
# Test with "uniform" weight (or unweighted)
|
||||
X_imputed_uniform = np.array([
|
||||
[0, 0],
|
||||
[5, 2],
|
||||
[4, 3],
|
||||
[5, 6],
|
||||
[7, 7],
|
||||
[9, 8],
|
||||
[11, 10]
|
||||
])
|
||||
|
||||
imputer = KNNImputer(weights="uniform", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
# Test with "callable" weight
|
||||
def no_weight(dist):
|
||||
return None
|
||||
|
||||
imputer = KNNImputer(weights=no_weight, missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
# Test with "callable" uniform weight
|
||||
def uniform_weight(dist):
|
||||
return np.ones_like(dist)
|
||||
|
||||
imputer = KNNImputer(weights=uniform_weight, missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_weight_distance(na):
|
||||
X = np.array([
|
||||
[0, 0],
|
||||
[na, 2],
|
||||
[4, 3],
|
||||
[5, 6],
|
||||
[7, 7],
|
||||
[9, 8],
|
||||
[11, 10]
|
||||
])
|
||||
|
||||
# Test with "distance" weight
|
||||
nn = KNeighborsRegressor(metric="euclidean", weights="distance")
|
||||
X_rows_idx = [0, 2, 3, 4, 5, 6]
|
||||
nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
|
||||
knn_imputed_value = nn.predict(X[1:2, 1:])[0]
|
||||
|
||||
# Manual calculation
|
||||
X_neighbors_idx = [0, 2, 3, 4, 5]
|
||||
dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
|
||||
weights = 1 / dist[:, X_neighbors_idx].ravel()
|
||||
manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)
|
||||
|
||||
X_imputed_distance1 = np.array([
|
||||
[0, 0],
|
||||
[manual_imputed_value, 2],
|
||||
[4, 3],
|
||||
[5, 6],
|
||||
[7, 7],
|
||||
[9, 8],
|
||||
[11, 10]
|
||||
])
|
||||
|
||||
# NearestNeighbor calculation
|
||||
X_imputed_distance2 = np.array([
|
||||
[0, 0],
|
||||
[knn_imputed_value, 2],
|
||||
[4, 3],
|
||||
[5, 6],
|
||||
[7, 7],
|
||||
[9, 8],
|
||||
[11, 10]
|
||||
])
|
||||
|
||||
imputer = KNNImputer(weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_distance2)
|
||||
|
||||
# Test with weights = "distance" and n_neighbors=2
|
||||
X = np.array([
|
||||
[na, 0, 0],
|
||||
[2, 1, 2],
|
||||
[3, 2, 3],
|
||||
[4, 5, 5],
|
||||
])
|
||||
|
||||
# neighbors are rows 1, 2, the nan_euclidean_distances are:
|
||||
dist_0_1 = np.sqrt((3/2)*((1 - 0)**2 + (2 - 0)**2))
|
||||
dist_0_2 = np.sqrt((3/2)*((2 - 0)**2 + (3 - 0)**2))
|
||||
imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])
|
||||
|
||||
X_imputed = np.array([
|
||||
[imputed_value, 0, 0],
|
||||
[2, 1, 2],
|
||||
[3, 2, 3],
|
||||
[4, 5, 5],
|
||||
])
|
||||
|
||||
imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test with varying missingness patterns
|
||||
X = np.array([
|
||||
[1, 0, 0, 1],
|
||||
[0, na, 1, na],
|
||||
[1, 1, 1, na],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[1, 0, 1, 1],
|
||||
[10, 10, 10, 10],
|
||||
])
|
||||
|
||||
# Get weights of donor neighbors
|
||||
dist = nan_euclidean_distances(X, missing_values=na)
|
||||
r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
|
||||
r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
|
||||
r1c1_nbor_wt = 1 / r1c1_nbor_dists
|
||||
r1c3_nbor_wt = 1 / r1c3_nbor_dists
|
||||
|
||||
r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
|
||||
r2c3_nbor_wt = 1 / r2c3_nbor_dists
|
||||
|
||||
# Collect donor values
|
||||
col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
|
||||
col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()
|
||||
|
||||
# Final imputed values
|
||||
r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
|
||||
r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
|
||||
r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)
|
||||
|
||||
X_imputed = np.array([
|
||||
[1, 0, 0, 1],
|
||||
[0, r1c1_imp, 1, r1c3_imp],
|
||||
[1, 1, 1, r2c3_imp],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[1, 0, 1, 1],
|
||||
[10, 10, 10, 10],
|
||||
])
|
||||
|
||||
imputer = KNNImputer(weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
X = np.array([
|
||||
[0, 0, 0, na],
|
||||
[1, 1, 1, na],
|
||||
[2, 2, na, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[na, 7, 7, 7]
|
||||
])
|
||||
|
||||
dist = pairwise_distances(X, metric="nan_euclidean", squared=False,
|
||||
missing_values=na)
|
||||
|
||||
# Calculate weights
|
||||
r0c3_w = 1.0 / dist[0, 2:-1]
|
||||
r1c3_w = 1.0 / dist[1, 2:-1]
|
||||
r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
|
||||
r7c0_w = 1.0 / dist[7, 2:7]
|
||||
|
||||
# Calculate weighted averages
|
||||
r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
|
||||
r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
|
||||
r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
|
||||
r7c0 = np.average(X[2:7, 0], weights=r7c0_w)
|
||||
|
||||
X_imputed = np.array([
|
||||
[0, 0, 0, r0c3],
|
||||
[1, 1, 1, r1c3],
|
||||
[2, 2, r2c2, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[r7c0, 7, 7, 7]
|
||||
])
|
||||
|
||||
imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
|
||||
assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
def test_knn_imputer_callable_metric():
|
||||
|
||||
# Define callable metric that returns the l1 norm:
|
||||
def custom_callable(x, y, missing_values=np.nan, squared=False):
|
||||
x = np.ma.array(x, mask=np.isnan(x))
|
||||
y = np.ma.array(y, mask=np.isnan(y))
|
||||
dist = np.nansum(np.abs(x-y))
|
||||
return dist
|
||||
|
||||
X = np.array([
|
||||
[4, 3, 3, np.nan],
|
||||
[6, 9, 6, 9],
|
||||
[4, 8, 6, 9],
|
||||
[np.nan, 9, 11, 10.]
|
||||
])
|
||||
|
||||
X_0_3 = (9 + 9) / 2
|
||||
X_3_0 = (6 + 4) / 2
|
||||
X_imputed = np.array([
|
||||
[4, 3, 3, X_0_3],
|
||||
[6, 9, 6, 9],
|
||||
[4, 8, 6, 9],
|
||||
[X_3_0, 9, 11, 10.]
|
||||
])
|
||||
|
||||
imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("working_memory", [None, 0])
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
# Note that we use working_memory=0 to ensure that chunking is tested, even
|
||||
# for a small dataset. However, it should raise a UserWarning that we ignore.
|
||||
@pytest.mark.filterwarnings("ignore:adhere to working_memory")
|
||||
def test_knn_imputer_with_simple_example(na, working_memory):
|
||||
|
||||
X = np.array([
|
||||
[0, na, 0, na],
|
||||
[1, 1, 1, na],
|
||||
[2, 2, na, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[na, 7, 7, 7]
|
||||
])
|
||||
|
||||
r0c1 = np.mean(X[1:6, 1])
|
||||
r0c3 = np.mean(X[2:-1, -1])
|
||||
r1c3 = np.mean(X[2:-1, -1])
|
||||
r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2])
|
||||
r7c0 = np.mean(X[2:-1, 0])
|
||||
|
||||
X_imputed = np.array([
|
||||
[0, r0c1, 0, r0c3],
|
||||
[1, 1, 1, r1c3],
|
||||
[2, 2, r2c2, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[r7c0, 7, 7, 7]
|
||||
])
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
imputer_comp = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer_comp.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
@pytest.mark.parametrize("weights", ['uniform', 'distance'])
|
||||
def test_knn_imputer_not_enough_valid_distances(na, weights):
|
||||
# Samples with needed feature has nan distance
|
||||
X1 = np.array([
|
||||
[na, 11],
|
||||
[na, 1],
|
||||
[3, na]
|
||||
])
|
||||
X1_imputed = np.array([
|
||||
[3, 11],
|
||||
[3, 1],
|
||||
[3, 6]
|
||||
])
|
||||
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
|
||||
assert_allclose(knn.fit_transform(X1), X1_imputed)
|
||||
|
||||
X2 = np.array([[4, na]])
|
||||
X2_imputed = np.array([[4, 6]])
|
||||
assert_allclose(knn.transform(X2), X2_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
def test_knn_imputer_drops_all_nan_features(na):
|
||||
X1 = np.array([
|
||||
[na, 1],
|
||||
[na, 2]
|
||||
])
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=1)
|
||||
X1_expected = np.array([[1], [2]])
|
||||
assert_allclose(knn.fit_transform(X1), X1_expected)
|
||||
|
||||
X2 = np.array([
|
||||
[1, 2],
|
||||
[3, na]
|
||||
])
|
||||
X2_expected = np.array([[2], [1.5]])
|
||||
assert_allclose(knn.transform(X2), X2_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("working_memory", [None, 0])
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
def test_knn_imputer_distance_weighted_not_enough_neighbors(na,
|
||||
working_memory):
|
||||
X = np.array([
|
||||
[3, na],
|
||||
[2, na],
|
||||
[na, 4],
|
||||
[5, 6],
|
||||
[6, 8],
|
||||
[na, 5]
|
||||
])
|
||||
|
||||
dist = pairwise_distances(X, metric="nan_euclidean", squared=False,
|
||||
missing_values=na)
|
||||
|
||||
X_01 = np.average(X[3:5, 1], weights=1/dist[0, 3:5])
|
||||
X_11 = np.average(X[3:5, 1], weights=1/dist[1, 3:5])
|
||||
X_20 = np.average(X[3:5, 0], weights=1/dist[2, 3:5])
|
||||
X_50 = np.average(X[3:5, 0], weights=1/dist[5, 3:5])
|
||||
|
||||
X_expected = np.array([
|
||||
[3, X_01],
|
||||
[2, X_11],
|
||||
[X_20, 4],
|
||||
[5, 6],
|
||||
[6, 8],
|
||||
[X_50, 5]
|
||||
])
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
knn_3 = KNNImputer(missing_values=na, n_neighbors=3,
|
||||
weights='distance')
|
||||
assert_allclose(knn_3.fit_transform(X), X_expected)
|
||||
|
||||
knn_4 = KNNImputer(missing_values=na, n_neighbors=4,
|
||||
weights='distance')
|
||||
assert_allclose(knn_4.fit_transform(X), X_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)])
|
||||
def test_knn_tags(na, allow_nan):
|
||||
knn = KNNImputer(missing_values=na)
|
||||
assert knn._get_tags()["allow_nan"] == allow_nan
|
Loading…
Add table
Add a link
Reference in a new issue