Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/preprocessing/init.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/init.py
@ -0,0 +1,67 @@
+"""
+The :mod:`sklearn.preprocessing` module includes scaling, centering,
+normalization, binarization methods.
+"""
+
+from ._function_transformer import FunctionTransformer
+
+from ._data import Binarizer
+from ._data import KernelCenterer
+from ._data import MinMaxScaler
+from ._data import MaxAbsScaler
+from ._data import Normalizer
+from ._data import RobustScaler
+from ._data import StandardScaler
+from ._data import QuantileTransformer
+from ._data import add_dummy_feature
+from ._data import binarize
+from ._data import normalize
+from ._data import scale
+from ._data import robust_scale
+from ._data import maxabs_scale
+from ._data import minmax_scale
+from ._data import quantile_transform
+from ._data import power_transform
+from ._data import PowerTransformer
+from ._data import PolynomialFeatures
+
+from ._encoders import OneHotEncoder
+from ._encoders import OrdinalEncoder
+
+from ._label import label_binarize
+from ._label import LabelBinarizer
+from ._label import LabelEncoder
+from ._label import MultiLabelBinarizer
+
+from ._discretization import KBinsDiscretizer
+
+
+__all__ = [
+    'Binarizer',
+    'FunctionTransformer',
+    'KBinsDiscretizer',
+    'KernelCenterer',
+    'LabelBinarizer',
+    'LabelEncoder',
+    'MultiLabelBinarizer',
+    'MinMaxScaler',
+    'MaxAbsScaler',
+    'QuantileTransformer',
+    'Normalizer',
+    'OneHotEncoder',
+    'OrdinalEncoder',
+    'PowerTransformer',
+    'RobustScaler',
+    'StandardScaler',
+    'add_dummy_feature',
+    'PolynomialFeatures',
+    'binarize',
+    'normalize',
+    'scale',
+    'robust_scale',
+    'maxabs_scale',
+    'minmax_scale',
+    'label_binarize',
+    'quantile_transform',
+    'power_transform',
+]
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/_data.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/_data.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/_discretization.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/_discretization.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/_encoders.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/_encoders.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/_function_transformer.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/_function_transformer.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/_label.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/_label.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/data.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/data.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/label.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/label.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/pycache/setup.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/pycache/setup.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp36-win32.pyd
+++ b/venv/Lib/site-packages/sklearn/preprocessing/_csr_polynomial_expansion.cp36-win32.pyd
--- a/venv/Lib/site-packages/sklearn/preprocessing/_data.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/_data.py
--- a/venv/Lib/site-packages/sklearn/preprocessing/_discretization.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/_discretization.py
@ -0,0 +1,324 @@
+# -*- coding: utf-8 -*-
+
+# Author: Henry Lin <hlin117@gmail.com>
+#         Tom Dupré la Tour
+
+# License: BSD
+
+
+import numbers
+import numpy as np
+import warnings
+
+from . import OneHotEncoder
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils.validation import check_array
+from ..utils.validation import check_is_fitted
+from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
+
+
+class KBinsDiscretizer(TransformerMixin, BaseEstimator):
+    """
+    Bin continuous data into intervals.
+
+    Read more in the :ref:`User Guide <preprocessing_discretization>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    n_bins : int or array-like, shape (n_features,) (default=5)
+        The number of bins to produce. Raises ValueError if ``n_bins < 2``.
+
+    encode : {'onehot', 'onehot-dense', 'ordinal'}, (default='onehot')
+        Method used to encode the transformed result.
+
+        onehot
+            Encode the transformed result with one-hot encoding
+            and return a sparse matrix. Ignored features are always
+            stacked to the right.
+        onehot-dense
+            Encode the transformed result with one-hot encoding
+            and return a dense array. Ignored features are always
+            stacked to the right.
+        ordinal
+            Return the bin identifier encoded as an integer value.
+
+    strategy : {'uniform', 'quantile', 'kmeans'}, (default='quantile')
+        Strategy used to define the widths of the bins.
+
+        uniform
+            All bins in each feature have identical widths.
+        quantile
+            All bins in each feature have the same number of points.
+        kmeans
+            Values in each bin have the same nearest center of a 1D k-means
+            cluster.
+
+    Attributes
+    ----------
+    n_bins_ : int array, shape (n_features,)
+        Number of bins per feature. Bins whose width are too small
+        (i.e., <= 1e-8) are removed with a warning.
+
+    bin_edges_ : array of arrays, shape (n_features, )
+        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
+        Ignored features will have empty arrays.
+
+    See Also
+    --------
+     sklearn.preprocessing.Binarizer : Class used to bin values as ``0`` or
+        ``1`` based on a parameter ``threshold``.
+
+    Notes
+    -----
+    In bin edges for feature ``i``, the first and last values are used only for
+    ``inverse_transform``. During transform, bin edges are extended to::
+
+      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
+
+    You can combine ``KBinsDiscretizer`` with
+    :class:`sklearn.compose.ColumnTransformer` if you only want to preprocess
+    part of the features.
+
+    ``KBinsDiscretizer`` might produce constant features (e.g., when
+    ``encode = 'onehot'`` and certain bins do not contain any data).
+    These features can be removed with feature selection algorithms
+    (e.g., :class:`sklearn.feature_selection.VarianceThreshold`).
+
+    Examples
+    --------
+    >>> X = [[-2, 1, -4,   -1],
+    ...      [-1, 2, -3, -0.5],
+    ...      [ 0, 3, -2,  0.5],
+    ...      [ 1, 4, -1,    2]]
+    >>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
+    >>> est.fit(X)
+    KBinsDiscretizer(...)
+    >>> Xt = est.transform(X)
+    >>> Xt  # doctest: +SKIP
+    array([[ 0., 0., 0., 0.],
+           [ 1., 1., 1., 0.],
+           [ 2., 2., 2., 1.],
+           [ 2., 2., 2., 2.]])
+
+    Sometimes it may be useful to convert the data back into the original
+    feature space. The ``inverse_transform`` function converts the binned
+    data into the original feature space. Each value will be equal to the mean
+    of the two bin edges.
+
+    >>> est.bin_edges_[0]
+    array([-2., -1.,  0.,  1.])
+    >>> est.inverse_transform(Xt)
+    array([[-1.5,  1.5, -3.5, -0.5],
+           [-0.5,  2.5, -2.5, -0.5],
+           [ 0.5,  3.5, -1.5,  0.5],
+           [ 0.5,  3.5, -1.5,  1.5]])
+
+    """
+
+    @_deprecate_positional_args
+    def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'):
+        self.n_bins = n_bins
+        self.encode = encode
+        self.strategy = strategy
+
+    def fit(self, X, y=None):
+        """
+        Fit the estimator.
+
+        Parameters
+        ----------
+        X : numeric array-like, shape (n_samples, n_features)
+            Data to be discretized.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        self
+        """
+        X = self._validate_data(X, dtype='numeric')
+
+        valid_encode = ('onehot', 'onehot-dense', 'ordinal')
+        if self.encode not in valid_encode:
+            raise ValueError("Valid options for 'encode' are {}. "
+                             "Got encode={!r} instead."
+                             .format(valid_encode, self.encode))
+        valid_strategy = ('uniform', 'quantile', 'kmeans')
+        if self.strategy not in valid_strategy:
+            raise ValueError("Valid options for 'strategy' are {}. "
+                             "Got strategy={!r} instead."
+                             .format(valid_strategy, self.strategy))
+
+        n_features = X.shape[1]
+        n_bins = self._validate_n_bins(n_features)
+
+        bin_edges = np.zeros(n_features, dtype=object)
+        for jj in range(n_features):
+            column = X[:, jj]
+            col_min, col_max = column.min(), column.max()
+
+            if col_min == col_max:
+                warnings.warn("Feature %d is constant and will be "
+                              "replaced with 0." % jj)
+                n_bins[jj] = 1
+                bin_edges[jj] = np.array([-np.inf, np.inf])
+                continue
+
+            if self.strategy == 'uniform':
+                bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
+
+            elif self.strategy == 'quantile':
+                quantiles = np.linspace(0, 100, n_bins[jj] + 1)
+                bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
+
+            elif self.strategy == 'kmeans':
+                from ..cluster import KMeans  # fixes import loops
+
+                # Deterministic initialization with uniform spacing
+                uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
+                init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
+
+                # 1D k-means procedure
+                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
+                centers = km.fit(column[:, None]).cluster_centers_[:, 0]
+                # Must sort, centers may be unsorted even with sorted init
+                centers.sort()
+                bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
+                bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
+
+            # Remove bins whose width are too small (i.e., <= 1e-8)
+            if self.strategy in ('quantile', 'kmeans'):
+                mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
+                bin_edges[jj] = bin_edges[jj][mask]
+                if len(bin_edges[jj]) - 1 != n_bins[jj]:
+                    warnings.warn('Bins whose width are too small (i.e., <= '
+                                  '1e-8) in feature %d are removed. Consider '
+                                  'decreasing the number of bins.' % jj)
+                    n_bins[jj] = len(bin_edges[jj]) - 1
+
+        self.bin_edges_ = bin_edges
+        self.n_bins_ = n_bins
+
+        if 'onehot' in self.encode:
+            self._encoder = OneHotEncoder(
+                categories=[np.arange(i) for i in self.n_bins_],
+                sparse=self.encode == 'onehot')
+            # Fit the OneHotEncoder with toy datasets
+            # so that it's ready for use after the KBinsDiscretizer is fitted
+            self._encoder.fit(np.zeros((1, len(self.n_bins_)), dtype=int))
+
+        return self
+
+    def _validate_n_bins(self, n_features):
+        """Returns n_bins_, the number of bins per feature.
+        """
+        orig_bins = self.n_bins
+        if isinstance(orig_bins, numbers.Number):
+            if not isinstance(orig_bins, numbers.Integral):
+                raise ValueError("{} received an invalid n_bins type. "
+                                 "Received {}, expected int."
+                                 .format(KBinsDiscretizer.__name__,
+                                         type(orig_bins).__name__))
+            if orig_bins < 2:
+                raise ValueError("{} received an invalid number "
+                                 "of bins. Received {}, expected at least 2."
+                                 .format(KBinsDiscretizer.__name__, orig_bins))
+            return np.full(n_features, orig_bins, dtype=np.int)
+
+        n_bins = check_array(orig_bins, dtype=np.int, copy=True,
+                             ensure_2d=False)
+
+        if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
+            raise ValueError("n_bins must be a scalar or array "
+                             "of shape (n_features,).")
+
+        bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
+
+        violating_indices = np.where(bad_nbins_value)[0]
+        if violating_indices.shape[0] > 0:
+            indices = ", ".join(str(i) for i in violating_indices)
+            raise ValueError("{} received an invalid number "
+                             "of bins at indices {}. Number of bins "
+                             "must be at least 2, and must be an int."
+                             .format(KBinsDiscretizer.__name__, indices))
+        return n_bins
+
+    def transform(self, X):
+        """
+        Discretize the data.
+
+        Parameters
+        ----------
+        X : numeric array-like, shape (n_samples, n_features)
+            Data to be discretized.
+
+        Returns
+        -------
+        Xt : numeric array-like or sparse matrix
+            Data in the binned space.
+        """
+        check_is_fitted(self)
+
+        Xt = check_array(X, copy=True, dtype=FLOAT_DTYPES)
+        n_features = self.n_bins_.shape[0]
+        if Xt.shape[1] != n_features:
+            raise ValueError("Incorrect number of features. Expecting {}, "
+                             "received {}.".format(n_features, Xt.shape[1]))
+
+        bin_edges = self.bin_edges_
+        for jj in range(Xt.shape[1]):
+            # Values which are close to a bin edge are susceptible to numeric
+            # instability. Add eps to X so these values are binned correctly
+            # with respect to their decimal truncation. See documentation of
+            # numpy.isclose for an explanation of ``rtol`` and ``atol``.
+            rtol = 1.e-5
+            atol = 1.e-8
+            eps = atol + rtol * np.abs(Xt[:, jj])
+            Xt[:, jj] = np.digitize(Xt[:, jj] + eps, bin_edges[jj][1:])
+        np.clip(Xt, 0, self.n_bins_ - 1, out=Xt)
+
+        if self.encode == 'ordinal':
+            return Xt
+
+        return self._encoder.transform(Xt)
+
+    def inverse_transform(self, Xt):
+        """
+        Transform discretized data back to original feature space.
+
+        Note that this function does not regenerate the original data
+        due to discretization rounding.
+
+        Parameters
+        ----------
+        Xt : numeric array-like, shape (n_sample, n_features)
+            Transformed data in the binned space.
+
+        Returns
+        -------
+        Xinv : numeric array-like
+            Data in the original feature space.
+        """
+        check_is_fitted(self)
+
+        if 'onehot' in self.encode:
+            Xt = self._encoder.inverse_transform(Xt)
+
+        Xinv = check_array(Xt, copy=True, dtype=FLOAT_DTYPES)
+        n_features = self.n_bins_.shape[0]
+        if Xinv.shape[1] != n_features:
+            raise ValueError("Incorrect number of features. Expecting {}, "
+                             "received {}.".format(n_features, Xinv.shape[1]))
+
+        for jj in range(n_features):
+            bin_edges = self.bin_edges_[jj]
+            bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
+            Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]
+
+        return Xinv
--- a/venv/Lib/site-packages/sklearn/preprocessing/_encoders.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/_encoders.py
@ -0,0 +1,737 @@
+# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
+#          Joris Van den Bossche <jorisvandenbossche@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+from scipy import sparse
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_array
+from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
+
+from ._label import _encode, _encode_check_unknown
+
+
+__all__ = [
+    'OneHotEncoder',
+    'OrdinalEncoder'
+]
+
+
+class _BaseEncoder(TransformerMixin, BaseEstimator):
+    """
+    Base class for encoders that includes the code to categorize and
+    transform the input features.
+
+    """
+
+    def _check_X(self, X):
+        """
+        Perform custom check_array:
+        - convert list of strings to object dtype
+        - check for missing values for object dtype data (check_array does
+          not do that)
+        - return list of features (arrays): this list of features is
+          constructed feature by feature to preserve the data types
+          of pandas DataFrame columns, as otherwise information is lost
+          and cannot be used, eg for the `categories_` attribute.
+
+        """
+        if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):
+            # if not a dataframe, do normal check_array validation
+            X_temp = check_array(X, dtype=None)
+            if (not hasattr(X, 'dtype')
+                    and np.issubdtype(X_temp.dtype, np.str_)):
+                X = check_array(X, dtype=np.object)
+            else:
+                X = X_temp
+            needs_validation = False
+        else:
+            # pandas dataframe, do validation later column by column, in order
+            # to keep the dtype information to be used in the encoder.
+            needs_validation = True
+
+        n_samples, n_features = X.shape
+        X_columns = []
+
+        for i in range(n_features):
+            Xi = self._get_feature(X, feature_idx=i)
+            Xi = check_array(Xi, ensure_2d=False, dtype=None,
+                             force_all_finite=needs_validation)
+            X_columns.append(Xi)
+
+        return X_columns, n_samples, n_features
+
+    def _get_feature(self, X, feature_idx):
+        if hasattr(X, 'iloc'):
+            # pandas dataframes
+            return X.iloc[:, feature_idx]
+        # numpy arrays, sparse arrays
+        return X[:, feature_idx]
+
+    def _fit(self, X, handle_unknown='error'):
+        X_list, n_samples, n_features = self._check_X(X)
+
+        if self.categories != 'auto':
+            if len(self.categories) != n_features:
+                raise ValueError("Shape mismatch: if categories is an array,"
+                                 " it has to be of shape (n_features,).")
+
+        self.categories_ = []
+
+        for i in range(n_features):
+            Xi = X_list[i]
+            if self.categories == 'auto':
+                cats = _encode(Xi)
+            else:
+                cats = np.array(self.categories[i], dtype=Xi.dtype)
+                if Xi.dtype != object:
+                    if not np.all(np.sort(cats) == cats):
+                        raise ValueError("Unsorted categories are not "
+                                         "supported for numerical categories")
+                if handle_unknown == 'error':
+                    diff = _encode_check_unknown(Xi, cats)
+                    if diff:
+                        msg = ("Found unknown categories {0} in column {1}"
+                               " during fit".format(diff, i))
+                        raise ValueError(msg)
+            self.categories_.append(cats)
+
+    def _transform(self, X, handle_unknown='error'):
+        X_list, n_samples, n_features = self._check_X(X)
+
+        X_int = np.zeros((n_samples, n_features), dtype=np.int)
+        X_mask = np.ones((n_samples, n_features), dtype=np.bool)
+
+        if n_features != len(self.categories_):
+            raise ValueError(
+                "The number of features in X is different to the number of "
+                "features of the fitted data. The fitted data had {} features "
+                "and the X has {} features."
+                .format(len(self.categories_,), n_features)
+            )
+
+        for i in range(n_features):
+            Xi = X_list[i]
+            diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i],
+                                                     return_mask=True)
+
+            if not np.all(valid_mask):
+                if handle_unknown == 'error':
+                    msg = ("Found unknown categories {0} in column {1}"
+                           " during transform".format(diff, i))
+                    raise ValueError(msg)
+                else:
+                    # Set the problematic rows to an acceptable value and
+                    # continue `The rows are marked `X_mask` and will be
+                    # removed later.
+                    X_mask[:, i] = valid_mask
+                    # cast Xi into the largest string type necessary
+                    # to handle different lengths of numpy strings
+                    if (self.categories_[i].dtype.kind in ('U', 'S')
+                            and self.categories_[i].itemsize > Xi.itemsize):
+                        Xi = Xi.astype(self.categories_[i].dtype)
+                    else:
+                        Xi = Xi.copy()
+
+                    Xi[~valid_mask] = self.categories_[i][0]
+            # We use check_unknown=False, since _encode_check_unknown was
+            # already called above.
+            _, encoded = _encode(Xi, self.categories_[i], encode=True,
+                                 check_unknown=False)
+            X_int[:, i] = encoded
+
+        return X_int, X_mask
+
+    def _more_tags(self):
+        return {'X_types': ['categorical']}
+
+
+class OneHotEncoder(_BaseEncoder):
+    """
+    Encode categorical features as a one-hot numeric array.
+
+    The input to this transformer should be an array-like of integers or
+    strings, denoting the values taken on by categorical (discrete) features.
+    The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
+    encoding scheme. This creates a binary column for each category and
+    returns a sparse matrix or dense array (depending on the ``sparse``
+    parameter)
+
+    By default, the encoder derives the categories based on the unique values
+    in each feature. Alternatively, you can also specify the `categories`
+    manually.
+
+    This encoding is needed for feeding categorical data to many scikit-learn
+    estimators, notably linear models and SVMs with the standard kernels.
+
+    Note: a one-hot encoding of y labels should use a LabelBinarizer
+    instead.
+
+    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+
+    .. versionchanged:: 0.20
+
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+
+        - 'auto' : Determine categories automatically from the training data.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values within a single feature, and should be sorted in case of
+          numeric values.
+
+        The used categories can be found in the ``categories_`` attribute.
+
+        .. versionadded:: 0.20
+
+    drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \
+            default=None
+        Specifies a methodology to use to drop one of the categories per
+        feature. This is useful in situations where perfectly collinear
+        features cause problems, such as when feeding the resulting data
+        into a neural network or an unregularized regression.
+
+        However, dropping one category breaks the symmetry of the original
+        representation and can therefore induce a bias in downstream models,
+        for instance for penalized linear classification or regression models.
+
+        - None : retain all features (the default).
+        - 'first' : drop the first category in each feature. If only one
+          category is present, the feature will be dropped entirely.
+        - 'if_binary' : drop the first category in each feature with two
+          categories. Features with 1 or more than 2 categories are
+          left intact.
+        - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
+          should be dropped.
+
+    sparse : bool, default=True
+        Will return sparse matrix if set True else will return an array.
+
+    dtype : number type, default=np.float
+        Desired dtype of output.
+
+    handle_unknown : {'error', 'ignore'}, default='error'
+        Whether to raise an error or ignore if an unknown categorical feature
+        is present during transform (default is to raise). When this parameter
+        is set to 'ignore' and an unknown category is encountered during
+        transform, the resulting one-hot encoded columns for this feature
+        will be all zeros. In the inverse transform, an unknown category
+        will be denoted as None.
+
+    Attributes
+    ----------
+    categories_ : list of arrays
+        The categories of each feature determined during fitting
+        (in order of the features in X and corresponding with the output
+        of ``transform``). This includes the category specified in ``drop``
+        (if any).
+
+    drop_idx_ : array of shape (n_features,)
+        - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
+          to be dropped for each feature.
+        - ``drop_idx_[i] = None`` if no category is to be dropped from the
+          feature with index ``i``, e.g. when `drop='if_binary'` and the
+          feature isn't binary.
+        - ``drop_idx_ = None`` if all the transformed features will be
+          retained.
+
+    See Also
+    --------
+    sklearn.preprocessing.OrdinalEncoder : Performs an ordinal (integer)
+      encoding of the categorical features.
+    sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
+      dictionary items (also handles string-valued features).
+    sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
+      encoding of dictionary items or strings.
+    sklearn.preprocessing.LabelBinarizer : Binarizes labels in a one-vs-all
+      fashion.
+    sklearn.preprocessing.MultiLabelBinarizer : Transforms between iterable of
+      iterables and a multilabel format, e.g. a (samples x classes) binary
+      matrix indicating the presence of a class label.
+
+    Examples
+    --------
+    Given a dataset with two features, we let the encoder find the unique
+    values per feature and transform the data to a binary one-hot encoding.
+
+    >>> from sklearn.preprocessing import OneHotEncoder
+
+    One can discard categories not seen during `fit`:
+
+    >>> enc = OneHotEncoder(handle_unknown='ignore')
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    OneHotEncoder(handle_unknown='ignore')
+    >>> enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
+    array([[1., 0., 1., 0., 0.],
+           [0., 1., 0., 0., 0.]])
+    >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
+    array([['Male', 1],
+           [None, 2]], dtype=object)
+    >>> enc.get_feature_names(['gender', 'group'])
+    array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],
+      dtype=object)
+
+    One can always drop the first column for each feature:
+
+    >>> drop_enc = OneHotEncoder(drop='first').fit(X)
+    >>> drop_enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 0., 0.],
+           [1., 1., 0.]])
+
+    Or drop a column for feature only having 2 categories:
+
+    >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
+    >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
+    array([[0., 1., 0., 0.],
+           [1., 0., 1., 0.]])
+    """
+
+    @_deprecate_positional_args
+    def __init__(self, *, categories='auto', drop=None, sparse=True,
+                 dtype=np.float64, handle_unknown='error'):
+        self.categories = categories
+        self.sparse = sparse
+        self.dtype = dtype
+        self.handle_unknown = handle_unknown
+        self.drop = drop
+
+    def _validate_keywords(self):
+        if self.handle_unknown not in ('error', 'ignore'):
+            msg = ("handle_unknown should be either 'error' or 'ignore', "
+                   "got {0}.".format(self.handle_unknown))
+            raise ValueError(msg)
+        # If we have both dropped columns and ignored unknown
+        # values, there will be ambiguous cells. This creates difficulties
+        # in interpreting the model.
+        if self.drop is not None and self.handle_unknown != 'error':
+            raise ValueError(
+                "`handle_unknown` must be 'error' when the drop parameter is "
+                "specified, as both would create categories that are all "
+                "zero.")
+
+    def _compute_drop_idx(self):
+        if self.drop is None:
+            return None
+        elif isinstance(self.drop, str):
+            if self.drop == 'first':
+                return np.zeros(len(self.categories_), dtype=np.object)
+            elif self.drop == 'if_binary':
+                return np.array([0 if len(cats) == 2 else None
+                                for cats in self.categories_], dtype=np.object)
+            else:
+                msg = (
+                    "Wrong input for parameter `drop`. Expected "
+                    "'first', 'if_binary', None or array of objects, got {}"
+                    )
+                raise ValueError(msg.format(type(self.drop)))
+
+        else:
+            try:
+                self.drop = np.asarray(self.drop, dtype=object)
+                droplen = len(self.drop)
+            except (ValueError, TypeError):
+                msg = (
+                    "Wrong input for parameter `drop`. Expected "
+                    "'first', 'if_binary', None or array of objects, got {}"
+                    )
+                raise ValueError(msg.format(type(self.drop)))
+            if droplen != len(self.categories_):
+                msg = ("`drop` should have length equal to the number "
+                       "of features ({}), got {}")
+                raise ValueError(msg.format(len(self.categories_),
+                                            len(self.drop)))
+            missing_drops = [(i, val) for i, val in enumerate(self.drop)
+                             if val not in self.categories_[i]]
+            if any(missing_drops):
+                msg = ("The following categories were supposed to be "
+                       "dropped, but were not found in the training "
+                       "data.\n{}".format(
+                           "\n".join(
+                                ["Category: {}, Feature: {}".format(c, v)
+                                    for c, v in missing_drops])))
+                raise ValueError(msg)
+            return np.array([np.where(cat_list == val)[0][0]
+                             for (val, cat_list) in
+                             zip(self.drop, self.categories_)],
+                            dtype=np.object)
+
+    def fit(self, X, y=None):
+        """
+        Fit OneHotEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to determine the categories of each feature.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        self
+        """
+        self._validate_keywords()
+        self._fit(X, handle_unknown=self.handle_unknown)
+        self.drop_idx_ = self._compute_drop_idx()
+        return self
+
+    def fit_transform(self, X, y=None):
+        """
+        Fit OneHotEncoder to X, then transform X.
+
+        Equivalent to fit(X).transform(X) but more convenient.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to encode.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        X_out : sparse matrix if sparse=True else a 2-d array
+            Transformed input.
+        """
+        self._validate_keywords()
+        return super().fit_transform(X, y)
+
+    def transform(self, X):
+        """
+        Transform X using one-hot encoding.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to encode.
+
+        Returns
+        -------
+        X_out : sparse matrix if sparse=True else a 2-d array
+            Transformed input.
+        """
+        check_is_fitted(self)
+        # validation of X happens in _check_X called by _transform
+        X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+
+        n_samples, n_features = X_int.shape
+
+        if self.drop_idx_ is not None:
+            to_drop = self.drop_idx_.copy()
+            # We remove all the dropped categories from mask, and decrement all
+            # categories that occur after them to avoid an empty column.
+            keep_cells = X_int != to_drop
+            n_values = []
+            for i, cats in enumerate(self.categories_):
+                n_cats = len(cats)
+
+                # drop='if_binary' but feature isn't binary
+                if to_drop[i] is None:
+                    # set to cardinality to not drop from X_int
+                    to_drop[i] = n_cats
+                    n_values.append(n_cats)
+                else:  # dropped
+                    n_values.append(n_cats - 1)
+
+            to_drop = to_drop.reshape(1, -1)
+            X_int[X_int > to_drop] -= 1
+            X_mask &= keep_cells
+        else:
+            n_values = [len(cats) for cats in self.categories_]
+
+        mask = X_mask.ravel()
+        feature_indices = np.cumsum([0] + n_values)
+        indices = (X_int + feature_indices[:-1]).ravel()[mask]
+
+        indptr = np.empty(n_samples + 1, dtype=np.int)
+        indptr[0] = 0
+        np.sum(X_mask, axis=1, out=indptr[1:])
+        np.cumsum(indptr[1:], out=indptr[1:])
+        data = np.ones(indptr[-1])
+
+        out = sparse.csr_matrix((data, indices, indptr),
+                                shape=(n_samples, feature_indices[-1]),
+                                dtype=self.dtype)
+        if not self.sparse:
+            return out.toarray()
+        else:
+            return out
+
+    def inverse_transform(self, X):
+        """
+        Convert the data back to the original representation.
+
+        In case unknown categories are encountered (all zeros in the
+        one-hot encoding), ``None`` is used to represent this category.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
+            The transformed data.
+
+        Returns
+        -------
+        X_tr : array-like, shape [n_samples, n_features]
+            Inverse transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse='csr')
+
+        n_samples, _ = X.shape
+        n_features = len(self.categories_)
+        if self.drop_idx_ is None:
+            n_transformed_features = sum(len(cats)
+                                         for cats in self.categories_)
+        else:
+            n_transformed_features = sum(
+                len(cats) - 1 if to_drop is not None else len(cats)
+                for cats, to_drop in zip(self.categories_, self.drop_idx_)
+            )
+
+        # validate shape of passed X
+        msg = ("Shape of the passed X data is not correct. Expected {0} "
+               "columns, got {1}.")
+        if X.shape[1] != n_transformed_features:
+            raise ValueError(msg.format(n_transformed_features, X.shape[1]))
+
+        # create resulting array of appropriate dtype
+        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
+        X_tr = np.empty((n_samples, n_features), dtype=dt)
+
+        j = 0
+        found_unknown = {}
+
+        for i in range(n_features):
+            if self.drop_idx_ is None or self.drop_idx_[i] is None:
+                cats = self.categories_[i]
+            else:
+                cats = np.delete(self.categories_[i], self.drop_idx_[i])
+            n_categories = len(cats)
+
+            # Only happens if there was a column with a unique
+            # category. In this case we just fill the column with this
+            # unique category value.
+            if n_categories == 0:
+                X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]
+                j += n_categories
+                continue
+            sub = X[:, j:j + n_categories]
+            # for sparse X argmax returns 2D matrix, ensure 1D array
+            labels = np.asarray(sub.argmax(axis=1)).flatten()
+            X_tr[:, i] = cats[labels]
+            if self.handle_unknown == 'ignore':
+                unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
+                # ignored unknown categories: we have a row of all zero
+                if unknown.any():
+                    found_unknown[i] = unknown
+            # drop will either be None or handle_unknown will be error. If
+            # self.drop_idx_ is not None, then we can safely assume that all of
+            # the nulls in each column are the dropped value
+            elif self.drop_idx_ is not None:
+                dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
+                if dropped.any():
+                    X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]
+
+            j += n_categories
+
+        # if ignored are found: potentially need to upcast result to
+        # insert None values
+        if found_unknown:
+            if X_tr.dtype != object:
+                X_tr = X_tr.astype(object)
+
+            for idx, mask in found_unknown.items():
+                X_tr[mask, idx] = None
+
+        return X_tr
+
+    def get_feature_names(self, input_features=None):
+        """
+        Return feature names for output features.
+
+        Parameters
+        ----------
+        input_features : list of str of shape (n_features,)
+            String names for input features if available. By default,
+            "x0", "x1", ... "xn_features" is used.
+
+        Returns
+        -------
+        output_feature_names : ndarray of shape (n_output_features,)
+            Array of feature names.
+        """
+        check_is_fitted(self)
+        cats = self.categories_
+        if input_features is None:
+            input_features = ['x%d' % i for i in range(len(cats))]
+        elif len(input_features) != len(self.categories_):
+            raise ValueError(
+                "input_features should have length equal to number of "
+                "features ({}), got {}".format(len(self.categories_),
+                                               len(input_features)))
+
+        feature_names = []
+        for i in range(len(cats)):
+            names = [
+                input_features[i] + '_' + str(t) for t in cats[i]]
+            if self.drop_idx_ is not None and self.drop_idx_[i] is not None:
+                names.pop(self.drop_idx_[i])
+            feature_names.extend(names)
+
+        return np.array(feature_names, dtype=object)
+
+
+class OrdinalEncoder(_BaseEncoder):
+    """
+    Encode categorical features as an integer array.
+
+    The input to this transformer should be an array-like of integers or
+    strings, denoting the values taken on by categorical (discrete) features.
+    The features are converted to ordinal integers. This results in
+    a single column of integers (0 to n_categories - 1) per feature.
+
+    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    categories : 'auto' or a list of array-like, default='auto'
+        Categories (unique values) per feature:
+
+        - 'auto' : Determine categories automatically from the training data.
+        - list : ``categories[i]`` holds the categories expected in the ith
+          column. The passed categories should not mix strings and numeric
+          values, and should be sorted in case of numeric values.
+
+        The used categories can be found in the ``categories_`` attribute.
+
+    dtype : number type, default np.float64
+        Desired dtype of output.
+
+    Attributes
+    ----------
+    categories_ : list of arrays
+        The categories of each feature determined during fitting
+        (in order of the features in X and corresponding with the output
+        of ``transform``).
+
+    See Also
+    --------
+    sklearn.preprocessing.OneHotEncoder : Performs a one-hot encoding of
+      categorical features.
+    sklearn.preprocessing.LabelEncoder : Encodes target labels with values
+      between 0 and n_classes-1.
+
+    Examples
+    --------
+    Given a dataset with two features, we let the encoder find the unique
+    values per feature and transform the data to an ordinal encoding.
+
+    >>> from sklearn.preprocessing import OrdinalEncoder
+    >>> enc = OrdinalEncoder()
+    >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
+    >>> enc.fit(X)
+    OrdinalEncoder()
+    >>> enc.categories_
+    [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
+    >>> enc.transform([['Female', 3], ['Male', 1]])
+    array([[0., 2.],
+           [1., 0.]])
+
+    >>> enc.inverse_transform([[1, 0], [0, 1]])
+    array([['Male', 1],
+           ['Female', 2]], dtype=object)
+    """
+
+    @_deprecate_positional_args
+    def __init__(self, *, categories='auto', dtype=np.float64):
+        self.categories = categories
+        self.dtype = dtype
+
+    def fit(self, X, y=None):
+        """
+        Fit the OrdinalEncoder to X.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to determine the categories of each feature.
+
+        y : None
+            Ignored. This parameter exists only for compatibility with
+            :class:`sklearn.pipeline.Pipeline`.
+
+        Returns
+        -------
+        self
+        """
+        self._fit(X)
+
+        return self
+
+    def transform(self, X):
+        """
+        Transform X to ordinal codes.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data to encode.
+
+        Returns
+        -------
+        X_out : sparse matrix or a 2-d array
+            Transformed input.
+        """
+        X_int, _ = self._transform(X)
+        return X_int.astype(self.dtype, copy=False)
+
+    def inverse_transform(self, X):
+        """
+        Convert the data back to the original representation.
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
+            The transformed data.
+
+        Returns
+        -------
+        X_tr : array-like, shape [n_samples, n_features]
+            Inverse transformed array.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse='csr')
+
+        n_samples, _ = X.shape
+        n_features = len(self.categories_)
+
+        # validate shape of passed X
+        msg = ("Shape of the passed X data is not correct. Expected {0} "
+               "columns, got {1}.")
+        if X.shape[1] != n_features:
+            raise ValueError(msg.format(n_features, X.shape[1]))
+
+        # create resulting array of appropriate dtype
+        dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
+        X_tr = np.empty((n_samples, n_features), dtype=dt)
+
+        for i in range(n_features):
+            labels = X[:, i].astype('int64', copy=False)
+            X_tr[:, i] = self.categories_[i][labels]
+
+        return X_tr
--- a/venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/_function_transformer.py
@ -0,0 +1,175 @@
+import warnings
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils.validation import _allclose_dense_sparse
+from ..utils.validation import _deprecate_positional_args
+
+
+def _identity(X):
+    """The identity function.
+    """
+    return X
+
+
+class FunctionTransformer(TransformerMixin, BaseEstimator):
+    """Constructs a transformer from an arbitrary callable.
+
+    A FunctionTransformer forwards its X (and optionally y) arguments to a
+    user-defined function or function object and returns the result of this
+    function. This is useful for stateless transformations such as taking the
+    log of frequencies, doing custom scaling, etc.
+
+    Note: If a lambda is used as the function, then the resulting
+    transformer will not be pickleable.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <function_transformer>`.
+
+    Parameters
+    ----------
+    func : callable, optional default=None
+        The callable to use for the transformation. This will be passed
+        the same arguments as transform, with args and kwargs forwarded.
+        If func is None, then func will be the identity function.
+
+    inverse_func : callable, optional default=None
+        The callable to use for the inverse transformation. This will be
+        passed the same arguments as inverse transform, with args and
+        kwargs forwarded. If inverse_func is None, then inverse_func
+        will be the identity function.
+
+    validate : bool, optional default=False
+        Indicate that the input X array should be checked before calling
+        ``func``. The possibilities are:
+
+        - If False, there is no input validation.
+        - If True, then X will be converted to a 2-dimensional NumPy array or
+          sparse matrix. If the conversion is not possible an exception is
+          raised.
+
+        .. versionchanged:: 0.22
+           The default of ``validate`` changed from True to False.
+
+    accept_sparse : boolean, optional
+        Indicate that func accepts a sparse matrix as input. If validate is
+        False, this has no effect. Otherwise, if accept_sparse is false,
+        sparse matrix inputs will cause an exception to be raised.
+
+    check_inverse : bool, default=True
+       Whether to check that or ``func`` followed by ``inverse_func`` leads to
+       the original inputs. It can be used for a sanity check, raising a
+       warning when the condition is not fulfilled.
+
+       .. versionadded:: 0.20
+
+    kw_args : dict, optional
+        Dictionary of additional keyword arguments to pass to func.
+
+        .. versionadded:: 0.18
+
+    inv_kw_args : dict, optional
+        Dictionary of additional keyword arguments to pass to inverse_func.
+
+        .. versionadded:: 0.18
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import FunctionTransformer
+    >>> transformer = FunctionTransformer(np.log1p)
+    >>> X = np.array([[0, 1], [2, 3]])
+    >>> transformer.transform(X)
+    array([[0.       , 0.6931...],
+           [1.0986..., 1.3862...]])
+    """
+
+    @_deprecate_positional_args
+    def __init__(self, func=None, inverse_func=None, *, validate=False,
+                 accept_sparse=False, check_inverse=True, kw_args=None,
+                 inv_kw_args=None):
+        self.func = func
+        self.inverse_func = inverse_func
+        self.validate = validate
+        self.accept_sparse = accept_sparse
+        self.check_inverse = check_inverse
+        self.kw_args = kw_args
+        self.inv_kw_args = inv_kw_args
+
+    def _check_input(self, X):
+        if self.validate:
+            return self._validate_data(X, accept_sparse=self.accept_sparse)
+        return X
+
+    def _check_inverse_transform(self, X):
+        """Check that func and inverse_func are the inverse."""
+        idx_selected = slice(None, None, max(1, X.shape[0] // 100))
+        X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
+        if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
+            warnings.warn("The provided functions are not strictly"
+                          " inverse of each other. If you are sure you"
+                          " want to proceed regardless, set"
+                          " 'check_inverse=False'.", UserWarning)
+
+    def fit(self, X, y=None):
+        """Fit transformer by checking X.
+
+        If ``validate`` is ``True``, ``X`` will be checked.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input array.
+
+        Returns
+        -------
+        self
+        """
+        X = self._check_input(X)
+        if (self.check_inverse and not (self.func is None or
+                                        self.inverse_func is None)):
+            self._check_inverse_transform(X)
+        return self
+
+    def transform(self, X):
+        """Transform X using the forward function.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input array.
+
+        Returns
+        -------
+        X_out : array-like, shape (n_samples, n_features)
+            Transformed input.
+        """
+        return self._transform(X, func=self.func, kw_args=self.kw_args)
+
+    def inverse_transform(self, X):
+        """Transform X using the inverse function.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Input array.
+
+        Returns
+        -------
+        X_out : array-like, shape (n_samples, n_features)
+            Transformed input.
+        """
+        return self._transform(X, func=self.inverse_func,
+                               kw_args=self.inv_kw_args)
+
+    def _transform(self, X, func=None, kw_args=None):
+        X = self._check_input(X)
+
+        if func is None:
+            func = _identity
+
+        return func(X, **(kw_args if kw_args else {}))
+
+    def _more_tags(self):
+        return {'no_validation': not self.validate,
+                'stateless': True}
--- a/venv/Lib/site-packages/sklearn/preprocessing/_label.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/_label.py
--- a/venv/Lib/site-packages/sklearn/preprocessing/data.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/data.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _data  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.preprocessing.data'
+correct_import_path = 'sklearn.preprocessing'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_data, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/preprocessing/label.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/label.py
@ -0,0 +1,18 @@
+
+# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
+import sys
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import _label  # type: ignore
+from ..externals._pep562 import Pep562
+from ..utils.deprecation import _raise_dep_warning_if_not_pytest
+
+deprecated_path = 'sklearn.preprocessing.label'
+correct_import_path = 'sklearn.preprocessing'
+
+_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
+
+def __getattr__(name):
+    return getattr(_label, name)
+
+if not sys.version_info >= (3, 7):
+    Pep562(__name__)
--- a/venv/Lib/site-packages/sklearn/preprocessing/setup.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/setup.py
@ -0,0 +1,20 @@
+import os
+
+
+def configuration(parent_package='', top_path=None):
+    import numpy
+    from numpy.distutils.misc_util import Configuration
+
+    config = Configuration('preprocessing', parent_package, top_path)
+    libraries = []
+    if os.name == 'posix':
+        libraries.append('m')
+
+    config.add_extension('_csr_polynomial_expansion',
+                         sources=['_csr_polynomial_expansion.pyx'],
+                         include_dirs=[numpy.get_include()],
+                         libraries=libraries)
+
+    config.add_subpackage('tests')
+
+    return config
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/init.py
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_common.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_common.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_data.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_data.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_discretization.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_discretization.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_encoders.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_encoders.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_function_transformer.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_function_transformer.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_label.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/pycache/test_label.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/test_common.py
@ -0,0 +1,158 @@
+import warnings
+
+import pytest
+import numpy as np
+
+from scipy import sparse
+
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+
+from sklearn.base import clone
+
+from sklearn.preprocessing import maxabs_scale
+from sklearn.preprocessing import minmax_scale
+from sklearn.preprocessing import scale
+from sklearn.preprocessing import power_transform
+from sklearn.preprocessing import quantile_transform
+from sklearn.preprocessing import robust_scale
+
+from sklearn.preprocessing import MaxAbsScaler
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import PowerTransformer
+from sklearn.preprocessing import QuantileTransformer
+from sklearn.preprocessing import RobustScaler
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_allclose
+
+iris = load_iris()
+
+
+def _get_valid_samples_by_column(X, col):
+    """Get non NaN samples in column of X"""
+    return X[:, [col]][~np.isnan(X[:, col])]
+
+
+@pytest.mark.parametrize(
+    "est, func, support_sparse, strictly_positive",
+    [(MaxAbsScaler(), maxabs_scale, True, False),
+     (MinMaxScaler(), minmax_scale, False, False),
+     (StandardScaler(), scale, False, False),
+     (StandardScaler(with_mean=False), scale, True, False),
+     (PowerTransformer('yeo-johnson'), power_transform, False, False),
+     (PowerTransformer('box-cox'), power_transform, False, True),
+     (QuantileTransformer(n_quantiles=10), quantile_transform, True, False),
+     (RobustScaler(), robust_scale, False, False),
+     (RobustScaler(with_centering=False), robust_scale, True, False)]
+)
+def test_missing_value_handling(est, func, support_sparse, strictly_positive):
+    # check that the preprocessing method let pass nan
+    rng = np.random.RandomState(42)
+    X = iris.data.copy()
+    n_missing = 50
+    X[rng.randint(X.shape[0], size=n_missing),
+      rng.randint(X.shape[1], size=n_missing)] = np.nan
+    if strictly_positive:
+        X += np.nanmin(X) + 0.1
+    X_train, X_test = train_test_split(X, random_state=1)
+    # sanity check
+    assert not np.all(np.isnan(X_train), axis=0).any()
+    assert np.any(np.isnan(X_train), axis=0).all()
+    assert np.any(np.isnan(X_test), axis=0).all()
+    X_test[:, 0] = np.nan  # make sure this boundary case is tested
+
+    with pytest.warns(None) as records:
+        Xt = est.fit(X_train).transform(X_test)
+    # ensure no warnings are raised
+    assert len(records) == 0
+    # missing values should still be missing, and only them
+    assert_array_equal(np.isnan(Xt), np.isnan(X_test))
+
+    # check that the function leads to the same results as the class
+    with pytest.warns(None) as records:
+        Xt_class = est.transform(X_train)
+    assert len(records) == 0
+    Xt_func = func(X_train, **est.get_params())
+    assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class))
+    assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)])
+
+    # check that the inverse transform keep NaN
+    Xt_inv = est.inverse_transform(Xt)
+    assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
+    # FIXME: we can introduce equal_nan=True in recent version of numpy.
+    # For the moment which just check that non-NaN values are almost equal.
+    assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])
+
+    for i in range(X.shape[1]):
+        # train only on non-NaN
+        est.fit(_get_valid_samples_by_column(X_train, i))
+        # check transforming with NaN works even when training without NaN
+        with pytest.warns(None) as records:
+            Xt_col = est.transform(X_test[:, [i]])
+        assert len(records) == 0
+        assert_allclose(Xt_col, Xt[:, [i]])
+        # check non-NaN is handled as before - the 1st column is all nan
+        if not np.isnan(X_test[:, i]).all():
+            Xt_col_nonan = est.transform(
+                _get_valid_samples_by_column(X_test, i))
+            assert_array_equal(Xt_col_nonan,
+                               Xt_col[~np.isnan(Xt_col.squeeze())])
+
+    if support_sparse:
+        est_dense = clone(est)
+        est_sparse = clone(est)
+
+        with pytest.warns(None) as records:
+            Xt_dense = est_dense.fit(X_train).transform(X_test)
+            Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
+        assert len(records) == 0
+        for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
+                                   sparse.bsr_matrix, sparse.coo_matrix,
+                                   sparse.dia_matrix, sparse.dok_matrix,
+                                   sparse.lil_matrix):
+            # check that the dense and sparse inputs lead to the same results
+            # precompute the matrix to avoid catching side warnings
+            X_train_sp = sparse_constructor(X_train)
+            X_test_sp = sparse_constructor(X_test)
+            with pytest.warns(None) as records:
+                warnings.simplefilter('ignore', PendingDeprecationWarning)
+                Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp)
+            assert len(records) == 0
+            assert_allclose(Xt_sp.A, Xt_dense)
+            with pytest.warns(None) as records:
+                warnings.simplefilter('ignore', PendingDeprecationWarning)
+                Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
+            assert len(records) == 0
+            assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
+
+
+@pytest.mark.parametrize(
+    "est, func",
+    [(MaxAbsScaler(), maxabs_scale),
+     (MinMaxScaler(), minmax_scale),
+     (StandardScaler(), scale),
+     (StandardScaler(with_mean=False), scale),
+     (PowerTransformer('yeo-johnson'), power_transform),
+     (PowerTransformer('box-cox'), power_transform,),
+     (QuantileTransformer(n_quantiles=3), quantile_transform),
+     (RobustScaler(), robust_scale),
+     (RobustScaler(with_centering=False), robust_scale)]
+)
+def test_missing_value_pandas_na_support(est, func):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip('pandas', minversion="1.0")
+
+    X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1],
+                  [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
+                  [1, 2, 3, 4, 5, 6, 7, 8]]).T
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c'])
+    X_df['c'] = X_df['c'].astype('int')
+
+    X_trans = est.fit_transform(X)
+    X_df_trans = est.fit_transform(X_df)
+
+    assert_allclose(X_trans, X_df_trans)
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/test_data.py
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/test_discretization.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/test_discretization.py
@ -0,0 +1,283 @@
+
+import pytest
+import numpy as np
+import scipy.sparse as sp
+import warnings
+
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.utils._testing import (
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_warns_message
+)
+
+X = [[-2, 1.5, -4, -1],
+     [-1, 2.5, -3, -0.5],
+     [0, 3.5, -2, 0.5],
+     [1, 4.5, -1, 2]]
+
+
+@pytest.mark.parametrize(
+    'strategy, expected',
+    [('uniform', [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
+     ('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
+     ('quantile', [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])])
+def test_fit_transform(strategy, expected):
+    est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy)
+    est.fit(X)
+    assert_array_equal(expected, est.transform(X))
+
+
+def test_valid_n_bins():
+    KBinsDiscretizer(n_bins=2).fit_transform(X)
+    KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X)
+    assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
+
+
+def test_invalid_n_bins():
+    est = KBinsDiscretizer(n_bins=1)
+    err_msg = ("KBinsDiscretizer received an invalid "
+               "number of bins. Received 1, expected at least 2.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+    est = KBinsDiscretizer(n_bins=1.1)
+    err_msg = ("KBinsDiscretizer received an invalid "
+               "n_bins type. Received float, expected int.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+
+def test_invalid_n_bins_array():
+    # Bad shape
+    n_bins = np.full((2, 4), 2.)
+    est = KBinsDiscretizer(n_bins=n_bins)
+    err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+    # Incorrect number of features
+    n_bins = [1, 2, 2]
+    est = KBinsDiscretizer(n_bins=n_bins)
+    err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)."
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+    # Bad bin values
+    n_bins = [1, 2, 2, 1]
+    est = KBinsDiscretizer(n_bins=n_bins)
+    err_msg = ("KBinsDiscretizer received an invalid number of bins "
+               "at indices 0, 3. Number of bins must be at least 2, "
+               "and must be an int.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+    # Float bin values
+    n_bins = [2.1, 2, 2.1, 2]
+    est = KBinsDiscretizer(n_bins=n_bins)
+    err_msg = ("KBinsDiscretizer received an invalid number of bins "
+               "at indices 0, 2. Number of bins must be at least 2, "
+               "and must be an int.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit_transform(X)
+
+
+@pytest.mark.parametrize(
+    'strategy, expected',
+    [('uniform', [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
+     ('kmeans', [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
+     ('quantile', [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]])])
+def test_fit_transform_n_bins_array(strategy, expected):
+    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal',
+                           strategy=strategy).fit(X)
+    assert_array_equal(expected, est.transform(X))
+
+    # test the shape of bin_edges_
+    n_features = np.array(X).shape[1]
+    assert est.bin_edges_.shape == (n_features, )
+    for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_):
+        assert bin_edges.shape == (n_bins + 1, )
+
+
+def test_invalid_n_features():
+    est = KBinsDiscretizer(n_bins=3).fit(X)
+    bad_X = np.arange(25).reshape(5, -1)
+    err_msg = "Incorrect number of features. Expecting 4, received 5"
+    with pytest.raises(ValueError, match=err_msg):
+        est.transform(bad_X)
+
+
+@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
+def test_same_min_max(strategy):
+    warnings.simplefilter("always")
+    X = np.array([[1, -2],
+                  [1, -1],
+                  [1, 0],
+                  [1, 1]])
+    est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal')
+    assert_warns_message(UserWarning,
+                         "Feature 0 is constant and will be replaced "
+                         "with 0.", est.fit, X)
+    assert est.n_bins_[0] == 1
+    # replace the feature with zeros
+    Xt = est.transform(X)
+    assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
+
+
+def test_transform_1d_behavior():
+    X = np.arange(4)
+    est = KBinsDiscretizer(n_bins=2)
+    with pytest.raises(ValueError):
+        est.fit(X)
+
+    est = KBinsDiscretizer(n_bins=2)
+    est.fit(X.reshape(-1, 1))
+    with pytest.raises(ValueError):
+        est.transform(X)
+
+
+@pytest.mark.parametrize('i', range(1, 9))
+def test_numeric_stability(i):
+    X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1)
+    Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1)
+
+    # Test up to discretizing nano units
+    X = X_init / 10**i
+    Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X)
+    assert_array_equal(Xt_expected, Xt)
+
+
+def test_invalid_encode_option():
+    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode')
+    err_msg = (r"Valid options for 'encode' are "
+               r"\('onehot', 'onehot-dense', 'ordinal'\). "
+               r"Got encode='invalid-encode' instead.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X)
+
+
+def test_encode_options():
+    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
+                           encode='ordinal').fit(X)
+    Xt_1 = est.transform(X)
+    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
+                           encode='onehot-dense').fit(X)
+    Xt_2 = est.transform(X)
+    assert not sp.issparse(Xt_2)
+    assert_array_equal(OneHotEncoder(
+                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
+                           sparse=False)
+                       .fit_transform(Xt_1), Xt_2)
+    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3],
+                           encode='onehot').fit(X)
+    Xt_3 = est.transform(X)
+    assert sp.issparse(Xt_3)
+    assert_array_equal(OneHotEncoder(
+                           categories=[np.arange(i) for i in [2, 3, 3, 3]],
+                           sparse=True)
+                       .fit_transform(Xt_1).toarray(),
+                       Xt_3.toarray())
+
+
+def test_invalid_strategy_option():
+    est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy')
+    err_msg = (r"Valid options for 'strategy' are "
+               r"\('uniform', 'quantile', 'kmeans'\). "
+               r"Got strategy='invalid-strategy' instead.")
+    with pytest.raises(ValueError, match=err_msg):
+        est.fit(X)
+
+
+@pytest.mark.parametrize(
+    'strategy, expected_2bins, expected_3bins, expected_5bins',
+    [('uniform', [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 2, 2], [0, 0, 1, 1, 4, 4]),
+     ('kmeans', [0, 0, 0, 0, 1, 1], [0, 0, 1, 1, 2, 2], [0, 0, 1, 2, 3, 4]),
+     ('quantile', [0, 0, 0, 1, 1, 1], [0, 0, 1, 1, 2, 2], [0, 1, 2, 3, 4, 4])])
+def test_nonuniform_strategies(
+        strategy, expected_2bins, expected_3bins, expected_5bins):
+    X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1)
+
+    # with 2 bins
+    est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal')
+    Xt = est.fit_transform(X)
+    assert_array_equal(expected_2bins, Xt.ravel())
+
+    # with 3 bins
+    est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal')
+    Xt = est.fit_transform(X)
+    assert_array_equal(expected_3bins, Xt.ravel())
+
+    # with 5 bins
+    est = KBinsDiscretizer(n_bins=5, strategy=strategy, encode='ordinal')
+    Xt = est.fit_transform(X)
+    assert_array_equal(expected_5bins, Xt.ravel())
+
+
+@pytest.mark.parametrize(
+    'strategy, expected_inv',
+    [('uniform', [[-1.5, 2., -3.5, -0.5], [-0.5, 3., -2.5, -0.5],
+                  [0.5, 4., -1.5, 0.5], [0.5, 4., -1.5, 1.5]]),
+     ('kmeans', [[-1.375, 2.125, -3.375, -0.5625],
+                 [-1.375, 2.125, -3.375, -0.5625],
+                 [-0.125, 3.375, -2.125, 0.5625],
+                 [0.75, 4.25, -1.25, 1.625]]),
+     ('quantile', [[-1.5, 2., -3.5, -0.75], [-0.5, 3., -2.5, 0.],
+                   [0.5, 4., -1.5, 1.25], [0.5, 4., -1.5, 1.25]])])
+@pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense'])
+def test_inverse_transform(strategy, encode, expected_inv):
+    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode)
+    Xt = kbd.fit_transform(X)
+    Xinv = kbd.inverse_transform(Xt)
+    assert_array_almost_equal(expected_inv, Xinv)
+
+
+@pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
+def test_transform_outside_fit_range(strategy):
+    X = np.array([0, 1, 2, 3])[:, None]
+    kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal')
+    kbd.fit(X)
+
+    X2 = np.array([-2, 5])[:, None]
+    X2t = kbd.transform(X2)
+    assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
+    assert_array_equal(X2t.min(axis=0), [0])
+
+
+def test_overwrite():
+    X = np.array([0, 1, 2, 3])[:, None]
+    X_before = X.copy()
+
+    est = KBinsDiscretizer(n_bins=3, encode="ordinal")
+    Xt = est.fit_transform(X)
+    assert_array_equal(X, X_before)
+
+    Xt_before = Xt.copy()
+    Xinv = est.inverse_transform(Xt)
+    assert_array_equal(Xt, Xt_before)
+    assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
+
+
+@pytest.mark.parametrize(
+    'strategy, expected_bin_edges',
+    [('quantile', [0, 1, 3]), ('kmeans', [0, 1.5, 3])])
+def test_redundant_bins(strategy, expected_bin_edges):
+    X = [[0], [0], [0], [0], [3], [3]]
+    kbd = KBinsDiscretizer(n_bins=3, strategy=strategy)
+    msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
+           "are removed. Consider decreasing the number of bins.")
+    assert_warns_message(UserWarning, msg, kbd.fit, X)
+    assert_array_almost_equal(kbd.bin_edges_[0], expected_bin_edges)
+
+
+def test_percentile_numeric_stability():
+    X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1)
+    bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95])
+    Xt = np.array([0, 0, 4]).reshape(-1, 1)
+    kbd = KBinsDiscretizer(n_bins=10, encode='ordinal',
+                           strategy='quantile')
+    msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 "
+           "are removed. Consider decreasing the number of bins.")
+    assert_warns_message(UserWarning, msg, kbd.fit, X)
+    assert_array_almost_equal(kbd.bin_edges_[0], bin_edges)
+    assert_array_almost_equal(kbd.transform(X), Xt)
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/test_encoders.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/test_encoders.py
@ -0,0 +1,698 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+import numpy as np
+from scipy import sparse
+import pytest
+
+from sklearn.exceptions import NotFittedError
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_allclose
+
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import OrdinalEncoder
+
+
+def test_one_hot_encoder_sparse_dense():
+    # check that sparse and dense will give the same results
+
+    X = np.array([[3, 2, 1], [0, 1, 1]])
+    enc_sparse = OneHotEncoder()
+    enc_dense = OneHotEncoder(sparse=False)
+
+    X_trans_sparse = enc_sparse.fit_transform(X)
+    X_trans_dense = enc_dense.fit_transform(X)
+
+    assert X_trans_sparse.shape == (2, 5)
+    assert X_trans_dense.shape == (2, 5)
+
+    assert sparse.issparse(X_trans_sparse)
+    assert not sparse.issparse(X_trans_dense)
+
+    # check outcome
+    assert_array_equal(X_trans_sparse.toarray(), [[0., 1., 0., 1., 1.],
+                                                  [1., 0., 1., 0., 1.]])
+    assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
+
+
+def test_one_hot_encoder_diff_n_features():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    X2 = np.array([[1, 0]])
+    enc = OneHotEncoder()
+    enc.fit(X)
+    err_msg = ("The number of features in X is different to the number of "
+               "features of the fitted data.")
+    with pytest.raises(ValueError, match=err_msg):
+        enc.transform(X2)
+
+
+def test_one_hot_encoder_handle_unknown():
+    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
+    X2 = np.array([[4, 1, 1]])
+
+    # Test that one hot encoder raises error for unknown features
+    # present during transform.
+    oh = OneHotEncoder(handle_unknown='error')
+    oh.fit(X)
+    with pytest.raises(ValueError, match='Found unknown categories'):
+        oh.transform(X2)
+
+    # Test the ignore option, ignores unknown features (giving all 0's)
+    oh = OneHotEncoder(handle_unknown='ignore')
+    oh.fit(X)
+    X2_passed = X2.copy()
+    assert_array_equal(
+        oh.transform(X2_passed).toarray(),
+        np.array([[0.,  0.,  0.,  0.,  1.,  0.,  0.]]))
+    # ensure transformed data was not modified in place
+    assert_allclose(X2, X2_passed)
+
+    # Raise error if handle_unknown is neither ignore or error.
+    oh = OneHotEncoder(handle_unknown='42')
+    with pytest.raises(ValueError, match='handle_unknown should be either'):
+        oh.fit(X)
+
+
+def test_one_hot_encoder_not_fitted():
+    X = np.array([['a'], ['b']])
+    enc = OneHotEncoder(categories=['a', 'b'])
+    msg = ("This OneHotEncoder instance is not fitted yet. "
+           "Call 'fit' with appropriate arguments before using this "
+           "estimator.")
+    with pytest.raises(NotFittedError, match=msg):
+        enc.transform(X)
+
+
+def test_one_hot_encoder_handle_unknown_strings():
+    X = np.array(['11111111', '22', '333', '4444']).reshape((-1, 1))
+    X2 = np.array(['55555', '22']).reshape((-1, 1))
+    # Non Regression test for the issue #12470
+    # Test the ignore option, when categories are numpy string dtype
+    # particularly when the known category strings are larger
+    # than the unknown category strings
+    oh = OneHotEncoder(handle_unknown='ignore')
+    oh.fit(X)
+    X2_passed = X2.copy()
+    assert_array_equal(
+        oh.transform(X2_passed).toarray(),
+        np.array([[0.,  0.,  0.,  0.], [0.,  1.,  0.,  0.]]))
+    # ensure transformed data was not modified in place
+    assert_array_equal(X2, X2_passed)
+
+
+@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
+@pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
+def test_one_hot_encoder_dtype(input_dtype, output_dtype):
+    X = np.asarray([[0, 1]], dtype=input_dtype).T
+    X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
+
+    oh = OneHotEncoder(categories='auto', dtype=output_dtype)
+    assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
+    assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
+
+    oh = OneHotEncoder(categories='auto', dtype=output_dtype, sparse=False)
+    assert_array_equal(oh.fit_transform(X), X_expected)
+    assert_array_equal(oh.fit(X).transform(X), X_expected)
+
+
+@pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
+def test_one_hot_encoder_dtype_pandas(output_dtype):
+    pd = pytest.importorskip('pandas')
+
+    X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
+    X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
+
+    oh = OneHotEncoder(dtype=output_dtype)
+    assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
+    assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
+
+    oh = OneHotEncoder(dtype=output_dtype, sparse=False)
+    assert_array_equal(oh.fit_transform(X_df), X_expected)
+    assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
+
+
+def test_one_hot_encoder_feature_names():
+    enc = OneHotEncoder()
+    X = [['Male', 1, 'girl', 2, 3],
+         ['Female', 41, 'girl', 1, 10],
+         ['Male', 51, 'boy', 12, 3],
+         ['Male', 91, 'girl', 21, 30]]
+
+    enc.fit(X)
+    feature_names = enc.get_feature_names()
+    assert isinstance(feature_names, np.ndarray)
+
+    assert_array_equal(['x0_Female', 'x0_Male',
+                        'x1_1', 'x1_41', 'x1_51', 'x1_91',
+                        'x2_boy', 'x2_girl',
+                        'x3_1', 'x3_2', 'x3_12', 'x3_21',
+                        'x4_3',
+                        'x4_10', 'x4_30'], feature_names)
+
+    feature_names2 = enc.get_feature_names(['one', 'two',
+                                            'three', 'four', 'five'])
+
+    assert_array_equal(['one_Female', 'one_Male',
+                        'two_1', 'two_41', 'two_51', 'two_91',
+                        'three_boy', 'three_girl',
+                        'four_1', 'four_2', 'four_12', 'four_21',
+                        'five_3', 'five_10', 'five_30'], feature_names2)
+
+    with pytest.raises(ValueError, match="input_features should have length"):
+        enc.get_feature_names(['one', 'two'])
+
+
+def test_one_hot_encoder_feature_names_unicode():
+    enc = OneHotEncoder()
+    X = np.array([['c❤t1', 'dat2']], dtype=object).T
+    enc.fit(X)
+    feature_names = enc.get_feature_names()
+    assert_array_equal(['x0_c❤t1', 'x0_dat2'], feature_names)
+    feature_names = enc.get_feature_names(input_features=['n👍me'])
+    assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
+
+
+def test_one_hot_encoder_set_params():
+    X = np.array([[1, 2]]).T
+    oh = OneHotEncoder()
+    # set params on not yet fitted object
+    oh.set_params(categories=[[0, 1, 2, 3]])
+    assert oh.get_params()['categories'] == [[0, 1, 2, 3]]
+    assert oh.fit_transform(X).toarray().shape == (2, 4)
+    # set params on already fitted object
+    oh.set_params(categories=[[0, 1, 2, 3, 4]])
+    assert oh.fit_transform(X).toarray().shape == (2, 5)
+
+
+def check_categorical_onehot(X):
+    enc = OneHotEncoder(categories='auto')
+    Xtr1 = enc.fit_transform(X)
+
+    enc = OneHotEncoder(categories='auto', sparse=False)
+    Xtr2 = enc.fit_transform(X)
+
+    assert_allclose(Xtr1.toarray(), Xtr2)
+
+    assert sparse.isspmatrix_csr(Xtr1)
+    return Xtr1.toarray()
+
+
+@pytest.mark.parametrize("X", [
+    [['def', 1, 55], ['abc', 2, 55]],
+    np.array([[10, 1, 55], [5, 2, 55]]),
+    np.array([['b', 'A', 'cat'], ['a', 'B', 'cat']], dtype=object)
+    ], ids=['mixed', 'numeric', 'object'])
+def test_one_hot_encoder(X):
+    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
+    assert_allclose(Xtr, [[0, 1], [1, 0]])
+
+    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
+    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
+
+    Xtr = OneHotEncoder(categories='auto').fit_transform(X)
+    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0,  1], [1, 0, 0, 1, 1]])
+
+
+@pytest.mark.parametrize('sparse_', [False, True])
+@pytest.mark.parametrize('drop', [None, 'first'])
+def test_one_hot_encoder_inverse(sparse_, drop):
+    X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
+    enc = OneHotEncoder(sparse=sparse_, drop=drop)
+    X_tr = enc.fit_transform(X)
+    exp = np.array(X, dtype=object)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    X = [[2, 55], [1, 55], [3, 55]]
+    enc = OneHotEncoder(sparse=sparse_, categories='auto',
+                        drop=drop)
+    X_tr = enc.fit_transform(X)
+    exp = np.array(X)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    if drop is None:
+        # with unknown categories
+        # drop is incompatible with handle_unknown=ignore
+        X = [['abc', 2, 55], ['def', 1, 55], ['abc', 3, 55]]
+        enc = OneHotEncoder(sparse=sparse_, handle_unknown='ignore',
+                            categories=[['abc', 'def'], [1, 2],
+                                        [54, 55, 56]])
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X, dtype=object)
+        exp[2, 1] = None
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+        # with an otherwise numerical output, still object if unknown
+        X = [[2, 55], [1, 55], [3, 55]]
+        enc = OneHotEncoder(sparse=sparse_, categories=[[1, 2], [54, 56]],
+                            handle_unknown='ignore')
+        X_tr = enc.fit_transform(X)
+        exp = np.array(X, dtype=object)
+        exp[2, 0] = None
+        exp[:, 1] = None
+        assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    # incorrect shape raises
+    X_tr = np.array([[0, 1, 1], [1, 0, 1]])
+    msg = re.escape('Shape of the passed X data is not correct')
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_tr)
+
+
+def test_one_hot_encoder_inverse_if_binary():
+    X = np.array([['Male', 1],
+                  ['Female', 3],
+                  ['Female', 2]], dtype=object)
+    ohe = OneHotEncoder(drop='if_binary', sparse=False)
+    X_tr = ohe.fit_transform(X)
+    assert_array_equal(ohe.inverse_transform(X_tr), X)
+
+
+# check that resetting drop option without refitting does not throw an error
+@pytest.mark.parametrize('drop', ['if_binary', 'first', None])
+@pytest.mark.parametrize('reset_drop', ['if_binary', 'first', None])
+def test_one_hot_encoder_drop_reset(drop, reset_drop):
+    X = np.array([['Male', 1],
+                  ['Female', 3],
+                  ['Female', 2]], dtype=object)
+    ohe = OneHotEncoder(drop=drop, sparse=False)
+    ohe.fit(X)
+    X_tr = ohe.transform(X)
+    feature_names = ohe.get_feature_names()
+    ohe.set_params(drop=reset_drop)
+    assert_array_equal(ohe.inverse_transform(X_tr), X)
+    assert_allclose(ohe.transform(X), X_tr)
+    assert_array_equal(ohe.get_feature_names(), feature_names)
+
+
+@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
+@pytest.mark.parametrize("X", [
+    [1, 2],
+    np.array([3., 4.])
+    ])
+def test_X_is_not_1D(X, method):
+    oh = OneHotEncoder()
+
+    msg = ("Expected 2D array, got 1D array instead")
+    with pytest.raises(ValueError, match=msg):
+        getattr(oh, method)(X)
+
+
+@pytest.mark.parametrize("method", ['fit', 'fit_transform'])
+def test_X_is_not_1D_pandas(method):
+    pd = pytest.importorskip('pandas')
+    X = pd.Series([6, 3, 4, 6])
+    oh = OneHotEncoder()
+
+    msg = ("Expected 2D array, got 1D array instead")
+    with pytest.raises(ValueError, match=msg):
+        getattr(oh, method)(X)
+
+
+@pytest.mark.parametrize("X, cat_exp, cat_dtype", [
+    ([['abc', 55], ['def', 55]], [['abc', 'def'], [55]], np.object_),
+    (np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
+    (np.array([['A', 'cat'], ['B', 'cat']], dtype=object),
+     [['A', 'B'], ['cat']], np.object_),
+    (np.array([['A', 'cat'], ['B', 'cat']]),
+     [['A', 'B'], ['cat']], np.str_)
+    ], ids=['mixed', 'numeric', 'object', 'string'])
+def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
+    # order of categories should not depend on order of samples
+    for Xi in [X, X[::-1]]:
+        enc = OneHotEncoder(categories='auto')
+        enc.fit(Xi)
+        # assert enc.categories == 'auto'
+        assert isinstance(enc.categories_, list)
+        for res, exp in zip(enc.categories_, cat_exp):
+            assert res.tolist() == exp
+            assert np.issubdtype(res.dtype, cat_dtype)
+
+
+@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
+    (np.array([['a', 'b']], dtype=object).T,
+     np.array([['a', 'd']], dtype=object).T,
+     [['a', 'b', 'c']], np.object_),
+    (np.array([[1, 2]], dtype='int64').T,
+     np.array([[1, 4]], dtype='int64').T,
+     [[1, 2, 3]], np.int64),
+    (np.array([['a', 'b']], dtype=object).T,
+     np.array([['a', 'd']], dtype=object).T,
+     [np.array(['a', 'b', 'c'])], np.object_),
+    ], ids=['object', 'numeric', 'object-string-cat'])
+def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype):
+    enc = OneHotEncoder(categories=cats)
+    exp = np.array([[1., 0., 0.],
+                    [0., 1., 0.]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert list(enc.categories[0]) == list(cats[0])
+    assert enc.categories_[0].tolist() == list(cats[0])
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert enc.categories_[0].dtype == cat_dtype
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    enc = OneHotEncoder(categories=cats)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        enc.fit(X2)
+    enc = OneHotEncoder(categories=cats, handle_unknown='ignore')
+    exp = np.array([[1., 0., 0.], [0., 0., 0.]])
+    assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
+
+
+def test_one_hot_encoder_unsorted_categories():
+    X = np.array([['a', 'b']], dtype=object).T
+
+    enc = OneHotEncoder(categories=[['b', 'a', 'c']])
+    exp = np.array([[0., 1., 0.],
+                    [1., 0., 0.]])
+    assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert enc.categories_[0].tolist() == ['b', 'a', 'c']
+    assert np.issubdtype(enc.categories_[0].dtype, np.object_)
+
+    # unsorted passed categories still raise for numerical values
+    X = np.array([[1, 2]]).T
+    enc = OneHotEncoder(categories=[[2, 1, 3]])
+    msg = 'Unsorted categories are not supported'
+    with pytest.raises(ValueError, match=msg):
+        enc.fit_transform(X)
+
+
+def test_one_hot_encoder_specified_categories_mixed_columns():
+    # multiple columns
+    X = np.array([['a', 'b'], [0, 2]], dtype=object).T
+    enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]])
+    exp = np.array([[1., 0., 0., 1., 0., 0.],
+                    [0., 1., 0., 0., 0., 1.]])
+    assert_array_equal(enc.fit_transform(X).toarray(), exp)
+    assert enc.categories_[0].tolist() == ['a', 'b', 'c']
+    assert np.issubdtype(enc.categories_[0].dtype, np.object_)
+    assert enc.categories_[1].tolist() == [0, 1, 2]
+    # integer categories but from object dtype data
+    assert np.issubdtype(enc.categories_[1].dtype, np.object_)
+
+
+def test_one_hot_encoder_pandas():
+    pd = pytest.importorskip('pandas')
+
+    X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})
+
+    Xtr = check_categorical_onehot(X_df)
+    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
+
+
+@pytest.mark.parametrize("drop, expected_names",
+                         [('first', ['x0_c', 'x2_b']),
+                          ('if_binary', ['x0_c', 'x1_2', 'x2_b']),
+                          (['c', 2, 'b'], ['x0_b', 'x2_a'])],
+                         ids=['first', 'binary', 'manual'])
+def test_one_hot_encoder_feature_names_drop(drop, expected_names):
+    X = [['c', 2, 'a'],
+         ['b', 2, 'b']]
+
+    ohe = OneHotEncoder(drop=drop)
+    ohe.fit(X)
+    feature_names = ohe.get_feature_names()
+    assert isinstance(feature_names, np.ndarray)
+    assert_array_equal(expected_names, feature_names)
+
+
+def test_one_hot_encoder_drop_equals_if_binary():
+    # Canonical case
+    X = [[10, 'yes'],
+         [20, 'no'],
+         [30, 'yes']]
+    expected = np.array([[1., 0., 0., 1.],
+                         [0., 1., 0., 0.],
+                         [0., 0., 1., 1.]])
+    expected_drop_idx = np.array([None, 0])
+
+    ohe = OneHotEncoder(drop='if_binary', sparse=False)
+    result = ohe.fit_transform(X)
+    assert_array_equal(ohe.drop_idx_, expected_drop_idx)
+    assert_allclose(result, expected)
+
+    # with only one cat, the behaviour is equivalent to drop=None
+    X = [['true', 'a'],
+         ['false', 'a'],
+         ['false', 'a']]
+    expected = np.array([[1., 1.],
+                         [0., 1.],
+                         [0., 1.]])
+    expected_drop_idx = np.array([0, None])
+
+    ohe = OneHotEncoder(drop='if_binary', sparse=False)
+    result = ohe.fit_transform(X)
+    assert_array_equal(ohe.drop_idx_, expected_drop_idx)
+    assert_allclose(result, expected)
+
+
+@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
+                               np.array([['a', np.nan]], dtype=object).T],
+                         ids=['numeric', 'object'])
+@pytest.mark.parametrize("as_data_frame", [False, True],
+                         ids=['array', 'dataframe'])
+@pytest.mark.parametrize("handle_unknown", ['error', 'ignore'])
+def test_one_hot_encoder_raise_missing(X, as_data_frame, handle_unknown):
+    if as_data_frame:
+        pd = pytest.importorskip('pandas')
+        X = pd.DataFrame(X)
+
+    ohe = OneHotEncoder(categories='auto', handle_unknown=handle_unknown)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit_transform(X)
+
+    if as_data_frame:
+        X_partial = X.iloc[:1, :]
+    else:
+        X_partial = X[:1, :]
+
+    ohe.fit(X_partial)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.transform(X)
+
+
+@pytest.mark.parametrize("X", [
+    [['abc', 2, 55], ['def', 1, 55]],
+    np.array([[10, 2, 55], [20, 1, 55]]),
+    np.array([['a', 'B', 'cat'], ['b', 'A', 'cat']], dtype=object)
+    ], ids=['mixed', 'numeric', 'object'])
+def test_ordinal_encoder(X):
+    enc = OrdinalEncoder()
+    exp = np.array([[0, 1, 0],
+                    [1, 0, 0]], dtype='int64')
+    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
+    enc = OrdinalEncoder(dtype='int64')
+    assert_array_equal(enc.fit_transform(X), exp)
+
+
+@pytest.mark.parametrize("X, X2, cats, cat_dtype", [
+    (np.array([['a', 'b']], dtype=object).T,
+     np.array([['a', 'd']], dtype=object).T,
+     [['a', 'b', 'c']], np.object_),
+    (np.array([[1, 2]], dtype='int64').T,
+     np.array([[1, 4]], dtype='int64').T,
+     [[1, 2, 3]], np.int64),
+    (np.array([['a', 'b']], dtype=object).T,
+     np.array([['a', 'd']], dtype=object).T,
+     [np.array(['a', 'b', 'c'])], np.object_),
+    ], ids=['object', 'numeric', 'object-string-cat'])
+def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
+    enc = OrdinalEncoder(categories=cats)
+    exp = np.array([[0.], [1.]])
+    assert_array_equal(enc.fit_transform(X), exp)
+    assert list(enc.categories[0]) == list(cats[0])
+    assert enc.categories_[0].tolist() == list(cats[0])
+    # manually specified categories should have same dtype as
+    # the data when coerced from lists
+    assert enc.categories_[0].dtype == cat_dtype
+
+    # when specifying categories manually, unknown categories should already
+    # raise when fitting
+    enc = OrdinalEncoder(categories=cats)
+    with pytest.raises(ValueError, match="Found unknown categories"):
+        enc.fit(X2)
+
+
+def test_ordinal_encoder_inverse():
+    X = [['abc', 2, 55], ['def', 1, 55]]
+    enc = OrdinalEncoder()
+    X_tr = enc.fit_transform(X)
+    exp = np.array(X, dtype=object)
+    assert_array_equal(enc.inverse_transform(X_tr), exp)
+
+    # incorrect shape raises
+    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
+    msg = re.escape('Shape of the passed X data is not correct')
+    with pytest.raises(ValueError, match=msg):
+        enc.inverse_transform(X_tr)
+
+
+@pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
+                               np.array([['a', np.nan]], dtype=object).T],
+                         ids=['numeric', 'object'])
+def test_ordinal_encoder_raise_missing(X):
+    ohe = OrdinalEncoder()
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit(X)
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.fit_transform(X)
+
+    ohe.fit(X[:1, :])
+
+    with pytest.raises(ValueError, match="Input contains NaN"):
+        ohe.transform(X)
+
+
+def test_ordinal_encoder_raise_categories_shape():
+
+    X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
+    cats = ['Low', 'Medium', 'High']
+    enc = OrdinalEncoder(categories=cats)
+    msg = ("Shape mismatch: if categories is an array,")
+
+    with pytest.raises(ValueError, match=msg):
+        enc.fit(X)
+
+
+def test_encoder_dtypes():
+    # check that dtypes are preserved when determining categories
+    enc = OneHotEncoder(categories='auto')
+    exp = np.array([[1., 0., 1., 0.], [0., 1., 0., 1.]], dtype='float64')
+
+    for X in [np.array([[1, 2], [3, 4]], dtype='int64'),
+              np.array([[1, 2], [3, 4]], dtype='float64'),
+              np.array([['a', 'b'], ['c', 'd']]),  # string dtype
+              np.array([[1, 'a'], [3, 'b']], dtype='object')]:
+        enc.fit(X)
+        assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
+        assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = [[1, 2], [3, 4]]
+    enc.fit(X)
+    assert all([np.issubdtype(enc.categories_[i].dtype, np.integer)
+                for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = [[1, 'a'], [3, 'b']]
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == 'object' for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+
+def test_encoder_dtypes_pandas():
+    # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
+    pd = pytest.importorskip('pandas')
+
+    enc = OneHotEncoder(categories='auto')
+    exp = np.array([[1., 0., 1., 0., 1., 0.],
+                    [0., 1., 0., 1., 0., 1.]], dtype='float64')
+
+    X = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}, dtype='int64')
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == 'int64' for i in range(2)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+    X = pd.DataFrame({'A': [1, 2], 'B': ['a', 'b'], 'C': [3., 4.]})
+    X_type = [X['A'].dtype, X['B'].dtype, X['C'].dtype]
+    enc.fit(X)
+    assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
+    assert_array_equal(enc.transform(X).toarray(), exp)
+
+
+def test_one_hot_encoder_warning():
+    enc = OneHotEncoder()
+    X = [['Male', 1], ['Female', 3]]
+    np.testing.assert_no_warnings(enc.fit_transform, X)
+
+
+def test_one_hot_encoder_drop_manual():
+    cats_to_drop = ['def', 12, 3, 56]
+    enc = OneHotEncoder(drop=cats_to_drop)
+    X = [['abc', 12, 2, 55],
+         ['def', 12, 1, 55],
+         ['def', 12, 3, 56]]
+    trans = enc.fit_transform(X).toarray()
+    exp = [[1, 0, 1, 1],
+           [0, 1, 0, 1],
+           [0, 0, 0, 0]]
+    assert_array_equal(trans, exp)
+    dropped_cats = [cat[feature]
+                    for cat, feature in zip(enc.categories_,
+                                            enc.drop_idx_)]
+    assert_array_equal(dropped_cats, cats_to_drop)
+    assert_array_equal(np.array(X, dtype=object),
+                       enc.inverse_transform(trans))
+
+
+@pytest.mark.parametrize(
+    "X_fit, params, err_msg",
+    [([["Male"], ["Female"]], {'drop': 'second'},
+     "Wrong input for parameter `drop`"),
+     ([["Male"], ["Female"]], {'drop': 'first', 'handle_unknown': 'ignore'},
+     "`handle_unknown` must be 'error'"),
+     ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
+      {'drop': np.asarray('b', dtype=object)},
+     "Wrong input for parameter `drop`"),
+     ([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]],
+      {'drop': ['ghi', 3, 59]},
+     "The following categories were supposed")]
+)
+def test_one_hot_encoder_invalid_params(X_fit, params, err_msg):
+    enc = OneHotEncoder(**params)
+    with pytest.raises(ValueError, match=err_msg):
+        enc.fit(X_fit)
+
+
+@pytest.mark.parametrize('drop', [['abc', 3], ['abc', 3, 41, 'a']])
+def test_invalid_drop_length(drop):
+    enc = OneHotEncoder(drop=drop)
+    err_msg = "`drop` should have length equal to the number"
+    with pytest.raises(ValueError, match=err_msg):
+        enc.fit([['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
+
+
+@pytest.mark.parametrize("density", [True, False],
+                         ids=['sparse', 'dense'])
+@pytest.mark.parametrize("drop", ['first',
+                                  ['a', 2, 'b']],
+                         ids=['first', 'manual'])
+def test_categories(density, drop):
+    ohe_base = OneHotEncoder(sparse=density)
+    ohe_test = OneHotEncoder(sparse=density, drop=drop)
+    X = [['c', 1, 'a'],
+         ['a', 2, 'b']]
+    ohe_base.fit(X)
+    ohe_test.fit(X)
+    assert_array_equal(ohe_base.categories_, ohe_test.categories_)
+    if drop == 'first':
+        assert_array_equal(ohe_test.drop_idx_, 0)
+    else:
+        for drop_cat, drop_idx, cat_list in zip(drop,
+                                                ohe_test.drop_idx_,
+                                                ohe_test.categories_):
+            assert cat_list[int(drop_idx)] == drop_cat
+    assert isinstance(ohe_test.drop_idx_, np.ndarray)
+    assert ohe_test.drop_idx_.dtype == np.object
+
+
+@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
+def test_encoders_has_categorical_tags(Encoder):
+    assert 'categorical' in Encoder()._get_tags()['X_types']
+
+
+@pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder])
+def test_encoders_does_not_support_none_values(Encoder):
+    values = [["a"], [None]]
+    with pytest.raises(TypeError, match="Encoders require their input to be "
+                                        "uniformly strings or numbers."):
+        Encoder().fit(values)
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/test_function_transformer.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/test_function_transformer.py
@ -0,0 +1,160 @@
+import pytest
+import numpy as np
+from scipy import sparse
+
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.utils._testing import (assert_array_equal,
+                                   assert_allclose_dense_sparse)
+from sklearn.utils._testing import assert_warns_message, assert_no_warnings
+
+
+def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
+    def _func(X, *args, **kwargs):
+        args_store.append(X)
+        args_store.extend(args)
+        kwargs_store.update(kwargs)
+        return func(X)
+
+    return _func
+
+
+def test_delegate_to_func():
+    # (args|kwargs)_store will hold the positional and keyword arguments
+    # passed to the function inside the FunctionTransformer.
+    args_store = []
+    kwargs_store = {}
+    X = np.arange(10).reshape((5, 2))
+    assert_array_equal(
+        FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
+        X, 'transform should have returned X unchanged',
+    )
+
+    # The function should only have received X.
+    assert args_store == [X], ('Incorrect positional arguments passed to '
+                               'func: {args}'.format(args=args_store))
+
+    assert not kwargs_store, ('Unexpected keyword arguments passed to '
+                              'func: {args}'.format(args=kwargs_store))
+
+    # reset the argument stores.
+    args_store[:] = []
+    kwargs_store.clear()
+    transformed = FunctionTransformer(
+        _make_func(args_store, kwargs_store),
+    ).transform(X)
+
+    assert_array_equal(transformed, X,
+                       err_msg='transform should have returned X unchanged')
+
+    # The function should have received X
+    assert args_store == [X], ('Incorrect positional arguments passed '
+                               'to func: {args}'.format(args=args_store))
+
+    assert not kwargs_store, ('Unexpected keyword arguments passed to '
+                              'func: {args}'.format(args=kwargs_store))
+
+
+def test_np_log():
+    X = np.arange(10).reshape((5, 2))
+
+    # Test that the numpy.log example still works.
+    assert_array_equal(
+        FunctionTransformer(np.log1p).transform(X),
+        np.log1p(X),
+    )
+
+
+def test_kw_arg():
+    X = np.linspace(0, 1, num=10).reshape((5, 2))
+
+    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
+
+    # Test that rounding is correct
+    assert_array_equal(F.transform(X),
+                       np.around(X, decimals=3))
+
+
+def test_kw_arg_update():
+    X = np.linspace(0, 1, num=10).reshape((5, 2))
+
+    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
+
+    F.kw_args['decimals'] = 1
+
+    # Test that rounding is correct
+    assert_array_equal(F.transform(X), np.around(X, decimals=1))
+
+
+def test_kw_arg_reset():
+    X = np.linspace(0, 1, num=10).reshape((5, 2))
+
+    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))
+
+    F.kw_args = dict(decimals=1)
+
+    # Test that rounding is correct
+    assert_array_equal(F.transform(X), np.around(X, decimals=1))
+
+
+def test_inverse_transform():
+    X = np.array([1, 4, 9, 16]).reshape((2, 2))
+
+    # Test that inverse_transform works correctly
+    F = FunctionTransformer(
+        func=np.sqrt,
+        inverse_func=np.around, inv_kw_args=dict(decimals=3),
+    )
+    assert_array_equal(
+        F.inverse_transform(F.transform(X)),
+        np.around(np.sqrt(X), decimals=3),
+    )
+
+
+def test_check_inverse():
+    X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2))
+
+    X_list = [X_dense,
+              sparse.csr_matrix(X_dense),
+              sparse.csc_matrix(X_dense)]
+
+    for X in X_list:
+        if sparse.issparse(X):
+            accept_sparse = True
+        else:
+            accept_sparse = False
+        trans = FunctionTransformer(func=np.sqrt,
+                                    inverse_func=np.around,
+                                    accept_sparse=accept_sparse,
+                                    check_inverse=True,
+                                    validate=True)
+        assert_warns_message(UserWarning,
+                             "The provided functions are not strictly"
+                             " inverse of each other. If you are sure you"
+                             " want to proceed regardless, set"
+                             " 'check_inverse=False'.",
+                             trans.fit, X)
+
+        trans = FunctionTransformer(func=np.expm1,
+                                    inverse_func=np.log1p,
+                                    accept_sparse=accept_sparse,
+                                    check_inverse=True,
+                                    validate=True)
+        Xt = assert_no_warnings(trans.fit_transform, X)
+        assert_allclose_dense_sparse(X, trans.inverse_transform(Xt))
+
+    # check that we don't check inverse when one of the func or inverse is not
+    # provided.
+    trans = FunctionTransformer(func=np.expm1, inverse_func=None,
+                                check_inverse=True, validate=True)
+    assert_no_warnings(trans.fit, X_dense)
+    trans = FunctionTransformer(func=None, inverse_func=np.expm1,
+                                check_inverse=True, validate=True)
+    assert_no_warnings(trans.fit, X_dense)
+
+
+def test_function_transformer_frame():
+    pd = pytest.importorskip('pandas')
+    X_df = pd.DataFrame(np.random.randn(100, 10))
+    transformer = FunctionTransformer()
+    X_df_trans = transformer.fit_transform(X_df)
+    assert hasattr(X_df_trans, 'loc')
--- a/venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py
+++ b/venv/Lib/site-packages/sklearn/preprocessing/tests/test_label.py
@ -0,0 +1,656 @@
+import numpy as np
+
+import pytest
+
+from scipy.sparse import issparse
+from scipy.sparse import coo_matrix
+from scipy.sparse import csc_matrix
+from scipy.sparse import csr_matrix
+from scipy.sparse import dok_matrix
+from scipy.sparse import lil_matrix
+
+from sklearn.utils.multiclass import type_of_target
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_warns_message
+from sklearn.utils._testing import ignore_warnings
+from sklearn.utils import _to_object_array
+
+from sklearn.preprocessing._label import LabelBinarizer
+from sklearn.preprocessing._label import MultiLabelBinarizer
+from sklearn.preprocessing._label import LabelEncoder
+from sklearn.preprocessing._label import label_binarize
+
+from sklearn.preprocessing._label import _inverse_binarize_thresholding
+from sklearn.preprocessing._label import _inverse_binarize_multiclass
+from sklearn.preprocessing._label import _encode
+
+from sklearn import datasets
+
+iris = datasets.load_iris()
+
+
+def toarray(a):
+    if hasattr(a, "toarray"):
+        a = a.toarray()
+    return a
+
+
+def test_label_binarizer():
+    # one-class case defaults to negative label
+    # For dense case:
+    inp = ["pos", "pos", "pos", "pos"]
+    lb = LabelBinarizer(sparse_output=False)
+    expected = np.array([[0, 0, 0, 0]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["pos"])
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    # For sparse case:
+    lb = LabelBinarizer(sparse_output=True)
+    got = lb.fit_transform(inp)
+    assert issparse(got)
+    assert_array_equal(lb.classes_, ["pos"])
+    assert_array_equal(expected, got.toarray())
+    assert_array_equal(lb.inverse_transform(got.toarray()), inp)
+
+    lb = LabelBinarizer(sparse_output=False)
+    # two-class case
+    inp = ["neg", "pos", "pos", "neg"]
+    expected = np.array([[0, 1, 1, 0]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ["neg", "pos"])
+    assert_array_equal(expected, got)
+
+    to_invert = np.array([[1, 0],
+                          [0, 1],
+                          [0, 1],
+                          [1, 0]])
+    assert_array_equal(lb.inverse_transform(to_invert), inp)
+
+    # multi-class case
+    inp = ["spam", "ham", "eggs", "ham", "0"]
+    expected = np.array([[0, 0, 0, 1],
+                         [0, 0, 1, 0],
+                         [0, 1, 0, 0],
+                         [0, 0, 1, 0],
+                         [1, 0, 0, 0]])
+    got = lb.fit_transform(inp)
+    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+def test_label_binarizer_unseen_labels():
+    lb = LabelBinarizer()
+
+    expected = np.array([[1, 0, 0],
+                         [0, 1, 0],
+                         [0, 0, 1]])
+    got = lb.fit_transform(['b', 'd', 'e'])
+    assert_array_equal(expected, got)
+
+    expected = np.array([[0, 0, 0],
+                         [1, 0, 0],
+                         [0, 0, 0],
+                         [0, 1, 0],
+                         [0, 0, 1],
+                         [0, 0, 0]])
+    got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
+    assert_array_equal(expected, got)
+
+
+def test_label_binarizer_set_label_encoding():
+    lb = LabelBinarizer(neg_label=-2, pos_label=0)
+
+    # two-class case with pos_label=0
+    inp = np.array([0, 1, 1, 0])
+    expected = np.array([[-2, 0, 0, -2]]).T
+    got = lb.fit_transform(inp)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+    lb = LabelBinarizer(neg_label=-2, pos_label=2)
+
+    # multi-class case
+    inp = np.array([3, 2, 1, 2, 0])
+    expected = np.array([[-2, -2, -2, +2],
+                         [-2, -2, +2, -2],
+                         [-2, +2, -2, -2],
+                         [-2, -2, +2, -2],
+                         [+2, -2, -2, -2]])
+    got = lb.fit_transform(inp)
+    assert_array_equal(expected, got)
+    assert_array_equal(lb.inverse_transform(got), inp)
+
+
+@ignore_warnings
+def test_label_binarizer_errors():
+    # Check that invalid arguments yield ValueError
+    one_class = np.array([0, 0, 0, 0])
+    lb = LabelBinarizer().fit(one_class)
+
+    multi_label = [(2, 3), (0,), (0, 2)]
+    with pytest.raises(ValueError):
+        lb.transform(multi_label)
+
+    lb = LabelBinarizer()
+    with pytest.raises(ValueError):
+        lb.transform([])
+    with pytest.raises(ValueError):
+        lb.inverse_transform([])
+
+    with pytest.raises(ValueError):
+        LabelBinarizer(neg_label=2, pos_label=1)
+    with pytest.raises(ValueError):
+        LabelBinarizer(neg_label=2, pos_label=2)
+
+    with pytest.raises(ValueError):
+        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
+
+    # Fail on y_type
+    with pytest.raises(ValueError):
+        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
+                                       output_type="foo", classes=[1, 2],
+                                       threshold=0)
+
+    # Sequence of seq type should raise ValueError
+    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
+    with pytest.raises(ValueError):
+        LabelBinarizer().fit_transform(y_seq_of_seqs)
+
+    # Fail on the number of classes
+    with pytest.raises(ValueError):
+        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
+                                       output_type="foo",
+                                       classes=[1, 2, 3],
+                                       threshold=0)
+
+    # Fail on the dimension of 'binary'
+    with pytest.raises(ValueError):
+        _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
+                                       output_type="binary",
+                                       classes=[1, 2, 3],
+                                       threshold=0)
+
+    # Fail on multioutput data
+    with pytest.raises(ValueError):
+        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
+    with pytest.raises(ValueError):
+        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
+
+
+@pytest.mark.parametrize(
+        "values, classes, unknown",
+        [(np.array([2, 1, 3, 1, 3], dtype='int64'),
+          np.array([1, 2, 3], dtype='int64'), np.array([4], dtype='int64')),
+         (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
+          np.array(['a', 'b', 'c'], dtype=object),
+          np.array(['d'], dtype=object)),
+         (np.array(['b', 'a', 'c', 'a', 'c']),
+          np.array(['a', 'b', 'c']), np.array(['d']))],
+        ids=['int64', 'object', 'str'])
+def test_label_encoder(values, classes, unknown):
+    # Test LabelEncoder's transform, fit_transform and
+    # inverse_transform methods
+    le = LabelEncoder()
+    le.fit(values)
+    assert_array_equal(le.classes_, classes)
+    assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
+    assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
+    le = LabelEncoder()
+    ret = le.fit_transform(values)
+    assert_array_equal(ret, [1, 0, 2, 0, 2])
+
+    with pytest.raises(ValueError, match="unseen labels"):
+        le.transform(unknown)
+
+
+def test_label_encoder_negative_ints():
+    le = LabelEncoder()
+    le.fit([1, 1, 4, 5, -1, 0])
+    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
+    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
+                       [1, 2, 3, 3, 4, 0, 0])
+    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
+                       [0, 1, 4, 4, 5, -1, -1])
+    with pytest.raises(ValueError):
+        le.transform([0, 6])
+
+
+@pytest.mark.parametrize("dtype", ['str', 'object'])
+def test_label_encoder_str_bad_shape(dtype):
+    le = LabelEncoder()
+    le.fit(np.array(["apple", "orange"], dtype=dtype))
+    msg = "should be a 1d array"
+    with pytest.raises(ValueError, match=msg):
+        le.transform("apple")
+
+
+def test_label_encoder_errors():
+    # Check that invalid arguments yield ValueError
+    le = LabelEncoder()
+    with pytest.raises(ValueError):
+        le.transform([])
+    with pytest.raises(ValueError):
+        le.inverse_transform([])
+
+    # Fail on unseen labels
+    le = LabelEncoder()
+    le.fit([1, 2, 3, -1, 1])
+    msg = "contains previously unseen labels"
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform([-2])
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform([-2, -3, -4])
+
+    # Fail on inverse_transform("")
+    msg = r"should be a 1d array.+shape \(\)"
+    with pytest.raises(ValueError, match=msg):
+        le.inverse_transform("")
+
+
+@pytest.mark.parametrize(
+        "values",
+        [np.array([2, 1, 3, 1, 3], dtype='int64'),
+         np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
+         np.array(['b', 'a', 'c', 'a', 'c'])],
+        ids=['int64', 'object', 'str'])
+def test_label_encoder_empty_array(values):
+    le = LabelEncoder()
+    le.fit(values)
+    # test empty transform
+    transformed = le.transform([])
+    assert_array_equal(np.array([]), transformed)
+    # test empty inverse transform
+    inverse_transformed = le.inverse_transform([])
+    assert_array_equal(np.array([]), inverse_transformed)
+
+
+def test_sparse_output_multilabel_binarizer():
+    # test input as iterable of iterables
+    inputs = [
+        lambda: [(2, 3), (1,), (1, 2)],
+        lambda: ({2, 3}, {1}, {1, 2}),
+        lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
+    ]
+    indicator_mat = np.array([[0, 1, 1],
+                              [1, 0, 0],
+                              [1, 1, 0]])
+
+    inverse = inputs[0]()
+    for sparse_output in [True, False]:
+        for inp in inputs:
+            # With fit_transform
+            mlb = MultiLabelBinarizer(sparse_output=sparse_output)
+            got = mlb.fit_transform(inp())
+            assert issparse(got) == sparse_output
+            if sparse_output:
+                # verify CSR assumption that indices and indptr have same dtype
+                assert got.indices.dtype == got.indptr.dtype
+                got = got.toarray()
+            assert_array_equal(indicator_mat, got)
+            assert_array_equal([1, 2, 3], mlb.classes_)
+            assert mlb.inverse_transform(got) == inverse
+
+            # With fit
+            mlb = MultiLabelBinarizer(sparse_output=sparse_output)
+            got = mlb.fit(inp()).transform(inp())
+            assert issparse(got) == sparse_output
+            if sparse_output:
+                # verify CSR assumption that indices and indptr have same dtype
+                assert got.indices.dtype == got.indptr.dtype
+                got = got.toarray()
+            assert_array_equal(indicator_mat, got)
+            assert_array_equal([1, 2, 3], mlb.classes_)
+            assert mlb.inverse_transform(got) == inverse
+
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1],
+                                                   [2, 0, 0],
+                                                   [1, 1, 0]])))
+
+
+def test_multilabel_binarizer():
+    # test input as iterable of iterables
+    inputs = [
+        lambda: [(2, 3), (1,), (1, 2)],
+        lambda: ({2, 3}, {1}, {1, 2}),
+        lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
+    ]
+    indicator_mat = np.array([[0, 1, 1],
+                              [1, 0, 0],
+                              [1, 1, 0]])
+    inverse = inputs[0]()
+    for inp in inputs:
+        # With fit_transform
+        mlb = MultiLabelBinarizer()
+        got = mlb.fit_transform(inp())
+        assert_array_equal(indicator_mat, got)
+        assert_array_equal([1, 2, 3], mlb.classes_)
+        assert mlb.inverse_transform(got) == inverse
+
+        # With fit
+        mlb = MultiLabelBinarizer()
+        got = mlb.fit(inp()).transform(inp())
+        assert_array_equal(indicator_mat, got)
+        assert_array_equal([1, 2, 3], mlb.classes_)
+        assert mlb.inverse_transform(got) == inverse
+
+
+def test_multilabel_binarizer_empty_sample():
+    mlb = MultiLabelBinarizer()
+    y = [[1, 2], [1], []]
+    Y = np.array([[1, 1],
+                  [1, 0],
+                  [0, 0]])
+    assert_array_equal(mlb.fit_transform(y), Y)
+
+
+def test_multilabel_binarizer_unknown_class():
+    mlb = MultiLabelBinarizer()
+    y = [[1, 2]]
+    Y = np.array([[1, 0], [0, 1]])
+    w = 'unknown class(es) [0, 4] will be ignored'
+    matrix = assert_warns_message(UserWarning, w,
+                                  mlb.fit(y).transform, [[4, 1], [2, 0]])
+    assert_array_equal(matrix, Y)
+
+    Y = np.array([[1, 0, 0], [0, 1, 0]])
+    mlb = MultiLabelBinarizer(classes=[1, 2, 3])
+    matrix = assert_warns_message(UserWarning, w,
+                                  mlb.fit(y).transform, [[4, 1], [2, 0]])
+    assert_array_equal(matrix, Y)
+
+
+def test_multilabel_binarizer_given_classes():
+    inp = [(2, 3), (1,), (1, 2)]
+    indicator_mat = np.array([[0, 1, 1],
+                              [1, 0, 0],
+                              [1, 0, 1]])
+    # fit_transform()
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+    assert_array_equal(mlb.classes_, [1, 3, 2])
+
+    # fit().transform()
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+    assert_array_equal(mlb.classes_, [1, 3, 2])
+
+    # ensure works with extra class
+    mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
+    assert_array_equal(mlb.fit_transform(inp),
+                       np.hstack(([[0], [0], [0]], indicator_mat)))
+    assert_array_equal(mlb.classes_, [4, 1, 3, 2])
+
+    # ensure fit is no-op as iterable is not consumed
+    inp = iter(inp)
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+
+    # ensure a ValueError is thrown if given duplicate classes
+    err_msg = "The classes argument contains duplicate classes. Remove " \
+              "these duplicates before passing them to MultiLabelBinarizer."
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
+    with pytest.raises(ValueError, match=err_msg):
+        mlb.fit(inp)
+
+
+def test_multilabel_binarizer_multiple_calls():
+    inp = [(2, 3), (1,), (1, 2)]
+    indicator_mat = np.array([[0, 1, 1],
+                              [1, 0, 0],
+                              [1, 0, 1]])
+
+    indicator_mat2 = np.array([[0, 1, 1],
+                               [1, 0, 0],
+                               [1, 1, 0]])
+
+    # first call
+    mlb = MultiLabelBinarizer(classes=[1, 3, 2])
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+    # second call change class
+    mlb.classes = [1, 2, 3]
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
+
+
+def test_multilabel_binarizer_same_length_sequence():
+    # Ensure sequences of the same length are not interpreted as a 2-d array
+    inp = [[1], [0], [2]]
+    indicator_mat = np.array([[0, 1, 0],
+                              [1, 0, 0],
+                              [0, 0, 1]])
+    # fit_transform()
+    mlb = MultiLabelBinarizer()
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+    assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
+
+    # fit().transform()
+    mlb = MultiLabelBinarizer()
+    assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+    assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
+
+
+def test_multilabel_binarizer_non_integer_labels():
+    tuple_classes = _to_object_array([(1,), (2,), (3,)])
+    inputs = [
+        ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']),
+        ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']),
+        ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
+    ]
+    indicator_mat = np.array([[0, 1, 1],
+                              [1, 0, 0],
+                              [1, 1, 0]])
+    for inp, classes in inputs:
+        # fit_transform()
+        mlb = MultiLabelBinarizer()
+        assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+        assert_array_equal(mlb.classes_, classes)
+        assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
+
+        # fit().transform()
+        mlb = MultiLabelBinarizer()
+        assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
+        assert_array_equal(mlb.classes_, classes)
+        assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
+
+    mlb = MultiLabelBinarizer()
+    with pytest.raises(TypeError):
+        mlb.fit_transform([({}), ({}, {'a': 'b'})])
+
+
+def test_multilabel_binarizer_non_unique():
+    inp = [(1, 1, 1, 0)]
+    indicator_mat = np.array([[1, 1]])
+    mlb = MultiLabelBinarizer()
+    assert_array_equal(mlb.fit_transform(inp), indicator_mat)
+
+
+def test_multilabel_binarizer_inverse_validation():
+    inp = [(1, 1, 1, 0)]
+    mlb = MultiLabelBinarizer()
+    mlb.fit_transform(inp)
+    # Not binary
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(np.array([[1, 3]]))
+    # The following binary cases are fine, however
+    mlb.inverse_transform(np.array([[0, 0]]))
+    mlb.inverse_transform(np.array([[1, 1]]))
+    mlb.inverse_transform(np.array([[1, 0]]))
+
+    # Wrong shape
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(np.array([[1]]))
+    with pytest.raises(ValueError):
+        mlb.inverse_transform(np.array([[1, 1, 1]]))
+
+
+def test_label_binarize_with_class_order():
+    out = label_binarize([1, 6], classes=[1, 2, 4, 6])
+    expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
+    assert_array_equal(out, expected)
+
+    # Modified class order
+    out = label_binarize([1, 6], classes=[1, 6, 4, 2])
+    expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
+    assert_array_equal(out, expected)
+
+    out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
+    expected = np.array([[0, 0, 1, 0],
+                         [0, 0, 0, 1],
+                         [0, 1, 0, 0],
+                         [1, 0, 0, 0]])
+    assert_array_equal(out, expected)
+
+
+def check_binarized_results(y, classes, pos_label, neg_label, expected):
+    for sparse_output in [True, False]:
+        if ((pos_label == 0 or neg_label != 0) and sparse_output):
+            with pytest.raises(ValueError):
+                label_binarize(y, classes=classes, neg_label=neg_label,
+                               pos_label=pos_label,
+                               sparse_output=sparse_output)
+            continue
+
+        # check label_binarize
+        binarized = label_binarize(y, classes=classes, neg_label=neg_label,
+                                   pos_label=pos_label,
+                                   sparse_output=sparse_output)
+        assert_array_equal(toarray(binarized), expected)
+        assert issparse(binarized) == sparse_output
+
+        # check inverse
+        y_type = type_of_target(y)
+        if y_type == "multiclass":
+            inversed = _inverse_binarize_multiclass(binarized, classes=classes)
+
+        else:
+            inversed = _inverse_binarize_thresholding(binarized,
+                                                      output_type=y_type,
+                                                      classes=classes,
+                                                      threshold=((neg_label +
+                                                                 pos_label) /
+                                                                 2.))
+
+        assert_array_equal(toarray(inversed), toarray(y))
+
+        # Check label binarizer
+        lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
+                            sparse_output=sparse_output)
+        binarized = lb.fit_transform(y)
+        assert_array_equal(toarray(binarized), expected)
+        assert issparse(binarized) == sparse_output
+        inverse_output = lb.inverse_transform(binarized)
+        assert_array_equal(toarray(inverse_output), toarray(y))
+        assert issparse(inverse_output) == issparse(y)
+
+
+def test_label_binarize_binary():
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 2
+    neg_label = -1
+    expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    # Binary case where sparse_output = True will not result in a ValueError
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 3
+    neg_label = 0
+    expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+
+def test_label_binarize_multiclass():
+    y = [0, 1, 2]
+    classes = [0, 1, 2]
+    pos_label = 2
+    neg_label = 0
+    expected = 2 * np.eye(3)
+
+    check_binarized_results(y, classes, pos_label, neg_label, expected)
+
+    with pytest.raises(ValueError):
+        label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
+                       sparse_output=True)
+
+
+def test_label_binarize_multilabel():
+    y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
+    classes = [0, 1, 2]
+    pos_label = 2
+    neg_label = 0
+    expected = pos_label * y_ind
+    y_sparse = [sparse_matrix(y_ind)
+                for sparse_matrix in [coo_matrix, csc_matrix, csr_matrix,
+                                      dok_matrix, lil_matrix]]
+
+    for y in [y_ind] + y_sparse:
+        check_binarized_results(y, classes, pos_label, neg_label,
+                                expected)
+
+    with pytest.raises(ValueError):
+        label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
+                       sparse_output=True)
+
+
+def test_invalid_input_label_binarize():
+    with pytest.raises(ValueError):
+        label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
+    with pytest.raises(ValueError, match="continuous target data is not "):
+        label_binarize([1.2, 2.7], classes=[0, 1])
+    with pytest.raises(ValueError, match="mismatch with the labels"):
+        label_binarize([[1, 3]], classes=[1, 2, 3])
+
+
+def test_inverse_binarize_multiclass():
+    got = _inverse_binarize_multiclass(csr_matrix([[0, 1, 0],
+                                                   [-1, 0, -1],
+                                                   [0, 0, 0]]),
+                                       np.arange(3))
+    assert_array_equal(got, np.array([1, 1, 0]))
+
+
+@pytest.mark.parametrize(
+        "values, expected",
+        [(np.array([2, 1, 3, 1, 3], dtype='int64'),
+          np.array([1, 2, 3], dtype='int64')),
+         (np.array(['b', 'a', 'c', 'a', 'c'], dtype=object),
+          np.array(['a', 'b', 'c'], dtype=object)),
+         (np.array(['b', 'a', 'c', 'a', 'c']),
+          np.array(['a', 'b', 'c']))],
+        ids=['int64', 'object', 'str'])
+def test_encode_util(values, expected):
+    uniques = _encode(values)
+    assert_array_equal(uniques, expected)
+    uniques, encoded = _encode(values, encode=True)
+    assert_array_equal(uniques, expected)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+    _, encoded = _encode(values, uniques, encode=True)
+    assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
+
+
+def test_encode_check_unknown():
+    # test for the check_unknown parameter of _encode()
+    uniques = np.array([1, 2, 3])
+    values = np.array([1, 2, 3, 4])
+
+    # Default is True, raise error
+    with pytest.raises(ValueError,
+                       match='y contains previously unseen labels'):
+        _encode(values, uniques, encode=True, check_unknown=True)
+
+    # dont raise error if False
+    _encode(values, uniques, encode=True, check_unknown=False)
+
+    # parameter is ignored for object dtype
+    uniques = np.array(['a', 'b', 'c'], dtype=object)
+    values = np.array(['a', 'b', 'c', 'd'], dtype=object)
+    with pytest.raises(ValueError,
+                       match='y contains previously unseen labels'):
+        _encode(values, uniques, encode=True, check_unknown=False)