Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/compose/_column_transformer.py
+++ b/venv/Lib/site-packages/sklearn/compose/_column_transformer.py
@ -0,0 +1,862 @@
+"""
+The :mod:`sklearn.compose._column_transformer` module implements utilities
+to work with heterogeneous data and to apply different transformers to
+different columns.
+"""
+# Author: Andreas Mueller
+#         Joris Van den Bossche
+# License: BSD
+import warnings
+from itertools import chain
+
+import numbers
+import numpy as np
+from scipy import sparse
+from joblib import Parallel, delayed
+
+from ..base import clone, TransformerMixin
+from ..utils._estimator_html_repr import _VisualBlock
+from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
+from ..preprocessing import FunctionTransformer
+from ..utils import Bunch
+from ..utils import _safe_indexing
+from ..utils import _get_column_indices
+from ..utils import _determine_key_type
+from ..utils.metaestimators import _BaseComposition
+from ..utils.validation import check_array, check_is_fitted
+from ..utils.validation import _deprecate_positional_args
+
+
+__all__ = [
+    'ColumnTransformer', 'make_column_transformer', 'make_column_selector'
+]
+
+
+_ERR_MSG_1DCOLUMN = ("1D data passed to a transformer that expects 2D data. "
+                     "Try to specify the column selection as a list of one "
+                     "item instead of a scalar.")
+
+
+class ColumnTransformer(TransformerMixin, _BaseComposition):
+    """Applies transformers to columns of an array or pandas DataFrame.
+
+    This estimator allows different columns or column subsets of the input
+    to be transformed separately and the features generated by each transformer
+    will be concatenated to form a single feature space.
+    This is useful for heterogeneous or columnar data, to combine several
+    feature extraction mechanisms or transformations into a single transformer.
+
+    Read more in the :ref:`User Guide <column_transformer>`.
+
+    .. versionadded:: 0.20
+
+    Parameters
+    ----------
+    transformers : list of tuples
+        List of (name, transformer, columns) tuples specifying the
+        transformer objects to be applied to subsets of the data.
+
+        name : str
+            Like in Pipeline and FeatureUnion, this allows the transformer and
+            its parameters to be set using ``set_params`` and searched in grid
+            search.
+        transformer : {'drop', 'passthrough'} or estimator
+            Estimator must support :term:`fit` and :term:`transform`.
+            Special-cased strings 'drop' and 'passthrough' are accepted as
+            well, to indicate to drop the columns or to pass them through
+            untransformed, respectively.
+        columns :  str, array-like of str, int, array-like of int, \
+                array-like of bool, slice or callable
+            Indexes the data on its second axis. Integers are interpreted as
+            positional columns, while strings can reference DataFrame columns
+            by name.  A scalar string or int should be used where
+            ``transformer`` expects X to be a 1d array-like (vector),
+            otherwise a 2d array will be passed to the transformer.
+            A callable is passed the input data `X` and can return any of the
+            above. To select multiple columns by name or dtype, you can use
+            :obj:`make_column_selector`.
+
+    remainder : {'drop', 'passthrough'} or estimator, default='drop'
+        By default, only the specified columns in `transformers` are
+        transformed and combined in the output, and the non-specified
+        columns are dropped. (default of ``'drop'``).
+        By specifying ``remainder='passthrough'``, all remaining columns that
+        were not specified in `transformers` will be automatically passed
+        through. This subset of columns is concatenated with the output of
+        the transformers.
+        By setting ``remainder`` to be an estimator, the remaining
+        non-specified columns will use the ``remainder`` estimator. The
+        estimator must support :term:`fit` and :term:`transform`.
+        Note that using this feature requires that the DataFrame columns
+        input at :term:`fit` and :term:`transform` have identical order.
+
+    sparse_threshold : float, default=0.3
+        If the output of the different transformers contains sparse matrices,
+        these will be stacked as a sparse matrix if the overall density is
+        lower than this value. Use ``sparse_threshold=0`` to always return
+        dense.  When the transformed output consists of all dense data, the
+        stacked result will be dense, and this keyword will be ignored.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    transformer_weights : dict, default=None
+        Multiplicative weights for features per transformer. The output of the
+        transformer is multiplied by these weights. Keys are transformer names,
+        values the weights.
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting each transformer will be
+        printed as it is completed.
+
+    Attributes
+    ----------
+    transformers_ : list
+        The collection of fitted transformers as tuples of
+        (name, fitted_transformer, column). `fitted_transformer` can be an
+        estimator, 'drop', or 'passthrough'. In case there were no columns
+        selected, this will be the unfitted transformer.
+        If there are remaining columns, the final element is a tuple of the
+        form:
+        ('remainder', transformer, remaining_columns) corresponding to the
+        ``remainder`` parameter. If there are remaining columns, then
+        ``len(transformers_)==len(transformers)+1``, otherwise
+        ``len(transformers_)==len(transformers)``.
+
+    named_transformers_ : :class:`~sklearn.utils.Bunch`
+        Read-only attribute to access any transformer by given name.
+        Keys are transformer names and values are the fitted transformer
+        objects.
+
+    sparse_output_ : bool
+        Boolean flag indicating whether the output of ``transform`` is a
+        sparse matrix or a dense numpy array, which depends on the output
+        of the individual transformers and the `sparse_threshold` keyword.
+
+    Notes
+    -----
+    The order of the columns in the transformed feature matrix follows the
+    order of how the columns are specified in the `transformers` list.
+    Columns of the original feature matrix that are not specified are
+    dropped from the resulting transformed feature matrix, unless specified
+    in the `passthrough` keyword. Those columns specified with `passthrough`
+    are added at the right to the output of the transformers.
+
+    See also
+    --------
+    sklearn.compose.make_column_transformer : convenience function for
+        combining the outputs of multiple transformer objects applied to
+        column subsets of the original feature space.
+    sklearn.compose.make_column_selector : convenience function for selecting
+        columns based on datatype or the columns name with a regex pattern.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.compose import ColumnTransformer
+    >>> from sklearn.preprocessing import Normalizer
+    >>> ct = ColumnTransformer(
+    ...     [("norm1", Normalizer(norm='l1'), [0, 1]),
+    ...      ("norm2", Normalizer(norm='l1'), slice(2, 4))])
+    >>> X = np.array([[0., 1., 2., 2.],
+    ...               [1., 1., 0., 1.]])
+    >>> # Normalizer scales each row of X to unit norm. A separate scaling
+    >>> # is applied for the two first and two last elements of each
+    >>> # row independently.
+    >>> ct.fit_transform(X)
+    array([[0. , 1. , 0.5, 0.5],
+           [0.5, 0.5, 0. , 1. ]])
+
+    """
+    _required_parameters = ['transformers']
+
+    @_deprecate_positional_args
+    def __init__(self,
+                 transformers, *,
+                 remainder='drop',
+                 sparse_threshold=0.3,
+                 n_jobs=None,
+                 transformer_weights=None,
+                 verbose=False):
+        self.transformers = transformers
+        self.remainder = remainder
+        self.sparse_threshold = sparse_threshold
+        self.n_jobs = n_jobs
+        self.transformer_weights = transformer_weights
+        self.verbose = verbose
+
+    @property
+    def _transformers(self):
+        """
+        Internal list of transformer only containing the name and
+        transformers, dropping the columns. This is for the implementation
+        of get_params via BaseComposition._get_params which expects lists
+        of tuples of len 2.
+        """
+        return [(name, trans) for name, trans, _ in self.transformers]
+
+    @_transformers.setter
+    def _transformers(self, value):
+        self.transformers = [
+            (name, trans, col) for ((name, trans), (_, _, col))
+            in zip(value, self.transformers)]
+
+    def get_params(self, deep=True):
+        """Get parameters for this estimator.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+        return self._get_params('_transformers', deep=deep)
+
+    def set_params(self, **kwargs):
+        """Set the parameters of this estimator.
+
+        Valid parameter keys can be listed with ``get_params()``.
+
+        Returns
+        -------
+        self
+        """
+        self._set_params('_transformers', **kwargs)
+        return self
+
+    def _iter(self, fitted=False, replace_strings=False):
+        """
+        Generate (name, trans, column, weight) tuples.
+
+        If fitted=True, use the fitted transformers, else use the
+        user specified transformers updated with converted column names
+        and potentially appended with transformer for remainder.
+
+        """
+        if fitted:
+            transformers = self.transformers_
+        else:
+            # interleave the validated column specifiers
+            transformers = [
+                (name, trans, column) for (name, trans, _), column
+                in zip(self.transformers, self._columns)
+            ]
+            # add transformer tuple for remainder
+            if self._remainder[2] is not None:
+                transformers = chain(transformers, [self._remainder])
+        get_weight = (self.transformer_weights or {}).get
+
+        for name, trans, column in transformers:
+            if replace_strings:
+                # replace 'passthrough' with identity transformer and
+                # skip in case of 'drop'
+                if trans == 'passthrough':
+                    trans = FunctionTransformer(
+                        accept_sparse=True, check_inverse=False
+                    )
+                elif trans == 'drop':
+                    continue
+                elif _is_empty_column_selection(column):
+                    continue
+
+            yield (name, trans, column, get_weight(name))
+
+    def _validate_transformers(self):
+        if not self.transformers:
+            return
+
+        names, transformers, _ = zip(*self.transformers)
+
+        # validate names
+        self._validate_names(names)
+
+        # validate estimators
+        for t in transformers:
+            if t in ('drop', 'passthrough'):
+                continue
+            if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
+                    hasattr(t, "transform")):
+                raise TypeError("All estimators should implement fit and "
+                                "transform, or can be 'drop' or 'passthrough' "
+                                "specifiers. '%s' (type %s) doesn't." %
+                                (t, type(t)))
+
+    def _validate_column_callables(self, X):
+        """
+        Converts callable column specifications.
+        """
+        columns = []
+        for _, _, column in self.transformers:
+            if callable(column):
+                column = column(X)
+            columns.append(column)
+        self._columns = columns
+
+    def _validate_remainder(self, X):
+        """
+        Validates ``remainder`` and defines ``_remainder`` targeting
+        the remaining columns.
+        """
+        is_transformer = ((hasattr(self.remainder, "fit")
+                           or hasattr(self.remainder, "fit_transform"))
+                          and hasattr(self.remainder, "transform"))
+        if (self.remainder not in ('drop', 'passthrough')
+                and not is_transformer):
+            raise ValueError(
+                "The remainder keyword needs to be one of 'drop', "
+                "'passthrough', or estimator. '%s' was passed instead" %
+                self.remainder)
+
+        # Make it possible to check for reordered named columns on transform
+        self._has_str_cols = any(_determine_key_type(cols) == 'str'
+                                 for cols in self._columns)
+        if hasattr(X, 'columns'):
+            self._df_columns = X.columns
+
+        self._n_features = X.shape[1]
+        cols = []
+        for columns in self._columns:
+            cols.extend(_get_column_indices(X, columns))
+
+        remaining_idx = sorted(set(range(self._n_features)) - set(cols))
+        self._remainder = ('remainder', self.remainder, remaining_idx or None)
+
+    @property
+    def named_transformers_(self):
+        """Access the fitted transformer by name.
+
+        Read-only attribute to access any transformer by given name.
+        Keys are transformer names and values are the fitted transformer
+        objects.
+
+        """
+        # Use Bunch object to improve autocomplete
+        return Bunch(**{name: trans for name, trans, _
+                        in self.transformers_})
+
+    def get_feature_names(self):
+        """Get feature names from all transformers.
+
+        Returns
+        -------
+        feature_names : list of strings
+            Names of the features produced by transform.
+        """
+        check_is_fitted(self)
+        feature_names = []
+        for name, trans, column, _ in self._iter(fitted=True):
+            if trans == 'drop' or (
+                    hasattr(column, '__len__') and not len(column)):
+                continue
+            if trans == 'passthrough':
+                if hasattr(self, '_df_columns'):
+                    if ((not isinstance(column, slice))
+                            and all(isinstance(col, str) for col in column)):
+                        feature_names.extend(column)
+                    else:
+                        feature_names.extend(self._df_columns[column])
+                else:
+                    indices = np.arange(self._n_features)
+                    feature_names.extend(['x%d' % i for i in indices[column]])
+                continue
+            if not hasattr(trans, 'get_feature_names'):
+                raise AttributeError("Transformer %s (type %s) does not "
+                                     "provide get_feature_names."
+                                     % (str(name), type(trans).__name__))
+            feature_names.extend([name + "__" + f for f in
+                                  trans.get_feature_names()])
+        return feature_names
+
+    def _update_fitted_transformers(self, transformers):
+        # transformers are fitted; excludes 'drop' cases
+        fitted_transformers = iter(transformers)
+        transformers_ = []
+
+        for name, old, column, _ in self._iter():
+            if old == 'drop':
+                trans = 'drop'
+            elif old == 'passthrough':
+                # FunctionTransformer is present in list of transformers,
+                # so get next transformer, but save original string
+                next(fitted_transformers)
+                trans = 'passthrough'
+            elif _is_empty_column_selection(column):
+                trans = old
+            else:
+                trans = next(fitted_transformers)
+            transformers_.append((name, trans, column))
+
+        # sanity check that transformers is exhausted
+        assert not list(fitted_transformers)
+        self.transformers_ = transformers_
+
+    def _validate_output(self, result):
+        """
+        Ensure that the output of each transformer is 2D. Otherwise
+        hstack can raise an error or produce incorrect results.
+        """
+        names = [name for name, _, _, _ in self._iter(fitted=True,
+                                                      replace_strings=True)]
+        for Xs, name in zip(result, names):
+            if not getattr(Xs, 'ndim', 0) == 2:
+                raise ValueError(
+                    "The output of the '{0}' transformer should be 2D (scipy "
+                    "matrix, array, or pandas DataFrame).".format(name))
+
+    def _validate_features(self, n_features, feature_names):
+        """Ensures feature counts and names are the same during fit and
+        transform.
+
+        TODO: It should raise an error from v0.24
+        """
+
+        if ((self._feature_names_in is None or feature_names is None)
+                and self._n_features == n_features):
+            return
+
+        neg_col_present = np.any([_is_negative_indexing(col)
+                                  for col in self._columns])
+        if neg_col_present and self._n_features != n_features:
+            raise RuntimeError("At least one negative column was used to "
+                               "indicate columns, and the new data's number "
+                               "of columns does not match the data given "
+                               "during fit. "
+                               "Please make sure the data during fit and "
+                               "transform have the same number of columns.")
+
+        if (self._n_features != n_features or
+                np.any(self._feature_names_in != np.asarray(feature_names))):
+            warnings.warn("Given feature/column names or counts do not match "
+                          "the ones for the data given during fit. This will "
+                          "fail from v0.24.",
+                          FutureWarning)
+
+    def _log_message(self, name, idx, total):
+        if not self.verbose:
+            return None
+        return '(%d of %d) Processing %s' % (idx, total, name)
+
+    def _fit_transform(self, X, y, func, fitted=False):
+        """
+        Private function to fit and/or transform on demand.
+
+        Return value (transformers and/or transformed X data) depends
+        on the passed function.
+        ``fitted=True`` ensures the fitted transformers are used.
+        """
+        transformers = list(
+            self._iter(fitted=fitted, replace_strings=True))
+        try:
+            return Parallel(n_jobs=self.n_jobs)(
+                delayed(func)(
+                    transformer=clone(trans) if not fitted else trans,
+                    X=_safe_indexing(X, column, axis=1),
+                    y=y,
+                    weight=weight,
+                    message_clsname='ColumnTransformer',
+                    message=self._log_message(name, idx, len(transformers)))
+                for idx, (name, trans, column, weight) in enumerate(
+                        self._iter(fitted=fitted, replace_strings=True), 1))
+        except ValueError as e:
+            if "Expected 2D array, got 1D array instead" in str(e):
+                raise ValueError(_ERR_MSG_1DCOLUMN)
+            else:
+                raise
+
+    def fit(self, X, y=None):
+        """Fit all transformers using X.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            Input data, of which specified subsets are used to fit the
+            transformers.
+
+        y : array-like of shape (n_samples,...), default=None
+            Targets for supervised learning.
+
+        Returns
+        -------
+        self : ColumnTransformer
+            This estimator
+
+        """
+        # we use fit_transform to make sure to set sparse_output_ (for which we
+        # need the transformed data) to have consistent output type in predict
+        self.fit_transform(X, y=y)
+        return self
+
+    def fit_transform(self, X, y=None):
+        """Fit all transformers, transform the data and concatenate results.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            Input data, of which specified subsets are used to fit the
+            transformers.
+
+        y : array-like of shape (n_samples,), default=None
+            Targets for supervised learning.
+
+        Returns
+        -------
+        X_t : {array-like, sparse matrix} of \
+                shape (n_samples, sum_n_components)
+            hstack of results of transformers. sum_n_components is the
+            sum of n_components (output dimension) over transformers. If
+            any result is a sparse matrix, everything will be converted to
+            sparse matrices.
+
+        """
+        # TODO: this should be `feature_names_in_` when we start having it
+        if hasattr(X, "columns"):
+            self._feature_names_in = np.asarray(X.columns)
+        else:
+            self._feature_names_in = None
+        X = _check_X(X)
+        # set n_features_in_ attribute
+        self._check_n_features(X, reset=True)
+        self._validate_transformers()
+        self._validate_column_callables(X)
+        self._validate_remainder(X)
+
+        result = self._fit_transform(X, y, _fit_transform_one)
+
+        if not result:
+            self._update_fitted_transformers([])
+            # All transformers are None
+            return np.zeros((X.shape[0], 0))
+
+        Xs, transformers = zip(*result)
+
+        # determine if concatenated output will be sparse or not
+        if any(sparse.issparse(X) for X in Xs):
+            nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs)
+            total = sum(X.shape[0] * X.shape[1] if sparse.issparse(X)
+                        else X.size for X in Xs)
+            density = nnz / total
+            self.sparse_output_ = density < self.sparse_threshold
+        else:
+            self.sparse_output_ = False
+
+        self._update_fitted_transformers(transformers)
+        self._validate_output(Xs)
+
+        return self._hstack(list(Xs))
+
+    def transform(self, X):
+        """Transform X separately by each transformer, concatenate results.
+
+        Parameters
+        ----------
+        X : {array-like, dataframe} of shape (n_samples, n_features)
+            The data to be transformed by subset.
+
+        Returns
+        -------
+        X_t : {array-like, sparse matrix} of \
+                shape (n_samples, sum_n_components)
+            hstack of results of transformers. sum_n_components is the
+            sum of n_components (output dimension) over transformers. If
+            any result is a sparse matrix, everything will be converted to
+            sparse matrices.
+
+        """
+        check_is_fitted(self)
+        X = _check_X(X)
+        if hasattr(X, "columns"):
+            X_feature_names = np.asarray(X.columns)
+        else:
+            X_feature_names = None
+
+        if self._n_features > X.shape[1]:
+            raise ValueError('Number of features of the input must be equal '
+                             'to or greater than that of the fitted '
+                             'transformer. Transformer n_features is {0} '
+                             'and input n_features is {1}.'
+                             .format(self._n_features, X.shape[1]))
+
+        # No column reordering allowed for named cols combined with remainder
+        # TODO: remove this mechanism in 0.24, once we enforce strict column
+        # name order and count. See #14237 for details.
+        if (self._remainder[2] is not None and
+                hasattr(self, '_df_columns') and
+                self._has_str_cols and
+                hasattr(X, 'columns')):
+            n_cols_fit = len(self._df_columns)
+            n_cols_transform = len(X.columns)
+            if (n_cols_transform >= n_cols_fit and
+                    any(X.columns[:n_cols_fit] != self._df_columns)):
+                raise ValueError('Column ordering must be equal for fit '
+                                 'and for transform when using the '
+                                 'remainder keyword')
+
+        # TODO: also call _check_n_features(reset=False) in 0.24
+        self._validate_features(X.shape[1], X_feature_names)
+        Xs = self._fit_transform(X, None, _transform_one, fitted=True)
+        self._validate_output(Xs)
+
+        if not Xs:
+            # All transformers are None
+            return np.zeros((X.shape[0], 0))
+
+        return self._hstack(list(Xs))
+
+    def _hstack(self, Xs):
+        """Stacks Xs horizontally.
+
+        This allows subclasses to control the stacking behavior, while reusing
+        everything else from ColumnTransformer.
+
+        Parameters
+        ----------
+        Xs : list of {array-like, sparse matrix, dataframe}
+        """
+        if self.sparse_output_:
+            try:
+                # since all columns should be numeric before stacking them
+                # in a sparse matrix, `check_array` is used for the
+                # dtype conversion if necessary.
+                converted_Xs = [check_array(X,
+                                            accept_sparse=True,
+                                            force_all_finite=False)
+                                for X in Xs]
+            except ValueError:
+                raise ValueError("For a sparse output, all columns should"
+                                 " be a numeric or convertible to a numeric.")
+
+            return sparse.hstack(converted_Xs).tocsr()
+        else:
+            Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
+            return np.hstack(Xs)
+
+    def _sk_visual_block_(self):
+        names, transformers, name_details = zip(*self.transformers)
+        return _VisualBlock('parallel', transformers,
+                            names=names, name_details=name_details)
+
+
+def _check_X(X):
+    """Use check_array only on lists and other non-array-likes / sparse"""
+    if hasattr(X, '__array__') or sparse.issparse(X):
+        return X
+    return check_array(X, force_all_finite='allow-nan', dtype=np.object)
+
+
+def _is_empty_column_selection(column):
+    """
+    Return True if the column selection is empty (empty list or all-False
+    boolean array).
+
+    """
+    if hasattr(column, 'dtype') and np.issubdtype(column.dtype, np.bool_):
+        return not column.any()
+    elif hasattr(column, '__len__'):
+        return len(column) == 0
+    else:
+        return False
+
+
+def _get_transformer_list(estimators):
+    """
+    Construct (name, trans, column) tuples from list
+
+    """
+    transformers, columns = zip(*estimators)
+    names, _ = zip(*_name_estimators(transformers))
+
+    transformer_list = list(zip(names, transformers, columns))
+    return transformer_list
+
+
+def make_column_transformer(*transformers, **kwargs):
+    """Construct a ColumnTransformer from the given transformers.
+
+    This is a shorthand for the ColumnTransformer constructor; it does not
+    require, and does not permit, naming the transformers. Instead, they will
+    be given names automatically based on their types. It also does not allow
+    weighting with ``transformer_weights``.
+
+    Read more in the :ref:`User Guide <make_column_transformer>`.
+
+    Parameters
+    ----------
+    *transformers : tuples
+        Tuples of the form (transformer, columns) specifying the
+        transformer objects to be applied to subsets of the data.
+
+        transformer : {'drop', 'passthrough'} or estimator
+            Estimator must support :term:`fit` and :term:`transform`.
+            Special-cased strings 'drop' and 'passthrough' are accepted as
+            well, to indicate to drop the columns or to pass them through
+            untransformed, respectively.
+        columns : str,  array-like of str, int, array-like of int, slice, \
+                array-like of bool or callable
+            Indexes the data on its second axis. Integers are interpreted as
+            positional columns, while strings can reference DataFrame columns
+            by name. A scalar string or int should be used where
+            ``transformer`` expects X to be a 1d array-like (vector),
+            otherwise a 2d array will be passed to the transformer.
+            A callable is passed the input data `X` and can return any of the
+            above. To select multiple columns by name or dtype, you can use
+            :obj:`make_column_selector`.
+
+    remainder : {'drop', 'passthrough'} or estimator, default='drop'
+        By default, only the specified columns in `transformers` are
+        transformed and combined in the output, and the non-specified
+        columns are dropped. (default of ``'drop'``).
+        By specifying ``remainder='passthrough'``, all remaining columns that
+        were not specified in `transformers` will be automatically passed
+        through. This subset of columns is concatenated with the output of
+        the transformers.
+        By setting ``remainder`` to be an estimator, the remaining
+        non-specified columns will use the ``remainder`` estimator. The
+        estimator must support :term:`fit` and :term:`transform`.
+
+    sparse_threshold : float, default=0.3
+        If the transformed output consists of a mix of sparse and dense data,
+        it will be stacked as a sparse matrix if the density is lower than this
+        value. Use ``sparse_threshold=0`` to always return dense.
+        When the transformed output consists of all sparse or all dense data,
+        the stacked result will be sparse or dense, respectively, and this
+        keyword will be ignored.
+
+    n_jobs : int, default=None
+        Number of jobs to run in parallel.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : bool, default=False
+        If True, the time elapsed while fitting each transformer will be
+        printed as it is completed.
+
+    Returns
+    -------
+    ct : ColumnTransformer
+
+    See also
+    --------
+    sklearn.compose.ColumnTransformer : Class that allows combining the
+        outputs of multiple transformer objects used on column subsets
+        of the data into a single feature space.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
+    >>> from sklearn.compose import make_column_transformer
+    >>> make_column_transformer(
+    ...     (StandardScaler(), ['numerical_column']),
+    ...     (OneHotEncoder(), ['categorical_column']))
+    ColumnTransformer(transformers=[('standardscaler', StandardScaler(...),
+                                     ['numerical_column']),
+                                    ('onehotencoder', OneHotEncoder(...),
+                                     ['categorical_column'])])
+
+    """
+    # transformer_weights keyword is not passed through because the user
+    # would need to know the automatically generated names of the transformers
+    n_jobs = kwargs.pop('n_jobs', None)
+    remainder = kwargs.pop('remainder', 'drop')
+    sparse_threshold = kwargs.pop('sparse_threshold', 0.3)
+    verbose = kwargs.pop('verbose', False)
+    if kwargs:
+        raise TypeError('Unknown keyword arguments: "{}"'
+                        .format(list(kwargs.keys())[0]))
+    transformer_list = _get_transformer_list(transformers)
+    return ColumnTransformer(transformer_list, n_jobs=n_jobs,
+                             remainder=remainder,
+                             sparse_threshold=sparse_threshold,
+                             verbose=verbose)
+
+
+def _is_negative_indexing(key):
+    # TODO: remove in v0.24
+    def is_neg(x): return isinstance(x, numbers.Integral) and x < 0
+    if isinstance(key, slice):
+        return is_neg(key.start) or is_neg(key.stop)
+    elif _determine_key_type(key) == 'int':
+        return np.any(np.asarray(key) < 0)
+    return False
+
+
+class make_column_selector:
+    """Create a callable to select columns to be used with
+    :class:`ColumnTransformer`.
+
+    :func:`make_column_selector` can select columns based on datatype or the
+    columns name with a regex. When using multiple selection criteria, **all**
+    criteria must match for a column to be selected.
+
+    Parameters
+    ----------
+    pattern : str, default=None
+        Name of columns containing this regex pattern will be included. If
+        None, column selection will not be selected based on pattern.
+
+    dtype_include : column dtype or list of column dtypes, default=None
+        A selection of dtypes to include. For more details, see
+        :meth:`pandas.DataFrame.select_dtypes`.
+
+    dtype_exclude : column dtype or list of column dtypes, default=None
+        A selection of dtypes to exclude. For more details, see
+        :meth:`pandas.DataFrame.select_dtypes`.
+
+    Returns
+    -------
+    selector : callable
+        Callable for column selection to be used by a
+        :class:`ColumnTransformer`.
+
+    See also
+    --------
+    sklearn.compose.ColumnTransformer : Class that allows combining the
+        outputs of multiple transformer objects used on column subsets
+        of the data into a single feature space.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder
+    >>> from sklearn.compose import make_column_transformer
+    >>> from sklearn.compose import make_column_selector
+    >>> import pandas as pd  # doctest: +SKIP
+    >>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],
+    ...                   'rating': [5, 3, 4, 5]})  # doctest: +SKIP
+    >>> ct = make_column_transformer(
+    ...       (StandardScaler(),
+    ...        make_column_selector(dtype_include=np.number)),  # rating
+    ...       (OneHotEncoder(),
+    ...        make_column_selector(dtype_include=object)))  # city
+    >>> ct.fit_transform(X)  # doctest: +SKIP
+    array([[ 0.90453403,  1.        ,  0.        ,  0.        ],
+           [-1.50755672,  1.        ,  0.        ,  0.        ],
+           [-0.30151134,  0.        ,  1.        ,  0.        ],
+           [ 0.90453403,  0.        ,  0.        ,  1.        ]])
+    """
+    @_deprecate_positional_args
+    def __init__(self, pattern=None, *, dtype_include=None,
+                 dtype_exclude=None):
+        self.pattern = pattern
+        self.dtype_include = dtype_include
+        self.dtype_exclude = dtype_exclude
+
+    def __call__(self, df):
+        if not hasattr(df, 'iloc'):
+            raise ValueError("make_column_selector can only be applied to "
+                             "pandas dataframes")
+        df_row = df.iloc[:1]
+        if self.dtype_include is not None or self.dtype_exclude is not None:
+            df_row = df_row.select_dtypes(include=self.dtype_include,
+                                          exclude=self.dtype_exclude)
+        cols = df_row.columns
+        if self.pattern is not None:
+            cols = cols[cols.str.contains(self.pattern, regex=True)]
+        return cols.tolist()