Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/feature_extraction/_hash.py
+++ b/venv/Lib/site-packages/sklearn/feature_extraction/_hash.py
@ -0,0 +1,173 @@
+# Author: Lars Buitinck
+# License: BSD 3 clause
+
+import numbers
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..utils import IS_PYPY
+from ..utils.validation import _deprecate_positional_args
+from ..base import BaseEstimator, TransformerMixin
+
+if not IS_PYPY:
+    from ._hashing_fast import transform as _hashing_transform
+else:
+    def _hashing_transform(*args, **kwargs):
+        raise NotImplementedError(
+                'FeatureHasher is not compatible with PyPy (see '
+                'https://github.com/scikit-learn/scikit-learn/issues/11540 '
+                'for the status updates).')
+
+
+def _iteritems(d):
+    """Like d.iteritems, but accepts any collections.Mapping."""
+    return d.iteritems() if hasattr(d, "iteritems") else d.items()
+
+
+class FeatureHasher(TransformerMixin, BaseEstimator):
+    """Implements feature hashing, aka the hashing trick.
+
+    This class turns sequences of symbolic feature names (strings) into
+    scipy.sparse matrices, using a hash function to compute the matrix column
+    corresponding to a name. The hash function employed is the signed 32-bit
+    version of Murmurhash3.
+
+    Feature names of type byte string are used as-is. Unicode strings are
+    converted to UTF-8 first, but no Unicode normalization is done.
+    Feature values must be (finite) numbers.
+
+    This class is a low-memory alternative to DictVectorizer and
+    CountVectorizer, intended for large-scale (online) learning and situations
+    where memory is tight, e.g. when running prediction code on embedded
+    devices.
+
+    Read more in the :ref:`User Guide <feature_hashing>`.
+
+    .. versionadded:: 0.13
+
+    Parameters
+    ----------
+    n_features : int, default=2**20
+        The number of features (columns) in the output matrices. Small numbers
+        of features are likely to cause hash collisions, but large numbers
+        will cause larger coefficient dimensions in linear learners.
+    input_type : {"dict", "pair"}, default="dict"
+        Either "dict" (the default) to accept dictionaries over
+        (feature_name, value); "pair" to accept pairs of (feature_name, value);
+        or "string" to accept single strings.
+        feature_name should be a string, while value should be a number.
+        In the case of "string", a value of 1 is implied.
+        The feature_name is hashed to find the appropriate column for the
+        feature. The value's sign might be flipped in the output (but see
+        non_negative, below).
+    dtype : numpy dtype, default=np.float64
+        The type of feature values. Passed to scipy.sparse matrix constructors
+        as the dtype argument. Do not set this to bool, np.boolean or any
+        unsigned integer type.
+    alternate_sign : bool, default=True
+        When True, an alternating sign is added to the features as to
+        approximately conserve the inner product in the hashed space even for
+        small n_features. This approach is similar to sparse random projection.
+
+    .. versionchanged:: 0.19
+        ``alternate_sign`` replaces the now deprecated ``non_negative``
+        parameter.
+
+    Examples
+    --------
+    >>> from sklearn.feature_extraction import FeatureHasher
+    >>> h = FeatureHasher(n_features=10)
+    >>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
+    >>> f = h.transform(D)
+    >>> f.toarray()
+    array([[ 0.,  0., -4., -1.,  0.,  0.,  0.,  0.,  0.,  2.],
+           [ 0.,  0.,  0., -2., -5.,  0.,  0.,  0.,  0.,  0.]])
+
+    See also
+    --------
+    DictVectorizer : vectorizes string-valued features using a hash table.
+    sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features.
+    """
+    @_deprecate_positional_args
+    def __init__(self, n_features=(2 ** 20), *, input_type="dict",
+                 dtype=np.float64, alternate_sign=True):
+        self._validate_params(n_features, input_type)
+
+        self.dtype = dtype
+        self.input_type = input_type
+        self.n_features = n_features
+        self.alternate_sign = alternate_sign
+
+    @staticmethod
+    def _validate_params(n_features, input_type):
+        # strangely, np.int16 instances are not instances of Integral,
+        # while np.int64 instances are...
+        if not isinstance(n_features, numbers.Integral):
+            raise TypeError("n_features must be integral, got %r (%s)."
+                            % (n_features, type(n_features)))
+        elif n_features < 1 or n_features >= np.iinfo(np.int32).max + 1:
+            raise ValueError("Invalid number of features (%d)." % n_features)
+
+        if input_type not in ("dict", "pair", "string"):
+            raise ValueError("input_type must be 'dict', 'pair' or 'string',"
+                             " got %r." % input_type)
+
+    def fit(self, X=None, y=None):
+        """No-op.
+
+        This method doesn't do anything. It exists purely for compatibility
+        with the scikit-learn transformer API.
+
+        Parameters
+        ----------
+        X : ndarray
+
+        Returns
+        -------
+        self : FeatureHasher
+
+        """
+        # repeat input validation for grid search (which calls set_params)
+        self._validate_params(self.n_features, self.input_type)
+        return self
+
+    def transform(self, raw_X):
+        """Transform a sequence of instances to a scipy.sparse matrix.
+
+        Parameters
+        ----------
+        raw_X : iterable over iterable over raw features, length = n_samples
+            Samples. Each sample must be iterable an (e.g., a list or tuple)
+            containing/generating feature names (and optionally values, see
+            the input_type constructor argument) which will be hashed.
+            raw_X need not support the len function, so it can be the result
+            of a generator; n_samples is determined on the fly.
+
+        Returns
+        -------
+        X : sparse matrix of shape (n_samples, n_features)
+            Feature matrix, for use with estimators or further transformers.
+
+        """
+        raw_X = iter(raw_X)
+        if self.input_type == "dict":
+            raw_X = (_iteritems(d) for d in raw_X)
+        elif self.input_type == "string":
+            raw_X = (((f, 1) for f in x) for x in raw_X)
+        indices, indptr, values = \
+            _hashing_transform(raw_X, self.n_features, self.dtype,
+                               self.alternate_sign, seed=0)
+        n_samples = indptr.shape[0] - 1
+
+        if n_samples == 0:
+            raise ValueError("Cannot vectorize empty sequence.")
+
+        X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype,
+                          shape=(n_samples, self.n_features))
+        X.sum_duplicates()  # also sorts the indices
+
+        return X
+
+    def _more_tags(self):
+        return {'X_types': [self.input_type]}