Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/feature_selection/_variance_threshold.py
+++ b/venv/Lib/site-packages/sklearn/feature_selection/_variance_threshold.py
@ -0,0 +1,102 @@
+# Author: Lars Buitinck
+# License: 3-clause BSD
+
+import numpy as np
+from ..base import BaseEstimator
+from ._base import SelectorMixin
+from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
+from ..utils.validation import check_is_fitted
+
+
+class VarianceThreshold(SelectorMixin, BaseEstimator):
+    """Feature selector that removes all low-variance features.
+
+    This feature selection algorithm looks only at the features (X), not the
+    desired outputs (y), and can thus be used for unsupervised learning.
+
+    Read more in the :ref:`User Guide <variance_threshold>`.
+
+    Parameters
+    ----------
+    threshold : float, optional
+        Features with a training-set variance lower than this threshold will
+        be removed. The default is to keep all features with non-zero variance,
+        i.e. remove the features that have the same value in all samples.
+
+    Attributes
+    ----------
+    variances_ : array, shape (n_features,)
+        Variances of individual features.
+
+    Notes
+    -----
+    Allows NaN in the input.
+
+    Examples
+    --------
+    The following dataset has integer features, two of which are the same
+    in every sample. These are removed with the default setting for threshold::
+
+        >>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
+        >>> selector = VarianceThreshold()
+        >>> selector.fit_transform(X)
+        array([[2, 0],
+               [1, 4],
+               [1, 1]])
+    """
+
+    def __init__(self, threshold=0.):
+        self.threshold = threshold
+
+    def fit(self, X, y=None):
+        """Learn empirical variances from X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Sample vectors from which to compute variances.
+
+        y : any
+            Ignored. This parameter exists only for compatibility with
+            sklearn.pipeline.Pipeline.
+
+        Returns
+        -------
+        self
+        """
+        X = self._validate_data(X, accept_sparse=('csr', 'csc'),
+                                dtype=np.float64,
+                                force_all_finite='allow-nan')
+
+        if hasattr(X, "toarray"):   # sparse matrix
+            _, self.variances_ = mean_variance_axis(X, axis=0)
+            if self.threshold == 0:
+                mins, maxes = min_max_axis(X, axis=0)
+                peak_to_peaks = maxes - mins
+        else:
+            self.variances_ = np.nanvar(X, axis=0)
+            if self.threshold == 0:
+                peak_to_peaks = np.ptp(X, axis=0)
+
+        if self.threshold == 0:
+            # Use peak-to-peak to avoid numeric precision issues
+            # for constant features
+            compare_arr = np.array([self.variances_, peak_to_peaks])
+            self.variances_ = np.nanmin(compare_arr, axis=0)
+
+        if np.all(~np.isfinite(self.variances_) |
+                  (self.variances_ <= self.threshold)):
+            msg = "No feature in X meets the variance threshold {0:.5f}"
+            if X.shape[0] == 1:
+                msg += " (X contains only one sample)"
+            raise ValueError(msg.format(self.threshold))
+
+        return self
+
+    def _get_support_mask(self):
+        check_is_fitted(self)
+
+        return self.variances_ > self.threshold
+
+    def _more_tags(self):
+        return {'allow_nan': True}