Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/covariance/_elliptic_envelope.py
+++ b/venv/Lib/site-packages/sklearn/covariance/_elliptic_envelope.py
@ -0,0 +1,230 @@
+# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from . import MinCovDet
+from ..utils.validation import check_is_fitted, check_array
+from ..utils.validation import _deprecate_positional_args
+from ..metrics import accuracy_score
+from ..base import OutlierMixin
+
+
+class EllipticEnvelope(OutlierMixin, MinCovDet):
+    """An object for detecting outliers in a Gaussian distributed dataset.
+
+    Read more in the :ref:`User Guide <outlier_detection>`.
+
+    Parameters
+    ----------
+    store_precision : bool, default=True
+        Specify if the estimated precision is stored.
+
+    assume_centered : bool, default=False
+        If True, the support of robust location and covariance estimates
+        is computed, and a covariance estimate is recomputed from it,
+        without centering the data.
+        Useful to work with data whose mean is significantly equal to
+        zero but is not exactly zero.
+        If False, the robust location and covariance are directly computed
+        with the FastMCD algorithm without additional treatment.
+
+    support_fraction : float, default=None
+        The proportion of points to be included in the support of the raw
+        MCD estimate. If None, the minimum value of support_fraction will
+        be used within the algorithm: `[n_sample + n_features + 1] / 2`.
+        Range is (0, 1).
+
+    contamination : float, default=0.1
+        The amount of contamination of the data set, i.e. the proportion
+        of outliers in the data set. Range is (0, 0.5).
+
+    random_state : int or RandomState instance, default=None
+        Determines the pseudo random number generator for shuffling
+        the data. Pass an int for reproducible results across multiple function
+        calls. See :term: `Glossary <random_state>`.
+
+    Attributes
+    ----------
+    location_ : ndarray of shape (n_features,)
+        Estimated robust location
+
+    covariance_ : ndarray of shape (n_features, n_features)
+        Estimated robust covariance matrix
+
+    precision_ : ndarray of shape (n_features, n_features)
+        Estimated pseudo inverse matrix.
+        (stored only if store_precision is True)
+
+    support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute the
+        robust estimates of location and shape.
+
+    offset_ : float
+        Offset used to define the decision function from the raw scores.
+        We have the relation: ``decision_function = score_samples - offset_``.
+        The offset depends on the contamination parameter and is defined in
+        such a way we obtain the expected number of outliers (samples with
+        decision function < 0) in training.
+
+        .. versionadded:: 0.20
+
+    raw_location_ : ndarray of shape (n_features,)
+        The raw robust estimated location before correction and re-weighting.
+
+    raw_covariance_ : ndarray of shape (n_features, n_features)
+        The raw robust estimated covariance before correction and re-weighting.
+
+    raw_support_ : ndarray of shape (n_samples,)
+        A mask of the observations that have been used to compute
+        the raw robust estimates of location and shape, before correction
+        and re-weighting.
+
+    dist_ : ndarray of shape (n_samples,)
+        Mahalanobis distances of the training set (on which :meth:`fit` is
+        called) observations.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.covariance import EllipticEnvelope
+    >>> true_cov = np.array([[.8, .3],
+    ...                      [.3, .4]])
+    >>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
+    ...                                                  cov=true_cov,
+    ...                                                  size=500)
+    >>> cov = EllipticEnvelope(random_state=0).fit(X)
+    >>> # predict returns 1 for an inlier and -1 for an outlier
+    >>> cov.predict([[0, 0],
+    ...              [3, 3]])
+    array([ 1, -1])
+    >>> cov.covariance_
+    array([[0.7411..., 0.2535...],
+           [0.2535..., 0.3053...]])
+    >>> cov.location_
+    array([0.0813... , 0.0427...])
+
+    See Also
+    --------
+    EmpiricalCovariance, MinCovDet
+
+    Notes
+    -----
+    Outlier detection from covariance estimation may break or not
+    perform well in high-dimensional settings. In particular, one will
+    always take care to work with ``n_samples > n_features ** 2``.
+
+    References
+    ----------
+    .. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
+       minimum covariance determinant estimator" Technometrics 41(3), 212
+       (1999)
+    """
+    @_deprecate_positional_args
+    def __init__(self, *, store_precision=True, assume_centered=False,
+                 support_fraction=None, contamination=0.1,
+                 random_state=None):
+        super().__init__(
+            store_precision=store_precision,
+            assume_centered=assume_centered,
+            support_fraction=support_fraction,
+            random_state=random_state)
+        self.contamination = contamination
+
+    def fit(self, X, y=None):
+        """Fit the EllipticEnvelope model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+        """
+        super().fit(X)
+        self.offset_ = np.percentile(-self.dist_, 100. * self.contamination)
+        return self
+
+    def decision_function(self, X):
+        """Compute the decision function of the given observations.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        decision : ndarray of shape (n_samples, )
+            Decision function of the samples.
+            It is equal to the shifted Mahalanobis distances.
+            The threshold for being an outlier is 0, which ensures a
+            compatibility with other outlier detection algorithms.
+        """
+        check_is_fitted(self)
+        negative_mahal_dist = self.score_samples(X)
+        return negative_mahal_dist - self.offset_
+
+    def score_samples(self, X):
+        """Compute the negative Mahalanobis distances.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        negative_mahal_distances : array-like of shape (n_samples,)
+            Opposite of the Mahalanobis distances.
+        """
+        check_is_fitted(self)
+        return -self.mahalanobis(X)
+
+    def predict(self, X):
+        """
+        Predict the labels (1 inlier, -1 outlier) of X according to the
+        fitted model.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data matrix.
+
+        Returns
+        -------
+        is_inlier : ndarray of shape (n_samples,)
+            Returns -1 for anomalies/outliers and +1 for inliers.
+        """
+        X = check_array(X)
+        is_inlier = np.full(X.shape[0], -1, dtype=int)
+        values = self.decision_function(X)
+        is_inlier[values >= 0] = 1
+
+        return is_inlier
+
+    def score(self, X, y, sample_weight=None):
+        """Returns the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for X.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of self.predict(X) w.r.t. y.
+        """
+        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)