Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py
+++ b/venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py
@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+"""
+Nearest Centroid Classification
+"""
+
+# Author: Robert Layton <robertlayton@gmail.com>
+#         Olivier Grisel <olivier.grisel@ensta.org>
+#
+# License: BSD 3 clause
+
+import warnings
+import numpy as np
+from scipy import sparse as sp
+
+from ..base import BaseEstimator, ClassifierMixin
+from ..metrics.pairwise import pairwise_distances
+from ..preprocessing import LabelEncoder
+from ..utils.validation import check_array, check_is_fitted
+from ..utils.validation import _deprecate_positional_args
+from ..utils.sparsefuncs import csc_median_axis_0
+from ..utils.multiclass import check_classification_targets
+
+
+class NearestCentroid(ClassifierMixin, BaseEstimator):
+    """Nearest centroid classifier.
+
+    Each class is represented by its centroid, with test samples classified to
+    the class with the nearest centroid.
+
+    Read more in the :ref:`User Guide <nearest_centroid_classifier>`.
+
+    Parameters
+    ----------
+    metric : str or callable
+        The metric to use when calculating distance between instances in a
+        feature array. If metric is a string or callable, it must be one of
+        the options allowed by metrics.pairwise.pairwise_distances for its
+        metric parameter.
+        The centroids for the samples corresponding to each class is the point
+        from which the sum of the distances (according to the metric) of all
+        samples that belong to that particular class are minimized.
+        If the "manhattan" metric is provided, this centroid is the median and
+        for all other metrics, the centroid is now set to be the mean.
+
+        .. versionchanged:: 0.19
+            ``metric='precomputed'`` was deprecated and now raises an error
+
+    shrink_threshold : float, default=None
+        Threshold for shrinking centroids to remove features.
+
+    Attributes
+    ----------
+    centroids_ : array-like of shape (n_classes, n_features)
+        Centroid of each class.
+
+    classes_ : array of shape (n_classes,)
+        The unique classes labels.
+
+    Examples
+    --------
+    >>> from sklearn.neighbors import NearestCentroid
+    >>> import numpy as np
+    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> y = np.array([1, 1, 1, 2, 2, 2])
+    >>> clf = NearestCentroid()
+    >>> clf.fit(X, y)
+    NearestCentroid()
+    >>> print(clf.predict([[-0.8, -1]]))
+    [1]
+
+    See also
+    --------
+    sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier
+
+    Notes
+    -----
+    When used for text classification with tf-idf vectors, this classifier is
+    also known as the Rocchio classifier.
+
+    References
+    ----------
+    Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of
+    multiple cancer types by shrunken centroids of gene expression. Proceedings
+    of the National Academy of Sciences of the United States of America,
+    99(10), 6567-6572. The National Academy of Sciences.
+
+    """
+
+    @_deprecate_positional_args
+    def __init__(self, metric='euclidean', *, shrink_threshold=None):
+        self.metric = metric
+        self.shrink_threshold = shrink_threshold
+
+    def fit(self, X, y):
+        """
+        Fit the NearestCentroid model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training vector, where n_samples is the number of samples and
+            n_features is the number of features.
+            Note that centroid shrinking cannot be used with sparse matrices.
+        y : array-like of shape (n_samples,)
+            Target values (integers)
+        """
+        if self.metric == 'precomputed':
+            raise ValueError("Precomputed is not supported.")
+        # If X is sparse and the metric is "manhattan", store it in a csc
+        # format is easier to calculate the median.
+        if self.metric == 'manhattan':
+            X, y = self._validate_data(X, y, accept_sparse=['csc'])
+        else:
+            X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
+        is_X_sparse = sp.issparse(X)
+        if is_X_sparse and self.shrink_threshold:
+            raise ValueError("threshold shrinking not supported"
+                             " for sparse input")
+        check_classification_targets(y)
+
+        n_samples, n_features = X.shape
+        le = LabelEncoder()
+        y_ind = le.fit_transform(y)
+        self.classes_ = classes = le.classes_
+        n_classes = classes.size
+        if n_classes < 2:
+            raise ValueError('The number of classes has to be greater than'
+                             ' one; got %d class' % (n_classes))
+
+        # Mask mapping each class to its members.
+        self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
+        # Number of clusters in each class.
+        nk = np.zeros(n_classes)
+
+        for cur_class in range(n_classes):
+            center_mask = y_ind == cur_class
+            nk[cur_class] = np.sum(center_mask)
+            if is_X_sparse:
+                center_mask = np.where(center_mask)[0]
+
+            # XXX: Update other averaging methods according to the metrics.
+            if self.metric == "manhattan":
+                # NumPy does not calculate median of sparse matrices.
+                if not is_X_sparse:
+                    self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
+                else:
+                    self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
+            else:
+                if self.metric != 'euclidean':
+                    warnings.warn("Averaging for metrics other than "
+                                  "euclidean and manhattan not supported. "
+                                  "The average is set to be the mean."
+                                  )
+                self.centroids_[cur_class] = X[center_mask].mean(axis=0)
+
+        if self.shrink_threshold:
+            dataset_centroid_ = np.mean(X, axis=0)
+
+            # m parameter for determining deviation
+            m = np.sqrt((1. / nk) - (1. / n_samples))
+            # Calculate deviation using the standard deviation of centroids.
+            variance = (X - self.centroids_[y_ind]) ** 2
+            variance = variance.sum(axis=0)
+            s = np.sqrt(variance / (n_samples - n_classes))
+            s += np.median(s)  # To deter outliers from affecting the results.
+            mm = m.reshape(len(m), 1)  # Reshape to allow broadcasting.
+            ms = mm * s
+            deviation = ((self.centroids_ - dataset_centroid_) / ms)
+            # Soft thresholding: if the deviation crosses 0 during shrinking,
+            # it becomes zero.
+            signs = np.sign(deviation)
+            deviation = (np.abs(deviation) - self.shrink_threshold)
+            np.clip(deviation, 0, None, out=deviation)
+            deviation *= signs
+            # Now adjust the centroids using the deviation
+            msd = ms * deviation
+            self.centroids_ = dataset_centroid_[np.newaxis, :] + msd
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        The predicted class C for each sample in X is returned.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+
+        Notes
+        -----
+        If the metric constructor parameter is "precomputed", X is assumed to
+        be the distance matrix between the data to be predicted and
+        ``self.centroids_``.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, accept_sparse='csr')
+        return self.classes_[pairwise_distances(
+            X, self.centroids_, metric=self.metric).argmin(axis=1)]