Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/cluster/_mean_shift.py
+++ b/venv/Lib/site-packages/sklearn/cluster/_mean_shift.py
@ -0,0 +1,465 @@
+"""Mean shift clustering algorithm.
+
+Mean shift clustering aims to discover *blobs* in a smooth density of
+samples. It is a centroid based algorithm, which works by updating candidates
+for centroids to be the mean of the points within a given region. These
+candidates are then filtered in a post-processing stage to eliminate
+near-duplicates to form the final set of centroids.
+
+Seeding is performed using a binning technique for scalability.
+"""
+
+# Authors: Conrad Lee <conradlee@gmail.com>
+#          Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Gael Varoquaux <gael.varoquaux@normalesup.org>
+#          Martino Sorbaro <martino.sorbaro@ed.ac.uk>
+
+import numpy as np
+import warnings
+from joblib import Parallel, delayed
+
+from collections import defaultdict
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
+from ..utils import check_random_state, gen_batches, check_array
+from ..base import BaseEstimator, ClusterMixin
+from ..neighbors import NearestNeighbors
+from ..metrics.pairwise import pairwise_distances_argmin
+
+
+@_deprecate_positional_args
+def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0,
+                       n_jobs=None):
+    """Estimate the bandwidth to use with the mean-shift algorithm.
+
+    That this function takes time at least quadratic in n_samples. For large
+    datasets, it's wise to set that parameter to a small value.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Input points.
+
+    quantile : float, default=0.3
+        should be between [0, 1]
+        0.5 means that the median of all pairwise distances is used.
+
+    n_samples : int, default=None
+        The number of samples to use. If not given, all samples are used.
+
+    random_state : int, RandomState instance, default=None
+        The generator used to randomly select the samples from input points
+        for bandwidth estimation. Use an int to make the randomness
+        deterministic.
+        See :term:`Glossary <random_state>`.
+
+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Returns
+    -------
+    bandwidth : float
+        The bandwidth parameter.
+    """
+    X = check_array(X)
+
+    random_state = check_random_state(random_state)
+    if n_samples is not None:
+        idx = random_state.permutation(X.shape[0])[:n_samples]
+        X = X[idx]
+    n_neighbors = int(X.shape[0] * quantile)
+    if n_neighbors < 1:  # cannot fit NearestNeighbors with n_neighbors = 0
+        n_neighbors = 1
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors,
+                            n_jobs=n_jobs)
+    nbrs.fit(X)
+
+    bandwidth = 0.
+    for batch in gen_batches(len(X), 500):
+        d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
+        bandwidth += np.max(d, axis=1).sum()
+
+    return bandwidth / X.shape[0]
+
+
+# separate function for each seed's iterative loop
+def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
+    # For each seed, climb gradient until convergence or max_iter
+    bandwidth = nbrs.get_params()['radius']
+    stop_thresh = 1e-3 * bandwidth  # when mean has converged
+    completed_iterations = 0
+    while True:
+        # Find mean of points within bandwidth
+        i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth,
+                                       return_distance=False)[0]
+        points_within = X[i_nbrs]
+        if len(points_within) == 0:
+            break  # Depending on seeding strategy this condition may occur
+        my_old_mean = my_mean  # save the old mean
+        my_mean = np.mean(points_within, axis=0)
+        # If converged or at max_iter, adds the cluster
+        if (np.linalg.norm(my_mean - my_old_mean) < stop_thresh or
+                completed_iterations == max_iter):
+            break
+        completed_iterations += 1
+    return tuple(my_mean), len(points_within), completed_iterations
+
+
+@_deprecate_positional_args
+def mean_shift(X, *, bandwidth=None, seeds=None, bin_seeding=False,
+               min_bin_freq=1, cluster_all=True, max_iter=300,
+               n_jobs=None):
+    """Perform mean shift clustering of data using a flat kernel.
+
+    Read more in the :ref:`User Guide <mean_shift>`.
+
+    Parameters
+    ----------
+
+    X : array-like of shape (n_samples, n_features)
+        Input data.
+
+    bandwidth : float, default=None
+        Kernel bandwidth.
+
+        If bandwidth is not given, it is determined using a heuristic based on
+        the median of all pairwise distances. This will take quadratic time in
+        the number of samples. The sklearn.cluster.estimate_bandwidth function
+        can be used to do this more efficiently.
+
+    seeds : array-like of shape (n_seeds, n_features) or None
+        Point used as initial kernel locations. If None and bin_seeding=False,
+        each data point is used as a seed. If None and bin_seeding=True,
+        see bin_seeding.
+
+    bin_seeding : boolean, default=False
+        If true, initial kernel locations are not locations of all
+        points, but rather the location of the discretized version of
+        points, where points are binned onto a grid whose coarseness
+        corresponds to the bandwidth. Setting this option to True will speed
+        up the algorithm because fewer seeds will be initialized.
+        Ignored if seeds argument is not None.
+
+    min_bin_freq : int, default=1
+       To speed up the algorithm, accept only those bins with at least
+       min_bin_freq points as seeds.
+
+    cluster_all : bool, default=True
+        If true, then all points are clustered, even those orphans that are
+        not within any kernel. Orphans are assigned to the nearest kernel.
+        If false, then orphans are given cluster label -1.
+
+    max_iter : int, default=300
+        Maximum number of iterations, per seed point before the clustering
+        operation terminates (for that seed point), if has not converged yet.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+        .. versionadded:: 0.17
+           Parallel Execution using *n_jobs*.
+
+    Returns
+    -------
+
+    cluster_centers : array, shape=[n_clusters, n_features]
+        Coordinates of cluster centers.
+
+    labels : array, shape=[n_samples]
+        Cluster labels for each point.
+
+    Notes
+    -----
+    For an example, see :ref:`examples/cluster/plot_mean_shift.py
+    <sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
+
+    """
+    model = MeanShift(bandwidth=bandwidth, seeds=seeds,
+                      min_bin_freq=min_bin_freq,
+                      bin_seeding=bin_seeding,
+                      cluster_all=cluster_all, n_jobs=n_jobs,
+                      max_iter=max_iter).fit(X)
+    return model.cluster_centers_, model.labels_
+
+
+def get_bin_seeds(X, bin_size, min_bin_freq=1):
+    """Finds seeds for mean_shift.
+
+    Finds seeds by first binning data onto a grid whose lines are
+    spaced bin_size apart, and then choosing those bins with at least
+    min_bin_freq points.
+
+    Parameters
+    ----------
+
+    X : array-like of shape (n_samples, n_features)
+        Input points, the same points that will be used in mean_shift.
+
+    bin_size : float
+        Controls the coarseness of the binning. Smaller values lead
+        to more seeding (which is computationally more expensive). If you're
+        not sure how to set this, set it to the value of the bandwidth used
+        in clustering.mean_shift.
+
+    min_bin_freq : int, default=1
+        Only bins with at least min_bin_freq will be selected as seeds.
+        Raising this value decreases the number of seeds found, which
+        makes mean_shift computationally cheaper.
+
+    Returns
+    -------
+    bin_seeds : array-like of shape (n_samples, n_features)
+        Points used as initial kernel positions in clustering.mean_shift.
+    """
+    if bin_size == 0:
+        return X
+
+    # Bin points
+    bin_sizes = defaultdict(int)
+    for point in X:
+        binned_point = np.round(point / bin_size)
+        bin_sizes[tuple(binned_point)] += 1
+
+    # Select only those bins as seeds which have enough members
+    bin_seeds = np.array([point for point, freq in bin_sizes.items() if
+                          freq >= min_bin_freq], dtype=np.float32)
+    if len(bin_seeds) == len(X):
+        warnings.warn("Binning data failed with provided bin_size=%f,"
+                      " using data points as seeds." % bin_size)
+        return X
+    bin_seeds = bin_seeds * bin_size
+    return bin_seeds
+
+
+class MeanShift(ClusterMixin, BaseEstimator):
+    """Mean shift clustering using a flat kernel.
+
+    Mean shift clustering aims to discover "blobs" in a smooth density of
+    samples. It is a centroid-based algorithm, which works by updating
+    candidates for centroids to be the mean of the points within a given
+    region. These candidates are then filtered in a post-processing stage to
+    eliminate near-duplicates to form the final set of centroids.
+
+    Seeding is performed using a binning technique for scalability.
+
+    Read more in the :ref:`User Guide <mean_shift>`.
+
+    Parameters
+    ----------
+    bandwidth : float, default=None
+        Bandwidth used in the RBF kernel.
+
+        If not given, the bandwidth is estimated using
+        sklearn.cluster.estimate_bandwidth; see the documentation for that
+        function for hints on scalability (see also the Notes, below).
+
+    seeds : array-like of shape (n_samples, n_features), default=None
+        Seeds used to initialize kernels. If not set,
+        the seeds are calculated by clustering.get_bin_seeds
+        with bandwidth as the grid size and default values for
+        other parameters.
+
+    bin_seeding : bool, default=False
+        If true, initial kernel locations are not locations of all
+        points, but rather the location of the discretized version of
+        points, where points are binned onto a grid whose coarseness
+        corresponds to the bandwidth. Setting this option to True will speed
+        up the algorithm because fewer seeds will be initialized.
+        The default value is False.
+        Ignored if seeds argument is not None.
+
+    min_bin_freq : int, default=1
+       To speed up the algorithm, accept only those bins with at least
+       min_bin_freq points as seeds.
+
+    cluster_all : bool, default=True
+        If true, then all points are clustered, even those orphans that are
+        not within any kernel. Orphans are assigned to the nearest kernel.
+        If false, then orphans are given cluster label -1.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation. This works by computing
+        each of the n_init runs in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    max_iter : int, default=300
+        Maximum number of iterations, per seed point before the clustering
+        operation terminates (for that seed point), if has not converged yet.
+
+        .. versionadded:: 0.22
+
+    Attributes
+    ----------
+    cluster_centers_ : array, [n_clusters, n_features]
+        Coordinates of cluster centers.
+
+    labels_ : array of shape (n_samples,)
+        Labels of each point.
+
+    n_iter_ : int
+        Maximum number of iterations performed on each seed.
+
+        .. versionadded:: 0.22
+
+    Examples
+    --------
+    >>> from sklearn.cluster import MeanShift
+    >>> import numpy as np
+    >>> X = np.array([[1, 1], [2, 1], [1, 0],
+    ...               [4, 7], [3, 5], [3, 6]])
+    >>> clustering = MeanShift(bandwidth=2).fit(X)
+    >>> clustering.labels_
+    array([1, 1, 1, 0, 0, 0])
+    >>> clustering.predict([[0, 0], [5, 5]])
+    array([1, 0])
+    >>> clustering
+    MeanShift(bandwidth=2)
+
+    Notes
+    -----
+
+    Scalability:
+
+    Because this implementation uses a flat kernel and
+    a Ball Tree to look up members of each kernel, the complexity will tend
+    towards O(T*n*log(n)) in lower dimensions, with n the number of samples
+    and T the number of points. In higher dimensions the complexity will
+    tend towards O(T*n^2).
+
+    Scalability can be boosted by using fewer seeds, for example by using
+    a higher value of min_bin_freq in the get_bin_seeds function.
+
+    Note that the estimate_bandwidth function is much less scalable than the
+    mean shift algorithm and will be the bottleneck if it is used.
+
+    References
+    ----------
+
+    Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
+    feature space analysis". IEEE Transactions on Pattern Analysis and
+    Machine Intelligence. 2002. pp. 603-619.
+
+    """
+    @_deprecate_positional_args
+    def __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False,
+                 min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300):
+        self.bandwidth = bandwidth
+        self.seeds = seeds
+        self.bin_seeding = bin_seeding
+        self.cluster_all = cluster_all
+        self.min_bin_freq = min_bin_freq
+        self.n_jobs = n_jobs
+        self.max_iter = max_iter
+
+    def fit(self, X, y=None):
+        """Perform clustering.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples to cluster.
+
+        y : Ignored
+
+        """
+        X = self._validate_data(X)
+        bandwidth = self.bandwidth
+        if bandwidth is None:
+            bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
+        elif bandwidth <= 0:
+            raise ValueError("bandwidth needs to be greater than zero or None,"
+                             " got %f" % bandwidth)
+
+        seeds = self.seeds
+        if seeds is None:
+            if self.bin_seeding:
+                seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
+            else:
+                seeds = X
+        n_samples, n_features = X.shape
+        center_intensity_dict = {}
+
+        # We use n_jobs=1 because this will be used in nested calls under
+        # parallel calls to _mean_shift_single_seed so there is no need for
+        # for further parallelism.
+        nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
+
+        # execute iterations on all seeds in parallel
+        all_res = Parallel(n_jobs=self.n_jobs)(
+            delayed(_mean_shift_single_seed)
+            (seed, X, nbrs, self.max_iter) for seed in seeds)
+        # copy results in a dictionary
+        for i in range(len(seeds)):
+            if all_res[i][1]:  # i.e. len(points_within) > 0
+                center_intensity_dict[all_res[i][0]] = all_res[i][1]
+
+        self.n_iter_ = max([x[2] for x in all_res])
+
+        if not center_intensity_dict:
+            # nothing near seeds
+            raise ValueError("No point was within bandwidth=%f of any seed."
+                             " Try a different seeding strategy \
+                             or increase the bandwidth."
+                             % bandwidth)
+
+        # POST PROCESSING: remove near duplicate points
+        # If the distance between two kernels is less than the bandwidth,
+        # then we have to remove one because it is a duplicate. Remove the
+        # one with fewer points.
+
+        sorted_by_intensity = sorted(center_intensity_dict.items(),
+                                     key=lambda tup: (tup[1], tup[0]),
+                                     reverse=True)
+        sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
+        unique = np.ones(len(sorted_centers), dtype=np.bool)
+        nbrs = NearestNeighbors(radius=bandwidth,
+                                n_jobs=self.n_jobs).fit(sorted_centers)
+        for i, center in enumerate(sorted_centers):
+            if unique[i]:
+                neighbor_idxs = nbrs.radius_neighbors([center],
+                                                      return_distance=False)[0]
+                unique[neighbor_idxs] = 0
+                unique[i] = 1  # leave the current point as unique
+        cluster_centers = sorted_centers[unique]
+
+        # ASSIGN LABELS: a point belongs to the cluster that it is closest to
+        nbrs = NearestNeighbors(n_neighbors=1,
+                                n_jobs=self.n_jobs).fit(cluster_centers)
+        labels = np.zeros(n_samples, dtype=np.int)
+        distances, idxs = nbrs.kneighbors(X)
+        if self.cluster_all:
+            labels = idxs.flatten()
+        else:
+            labels.fill(-1)
+            bool_selector = distances.flatten() <= bandwidth
+            labels[bool_selector] = idxs.flatten()[bool_selector]
+
+        self.cluster_centers_, self.labels_ = cluster_centers, labels
+        return self
+
+    def predict(self, X):
+        """Predict the closest cluster each sample in X belongs to.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape=[n_samples, n_features]
+            New data to predict.
+
+        Returns
+        -------
+        labels : array, shape [n_samples,]
+            Index of the cluster each sample belongs to.
+        """
+        check_is_fitted(self)
+
+        return pairwise_distances_argmin(X, self.cluster_centers_)