Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
474
venv/Lib/site-packages/sklearn/cluster/_affinity_propagation.py
Normal file
474
venv/Lib/site-packages/sklearn/cluster/_affinity_propagation.py
Normal file
|
@ -0,0 +1,474 @@
|
|||
"""Affinity Propagation clustering algorithm."""
|
||||
|
||||
# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
|
||||
# Gael Varoquaux gael.varoquaux@normalesup.org
|
||||
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..base import BaseEstimator, ClusterMixin
|
||||
from ..utils import as_float_array, check_array, check_random_state
|
||||
from ..utils.validation import check_is_fitted, _deprecate_positional_args
|
||||
from ..metrics import euclidean_distances
|
||||
from ..metrics import pairwise_distances_argmin
|
||||
|
||||
|
||||
def _equal_similarities_and_preferences(S, preference):
|
||||
def all_equal_preferences():
|
||||
return np.all(preference == preference.flat[0])
|
||||
|
||||
def all_equal_similarities():
|
||||
# Create mask to ignore diagonal of S
|
||||
mask = np.ones(S.shape, dtype=bool)
|
||||
np.fill_diagonal(mask, 0)
|
||||
|
||||
return np.all(S[mask].flat == S[mask].flat[0])
|
||||
|
||||
return all_equal_preferences() and all_equal_similarities()
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def affinity_propagation(S, *, preference=None, convergence_iter=15,
|
||||
max_iter=200, damping=0.5, copy=True, verbose=False,
|
||||
return_n_iter=False, random_state='warn'):
|
||||
"""Perform Affinity Propagation Clustering of data
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
S : array-like, shape (n_samples, n_samples)
|
||||
Matrix of similarities between points
|
||||
|
||||
preference : array-like, shape (n_samples,) or float, optional
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number of
|
||||
exemplars, i.e. of clusters, is influenced by the input preferences
|
||||
value. If the preferences are not passed as arguments, they will be
|
||||
set to the median of the input similarities (resulting in a moderate
|
||||
number of clusters). For a smaller amount of clusters, this can be set
|
||||
to the minimum value of the similarities.
|
||||
|
||||
convergence_iter : int, optional, default: 15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
max_iter : int, optional, default: 200
|
||||
Maximum number of iterations
|
||||
|
||||
damping : float, optional, default: 0.5
|
||||
Damping factor between 0.5 and 1.
|
||||
|
||||
copy : boolean, optional, default: True
|
||||
If copy is False, the affinity matrix is modified inplace by the
|
||||
algorithm, for memory efficiency
|
||||
|
||||
verbose : boolean, optional, default: False
|
||||
The verbosity level
|
||||
|
||||
return_n_iter : bool, default False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
random_state : int or np.random.RandomStateInstance, default: 0
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
cluster_centers_indices : array, shape (n_clusters,)
|
||||
index of clusters centers
|
||||
|
||||
labels : array, shape (n_samples,)
|
||||
cluster labels for each point
|
||||
|
||||
n_iter : int
|
||||
number of iterations run. Returned only if `return_n_iter` is
|
||||
set to True.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||||
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||||
|
||||
When the algorithm does not converge, it returns an empty array as
|
||||
``cluster_center_indices`` and ``-1`` as label for each training sample.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, a single cluster center
|
||||
and label ``0`` for every sample will be returned. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
"""
|
||||
S = as_float_array(S, copy=copy)
|
||||
n_samples = S.shape[0]
|
||||
|
||||
if S.shape[0] != S.shape[1]:
|
||||
raise ValueError("S must be a square array (shape=%s)" % repr(S.shape))
|
||||
|
||||
if preference is None:
|
||||
preference = np.median(S)
|
||||
if damping < 0.5 or damping >= 1:
|
||||
raise ValueError('damping must be >= 0.5 and < 1')
|
||||
|
||||
preference = np.array(preference)
|
||||
|
||||
if (n_samples == 1 or
|
||||
_equal_similarities_and_preferences(S, preference)):
|
||||
# It makes no sense to run the algorithm in this case, so return 1 or
|
||||
# n_samples clusters, depending on preferences
|
||||
warnings.warn("All samples have mutually equal similarities. "
|
||||
"Returning arbitrary cluster center(s).")
|
||||
if preference.flat[0] >= S.flat[n_samples - 1]:
|
||||
return ((np.arange(n_samples), np.arange(n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.arange(n_samples), np.arange(n_samples)))
|
||||
else:
|
||||
return ((np.array([0]), np.array([0] * n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.array([0]), np.array([0] * n_samples)))
|
||||
|
||||
if random_state == 'warn':
|
||||
warnings.warn(("'random_state' has been introduced in 0.23. "
|
||||
"It will be set to None starting from 0.25 which "
|
||||
"means that results will differ at every function "
|
||||
"call. Set 'random_state' to None to silence this "
|
||||
"warning, or to 0 to keep the behavior of versions "
|
||||
"<0.23."),
|
||||
FutureWarning)
|
||||
random_state = 0
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
# Place preference on the diagonal of S
|
||||
S.flat[::(n_samples + 1)] = preference
|
||||
|
||||
A = np.zeros((n_samples, n_samples))
|
||||
R = np.zeros((n_samples, n_samples)) # Initialize messages
|
||||
# Intermediate results
|
||||
tmp = np.zeros((n_samples, n_samples))
|
||||
|
||||
# Remove degeneracies
|
||||
S += ((np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100) *
|
||||
random_state.randn(n_samples, n_samples))
|
||||
|
||||
# Execute parallel affinity propagation updates
|
||||
e = np.zeros((n_samples, convergence_iter))
|
||||
|
||||
ind = np.arange(n_samples)
|
||||
|
||||
for it in range(max_iter):
|
||||
# tmp = A + S; compute responsibilities
|
||||
np.add(A, S, tmp)
|
||||
I = np.argmax(tmp, axis=1)
|
||||
Y = tmp[ind, I] # np.max(A + S, axis=1)
|
||||
tmp[ind, I] = -np.inf
|
||||
Y2 = np.max(tmp, axis=1)
|
||||
|
||||
# tmp = Rnew
|
||||
np.subtract(S, Y[:, None], tmp)
|
||||
tmp[ind, I] = S[ind, I] - Y2
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
R *= damping
|
||||
R += tmp
|
||||
|
||||
# tmp = Rp; compute availabilities
|
||||
np.maximum(R, 0, tmp)
|
||||
tmp.flat[::n_samples + 1] = R.flat[::n_samples + 1]
|
||||
|
||||
# tmp = -Anew
|
||||
tmp -= np.sum(tmp, axis=0)
|
||||
dA = np.diag(tmp).copy()
|
||||
tmp.clip(0, np.inf, tmp)
|
||||
tmp.flat[::n_samples + 1] = dA
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
A *= damping
|
||||
A -= tmp
|
||||
|
||||
# Check for convergence
|
||||
E = (np.diag(A) + np.diag(R)) > 0
|
||||
e[:, it % convergence_iter] = E
|
||||
K = np.sum(E, axis=0)
|
||||
|
||||
if it >= convergence_iter:
|
||||
se = np.sum(e, axis=1)
|
||||
unconverged = (np.sum((se == convergence_iter) + (se == 0))
|
||||
!= n_samples)
|
||||
if (not unconverged and (K > 0)) or (it == max_iter):
|
||||
never_converged = False
|
||||
if verbose:
|
||||
print("Converged after %d iterations." % it)
|
||||
break
|
||||
else:
|
||||
never_converged = True
|
||||
if verbose:
|
||||
print("Did not converge")
|
||||
|
||||
I = np.flatnonzero(E)
|
||||
K = I.size # Identify exemplars
|
||||
|
||||
if K > 0 and not never_converged:
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K) # Identify clusters
|
||||
# Refine the final set of exemplars and clusters and return results
|
||||
for k in range(K):
|
||||
ii = np.where(c == k)[0]
|
||||
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
|
||||
I[k] = ii[j]
|
||||
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K)
|
||||
labels = I[c]
|
||||
# Reduce labels to a sorted, gapless, list
|
||||
cluster_centers_indices = np.unique(labels)
|
||||
labels = np.searchsorted(cluster_centers_indices, labels)
|
||||
else:
|
||||
warnings.warn("Affinity propagation did not converge, this model "
|
||||
"will not have any cluster centers.", ConvergenceWarning)
|
||||
labels = np.array([-1] * n_samples)
|
||||
cluster_centers_indices = []
|
||||
|
||||
if return_n_iter:
|
||||
return cluster_centers_indices, labels, it + 1
|
||||
else:
|
||||
return cluster_centers_indices, labels
|
||||
|
||||
|
||||
###############################################################################
|
||||
|
||||
class AffinityPropagation(ClusterMixin, BaseEstimator):
|
||||
"""Perform Affinity Propagation Clustering of data.
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
damping : float, default=0.5
|
||||
Damping factor (between 0.5 and 1) is the extent to
|
||||
which the current value is maintained relative to
|
||||
incoming values (weighted 1 - damping). This in order
|
||||
to avoid numerical oscillations when updating these
|
||||
values (messages).
|
||||
|
||||
max_iter : int, default=200
|
||||
Maximum number of iterations.
|
||||
|
||||
convergence_iter : int, default=15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
copy : bool, default=True
|
||||
Make a copy of input data.
|
||||
|
||||
preference : array-like of shape (n_samples,) or float, default=None
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number
|
||||
of exemplars, ie of clusters, is influenced by the input
|
||||
preferences value. If the preferences are not passed as arguments,
|
||||
they will be set to the median of the input similarities.
|
||||
|
||||
affinity : {'euclidean', 'precomputed'}, default='euclidean'
|
||||
Which affinity to use. At the moment 'precomputed' and
|
||||
``euclidean`` are supported. 'euclidean' uses the
|
||||
negative squared euclidean distance between points.
|
||||
|
||||
verbose : bool, default=False
|
||||
Whether to be verbose.
|
||||
|
||||
random_state : int or np.random.RandomStateInstance, default: 0
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_indices_ : ndarray of shape (n_clusters,)
|
||||
Indices of cluster centers
|
||||
|
||||
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
||||
Cluster centers (if affinity != ``precomputed``).
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Stores the affinity matrix used in ``fit``.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations taken to converge.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||||
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||||
|
||||
The algorithmic complexity of affinity propagation is quadratic
|
||||
in the number of points.
|
||||
|
||||
When ``fit`` does not converge, ``cluster_centers_`` becomes an empty
|
||||
array and all training samples will be labelled as ``-1``. In addition,
|
||||
``predict`` will then label every sample as ``-1``.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, ``fit`` will result in
|
||||
a single cluster center and label ``0`` for every sample. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import AffinityPropagation
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||||
... [4, 2], [4, 4], [4, 0]])
|
||||
>>> clustering = AffinityPropagation(random_state=5).fit(X)
|
||||
>>> clustering
|
||||
AffinityPropagation(random_state=5)
|
||||
>>> clustering.labels_
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
>>> clustering.predict([[0, 0], [4, 4]])
|
||||
array([0, 1])
|
||||
>>> clustering.cluster_centers_
|
||||
array([[1, 2],
|
||||
[4, 2]])
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
|
||||
copy=True, preference=None, affinity='euclidean',
|
||||
verbose=False, random_state='warn'):
|
||||
|
||||
self.damping = damping
|
||||
self.max_iter = max_iter
|
||||
self.convergence_iter = convergence_iter
|
||||
self.copy = copy
|
||||
self.verbose = verbose
|
||||
self.preference = preference
|
||||
self.affinity = affinity
|
||||
self.random_state = random_state
|
||||
|
||||
@property
|
||||
def _pairwise(self):
|
||||
return self.affinity == "precomputed"
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the clustering from features, or affinity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features), or \
|
||||
array-like, shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
accept_sparse = False
|
||||
else:
|
||||
accept_sparse = 'csr'
|
||||
X = self._validate_data(X, accept_sparse=accept_sparse)
|
||||
if self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
elif self.affinity == "euclidean":
|
||||
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
|
||||
else:
|
||||
raise ValueError("Affinity must be 'precomputed' or "
|
||||
"'euclidean'. Got %s instead"
|
||||
% str(self.affinity))
|
||||
|
||||
self.cluster_centers_indices_, self.labels_, self.n_iter_ = \
|
||||
affinity_propagation(
|
||||
self.affinity_matrix_, preference=self.preference,
|
||||
max_iter=self.max_iter,
|
||||
convergence_iter=self.convergence_iter, damping=self.damping,
|
||||
copy=self.copy, verbose=self.verbose, return_n_iter=True,
|
||||
random_state=self.random_state)
|
||||
|
||||
if self.affinity != "precomputed":
|
||||
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the closest cluster each sample in X belongs to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features)
|
||||
New data to predict. If a sparse matrix is provided, it will be
|
||||
converted into a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray, shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X)
|
||||
if not hasattr(self, "cluster_centers_"):
|
||||
raise ValueError("Predict method is not supported when "
|
||||
"affinity='precomputed'.")
|
||||
|
||||
if self.cluster_centers_.shape[0] > 0:
|
||||
return pairwise_distances_argmin(X, self.cluster_centers_)
|
||||
else:
|
||||
warnings.warn("This model does not have any cluster centers "
|
||||
"because affinity propagation did not converge. "
|
||||
"Labeling every sample as '-1'.", ConvergenceWarning)
|
||||
return np.array([-1] * X.shape[0])
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Fit the clustering from features or affinity matrix, and return
|
||||
cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features), or \
|
||||
array-like, shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray, shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
return super().fit_predict(X, y)
|
Loading…
Add table
Add a link
Reference in a new issue