Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,42 @@
"""
The :mod:`sklearn.cluster` module gathers popular unsupervised clustering
algorithms.
"""
from ._spectral import spectral_clustering, SpectralClustering
from ._mean_shift import (mean_shift, MeanShift,
estimate_bandwidth, get_bin_seeds)
from ._affinity_propagation import affinity_propagation, AffinityPropagation
from ._agglomerative import (ward_tree, AgglomerativeClustering,
linkage_tree, FeatureAgglomeration)
from ._kmeans import k_means, KMeans, MiniBatchKMeans
from ._dbscan import dbscan, DBSCAN
from ._optics import (OPTICS, cluster_optics_dbscan, compute_optics_graph,
cluster_optics_xi)
from ._bicluster import SpectralBiclustering, SpectralCoclustering
from ._birch import Birch
__all__ = ['AffinityPropagation',
'AgglomerativeClustering',
'Birch',
'DBSCAN',
'OPTICS',
'cluster_optics_dbscan',
'cluster_optics_xi',
'compute_optics_graph',
'KMeans',
'FeatureAgglomeration',
'MeanShift',
'MiniBatchKMeans',
'SpectralClustering',
'affinity_propagation',
'dbscan',
'estimate_bandwidth',
'get_bin_seeds',
'k_means',
'linkage_tree',
'mean_shift',
'spectral_clustering',
'ward_tree',
'SpectralBiclustering',
'SpectralCoclustering']

View file

@ -0,0 +1,474 @@
"""Affinity Propagation clustering algorithm."""
# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
# Gael Varoquaux gael.varoquaux@normalesup.org
# License: BSD 3 clause
import numpy as np
import warnings
from ..exceptions import ConvergenceWarning
from ..base import BaseEstimator, ClusterMixin
from ..utils import as_float_array, check_array, check_random_state
from ..utils.validation import check_is_fitted, _deprecate_positional_args
from ..metrics import euclidean_distances
from ..metrics import pairwise_distances_argmin
def _equal_similarities_and_preferences(S, preference):
def all_equal_preferences():
return np.all(preference == preference.flat[0])
def all_equal_similarities():
# Create mask to ignore diagonal of S
mask = np.ones(S.shape, dtype=bool)
np.fill_diagonal(mask, 0)
return np.all(S[mask].flat == S[mask].flat[0])
return all_equal_preferences() and all_equal_similarities()
@_deprecate_positional_args
def affinity_propagation(S, *, preference=None, convergence_iter=15,
max_iter=200, damping=0.5, copy=True, verbose=False,
return_n_iter=False, random_state='warn'):
"""Perform Affinity Propagation Clustering of data
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
S : array-like, shape (n_samples, n_samples)
Matrix of similarities between points
preference : array-like, shape (n_samples,) or float, optional
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number of
exemplars, i.e. of clusters, is influenced by the input preferences
value. If the preferences are not passed as arguments, they will be
set to the median of the input similarities (resulting in a moderate
number of clusters). For a smaller amount of clusters, this can be set
to the minimum value of the similarities.
convergence_iter : int, optional, default: 15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
max_iter : int, optional, default: 200
Maximum number of iterations
damping : float, optional, default: 0.5
Damping factor between 0.5 and 1.
copy : boolean, optional, default: True
If copy is False, the affinity matrix is modified inplace by the
algorithm, for memory efficiency
verbose : boolean, optional, default: False
The verbosity level
return_n_iter : bool, default False
Whether or not to return the number of iterations.
random_state : int or np.random.RandomStateInstance, default: 0
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Returns
-------
cluster_centers_indices : array, shape (n_clusters,)
index of clusters centers
labels : array, shape (n_samples,)
cluster labels for each point
n_iter : int
number of iterations run. Returned only if `return_n_iter` is
set to True.
Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
When the algorithm does not converge, it returns an empty array as
``cluster_center_indices`` and ``-1`` as label for each training sample.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, a single cluster center
and label ``0`` for every sample will be returned. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
"""
S = as_float_array(S, copy=copy)
n_samples = S.shape[0]
if S.shape[0] != S.shape[1]:
raise ValueError("S must be a square array (shape=%s)" % repr(S.shape))
if preference is None:
preference = np.median(S)
if damping < 0.5 or damping >= 1:
raise ValueError('damping must be >= 0.5 and < 1')
preference = np.array(preference)
if (n_samples == 1 or
_equal_similarities_and_preferences(S, preference)):
# It makes no sense to run the algorithm in this case, so return 1 or
# n_samples clusters, depending on preferences
warnings.warn("All samples have mutually equal similarities. "
"Returning arbitrary cluster center(s).")
if preference.flat[0] >= S.flat[n_samples - 1]:
return ((np.arange(n_samples), np.arange(n_samples), 0)
if return_n_iter
else (np.arange(n_samples), np.arange(n_samples)))
else:
return ((np.array([0]), np.array([0] * n_samples), 0)
if return_n_iter
else (np.array([0]), np.array([0] * n_samples)))
if random_state == 'warn':
warnings.warn(("'random_state' has been introduced in 0.23. "
"It will be set to None starting from 0.25 which "
"means that results will differ at every function "
"call. Set 'random_state' to None to silence this "
"warning, or to 0 to keep the behavior of versions "
"<0.23."),
FutureWarning)
random_state = 0
random_state = check_random_state(random_state)
# Place preference on the diagonal of S
S.flat[::(n_samples + 1)] = preference
A = np.zeros((n_samples, n_samples))
R = np.zeros((n_samples, n_samples)) # Initialize messages
# Intermediate results
tmp = np.zeros((n_samples, n_samples))
# Remove degeneracies
S += ((np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100) *
random_state.randn(n_samples, n_samples))
# Execute parallel affinity propagation updates
e = np.zeros((n_samples, convergence_iter))
ind = np.arange(n_samples)
for it in range(max_iter):
# tmp = A + S; compute responsibilities
np.add(A, S, tmp)
I = np.argmax(tmp, axis=1)
Y = tmp[ind, I] # np.max(A + S, axis=1)
tmp[ind, I] = -np.inf
Y2 = np.max(tmp, axis=1)
# tmp = Rnew
np.subtract(S, Y[:, None], tmp)
tmp[ind, I] = S[ind, I] - Y2
# Damping
tmp *= 1 - damping
R *= damping
R += tmp
# tmp = Rp; compute availabilities
np.maximum(R, 0, tmp)
tmp.flat[::n_samples + 1] = R.flat[::n_samples + 1]
# tmp = -Anew
tmp -= np.sum(tmp, axis=0)
dA = np.diag(tmp).copy()
tmp.clip(0, np.inf, tmp)
tmp.flat[::n_samples + 1] = dA
# Damping
tmp *= 1 - damping
A *= damping
A -= tmp
# Check for convergence
E = (np.diag(A) + np.diag(R)) > 0
e[:, it % convergence_iter] = E
K = np.sum(E, axis=0)
if it >= convergence_iter:
se = np.sum(e, axis=1)
unconverged = (np.sum((se == convergence_iter) + (se == 0))
!= n_samples)
if (not unconverged and (K > 0)) or (it == max_iter):
never_converged = False
if verbose:
print("Converged after %d iterations." % it)
break
else:
never_converged = True
if verbose:
print("Did not converge")
I = np.flatnonzero(E)
K = I.size # Identify exemplars
if K > 0 and not never_converged:
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K) # Identify clusters
# Refine the final set of exemplars and clusters and return results
for k in range(K):
ii = np.where(c == k)[0]
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
I[k] = ii[j]
c = np.argmax(S[:, I], axis=1)
c[I] = np.arange(K)
labels = I[c]
# Reduce labels to a sorted, gapless, list
cluster_centers_indices = np.unique(labels)
labels = np.searchsorted(cluster_centers_indices, labels)
else:
warnings.warn("Affinity propagation did not converge, this model "
"will not have any cluster centers.", ConvergenceWarning)
labels = np.array([-1] * n_samples)
cluster_centers_indices = []
if return_n_iter:
return cluster_centers_indices, labels, it + 1
else:
return cluster_centers_indices, labels
###############################################################################
class AffinityPropagation(ClusterMixin, BaseEstimator):
"""Perform Affinity Propagation Clustering of data.
Read more in the :ref:`User Guide <affinity_propagation>`.
Parameters
----------
damping : float, default=0.5
Damping factor (between 0.5 and 1) is the extent to
which the current value is maintained relative to
incoming values (weighted 1 - damping). This in order
to avoid numerical oscillations when updating these
values (messages).
max_iter : int, default=200
Maximum number of iterations.
convergence_iter : int, default=15
Number of iterations with no change in the number
of estimated clusters that stops the convergence.
copy : bool, default=True
Make a copy of input data.
preference : array-like of shape (n_samples,) or float, default=None
Preferences for each point - points with larger values of
preferences are more likely to be chosen as exemplars. The number
of exemplars, ie of clusters, is influenced by the input
preferences value. If the preferences are not passed as arguments,
they will be set to the median of the input similarities.
affinity : {'euclidean', 'precomputed'}, default='euclidean'
Which affinity to use. At the moment 'precomputed' and
``euclidean`` are supported. 'euclidean' uses the
negative squared euclidean distance between points.
verbose : bool, default=False
Whether to be verbose.
random_state : int or np.random.RandomStateInstance, default: 0
Pseudo-random number generator to control the starting state.
Use an int for reproducible results across function calls.
See the :term:`Glossary <random_state>`.
.. versionadded:: 0.23
this parameter was previously hardcoded as 0.
Attributes
----------
cluster_centers_indices_ : ndarray of shape (n_clusters,)
Indices of cluster centers
cluster_centers_ : ndarray of shape (n_clusters, n_features)
Cluster centers (if affinity != ``precomputed``).
labels_ : ndarray of shape (n_samples,)
Labels of each point
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
Stores the affinity matrix used in ``fit``.
n_iter_ : int
Number of iterations taken to converge.
Notes
-----
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
The algorithmic complexity of affinity propagation is quadratic
in the number of points.
When ``fit`` does not converge, ``cluster_centers_`` becomes an empty
array and all training samples will be labelled as ``-1``. In addition,
``predict`` will then label every sample as ``-1``.
When all training samples have equal similarities and equal preferences,
the assignment of cluster centers and labels depends on the preference.
If the preference is smaller than the similarities, ``fit`` will result in
a single cluster center and label ``0`` for every sample. Otherwise, every
training sample becomes its own cluster center and is assigned a unique
label.
References
----------
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
Between Data Points", Science Feb. 2007
Examples
--------
>>> from sklearn.cluster import AffinityPropagation
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> clustering = AffinityPropagation(random_state=5).fit(X)
>>> clustering
AffinityPropagation(random_state=5)
>>> clustering.labels_
array([0, 0, 0, 1, 1, 1])
>>> clustering.predict([[0, 0], [4, 4]])
array([0, 1])
>>> clustering.cluster_centers_
array([[1, 2],
[4, 2]])
"""
@_deprecate_positional_args
def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
copy=True, preference=None, affinity='euclidean',
verbose=False, random_state='warn'):
self.damping = damping
self.max_iter = max_iter
self.convergence_iter = convergence_iter
self.copy = copy
self.verbose = verbose
self.preference = preference
self.affinity = affinity
self.random_state = random_state
@property
def _pairwise(self):
return self.affinity == "precomputed"
def fit(self, X, y=None):
"""Fit the clustering from features, or affinity matrix.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features), or \
array-like, shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
"""
if self.affinity == "precomputed":
accept_sparse = False
else:
accept_sparse = 'csr'
X = self._validate_data(X, accept_sparse=accept_sparse)
if self.affinity == "precomputed":
self.affinity_matrix_ = X
elif self.affinity == "euclidean":
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
else:
raise ValueError("Affinity must be 'precomputed' or "
"'euclidean'. Got %s instead"
% str(self.affinity))
self.cluster_centers_indices_, self.labels_, self.n_iter_ = \
affinity_propagation(
self.affinity_matrix_, preference=self.preference,
max_iter=self.max_iter,
convergence_iter=self.convergence_iter, damping=self.damping,
copy=self.copy, verbose=self.verbose, return_n_iter=True,
random_state=self.random_state)
if self.affinity != "precomputed":
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
New data to predict. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.
Returns
-------
labels : ndarray, shape (n_samples,)
Cluster labels.
"""
check_is_fitted(self)
X = check_array(X)
if not hasattr(self, "cluster_centers_"):
raise ValueError("Predict method is not supported when "
"affinity='precomputed'.")
if self.cluster_centers_.shape[0] > 0:
return pairwise_distances_argmin(X, self.cluster_centers_)
else:
warnings.warn("This model does not have any cluster centers "
"because affinity propagation did not converge. "
"Labeling every sample as '-1'.", ConvergenceWarning)
return np.array([-1] * X.shape[0])
def fit_predict(self, X, y=None):
"""Fit the clustering from features or affinity matrix, and return
cluster labels.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features), or \
array-like, shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse feature matrix
is provided, it will be converted into a sparse ``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray, shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,546 @@
"""Spectral biclustering algorithms."""
# Authors : Kemal Eren
# License: BSD 3 clause
from abc import ABCMeta, abstractmethod
import warnings
import numpy as np
from scipy.linalg import norm
from scipy.sparse import dia_matrix, issparse
from scipy.sparse.linalg import eigsh, svds
from . import KMeans, MiniBatchKMeans
from ..base import BaseEstimator, BiclusterMixin
from ..utils import check_random_state
from ..utils.extmath import (make_nonnegative, randomized_svd,
safe_sparse_dot)
from ..utils.validation import assert_all_finite, _deprecate_positional_args
__all__ = ['SpectralCoclustering',
'SpectralBiclustering']
def _scale_normalize(X):
"""Normalize ``X`` by scaling rows and columns independently.
Returns the normalized matrix and the row and column scaling
factors.
"""
X = make_nonnegative(X)
row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
row_diag = np.where(np.isnan(row_diag), 0, row_diag)
col_diag = np.where(np.isnan(col_diag), 0, col_diag)
if issparse(X):
n_rows, n_cols = X.shape
r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
an = r * X * c
else:
an = row_diag[:, np.newaxis] * X * col_diag
return an, row_diag, col_diag
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
"""Normalize rows and columns of ``X`` simultaneously so that all
rows sum to one constant and all columns sum to a different
constant.
"""
# According to paper, this can also be done more efficiently with
# deviation reduction and balancing algorithms.
X = make_nonnegative(X)
X_scaled = X
for _ in range(max_iter):
X_new, _, _ = _scale_normalize(X_scaled)
if issparse(X):
dist = norm(X_scaled.data - X.data)
else:
dist = norm(X_scaled - X_new)
X_scaled = X_new
if dist is not None and dist < tol:
break
return X_scaled
def _log_normalize(X):
"""Normalize ``X`` according to Kluger's log-interactions scheme."""
X = make_nonnegative(X, min_value=1)
if issparse(X):
raise ValueError("Cannot compute log of a sparse matrix,"
" because log(x) diverges to -infinity as x"
" goes to 0.")
L = np.log(X)
row_avg = L.mean(axis=1)[:, np.newaxis]
col_avg = L.mean(axis=0)
avg = L.mean()
return L - row_avg - col_avg + avg
class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
"""Base class for spectral biclustering."""
@abstractmethod
def __init__(self, n_clusters=3, svd_method="randomized",
n_svd_vecs=None, mini_batch=False, init="k-means++",
n_init=10, n_jobs='deprecated', random_state=None):
self.n_clusters = n_clusters
self.svd_method = svd_method
self.n_svd_vecs = n_svd_vecs
self.mini_batch = mini_batch
self.init = init
self.n_init = n_init
self.n_jobs = n_jobs
self.random_state = random_state
def _check_parameters(self):
legal_svd_methods = ('randomized', 'arpack')
if self.svd_method not in legal_svd_methods:
raise ValueError("Unknown SVD method: '{0}'. svd_method must be"
" one of {1}.".format(self.svd_method,
legal_svd_methods))
def fit(self, X, y=None):
"""Creates a biclustering for X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : Ignored
"""
if self.n_jobs != 'deprecated':
warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
" removed in 0.25.", FutureWarning)
X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)
self._check_parameters()
self._fit(X)
return self
def _svd(self, array, n_components, n_discard):
"""Returns first `n_components` left and right singular
vectors u and v, discarding the first `n_discard`.
"""
if self.svd_method == 'randomized':
kwargs = {}
if self.n_svd_vecs is not None:
kwargs['n_oversamples'] = self.n_svd_vecs
u, _, vt = randomized_svd(array, n_components,
random_state=self.random_state,
**kwargs)
elif self.svd_method == 'arpack':
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
if np.any(np.isnan(vt)):
# some eigenvalues of A * A.T are negative, causing
# sqrt() to be np.nan. This causes some vectors in vt
# to be np.nan.
A = safe_sparse_dot(array.T, array)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
vt = v.T
if np.any(np.isnan(u)):
A = safe_sparse_dot(array, array.T)
random_state = check_random_state(self.random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, A.shape[0])
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
assert_all_finite(u)
assert_all_finite(vt)
u = u[:, n_discard:]
vt = vt[n_discard:]
return u, vt.T
def _k_means(self, data, n_clusters):
if self.mini_batch:
model = MiniBatchKMeans(n_clusters,
init=self.init,
n_init=self.n_init,
random_state=self.random_state)
else:
model = KMeans(n_clusters, init=self.init,
n_init=self.n_init, n_jobs=self.n_jobs,
random_state=self.random_state)
model.fit(data)
centroid = model.cluster_centers_
labels = model.labels_
return centroid, labels
class SpectralCoclustering(BaseSpectral):
"""Spectral Co-Clustering algorithm (Dhillon, 2001).
Clusters rows and columns of an array `X` to solve the relaxed
normalized cut of the bipartite graph created from `X` as follows:
the edge between row vertex `i` and column vertex `j` has weight
`X[i, j]`.
The resulting bicluster structure is block-diagonal, since each
row and each column belongs to exactly one bicluster.
Supports sparse matrices, as long as they are nonnegative.
Read more in the :ref:`User Guide <spectral_coclustering>`.
Parameters
----------
n_clusters : int, default=3
The number of biclusters to find.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', use
:func:`sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', use
:func:`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random', or ndarray of shape \
(n_clusters, n_features), default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
n_jobs : int, default=None
The number of jobs to use for the computation. This works by breaking
down the pairwise matrix into n_jobs even slices and computing them in
parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. deprecated:: 0.23
``n_jobs`` was deprecated in version 0.23 and will be removed in
0.25.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
The bicluster label of each row.
column_labels_ : array-like of shape (n_cols,)
The bicluster label of each column.
Examples
--------
>>> from sklearn.cluster import SpectralCoclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_ #doctest: +SKIP
array([0, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_ #doctest: +SKIP
array([0, 0], dtype=int32)
>>> clustering
SpectralCoclustering(n_clusters=2, random_state=0)
References
----------
* Dhillon, Inderjit S, 2001. `Co-clustering documents and words using
bipartite spectral graph partitioning
<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.140.3011>`__.
"""
@_deprecate_positional_args
def __init__(self, n_clusters=3, *, svd_method='randomized',
n_svd_vecs=None, mini_batch=False, init='k-means++',
n_init=10, n_jobs='deprecated', random_state=None):
super().__init__(n_clusters,
svd_method,
n_svd_vecs,
mini_batch,
init,
n_init,
n_jobs,
random_state)
def _fit(self, X):
normalized_data, row_diag, col_diag = _scale_normalize(X)
n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
u, v = self._svd(normalized_data, n_sv, n_discard=1)
z = np.vstack((row_diag[:, np.newaxis] * u,
col_diag[:, np.newaxis] * v))
_, labels = self._k_means(z, self.n_clusters)
n_rows = X.shape[0]
self.row_labels_ = labels[:n_rows]
self.column_labels_ = labels[n_rows:]
self.rows_ = np.vstack([self.row_labels_ == c
for c in range(self.n_clusters)])
self.columns_ = np.vstack([self.column_labels_ == c
for c in range(self.n_clusters)])
class SpectralBiclustering(BaseSpectral):
"""Spectral biclustering (Kluger, 2003).
Partitions rows and columns under the assumption that the data has
an underlying checkerboard structure. For instance, if there are
two row partitions and three column partitions, each row will
belong to three biclusters, and each column will belong to two
biclusters. The outer product of the corresponding row and column
label vectors gives this checkerboard structure.
Read more in the :ref:`User Guide <spectral_biclustering>`.
Parameters
----------
n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
The number of row and column clusters in the checkerboard
structure.
method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
Method of normalizing and converting singular vectors into
biclusters. May be one of 'scale', 'bistochastic', or 'log'.
The authors recommend using 'log'. If the data is sparse,
however, log normalization will not work, which is why the
default is 'bistochastic'.
.. warning::
if `method='log'`, the data must be sparse.
n_components : int, default=6
Number of singular vectors to check.
n_best : int, default=3
Number of best singular vectors to which to project the data
for clustering.
svd_method : {'randomized', 'arpack'}, default='randomized'
Selects the algorithm for finding singular vectors. May be
'randomized' or 'arpack'. If 'randomized', uses
:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
for large matrices. If 'arpack', uses
`scipy.sparse.linalg.svds`, which is more accurate, but
possibly slower in some cases.
n_svd_vecs : int, default=None
Number of vectors to use in calculating the SVD. Corresponds
to `ncv` when `svd_method=arpack` and `n_oversamples` when
`svd_method` is 'randomized`.
mini_batch : bool, default=False
Whether to use mini-batch k-means, which is faster but may get
different results.
init : {'k-means++', 'random'} or ndarray of (n_clusters, n_features), \
default='k-means++'
Method for initialization of k-means algorithm; defaults to
'k-means++'.
n_init : int, default=10
Number of random initializations that are tried with the
k-means algorithm.
If mini-batch k-means is used, the best initialization is
chosen and the algorithm runs once. Otherwise, the algorithm
is run for each initialization and the best solution chosen.
n_jobs : int, default=None
The number of jobs to use for the computation. This works by breaking
down the pairwise matrix into n_jobs even slices and computing them in
parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. deprecated:: 0.23
``n_jobs`` was deprecated in version 0.23 and will be removed in
0.25.
random_state : int, RandomState instance, default=None
Used for randomizing the singular value decomposition and the k-means
initialization. Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Attributes
----------
rows_ : array-like of shape (n_row_clusters, n_rows)
Results of the clustering. `rows[i, r]` is True if
cluster `i` contains row `r`. Available only after calling ``fit``.
columns_ : array-like of shape (n_column_clusters, n_columns)
Results of the clustering, like `rows`.
row_labels_ : array-like of shape (n_rows,)
Row partition labels.
column_labels_ : array-like of shape (n_cols,)
Column partition labels.
Examples
--------
>>> from sklearn.cluster import SpectralBiclustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
>>> clustering.row_labels_
array([1, 1, 1, 0, 0, 0], dtype=int32)
>>> clustering.column_labels_
array([0, 1], dtype=int32)
>>> clustering
SpectralBiclustering(n_clusters=2, random_state=0)
References
----------
* Kluger, Yuval, et. al., 2003. `Spectral biclustering of microarray
data: coclustering genes and conditions
<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.135.1608>`__.
"""
@_deprecate_positional_args
def __init__(self, n_clusters=3, *, method='bistochastic',
n_components=6, n_best=3, svd_method='randomized',
n_svd_vecs=None, mini_batch=False, init='k-means++',
n_init=10, n_jobs='deprecated', random_state=None):
super().__init__(n_clusters,
svd_method,
n_svd_vecs,
mini_batch,
init,
n_init,
n_jobs,
random_state)
self.method = method
self.n_components = n_components
self.n_best = n_best
def _check_parameters(self):
super()._check_parameters()
legal_methods = ('bistochastic', 'scale', 'log')
if self.method not in legal_methods:
raise ValueError("Unknown method: '{0}'. method must be"
" one of {1}.".format(self.method, legal_methods))
try:
int(self.n_clusters)
except TypeError:
try:
r, c = self.n_clusters
int(r)
int(c)
except (ValueError, TypeError):
raise ValueError("Incorrect parameter n_clusters has value:"
" {}. It should either be a single integer"
" or an iterable with two integers:"
" (n_row_clusters, n_column_clusters)")
if self.n_components < 1:
raise ValueError("Parameter n_components must be greater than 0,"
" but its value is {}".format(self.n_components))
if self.n_best < 1:
raise ValueError("Parameter n_best must be greater than 0,"
" but its value is {}".format(self.n_best))
if self.n_best > self.n_components:
raise ValueError("n_best cannot be larger than"
" n_components, but {} > {}"
"".format(self.n_best, self.n_components))
def _fit(self, X):
n_sv = self.n_components
if self.method == 'bistochastic':
normalized_data = _bistochastic_normalize(X)
n_sv += 1
elif self.method == 'scale':
normalized_data, _, _ = _scale_normalize(X)
n_sv += 1
elif self.method == 'log':
normalized_data = _log_normalize(X)
n_discard = 0 if self.method == 'log' else 1
u, v = self._svd(normalized_data, n_sv, n_discard)
ut = u.T
vt = v.T
try:
n_row_clusters, n_col_clusters = self.n_clusters
except TypeError:
n_row_clusters = n_col_clusters = self.n_clusters
best_ut = self._fit_best_piecewise(ut, self.n_best,
n_row_clusters)
best_vt = self._fit_best_piecewise(vt, self.n_best,
n_col_clusters)
self.row_labels_ = self._project_and_cluster(X, best_vt.T,
n_row_clusters)
self.column_labels_ = self._project_and_cluster(X.T, best_ut.T,
n_col_clusters)
self.rows_ = np.vstack([self.row_labels_ == label
for label in range(n_row_clusters)
for _ in range(n_col_clusters)])
self.columns_ = np.vstack([self.column_labels_ == label
for _ in range(n_row_clusters)
for label in range(n_col_clusters)])
def _fit_best_piecewise(self, vectors, n_best, n_clusters):
"""Find the ``n_best`` vectors that are best approximated by piecewise
constant vectors.
The piecewise vectors are found by k-means; the best is chosen
according to Euclidean distance.
"""
def make_piecewise(v):
centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
return centroid[labels].ravel()
piecewise_vectors = np.apply_along_axis(make_piecewise,
axis=1, arr=vectors)
dists = np.apply_along_axis(norm, axis=1,
arr=(vectors - piecewise_vectors))
result = vectors[np.argsort(dists)[:n_best]]
return result
def _project_and_cluster(self, data, vectors, n_clusters):
"""Project ``data`` to ``vectors`` and cluster the result."""
projected = safe_sparse_dot(data, vectors)
_, labels = self._k_means(projected, n_clusters)
return labels

View file

@ -0,0 +1,658 @@
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# Joel Nothman <joel.nothman@gmail.com>
# License: BSD 3 clause
import warnings
import numbers
import numpy as np
from scipy import sparse
from math import sqrt
from ..metrics import pairwise_distances_argmin
from ..metrics.pairwise import euclidean_distances
from ..base import TransformerMixin, ClusterMixin, BaseEstimator
from ..utils import check_array
from ..utils.extmath import row_norms
from ..utils.validation import check_is_fitted, _deprecate_positional_args
from ..exceptions import ConvergenceWarning
from . import AgglomerativeClustering
def _iterate_sparse_X(X):
"""This little hack returns a densified row when iterating over a sparse
matrix, instead of constructing a sparse matrix for every row that is
expensive.
"""
n_samples = X.shape[0]
X_indices = X.indices
X_data = X.data
X_indptr = X.indptr
for i in range(n_samples):
row = np.zeros(X.shape[1])
startptr, endptr = X_indptr[i], X_indptr[i + 1]
nonzero_indices = X_indices[startptr:endptr]
row[nonzero_indices] = X_data[startptr:endptr]
yield row
def _split_node(node, threshold, branching_factor):
"""The node has to be split if there is no place for a new subcluster
in the node.
1. Two empty nodes and two empty subclusters are initialized.
2. The pair of distant subclusters are found.
3. The properties of the empty subclusters and nodes are updated
according to the nearest distance between the subclusters to the
pair of distant subclusters.
4. The two nodes are set as children to the two subclusters.
"""
new_subcluster1 = _CFSubcluster()
new_subcluster2 = _CFSubcluster()
new_node1 = _CFNode(
threshold=threshold, branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features)
new_node2 = _CFNode(
threshold=threshold, branching_factor=branching_factor,
is_leaf=node.is_leaf,
n_features=node.n_features)
new_subcluster1.child_ = new_node1
new_subcluster2.child_ = new_node2
if node.is_leaf:
if node.prev_leaf_ is not None:
node.prev_leaf_.next_leaf_ = new_node1
new_node1.prev_leaf_ = node.prev_leaf_
new_node1.next_leaf_ = new_node2
new_node2.prev_leaf_ = new_node1
new_node2.next_leaf_ = node.next_leaf_
if node.next_leaf_ is not None:
node.next_leaf_.prev_leaf_ = new_node2
dist = euclidean_distances(
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True)
n_clusters = dist.shape[0]
farthest_idx = np.unravel_index(
dist.argmax(), (n_clusters, n_clusters))
node1_dist, node2_dist = dist[(farthest_idx,)]
node1_closer = node1_dist < node2_dist
for idx, subcluster in enumerate(node.subclusters_):
if node1_closer[idx]:
new_node1.append_subcluster(subcluster)
new_subcluster1.update(subcluster)
else:
new_node2.append_subcluster(subcluster)
new_subcluster2.update(subcluster)
return new_subcluster1, new_subcluster2
class _CFNode:
"""Each node in a CFTree is called a CFNode.
The CFNode can have a maximum of branching_factor
number of CFSubclusters.
Parameters
----------
threshold : float
Threshold needed for a new subcluster to enter a CFSubcluster.
branching_factor : int
Maximum number of CF subclusters in each node.
is_leaf : bool
We need to know if the CFNode is a leaf or not, in order to
retrieve the final subclusters.
n_features : int
The number of features.
Attributes
----------
subclusters_ : list
List of subclusters for a particular CFNode.
prev_leaf_ : _CFNode
Useful only if is_leaf is True.
next_leaf_ : _CFNode
next_leaf. Useful only if is_leaf is True.
the final subclusters.
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
Manipulate ``init_centroids_`` throughout rather than centroids_ since
the centroids are just a view of the ``init_centroids_`` .
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
centroids_ : ndarray of shape (branching_factor + 1, n_features)
View of ``init_centroids_``.
squared_norm_ : ndarray of shape (branching_factor + 1,)
View of ``init_sq_norm_``.
"""
def __init__(self, *, threshold, branching_factor, is_leaf, n_features):
self.threshold = threshold
self.branching_factor = branching_factor
self.is_leaf = is_leaf
self.n_features = n_features
# The list of subclusters, centroids and squared norms
# to manipulate throughout.
self.subclusters_ = []
self.init_centroids_ = np.zeros((branching_factor + 1, n_features))
self.init_sq_norm_ = np.zeros((branching_factor + 1))
self.squared_norm_ = []
self.prev_leaf_ = None
self.next_leaf_ = None
def append_subcluster(self, subcluster):
n_samples = len(self.subclusters_)
self.subclusters_.append(subcluster)
self.init_centroids_[n_samples] = subcluster.centroid_
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
# Keep centroids and squared norm as views. In this way
# if we change init_centroids and init_sq_norm_, it is
# sufficient,
self.centroids_ = self.init_centroids_[:n_samples + 1, :]
self.squared_norm_ = self.init_sq_norm_[:n_samples + 1]
def update_split_subclusters(self, subcluster,
new_subcluster1, new_subcluster2):
"""Remove a subcluster from a node and update it with the
split subclusters.
"""
ind = self.subclusters_.index(subcluster)
self.subclusters_[ind] = new_subcluster1
self.init_centroids_[ind] = new_subcluster1.centroid_
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
self.append_subcluster(new_subcluster2)
def insert_cf_subcluster(self, subcluster):
"""Insert a new subcluster into the node."""
if not self.subclusters_:
self.append_subcluster(subcluster)
return False
threshold = self.threshold
branching_factor = self.branching_factor
# We need to find the closest subcluster among all the
# subclusters so that we can insert our new subcluster.
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
dist_matrix *= -2.
dist_matrix += self.squared_norm_
closest_index = np.argmin(dist_matrix)
closest_subcluster = self.subclusters_[closest_index]
# If the subcluster has a child, we need a recursive strategy.
if closest_subcluster.child_ is not None:
split_child = closest_subcluster.child_.insert_cf_subcluster(
subcluster)
if not split_child:
# If it is determined that the child need not be split, we
# can just update the closest_subcluster
closest_subcluster.update(subcluster)
self.init_centroids_[closest_index] = \
self.subclusters_[closest_index].centroid_
self.init_sq_norm_[closest_index] = \
self.subclusters_[closest_index].sq_norm_
return False
# things not too good. we need to redistribute the subclusters in
# our child node, and add a new subcluster in the parent
# subcluster to accommodate the new child.
else:
new_subcluster1, new_subcluster2 = _split_node(
closest_subcluster.child_, threshold, branching_factor)
self.update_split_subclusters(
closest_subcluster, new_subcluster1, new_subcluster2)
if len(self.subclusters_) > self.branching_factor:
return True
return False
# good to go!
else:
merged = closest_subcluster.merge_subcluster(
subcluster, self.threshold)
if merged:
self.init_centroids_[closest_index] = \
closest_subcluster.centroid_
self.init_sq_norm_[closest_index] = \
closest_subcluster.sq_norm_
return False
# not close to any other subclusters, and we still
# have space, so add.
elif len(self.subclusters_) < self.branching_factor:
self.append_subcluster(subcluster)
return False
# We do not have enough space nor is it closer to an
# other subcluster. We need to split.
else:
self.append_subcluster(subcluster)
return True
class _CFSubcluster:
"""Each subcluster in a CFNode is called a CFSubcluster.
A CFSubcluster can have a CFNode has its child.
Parameters
----------
linear_sum : ndarray of shape (n_features,), default=None
Sample. This is kept optional to allow initialization of empty
subclusters.
Attributes
----------
n_samples_ : int
Number of samples that belong to each subcluster.
linear_sum_ : ndarray
Linear sum of all the samples in a subcluster. Prevents holding
all sample data in memory.
squared_sum_ : float
Sum of the squared l2 norms of all samples belonging to a subcluster.
centroid_ : ndarray of shape (branching_factor + 1, n_features)
Centroid of the subcluster. Prevent recomputing of centroids when
``CFNode.centroids_`` is called.
child_ : _CFNode
Child Node of the subcluster. Once a given _CFNode is set as the child
of the _CFNode, it is set to ``self.child_``.
sq_norm_ : ndarray of shape (branching_factor + 1,)
Squared norm of the subcluster. Used to prevent recomputing when
pairwise minimum distances are computed.
"""
def __init__(self, *, linear_sum=None):
if linear_sum is None:
self.n_samples_ = 0
self.squared_sum_ = 0.0
self.centroid_ = self.linear_sum_ = 0
else:
self.n_samples_ = 1
self.centroid_ = self.linear_sum_ = linear_sum
self.squared_sum_ = self.sq_norm_ = np.dot(
self.linear_sum_, self.linear_sum_)
self.child_ = None
def update(self, subcluster):
self.n_samples_ += subcluster.n_samples_
self.linear_sum_ += subcluster.linear_sum_
self.squared_sum_ += subcluster.squared_sum_
self.centroid_ = self.linear_sum_ / self.n_samples_
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
def merge_subcluster(self, nominee_cluster, threshold):
"""Check if a cluster is worthy enough to be merged. If
yes then merge.
"""
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
new_n = self.n_samples_ + nominee_cluster.n_samples_
new_centroid = (1 / new_n) * new_ls
new_norm = np.dot(new_centroid, new_centroid)
dot_product = (-2 * new_n) * new_norm
sq_radius = (new_ss + dot_product) / new_n + new_norm
if sq_radius <= threshold ** 2:
(self.n_samples_, self.linear_sum_, self.squared_sum_,
self.centroid_, self.sq_norm_) = \
new_n, new_ls, new_ss, new_centroid, new_norm
return True
return False
@property
def radius(self):
"""Return radius of the subcluster"""
dot_product = -2 * np.dot(self.linear_sum_, self.centroid_)
return sqrt(
((self.squared_sum_ + dot_product) / self.n_samples_) +
self.sq_norm_)
class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
"""Implements the Birch clustering algorithm.
It is a memory-efficient, online-learning algorithm provided as an
alternative to :class:`MiniBatchKMeans`. It constructs a tree
data structure with the cluster centroids being read off the leaf.
These can be either the final cluster centroids or can be provided as input
to another clustering algorithm such as :class:`AgglomerativeClustering`.
Read more in the :ref:`User Guide <birch>`.
.. versionadded:: 0.16
Parameters
----------
threshold : float, default=0.5
The radius of the subcluster obtained by merging a new sample and the
closest subcluster should be lesser than the threshold. Otherwise a new
subcluster is started. Setting this value to be very low promotes
splitting and vice-versa.
branching_factor : int, default=50
Maximum number of CF subclusters in each node. If a new samples enters
such that the number of subclusters exceed the branching_factor then
that node is split into two nodes with the subclusters redistributed
in each. The parent subcluster of that node is removed and two new
subclusters are added as parents of the 2 split nodes.
n_clusters : int, instance of sklearn.cluster model, default=3
Number of clusters after the final clustering step, which treats the
subclusters from the leaves as new samples.
- `None` : the final clustering step is not performed and the
subclusters are returned as they are.
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
is fit treating the subclusters as new samples and the initial data
is mapped to the label of the closest subcluster.
- `int` : the model fit is :class:`AgglomerativeClustering` with
`n_clusters` set to be equal to the int.
compute_labels : bool, default=True
Whether or not to compute labels for each fit.
copy : bool, default=True
Whether or not to make a copy of the given data. If set to False,
the initial data will be overwritten.
Attributes
----------
root_ : _CFNode
Root of the CFTree.
dummy_leaf_ : _CFNode
Start pointer to all the leaves.
subcluster_centers_ : ndarray
Centroids of all subclusters read directly from the leaves.
subcluster_labels_ : ndarray
Labels assigned to the centroids of the subclusters after
they are clustered globally.
labels_ : ndarray of shape (n_samples,)
Array of labels assigned to the input data.
if partial_fit is used instead of fit, they are assigned to the
last batch of data.
See Also
--------
MiniBatchKMeans
Alternative implementation that does incremental updates
of the centers' positions using mini-batches.
Notes
-----
The tree data structure consists of nodes with each node consisting of
a number of subclusters. The maximum number of subclusters in a node
is determined by the branching factor. Each subcluster maintains a
linear sum, squared sum and the number of samples in that subcluster.
In addition, each subcluster can also have a node as its child, if the
subcluster is not a member of a leaf node.
For a new point entering the root, it is merged with the subcluster closest
to it and the linear sum, squared sum and the number of samples of that
subcluster are updated. This is done recursively till the properties of
the leaf node are updated.
References
----------
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
BIRCH: An efficient data clustering method for large databases.
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
* Roberto Perdisci
JBirch - Java implementation of BIRCH clustering algorithm
https://code.google.com/archive/p/jbirch
Examples
--------
>>> from sklearn.cluster import Birch
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
>>> brc = Birch(n_clusters=None)
>>> brc.fit(X)
Birch(n_clusters=None)
>>> brc.predict(X)
array([0, 0, 0, 1, 1, 1])
"""
@_deprecate_positional_args
def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3,
compute_labels=True, copy=True):
self.threshold = threshold
self.branching_factor = branching_factor
self.n_clusters = n_clusters
self.compute_labels = compute_labels
self.copy = copy
def fit(self, X, y=None):
"""
Build a CF Tree for the input data.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
self.fit_, self.partial_fit_ = True, False
return self._fit(X)
def _fit(self, X):
X = self._validate_data(X, accept_sparse='csr', copy=self.copy)
threshold = self.threshold
branching_factor = self.branching_factor
if branching_factor <= 1:
raise ValueError("Branching_factor should be greater than one.")
n_samples, n_features = X.shape
# If partial_fit is called for the first time or fit is called, we
# start a new tree.
partial_fit = getattr(self, 'partial_fit_')
has_root = getattr(self, 'root_', None)
if getattr(self, 'fit_') or (partial_fit and not has_root):
# The first root is the leaf. Manipulate this object throughout.
self.root_ = _CFNode(threshold=threshold,
branching_factor=branching_factor,
is_leaf=True,
n_features=n_features)
# To enable getting back subclusters.
self.dummy_leaf_ = _CFNode(threshold=threshold,
branching_factor=branching_factor,
is_leaf=True, n_features=n_features)
self.dummy_leaf_.next_leaf_ = self.root_
self.root_.prev_leaf_ = self.dummy_leaf_
# Cannot vectorize. Enough to convince to use cython.
if not sparse.issparse(X):
iter_func = iter
else:
iter_func = _iterate_sparse_X
for sample in iter_func(X):
subcluster = _CFSubcluster(linear_sum=sample)
split = self.root_.insert_cf_subcluster(subcluster)
if split:
new_subcluster1, new_subcluster2 = _split_node(
self.root_, threshold, branching_factor)
del self.root_
self.root_ = _CFNode(threshold=threshold,
branching_factor=branching_factor,
is_leaf=False,
n_features=n_features)
self.root_.append_subcluster(new_subcluster1)
self.root_.append_subcluster(new_subcluster2)
centroids = np.concatenate([
leaf.centroids_ for leaf in self._get_leaves()])
self.subcluster_centers_ = centroids
self._global_clustering(X)
return self
def _get_leaves(self):
"""
Retrieve the leaves of the CF Node.
Returns
-------
leaves : list of shape (n_leaves,)
List of the leaf nodes.
"""
leaf_ptr = self.dummy_leaf_.next_leaf_
leaves = []
while leaf_ptr is not None:
leaves.append(leaf_ptr)
leaf_ptr = leaf_ptr.next_leaf_
return leaves
def partial_fit(self, X=None, y=None):
"""
Online learning. Prevents rebuilding of CFTree from scratch.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
default=None
Input data. If X is not provided, only the global clustering
step is done.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
Fitted estimator.
"""
self.partial_fit_, self.fit_ = True, False
if X is None:
# Perform just the final global clustering step.
self._global_clustering()
return self
else:
self._check_fit(X)
return self._fit(X)
def _check_fit(self, X):
check_is_fitted(self)
if (hasattr(self, 'subcluster_centers_') and
X.shape[1] != self.subcluster_centers_.shape[1]):
raise ValueError(
"Training data and predicted data do "
"not have same number of features.")
def predict(self, X):
"""
Predict data using the ``centroids_`` of subclusters.
Avoid computation of the row norms of X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
labels : ndarray of shape(n_samples,)
Labelled data.
"""
X = check_array(X, accept_sparse='csr')
self._check_fit(X)
kwargs = {'Y_norm_squared': self._subcluster_norms}
return self.subcluster_labels_[
pairwise_distances_argmin(X,
self.subcluster_centers_,
metric_kwargs=kwargs)
]
def transform(self, X):
"""
Transform X into subcluster centroids dimension.
Each dimension represents the distance from the sample point to each
cluster centroid.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
Returns
-------
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
Transformed data.
"""
check_is_fitted(self)
return euclidean_distances(X, self.subcluster_centers_)
def _global_clustering(self, X=None):
"""
Global clustering for the subclusters obtained after fitting
"""
clusterer = self.n_clusters
centroids = self.subcluster_centers_
compute_labels = (X is not None) and self.compute_labels
# Preprocessing for the global clustering.
not_enough_centroids = False
if isinstance(clusterer, numbers.Integral):
clusterer = AgglomerativeClustering(
n_clusters=self.n_clusters)
# There is no need to perform the global clustering step.
if len(centroids) < self.n_clusters:
not_enough_centroids = True
elif (clusterer is not None and not
hasattr(clusterer, 'fit_predict')):
raise ValueError("n_clusters should be an instance of "
"ClusterMixin or an int")
# To use in predict to avoid recalculation.
self._subcluster_norms = row_norms(
self.subcluster_centers_, squared=True)
if clusterer is None or not_enough_centroids:
self.subcluster_labels_ = np.arange(len(centroids))
if not_enough_centroids:
warnings.warn(
"Number of subclusters found (%d) by Birch is less "
"than (%d). Decrease the threshold."
% (len(centroids), self.n_clusters), ConvergenceWarning)
else:
# The global clustering step that clusters the subclusters of
# the leaves. It assumes the centroids of the subclusters as
# samples and finds the final centroids.
self.subcluster_labels_ = clusterer.fit_predict(
self.subcluster_centers_)
if compute_labels:
self.labels_ = self.predict(X)

View file

@ -0,0 +1,392 @@
# -*- coding: utf-8 -*-
"""
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
"""
# Author: Robert Layton <robertlayton@gmail.com>
# Joel Nothman <joel.nothman@gmail.com>
# Lars Buitinck
#
# License: BSD 3 clause
import numpy as np
import warnings
from scipy import sparse
from ..base import BaseEstimator, ClusterMixin
from ..utils.validation import _check_sample_weight, _deprecate_positional_args
from ..neighbors import NearestNeighbors
from ._dbscan_inner import dbscan_inner
@_deprecate_positional_args
def dbscan(X, eps=0.5, *, min_samples=5, metric='minkowski',
metric_params=None, algorithm='auto', leaf_size=30, p=2,
sample_weight=None, n_jobs=None):
"""Perform DBSCAN clustering from vector array or distance matrix.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
A feature array, or array of distances between samples if
``metric='precomputed'``.
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
metric : string, or callable
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit.
X may be a :term:`sparse graph <sparse graph>`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
p : float, default=2
The power of the Minkowski metric to be used to calculate distance
between points.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with negative
weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search. ``None`` means
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
using all processors. See :term:`Glossary <n_jobs>` for more details.
If precomputed distance are used, parallel execution is not available
and thus n_jobs will have no effect.
Returns
-------
core_samples : ndarray of shape (n_core_samples,)
Indices of core samples.
labels : ndarray of shape (n_samples,)
Cluster labels for each point. Noisy samples are given the label -1.
See also
--------
DBSCAN
An estimator interface for this clustering algorithm.
OPTICS
A similar estimator interface clustering at multiple values of eps. Our
implementation is optimized for memory usage.
Notes
-----
For an example, see :ref:`examples/cluster/plot_dbscan.py
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:func:`cluster.optics <sklearn.cluster.optics>` provides a similar
clustering with lower memory usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
ACM Transactions on Database Systems (TODS), 42(3), 19.
"""
est = DBSCAN(eps=eps, min_samples=min_samples, metric=metric,
metric_params=metric_params, algorithm=algorithm,
leaf_size=leaf_size, p=p, n_jobs=n_jobs)
est.fit(X, sample_weight=sample_weight)
return est.core_sample_indices_, est.labels_
class DBSCAN(ClusterMixin, BaseEstimator):
"""Perform DBSCAN clustering from vector array or distance matrix.
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
Finds core samples of high density and expands clusters from them.
Good for data which contains clusters of similar density.
Read more in the :ref:`User Guide <dbscan>`.
Parameters
----------
eps : float, default=0.5
The maximum distance between two samples for one to be considered
as in the neighborhood of the other. This is not a maximum bound
on the distances of points within a cluster. This is the most
important DBSCAN parameter to choose appropriately for your data set
and distance function.
min_samples : int, default=5
The number of samples (or total weight) in a neighborhood for a point
to be considered as a core point. This includes the point itself.
metric : string, or callable, default='euclidean'
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`Glossary <sparse graph>`, in which
case only "nonzero" elements may be considered neighbors for DBSCAN.
.. versionadded:: 0.17
metric *precomputed* to accept precomputed sparse matrix.
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.19
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
The algorithm to be used by the NearestNeighbors module
to compute pointwise distances and find nearest neighbors.
See NearestNeighbors module documentation for details.
leaf_size : int, default=30
Leaf size passed to BallTree or cKDTree. This can affect the speed
of the construction and query, as well as the memory required
to store the tree. The optimal value depends
on the nature of the problem.
p : float, default=None
The power of the Minkowski metric to be used to calculate distance
between points.
n_jobs : int, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
core_sample_indices_ : ndarray of shape (n_core_samples,)
Indices of core samples.
components_ : ndarray of shape (n_core_samples, n_features)
Copy of each core sample found by training.
labels_ : ndarray of shape (n_samples)
Cluster labels for each point in the dataset given to fit().
Noisy samples are given the label -1.
Examples
--------
>>> from sklearn.cluster import DBSCAN
>>> import numpy as np
>>> X = np.array([[1, 2], [2, 2], [2, 3],
... [8, 7], [8, 8], [25, 80]])
>>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
>>> clustering.labels_
array([ 0, 0, 0, 1, 1, -1])
>>> clustering
DBSCAN(eps=3, min_samples=2)
See also
--------
OPTICS
A similar clustering at multiple values of eps. Our implementation
is optimized for memory usage.
Notes
-----
For an example, see :ref:`examples/cluster/plot_dbscan.py
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
This implementation bulk-computes all neighborhood queries, which increases
the memory complexity to O(n.d) where d is the average number of neighbors,
while original DBSCAN had memory complexity O(n). It may attract a higher
memory complexity when querying these nearest neighborhoods, depending
on the ``algorithm``.
One way to avoid the query complexity is to pre-compute sparse
neighborhoods in chunks using
:func:`NearestNeighbors.radius_neighbors_graph
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
``mode='distance'``, then using ``metric='precomputed'`` here.
Another way to reduce memory and computation time is to remove
(near-)duplicate points and use ``sample_weight`` instead.
:class:`cluster.OPTICS` provides a similar clustering with lower memory
usage.
References
----------
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
In: Proceedings of the 2nd International Conference on Knowledge Discovery
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
ACM Transactions on Database Systems (TODS), 42(3), 19.
"""
@_deprecate_positional_args
def __init__(self, eps=0.5, *, min_samples=5, metric='euclidean',
metric_params=None, algorithm='auto', leaf_size=30, p=None,
n_jobs=None):
self.eps = eps
self.min_samples = min_samples
self.metric = metric
self.metric_params = metric_params
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p
self.n_jobs = n_jobs
def fit(self, X, y=None, sample_weight=None):
"""Perform DBSCAN clustering from features, or distance matrix.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
"""
X = self._validate_data(X, accept_sparse='csr')
if not self.eps > 0.0:
raise ValueError("eps must be positive.")
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
# Calculate neighborhood for all samples. This leaves the original
# point in, which needs to be considered later (i.e. point i is in the
# neighborhood of point i. While True, its useless information)
if self.metric == 'precomputed' and sparse.issparse(X):
# set the diagonal to explicit values, as a point is its own
# neighbor
with warnings.catch_warnings():
warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place
neighbors_model = NearestNeighbors(
radius=self.eps, algorithm=self.algorithm,
leaf_size=self.leaf_size, metric=self.metric,
metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs)
neighbors_model.fit(X)
# This has worst case O(n^2) memory complexity
neighborhoods = neighbors_model.radius_neighbors(X,
return_distance=False)
if sample_weight is None:
n_neighbors = np.array([len(neighbors)
for neighbors in neighborhoods])
else:
n_neighbors = np.array([np.sum(sample_weight[neighbors])
for neighbors in neighborhoods])
# Initially, all samples are noise.
labels = np.full(X.shape[0], -1, dtype=np.intp)
# A list of all core samples found.
core_samples = np.asarray(n_neighbors >= self.min_samples,
dtype=np.uint8)
dbscan_inner(core_samples, neighborhoods, labels)
self.core_sample_indices_ = np.where(core_samples)[0]
self.labels_ = labels
if len(self.core_sample_indices_):
# fix for scipy sparse indexing issue
self.components_ = X[self.core_sample_indices_].copy()
else:
# no core samples
self.components_ = np.empty((0, X.shape[1]))
return self
def fit_predict(self, X, y=None, sample_weight=None):
"""Perform DBSCAN clustering from features or distance matrix,
and return cluster labels.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``. If a sparse matrix is provided, it will
be converted into a sparse ``csr_matrix``.
sample_weight : array-like of shape (n_samples,), default=None
Weight of each sample, such that a sample with a weight of at least
``min_samples`` is by itself a core sample; a sample with a
negative weight may inhibit its eps-neighbor from being core.
Note that weights are absolute, and default to 1.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels. Noisy samples are given the label -1.
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_

View file

@ -0,0 +1,77 @@
"""
Feature agglomeration. Base classes and functions for performing feature
agglomeration.
"""
# Author: V. Michel, A. Gramfort
# License: BSD 3 clause
import numpy as np
from ..base import TransformerMixin
from ..utils import check_array
from ..utils.validation import check_is_fitted
from scipy.sparse import issparse
###############################################################################
# Mixin class for feature agglomeration.
class AgglomerationTransform(TransformerMixin):
"""
A class for feature agglomeration via the transform interface
"""
def transform(self, X):
"""
Transform a new matrix using the built clustering
Parameters
----------
X : array-like of shape (n_samples, n_features) or (n_samples,)
A M by N array of M observations in N dimensions or a length
M array of M one-dimensional observations.
Returns
-------
Y : array, shape = [n_samples, n_clusters] or [n_clusters]
The pooled values for each feature cluster.
"""
check_is_fitted(self)
X = check_array(X)
if len(self.labels_) != X.shape[1]:
raise ValueError("X has a different number of features than "
"during fitting.")
if self.pooling_func == np.mean and not issparse(X):
size = np.bincount(self.labels_)
n_samples = X.shape[0]
# a fast way to compute the mean of grouped features
nX = np.array([np.bincount(self.labels_, X[i, :]) / size
for i in range(n_samples)])
else:
nX = [self.pooling_func(X[:, self.labels_ == l], axis=1)
for l in np.unique(self.labels_)]
nX = np.array(nX).T
return nX
def inverse_transform(self, Xred):
"""
Inverse the transformation.
Return a vector of size nb_features with the values of Xred assigned
to each group of features
Parameters
----------
Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)
The values to be assigned to each cluster of samples
Returns
-------
X : array, shape=[n_samples, n_features] or [n_features]
A vector of size n_samples with the values of Xred assigned to
each of the cluster of samples.
"""
check_is_fitted(self)
unil, inverse = np.unique(self.labels_, return_inverse=True)
return Xred[..., inverse]

View file

@ -0,0 +1,23 @@
# cython: language_level=3
from cython cimport floating
cimport numpy as np
cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil
cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
floating, bint) nogil
cpdef void _relocate_empty_clusters_dense(
np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], int[::1])
cpdef void _relocate_empty_clusters_sparse(
floating[::1], int[::1], int[::1], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], int[::1])
cdef void _average_centers(floating[:, ::1], floating[::1])
cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1])

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,465 @@
"""Mean shift clustering algorithm.
Mean shift clustering aims to discover *blobs* in a smooth density of
samples. It is a centroid based algorithm, which works by updating candidates
for centroids to be the mean of the points within a given region. These
candidates are then filtered in a post-processing stage to eliminate
near-duplicates to form the final set of centroids.
Seeding is performed using a binning technique for scalability.
"""
# Authors: Conrad Lee <conradlee@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Martino Sorbaro <martino.sorbaro@ed.ac.uk>
import numpy as np
import warnings
from joblib import Parallel, delayed
from collections import defaultdict
from ..utils.validation import check_is_fitted, _deprecate_positional_args
from ..utils import check_random_state, gen_batches, check_array
from ..base import BaseEstimator, ClusterMixin
from ..neighbors import NearestNeighbors
from ..metrics.pairwise import pairwise_distances_argmin
@_deprecate_positional_args
def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0,
n_jobs=None):
"""Estimate the bandwidth to use with the mean-shift algorithm.
That this function takes time at least quadratic in n_samples. For large
datasets, it's wise to set that parameter to a small value.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input points.
quantile : float, default=0.3
should be between [0, 1]
0.5 means that the median of all pairwise distances is used.
n_samples : int, default=None
The number of samples to use. If not given, all samples are used.
random_state : int, RandomState instance, default=None
The generator used to randomly select the samples from input points
for bandwidth estimation. Use an int to make the randomness
deterministic.
See :term:`Glossary <random_state>`.
n_jobs : int, default=None
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
bandwidth : float
The bandwidth parameter.
"""
X = check_array(X)
random_state = check_random_state(random_state)
if n_samples is not None:
idx = random_state.permutation(X.shape[0])[:n_samples]
X = X[idx]
n_neighbors = int(X.shape[0] * quantile)
if n_neighbors < 1: # cannot fit NearestNeighbors with n_neighbors = 0
n_neighbors = 1
nbrs = NearestNeighbors(n_neighbors=n_neighbors,
n_jobs=n_jobs)
nbrs.fit(X)
bandwidth = 0.
for batch in gen_batches(len(X), 500):
d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
bandwidth += np.max(d, axis=1).sum()
return bandwidth / X.shape[0]
# separate function for each seed's iterative loop
def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
# For each seed, climb gradient until convergence or max_iter
bandwidth = nbrs.get_params()['radius']
stop_thresh = 1e-3 * bandwidth # when mean has converged
completed_iterations = 0
while True:
# Find mean of points within bandwidth
i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth,
return_distance=False)[0]
points_within = X[i_nbrs]
if len(points_within) == 0:
break # Depending on seeding strategy this condition may occur
my_old_mean = my_mean # save the old mean
my_mean = np.mean(points_within, axis=0)
# If converged or at max_iter, adds the cluster
if (np.linalg.norm(my_mean - my_old_mean) < stop_thresh or
completed_iterations == max_iter):
break
completed_iterations += 1
return tuple(my_mean), len(points_within), completed_iterations
@_deprecate_positional_args
def mean_shift(X, *, bandwidth=None, seeds=None, bin_seeding=False,
min_bin_freq=1, cluster_all=True, max_iter=300,
n_jobs=None):
"""Perform mean shift clustering of data using a flat kernel.
Read more in the :ref:`User Guide <mean_shift>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data.
bandwidth : float, default=None
Kernel bandwidth.
If bandwidth is not given, it is determined using a heuristic based on
the median of all pairwise distances. This will take quadratic time in
the number of samples. The sklearn.cluster.estimate_bandwidth function
can be used to do this more efficiently.
seeds : array-like of shape (n_seeds, n_features) or None
Point used as initial kernel locations. If None and bin_seeding=False,
each data point is used as a seed. If None and bin_seeding=True,
see bin_seeding.
bin_seeding : boolean, default=False
If true, initial kernel locations are not locations of all
points, but rather the location of the discretized version of
points, where points are binned onto a grid whose coarseness
corresponds to the bandwidth. Setting this option to True will speed
up the algorithm because fewer seeds will be initialized.
Ignored if seeds argument is not None.
min_bin_freq : int, default=1
To speed up the algorithm, accept only those bins with at least
min_bin_freq points as seeds.
cluster_all : bool, default=True
If true, then all points are clustered, even those orphans that are
not within any kernel. Orphans are assigned to the nearest kernel.
If false, then orphans are given cluster label -1.
max_iter : int, default=300
Maximum number of iterations, per seed point before the clustering
operation terminates (for that seed point), if has not converged yet.
n_jobs : int, default=None
The number of jobs to use for the computation. This works by computing
each of the n_init runs in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. versionadded:: 0.17
Parallel Execution using *n_jobs*.
Returns
-------
cluster_centers : array, shape=[n_clusters, n_features]
Coordinates of cluster centers.
labels : array, shape=[n_samples]
Cluster labels for each point.
Notes
-----
For an example, see :ref:`examples/cluster/plot_mean_shift.py
<sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
"""
model = MeanShift(bandwidth=bandwidth, seeds=seeds,
min_bin_freq=min_bin_freq,
bin_seeding=bin_seeding,
cluster_all=cluster_all, n_jobs=n_jobs,
max_iter=max_iter).fit(X)
return model.cluster_centers_, model.labels_
def get_bin_seeds(X, bin_size, min_bin_freq=1):
"""Finds seeds for mean_shift.
Finds seeds by first binning data onto a grid whose lines are
spaced bin_size apart, and then choosing those bins with at least
min_bin_freq points.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input points, the same points that will be used in mean_shift.
bin_size : float
Controls the coarseness of the binning. Smaller values lead
to more seeding (which is computationally more expensive). If you're
not sure how to set this, set it to the value of the bandwidth used
in clustering.mean_shift.
min_bin_freq : int, default=1
Only bins with at least min_bin_freq will be selected as seeds.
Raising this value decreases the number of seeds found, which
makes mean_shift computationally cheaper.
Returns
-------
bin_seeds : array-like of shape (n_samples, n_features)
Points used as initial kernel positions in clustering.mean_shift.
"""
if bin_size == 0:
return X
# Bin points
bin_sizes = defaultdict(int)
for point in X:
binned_point = np.round(point / bin_size)
bin_sizes[tuple(binned_point)] += 1
# Select only those bins as seeds which have enough members
bin_seeds = np.array([point for point, freq in bin_sizes.items() if
freq >= min_bin_freq], dtype=np.float32)
if len(bin_seeds) == len(X):
warnings.warn("Binning data failed with provided bin_size=%f,"
" using data points as seeds." % bin_size)
return X
bin_seeds = bin_seeds * bin_size
return bin_seeds
class MeanShift(ClusterMixin, BaseEstimator):
"""Mean shift clustering using a flat kernel.
Mean shift clustering aims to discover "blobs" in a smooth density of
samples. It is a centroid-based algorithm, which works by updating
candidates for centroids to be the mean of the points within a given
region. These candidates are then filtered in a post-processing stage to
eliminate near-duplicates to form the final set of centroids.
Seeding is performed using a binning technique for scalability.
Read more in the :ref:`User Guide <mean_shift>`.
Parameters
----------
bandwidth : float, default=None
Bandwidth used in the RBF kernel.
If not given, the bandwidth is estimated using
sklearn.cluster.estimate_bandwidth; see the documentation for that
function for hints on scalability (see also the Notes, below).
seeds : array-like of shape (n_samples, n_features), default=None
Seeds used to initialize kernels. If not set,
the seeds are calculated by clustering.get_bin_seeds
with bandwidth as the grid size and default values for
other parameters.
bin_seeding : bool, default=False
If true, initial kernel locations are not locations of all
points, but rather the location of the discretized version of
points, where points are binned onto a grid whose coarseness
corresponds to the bandwidth. Setting this option to True will speed
up the algorithm because fewer seeds will be initialized.
The default value is False.
Ignored if seeds argument is not None.
min_bin_freq : int, default=1
To speed up the algorithm, accept only those bins with at least
min_bin_freq points as seeds.
cluster_all : bool, default=True
If true, then all points are clustered, even those orphans that are
not within any kernel. Orphans are assigned to the nearest kernel.
If false, then orphans are given cluster label -1.
n_jobs : int, default=None
The number of jobs to use for the computation. This works by computing
each of the n_init runs in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_iter : int, default=300
Maximum number of iterations, per seed point before the clustering
operation terminates (for that seed point), if has not converged yet.
.. versionadded:: 0.22
Attributes
----------
cluster_centers_ : array, [n_clusters, n_features]
Coordinates of cluster centers.
labels_ : array of shape (n_samples,)
Labels of each point.
n_iter_ : int
Maximum number of iterations performed on each seed.
.. versionadded:: 0.22
Examples
--------
>>> from sklearn.cluster import MeanShift
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = MeanShift(bandwidth=2).fit(X)
>>> clustering.labels_
array([1, 1, 1, 0, 0, 0])
>>> clustering.predict([[0, 0], [5, 5]])
array([1, 0])
>>> clustering
MeanShift(bandwidth=2)
Notes
-----
Scalability:
Because this implementation uses a flat kernel and
a Ball Tree to look up members of each kernel, the complexity will tend
towards O(T*n*log(n)) in lower dimensions, with n the number of samples
and T the number of points. In higher dimensions the complexity will
tend towards O(T*n^2).
Scalability can be boosted by using fewer seeds, for example by using
a higher value of min_bin_freq in the get_bin_seeds function.
Note that the estimate_bandwidth function is much less scalable than the
mean shift algorithm and will be the bottleneck if it is used.
References
----------
Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
feature space analysis". IEEE Transactions on Pattern Analysis and
Machine Intelligence. 2002. pp. 603-619.
"""
@_deprecate_positional_args
def __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False,
min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300):
self.bandwidth = bandwidth
self.seeds = seeds
self.bin_seeding = bin_seeding
self.cluster_all = cluster_all
self.min_bin_freq = min_bin_freq
self.n_jobs = n_jobs
self.max_iter = max_iter
def fit(self, X, y=None):
"""Perform clustering.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to cluster.
y : Ignored
"""
X = self._validate_data(X)
bandwidth = self.bandwidth
if bandwidth is None:
bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
elif bandwidth <= 0:
raise ValueError("bandwidth needs to be greater than zero or None,"
" got %f" % bandwidth)
seeds = self.seeds
if seeds is None:
if self.bin_seeding:
seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
else:
seeds = X
n_samples, n_features = X.shape
center_intensity_dict = {}
# We use n_jobs=1 because this will be used in nested calls under
# parallel calls to _mean_shift_single_seed so there is no need for
# for further parallelism.
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
# execute iterations on all seeds in parallel
all_res = Parallel(n_jobs=self.n_jobs)(
delayed(_mean_shift_single_seed)
(seed, X, nbrs, self.max_iter) for seed in seeds)
# copy results in a dictionary
for i in range(len(seeds)):
if all_res[i][1]: # i.e. len(points_within) > 0
center_intensity_dict[all_res[i][0]] = all_res[i][1]
self.n_iter_ = max([x[2] for x in all_res])
if not center_intensity_dict:
# nothing near seeds
raise ValueError("No point was within bandwidth=%f of any seed."
" Try a different seeding strategy \
or increase the bandwidth."
% bandwidth)
# POST PROCESSING: remove near duplicate points
# If the distance between two kernels is less than the bandwidth,
# then we have to remove one because it is a duplicate. Remove the
# one with fewer points.
sorted_by_intensity = sorted(center_intensity_dict.items(),
key=lambda tup: (tup[1], tup[0]),
reverse=True)
sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
unique = np.ones(len(sorted_centers), dtype=np.bool)
nbrs = NearestNeighbors(radius=bandwidth,
n_jobs=self.n_jobs).fit(sorted_centers)
for i, center in enumerate(sorted_centers):
if unique[i]:
neighbor_idxs = nbrs.radius_neighbors([center],
return_distance=False)[0]
unique[neighbor_idxs] = 0
unique[i] = 1 # leave the current point as unique
cluster_centers = sorted_centers[unique]
# ASSIGN LABELS: a point belongs to the cluster that it is closest to
nbrs = NearestNeighbors(n_neighbors=1,
n_jobs=self.n_jobs).fit(cluster_centers)
labels = np.zeros(n_samples, dtype=np.int)
distances, idxs = nbrs.kneighbors(X)
if self.cluster_all:
labels = idxs.flatten()
else:
labels.fill(-1)
bool_selector = distances.flatten() <= bandwidth
labels[bool_selector] = idxs.flatten()[bool_selector]
self.cluster_centers_, self.labels_ = cluster_centers, labels
return self
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
Parameters
----------
X : {array-like, sparse matrix}, shape=[n_samples, n_features]
New data to predict.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
check_is_fitted(self)
return pairwise_distances_argmin(X, self.cluster_centers_)

View file

@ -0,0 +1,928 @@
# -*- coding: utf-8 -*-
"""Ordering Points To Identify the Clustering Structure (OPTICS)
These routines execute the OPTICS algorithm, and implement various
cluster extraction methods of the ordered list.
Authors: Shane Grigsby <refuge@rocktalus.com>
Adrin Jalali <adrinjalali@gmail.com>
Erich Schubert <erich@debian.org>
Hanmin Qin <qinhanmin2005@sina.com>
License: BSD 3 clause
"""
import warnings
import numpy as np
from ..utils import gen_batches, get_chunk_n_rows
from ..utils.validation import _deprecate_positional_args
from ..neighbors import NearestNeighbors
from ..base import BaseEstimator, ClusterMixin
from ..metrics import pairwise_distances
class OPTICS(ClusterMixin, BaseEstimator):
"""Estimate clustering structure from vector array.
OPTICS (Ordering Points To Identify the Clustering Structure), closely
related to DBSCAN, finds core sample of high density and expands clusters
from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
neighborhood radius. Better suited for usage on large datasets than the
current sklearn implementation of DBSCAN.
Clusters are then extracted using a DBSCAN-like method
(cluster_method = 'dbscan') or an automatic
technique proposed in [1]_ (cluster_method = 'xi').
This implementation deviates from the original OPTICS by first performing
k-nearest-neighborhood searches on all points to identify core sizes, then
computing only the distances to unprocessed points when constructing the
cluster order. Note that we do not employ a heap to manage the expansion
candidates, so the time complexity will be O(n^2).
Read more in the :ref:`User Guide <optics>`.
Parameters
----------
min_samples : int > 1 or float between 0 and 1 (default=5)
The number of samples in a neighborhood for a point to be considered as
a core point. Also, up and down steep regions can't have more then
``min_samples`` consecutive non-steep points. Expressed as an absolute
number or a fraction of the number of samples (rounded to be at least
2).
max_eps : float, optional (default=np.inf)
The maximum distance between two samples for one to be considered as
in the neighborhood of the other. Default value of ``np.inf`` will
identify clusters across all scales; reducing ``max_eps`` will result
in shorter run times.
metric : str or callable, optional (default='minkowski')
Metric to use for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.
If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string. If metric is
"precomputed", X is assumed to be a distance matrix and must be square.
Valid values for metric are:
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule']
See the documentation for scipy.spatial.distance for details on these
metrics.
p : int, optional (default=2)
Parameter for the Minkowski metric from
:class:`sklearn.metrics.pairwise_distances`. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, optional (default=None)
Additional keyword arguments for the metric function.
cluster_method : str, optional (default='xi')
The extraction method used to extract clusters using the calculated
reachability and ordering. Possible values are "xi" and "dbscan".
eps : float, optional (default=None)
The maximum distance between two samples for one to be considered as
in the neighborhood of the other. By default it assumes the same value
as ``max_eps``.
Used only when ``cluster_method='dbscan'``.
xi : float, between 0 and 1, optional (default=0.05)
Determines the minimum steepness on the reachability plot that
constitutes a cluster boundary. For example, an upwards point in the
reachability plot is defined by the ratio from one point to its
successor being at most 1-xi.
Used only when ``cluster_method='xi'``.
predecessor_correction : bool, optional (default=True)
Correct clusters according to the predecessors calculated by OPTICS
[2]_. This parameter has minimal effect on most datasets.
Used only when ``cluster_method='xi'``.
min_cluster_size : int > 1 or float between 0 and 1 (default=None)
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded to be
at least 2). If ``None``, the value of ``min_samples`` is used instead.
Used only when ``cluster_method='xi'``.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method. (default)
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, optional (default=30)
Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
affect the speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
labels_ : array, shape (n_samples,)
Cluster labels for each point in the dataset given to fit().
Noisy samples and points which are not included in a leaf cluster
of ``cluster_hierarchy_`` are labeled as -1.
reachability_ : array, shape (n_samples,)
Reachability distances per sample, indexed by object order. Use
``clust.reachability_[clust.ordering_]`` to access in cluster order.
ordering_ : array, shape (n_samples,)
The cluster ordered list of sample indices.
core_distances_ : array, shape (n_samples,)
Distance at which each sample becomes a core point, indexed by object
order. Points which will never be core have a distance of inf. Use
``clust.core_distances_[clust.ordering_]`` to access in cluster order.
predecessor_ : array, shape (n_samples,)
Point that a sample was reached from, indexed by object order.
Seed points have a predecessor of -1.
cluster_hierarchy_ : array, shape (n_clusters, 2)
The list of clusters in the form of ``[start, end]`` in each row, with
all indices inclusive. The clusters are ordered according to
``(end, -start)`` (ascending) so that larger clusters encompassing
smaller clusters come after those smaller ones. Since ``labels_`` does
not reflect the hierarchy, usually
``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also
note that these indices are of the ``ordering_``, i.e.
``X[ordering_][start:end + 1]`` form a cluster.
Only available when ``cluster_method='xi'``.
See Also
--------
DBSCAN
A similar clustering for a specified neighborhood radius (eps).
Our implementation is optimized for runtime.
References
----------
.. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
and Jörg Sander. "OPTICS: ordering points to identify the clustering
structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
.. [2] Schubert, Erich, Michael Gertz.
"Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
Examples
--------
>>> from sklearn.cluster import OPTICS
>>> import numpy as np
>>> X = np.array([[1, 2], [2, 5], [3, 6],
... [8, 7], [8, 8], [7, 3]])
>>> clustering = OPTICS(min_samples=2).fit(X)
>>> clustering.labels_
array([0, 0, 0, 1, 1, 1])
"""
@_deprecate_positional_args
def __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski',
p=2, metric_params=None, cluster_method='xi', eps=None,
xi=0.05, predecessor_correction=True, min_cluster_size=None,
algorithm='auto', leaf_size=30, n_jobs=None):
self.max_eps = max_eps
self.min_samples = min_samples
self.min_cluster_size = min_cluster_size
self.algorithm = algorithm
self.metric = metric
self.metric_params = metric_params
self.p = p
self.leaf_size = leaf_size
self.cluster_method = cluster_method
self.eps = eps
self.xi = xi
self.predecessor_correction = predecessor_correction
self.n_jobs = n_jobs
def fit(self, X, y=None):
"""Perform OPTICS clustering.
Extracts an ordered list of points and reachability distances, and
performs initial clustering using ``max_eps`` distance specified at
OPTICS object instantiation.
Parameters
----------
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
if metric=precomputed
A feature array, or array of distances between samples if
metric='precomputed'.
y : ignored
Ignored.
Returns
-------
self : instance of OPTICS
The instance.
"""
X = self._validate_data(X, dtype=np.float)
if self.cluster_method not in ['dbscan', 'xi']:
raise ValueError("cluster_method should be one of"
" 'dbscan' or 'xi' but is %s" %
self.cluster_method)
(self.ordering_, self.core_distances_, self.reachability_,
self.predecessor_) = compute_optics_graph(
X=X, min_samples=self.min_samples, algorithm=self.algorithm,
leaf_size=self.leaf_size, metric=self.metric,
metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs,
max_eps=self.max_eps)
# Extract clusters from the calculated orders and reachability
if self.cluster_method == 'xi':
labels_, clusters_ = cluster_optics_xi(
reachability=self.reachability_,
predecessor=self.predecessor_,
ordering=self.ordering_,
min_samples=self.min_samples,
min_cluster_size=self.min_cluster_size,
xi=self.xi,
predecessor_correction=self.predecessor_correction)
self.cluster_hierarchy_ = clusters_
elif self.cluster_method == 'dbscan':
if self.eps is None:
eps = self.max_eps
else:
eps = self.eps
if eps > self.max_eps:
raise ValueError('Specify an epsilon smaller than %s. Got %s.'
% (self.max_eps, eps))
labels_ = cluster_optics_dbscan(
reachability=self.reachability_,
core_distances=self.core_distances_,
ordering=self.ordering_, eps=eps)
self.labels_ = labels_
return self
def _validate_size(size, n_samples, param_name):
if size <= 0 or (size !=
int(size)
and size > 1):
raise ValueError('%s must be a positive integer '
'or a float between 0 and 1. Got %r' %
(param_name, size))
elif size > n_samples:
raise ValueError('%s must be no greater than the'
' number of samples (%d). Got %d' %
(param_name, n_samples, size))
# OPTICS helper functions
def _compute_core_distances_(X, neighbors, min_samples, working_memory):
"""Compute the k-th nearest neighbor of each sample
Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
but with more memory efficiency.
Parameters
----------
X : array, shape (n_samples, n_features)
The data.
neighbors : NearestNeighbors instance
The fitted nearest neighbors estimator.
working_memory : int, optional
The sought maximum memory for temporary distance matrix chunks.
When None (default), the value of
``sklearn.get_config()['working_memory']`` is used.
Returns
-------
core_distances : array, shape (n_samples,)
Distance at which each sample becomes a core point.
Points which will never be core have a distance of inf.
"""
n_samples = X.shape[0]
core_distances = np.empty(n_samples)
core_distances.fill(np.nan)
chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples,
max_n_rows=n_samples,
working_memory=working_memory)
slices = gen_batches(n_samples, chunk_n_rows)
for sl in slices:
core_distances[sl] = neighbors.kneighbors(
X[sl], min_samples)[0][:, -1]
return core_distances
@_deprecate_positional_args
def compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params,
algorithm, leaf_size, n_jobs):
"""Computes the OPTICS reachability graph.
Read more in the :ref:`User Guide <optics>`.
Parameters
----------
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
if metric=precomputed.
A feature array, or array of distances between samples if
metric='precomputed'
min_samples : int > 1 or float between 0 and 1
The number of samples in a neighborhood for a point to be considered
as a core point. Expressed as an absolute number or a fraction of the
number of samples (rounded to be at least 2).
max_eps : float, optional (default=np.inf)
The maximum distance between two samples for one to be considered as
in the neighborhood of the other. Default value of ``np.inf`` will
identify clusters across all scales; reducing ``max_eps`` will result
in shorter run times.
metric : string or callable, optional (default='minkowski')
Metric to use for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.
If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string. If metric is
"precomputed", X is assumed to be a distance matrix and must be square.
Valid values for metric are:
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule']
See the documentation for scipy.spatial.distance for details on these
metrics.
p : integer, optional (default=2)
Parameter for the Minkowski metric from
:class:`sklearn.metrics.pairwise_distances`. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric_params : dict, optional (default=None)
Additional keyword arguments for the metric function.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method. (default)
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, optional (default=30)
Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
affect the speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
ordering_ : array, shape (n_samples,)
The cluster ordered list of sample indices.
core_distances_ : array, shape (n_samples,)
Distance at which each sample becomes a core point, indexed by object
order. Points which will never be core have a distance of inf. Use
``clust.core_distances_[clust.ordering_]`` to access in cluster order.
reachability_ : array, shape (n_samples,)
Reachability distances per sample, indexed by object order. Use
``clust.reachability_[clust.ordering_]`` to access in cluster order.
predecessor_ : array, shape (n_samples,)
Point that a sample was reached from, indexed by object order.
Seed points have a predecessor of -1.
References
----------
.. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
and Jörg Sander. "OPTICS: ordering points to identify the clustering
structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
"""
n_samples = X.shape[0]
_validate_size(min_samples, n_samples, 'min_samples')
if min_samples <= 1:
min_samples = max(2, int(min_samples * n_samples))
# Start all points as 'unprocessed' ##
reachability_ = np.empty(n_samples)
reachability_.fill(np.inf)
predecessor_ = np.empty(n_samples, dtype=int)
predecessor_.fill(-1)
nbrs = NearestNeighbors(n_neighbors=min_samples,
algorithm=algorithm,
leaf_size=leaf_size,
metric=metric,
metric_params=metric_params,
p=p,
n_jobs=n_jobs)
nbrs.fit(X)
# Here we first do a kNN query for each point, this differs from
# the original OPTICS that only used epsilon range queries.
# TODO: handle working_memory somehow?
core_distances_ = _compute_core_distances_(X=X, neighbors=nbrs,
min_samples=min_samples,
working_memory=None)
# OPTICS puts an upper limit on these, use inf for undefined.
core_distances_[core_distances_ > max_eps] = np.inf
# Main OPTICS loop. Not parallelizable. The order that entries are
# written to the 'ordering_' list is important!
# Note that this implementation is O(n^2) theoretically, but
# supposedly with very low constant factors.
processed = np.zeros(X.shape[0], dtype=bool)
ordering = np.zeros(X.shape[0], dtype=int)
for ordering_idx in range(X.shape[0]):
# Choose next based on smallest reachability distance
# (And prefer smaller ids on ties, possibly np.inf!)
index = np.where(processed == 0)[0]
point = index[np.argmin(reachability_[index])]
processed[point] = True
ordering[ordering_idx] = point
if core_distances_[point] != np.inf:
_set_reach_dist(core_distances_=core_distances_,
reachability_=reachability_,
predecessor_=predecessor_,
point_index=point,
processed=processed, X=X, nbrs=nbrs,
metric=metric, metric_params=metric_params,
p=p, max_eps=max_eps)
if np.all(np.isinf(reachability_)):
warnings.warn("All reachability values are inf. Set a larger"
" max_eps or all data will be considered outliers.",
UserWarning)
return ordering, core_distances_, reachability_, predecessor_
def _set_reach_dist(core_distances_, reachability_, predecessor_,
point_index, processed, X, nbrs, metric, metric_params,
p, max_eps):
P = X[point_index:point_index + 1]
# Assume that radius_neighbors is faster without distances
# and we don't need all distances, nevertheless, this means
# we may be doing some work twice.
indices = nbrs.radius_neighbors(P, radius=max_eps,
return_distance=False)[0]
# Getting indices of neighbors that have not been processed
unproc = np.compress(~np.take(processed, indices), indices)
# Neighbors of current point are already processed.
if not unproc.size:
return
# Only compute distances to unprocessed neighbors:
if metric == 'precomputed':
dists = X[point_index, unproc]
else:
_params = dict() if metric_params is None else metric_params.copy()
if metric == 'minkowski' and 'p' not in _params:
# the same logic as neighbors, p is ignored if explicitly set
# in the dict params
_params['p'] = p
dists = pairwise_distances(P, np.take(X, unproc, axis=0),
metric=metric, n_jobs=None,
**_params).ravel()
rdists = np.maximum(dists, core_distances_[point_index])
improved = np.where(rdists < np.take(reachability_, unproc))
reachability_[unproc[improved]] = rdists[improved]
predecessor_[unproc[improved]] = point_index
@_deprecate_positional_args
def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
"""Performs DBSCAN extraction for an arbitrary epsilon.
Extracting the clusters runs in linear time. Note that this results in
``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with
similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.
Parameters
----------
reachability : array, shape (n_samples,)
Reachability distances calculated by OPTICS (``reachability_``)
core_distances : array, shape (n_samples,)
Distances at which points become core (``core_distances_``)
ordering : array, shape (n_samples,)
OPTICS ordered point indices (``ordering_``)
eps : float
DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results
will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close
to one another.
Returns
-------
labels_ : array, shape (n_samples,)
The estimated labels.
"""
n_samples = len(core_distances)
labels = np.zeros(n_samples, dtype=int)
far_reach = reachability > eps
near_core = core_distances <= eps
labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1
labels[far_reach & ~near_core] = -1
return labels
def cluster_optics_xi(*, reachability, predecessor, ordering, min_samples,
min_cluster_size=None, xi=0.05,
predecessor_correction=True):
"""Automatically extract clusters according to the Xi-steep method.
Parameters
----------
reachability : array, shape (n_samples,)
Reachability distances calculated by OPTICS (`reachability_`)
predecessor : array, shape (n_samples,)
Predecessors calculated by OPTICS.
ordering : array, shape (n_samples,)
OPTICS ordered point indices (`ordering_`)
min_samples : int > 1 or float between 0 and 1
The same as the min_samples given to OPTICS. Up and down steep regions
can't have more then ``min_samples`` consecutive non-steep points.
Expressed as an absolute number or a fraction of the number of samples
(rounded to be at least 2).
min_cluster_size : int > 1 or float between 0 and 1 (default=None)
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded to be
at least 2). If ``None``, the value of ``min_samples`` is used instead.
xi : float, between 0 and 1, optional (default=0.05)
Determines the minimum steepness on the reachability plot that
constitutes a cluster boundary. For example, an upwards point in the
reachability plot is defined by the ratio from one point to its
successor being at most 1-xi.
predecessor_correction : bool, optional (default=True)
Correct clusters based on the calculated predecessors.
Returns
-------
labels : array, shape (n_samples)
The labels assigned to samples. Points which are not included
in any cluster are labeled as -1.
clusters : array, shape (n_clusters, 2)
The list of clusters in the form of ``[start, end]`` in each row, with
all indices inclusive. The clusters are ordered according to ``(end,
-start)`` (ascending) so that larger clusters encompassing smaller
clusters come after such nested smaller clusters. Since ``labels`` does
not reflect the hierarchy, usually ``len(clusters) >
np.unique(labels)``.
"""
n_samples = len(reachability)
_validate_size(min_samples, n_samples, 'min_samples')
if min_samples <= 1:
min_samples = max(2, int(min_samples * n_samples))
if min_cluster_size is None:
min_cluster_size = min_samples
_validate_size(min_cluster_size, n_samples, 'min_cluster_size')
if min_cluster_size <= 1:
min_cluster_size = max(2, int(min_cluster_size * n_samples))
clusters = _xi_cluster(reachability[ordering], predecessor[ordering],
ordering, xi,
min_samples, min_cluster_size,
predecessor_correction)
labels = _extract_xi_labels(ordering, clusters)
return labels, clusters
def _extend_region(steep_point, xward_point, start, min_samples):
"""Extend the area until it's maximal.
It's the same function for both upward and downward reagions, depending on
the given input parameters. Assuming:
- steep_{upward/downward}: bool array indicating whether a point is a
steep {upward/downward};
- upward/downward: bool array indicating whether a point is
upward/downward;
To extend an upward reagion, ``steep_point=steep_upward`` and
``xward_point=downward`` are expected, and to extend a downward region,
``steep_point=steep_downward`` and ``xward_point=upward``.
Parameters
----------
steep_point : bool array, shape (n_samples)
True if the point is steep downward (upward).
xward_point : bool array, shape (n_samples)
True if the point is an upward (respectively downward) point.
start : integer
The start of the xward region.
min_samples : integer
The same as the min_samples given to OPTICS. Up and down steep
regions can't have more then ``min_samples`` consecutive non-steep
points.
Returns
-------
index : integer
The current index iterating over all the samples, i.e. where we are up
to in our search.
end : integer
The end of the region, which can be behind the index. The region
includes the ``end`` index.
"""
n_samples = len(steep_point)
non_xward_points = 0
index = start
end = start
# find a maximal area
while index < n_samples:
if steep_point[index]:
non_xward_points = 0
end = index
elif not xward_point[index]:
# it's not a steep point, but still goes up.
non_xward_points += 1
# region should include no more than min_samples consecutive
# non steep xward points.
if non_xward_points > min_samples:
break
else:
return end
index += 1
return end
def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot):
"""Update steep down areas (SDAs) using the new maximum in between (mib)
value, and the given complement of xi, i.e. ``1 - xi``.
"""
if np.isinf(mib):
return []
res = [sda for sda in sdas
if mib <= reachability_plot[sda['start']] * xi_complement]
for sda in res:
sda['mib'] = max(sda['mib'], mib)
return res
def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e):
"""Correct for predecessors.
Applies Algorithm 2 of [1]_.
Input parameters are ordered by the computer OPTICS ordering.
.. [1] Schubert, Erich, Michael Gertz.
"Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
"""
while s < e:
if reachability_plot[s] > reachability_plot[e]:
return s, e
p_e = ordering[predecessor_plot[e]]
for i in range(s, e):
if p_e == ordering[i]:
return s, e
e -= 1
return None, None
def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
min_cluster_size, predecessor_correction):
"""Automatically extract clusters according to the Xi-steep method.
This is rouphly an implementation of Figure 19 of the OPTICS paper.
Parameters
----------
reachability_plot : array, shape (n_samples)
The reachability plot, i.e. reachability ordered according to
the calculated ordering, all computed by OPTICS.
predecessor_plot : array, shape (n_samples)
Predecessors ordered according to the calculated ordering.
xi : float, between 0 and 1
Determines the minimum steepness on the reachability plot that
constitutes a cluster boundary. For example, an upwards point in the
reachability plot is defined by the ratio from one point to its
successor being at most 1-xi.
min_samples : int > 1
The same as the min_samples given to OPTICS. Up and down steep regions
can't have more then ``min_samples`` consecutive non-steep points.
min_cluster_size : int > 1
Minimum number of samples in an OPTICS cluster.
predecessor_correction : bool
Correct clusters based on the calculated predecessors.
Returns
-------
clusters : array, shape (n_clusters, 2)
The list of clusters in the form of [start, end] in each row, with all
indices inclusive. The clusters are ordered in a way that larger
clusters encompassing smaller clusters come after those smaller
clusters.
"""
# Our implementation adds an inf to the end of reachability plot
# this helps to find potential clusters at the end of the
# reachability plot even if there's no upward region at the end of it.
reachability_plot = np.hstack((reachability_plot, np.inf))
xi_complement = 1 - xi
sdas = [] # steep down areas, introduced in section 4.3.2 of the paper
clusters = []
index = 0
mib = 0. # maximum in between, section 4.3.2
# Our implementation corrects a mistake in the original
# paper, i.e., in Definition 9 steep downward point,
# r(p) * (1 - x1) <= r(p + 1) should be
# r(p) * (1 - x1) >= r(p + 1)
with np.errstate(invalid='ignore'):
ratio = reachability_plot[:-1] / reachability_plot[1:]
steep_upward = ratio <= xi_complement
steep_downward = ratio >= 1 / xi_complement
downward = ratio > 1
upward = ratio < 1
# the following loop is is almost exactly as Figure 19 of the paper.
# it jumps over the areas which are not either steep down or up areas
for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)):
# just continue if steep_index has been a part of a discovered xward
# area.
if steep_index < index:
continue
mib = max(mib, np.max(reachability_plot[index:steep_index + 1]))
# steep downward areas
if steep_downward[steep_index]:
sdas = _update_filter_sdas(sdas, mib, xi_complement,
reachability_plot)
D_start = steep_index
D_end = _extend_region(steep_downward, upward,
D_start, min_samples)
D = {'start': D_start, 'end': D_end, 'mib': 0.}
sdas.append(D)
index = D_end + 1
mib = reachability_plot[index]
# steep upward areas
else:
sdas = _update_filter_sdas(sdas, mib, xi_complement,
reachability_plot)
U_start = steep_index
U_end = _extend_region(steep_upward, downward, U_start,
min_samples)
index = U_end + 1
mib = reachability_plot[index]
U_clusters = []
for D in sdas:
c_start = D['start']
c_end = U_end
# line (**), sc2*
if reachability_plot[c_end + 1] * xi_complement < D['mib']:
continue
# Definition 11: criterion 4
D_max = reachability_plot[D['start']]
if D_max * xi_complement >= reachability_plot[c_end + 1]:
# Find the first index from the left side which is almost
# at the same level as the end of the detected cluster.
while (reachability_plot[c_start + 1] >
reachability_plot[c_end + 1]
and c_start < D['end']):
c_start += 1
elif reachability_plot[c_end + 1] * xi_complement >= D_max:
# Find the first index from the right side which is almost
# at the same level as the beginning of the detected
# cluster.
# Our implementation corrects a mistake in the original
# paper, i.e., in Definition 11 4c, r(x) < r(sD) should be
# r(x) > r(sD).
while (reachability_plot[c_end - 1] > D_max
and c_end > U_start):
c_end -= 1
# predecessor correction
if predecessor_correction:
c_start, c_end = _correct_predecessor(reachability_plot,
predecessor_plot,
ordering,
c_start,
c_end)
if c_start is None:
continue
# Definition 11: criterion 3.a
if c_end - c_start + 1 < min_cluster_size:
continue
# Definition 11: criterion 1
if c_start > D['end']:
continue
# Definition 11: criterion 2
if c_end < U_start:
continue
U_clusters.append((c_start, c_end))
# add smaller clusters first.
U_clusters.reverse()
clusters.extend(U_clusters)
return np.array(clusters)
def _extract_xi_labels(ordering, clusters):
"""Extracts the labels from the clusters returned by `_xi_cluster`.
We rely on the fact that clusters are stored
with the smaller clusters coming before the larger ones.
Parameters
----------
ordering : array, shape (n_samples)
The ordering of points calculated by OPTICS
clusters : array, shape (n_clusters, 2)
List of clusters i.e. (start, end) tuples,
as returned by `_xi_cluster`.
Returns
-------
labels : array, shape (n_samples)
"""
labels = np.full(len(ordering), -1, dtype=int)
label = 0
for c in clusters:
if not np.any(labels[c[0]:(c[1] + 1)] != -1):
labels[c[0]:(c[1] + 1)] = label
label += 1
labels[ordering] = labels.copy()
return labels

View file

@ -0,0 +1,552 @@
# -*- coding: utf-8 -*-
"""Algorithms for spectral clustering"""
# Author: Gael Varoquaux gael.varoquaux@normalesup.org
# Brian Cheung
# Wei LI <kuantkid@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
from ..base import BaseEstimator, ClusterMixin
from ..utils import check_random_state, as_float_array
from ..utils.validation import _deprecate_positional_args
from ..metrics.pairwise import pairwise_kernels
from ..neighbors import kneighbors_graph, NearestNeighbors
from ..manifold import spectral_embedding
from ._kmeans import k_means
@_deprecate_positional_args
def discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20,
random_state=None):
"""Search for a partition matrix (clustering) which is closest to the
eigenvector embedding.
Parameters
----------
vectors : array-like, shape: (n_samples, n_clusters)
The embedding space of the samples.
copy : boolean, optional, default: True
Whether to copy vectors, or perform in-place normalization.
max_svd_restarts : int, optional, default: 30
Maximum number of attempts to restart SVD if convergence fails
n_iter_max : int, optional, default: 30
Maximum number of iterations to attempt in rotation and partition
matrix search if machine precision convergence is not reached
random_state : int, RandomState instance, default=None
Determines random number generation for rotation matrix initialization.
Use an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
Returns
-------
labels : array of integers, shape: n_samples
The labels of the clusters.
References
----------
- Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
Notes
-----
The eigenvector embedding is used to iteratively search for the
closest discrete partition. First, the eigenvector embedding is
normalized to the space of partition matrices. An optimal discrete
partition matrix closest to this normalized embedding multiplied by
an initial rotation is calculated. Fixing this discrete partition
matrix, an optimal rotation matrix is calculated. These two
calculations are performed until convergence. The discrete partition
matrix is returned as the clustering solution. Used in spectral
clustering, this method tends to be faster and more robust to random
initialization than k-means.
"""
from scipy.sparse import csc_matrix
from scipy.linalg import LinAlgError
random_state = check_random_state(random_state)
vectors = as_float_array(vectors, copy=copy)
eps = np.finfo(float).eps
n_samples, n_components = vectors.shape
# Normalize the eigenvectors to an equal length of a vector of ones.
# Reorient the eigenvectors to point in the negative direction with respect
# to the first element. This may have to do with constraining the
# eigenvectors to lie in a specific quadrant to make the discretization
# search easier.
norm_ones = np.sqrt(n_samples)
for i in range(vectors.shape[1]):
vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) \
* norm_ones
if vectors[0, i] != 0:
vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
# Normalize the rows of the eigenvectors. Samples should lie on the unit
# hypersphere centered at the origin. This transforms the samples in the
# embedding space to the space of partition matrices.
vectors = vectors / np.sqrt((vectors ** 2).sum(axis=1))[:, np.newaxis]
svd_restarts = 0
has_converged = False
# If there is an exception we try to randomize and rerun SVD again
# do this max_svd_restarts times.
while (svd_restarts < max_svd_restarts) and not has_converged:
# Initialize first column of rotation matrix with a row of the
# eigenvectors
rotation = np.zeros((n_components, n_components))
rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
# To initialize the rest of the rotation matrix, find the rows
# of the eigenvectors that are as orthogonal to each other as
# possible
c = np.zeros(n_samples)
for j in range(1, n_components):
# Accumulate c to ensure row is as orthogonal as possible to
# previous picks as well as current one
c += np.abs(np.dot(vectors, rotation[:, j - 1]))
rotation[:, j] = vectors[c.argmin(), :].T
last_objective_value = 0.0
n_iter = 0
while not has_converged:
n_iter += 1
t_discrete = np.dot(vectors, rotation)
labels = t_discrete.argmax(axis=1)
vectors_discrete = csc_matrix(
(np.ones(len(labels)), (np.arange(0, n_samples), labels)),
shape=(n_samples, n_components))
t_svd = vectors_discrete.T * vectors
try:
U, S, Vh = np.linalg.svd(t_svd)
svd_restarts += 1
except LinAlgError:
print("SVD did not converge, randomizing and trying again")
break
ncut_value = 2.0 * (n_samples - S.sum())
if ((abs(ncut_value - last_objective_value) < eps) or
(n_iter > n_iter_max)):
has_converged = True
else:
# otherwise calculate rotation and continue
last_objective_value = ncut_value
rotation = np.dot(Vh.T, U.T)
if not has_converged:
raise LinAlgError('SVD did not converge')
return labels
@_deprecate_positional_args
def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
eigen_solver=None, random_state=None, n_init=10,
eigen_tol=0.0, assign_labels='kmeans'):
"""Apply clustering to a projection of the normalized Laplacian.
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster. For instance, when clusters are
nested circles on the 2D plane.
If affinity is the adjacency matrix of a graph, this method can be
used to find normalized graph cuts.
Read more in the :ref:`User Guide <spectral_clustering>`.
Parameters
----------
affinity : array-like or sparse matrix, shape: (n_samples, n_samples)
The affinity matrix describing the relationship of the samples to
embed. **Must be symmetric**.
Possible examples:
- adjacency matrix of a graph,
- heat kernel of the pairwise distance matrix of the samples,
- symmetric k-nearest neighbours connectivity matrix of the samples.
n_clusters : integer, optional
Number of clusters to extract.
n_components : integer, optional, default is n_clusters
Number of eigen vectors to use for the spectral embedding
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities
random_state : int, RandomState instance, default=None
A pseudo random number generator used for the initialization of the
lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by
the K-Means initialization. Use an int to make the randomness
deterministic.
See :term:`Glossary <random_state>`.
n_init : int, optional, default: 10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
eigen_tol : float, optional, default: 0.0
Stopping criterion for eigendecomposition of the Laplacian matrix
when using arpack eigen_solver.
assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
The strategy to use to assign labels in the embedding
space. There are two ways to assign labels after the laplacian
embedding. k-means can be applied and is a popular choice. But it can
also be sensitive to initialization. Discretization is another
approach which is less sensitive to random initialization. See
the 'Multiclass spectral clustering' paper referenced below for
more details on the discretization approach.
Returns
-------
labels : array of integers, shape: n_samples
The labels of the clusters.
References
----------
- Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
- A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
- Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
Notes
-----
The graph should contain only one connect component, elsewhere
the results make little sense.
This algorithm solves the normalized cut for k=2: it is a
normalized spectral clustering.
"""
if assign_labels not in ('kmeans', 'discretize'):
raise ValueError("The 'assign_labels' parameter should be "
"'kmeans' or 'discretize', but '%s' was given"
% assign_labels)
random_state = check_random_state(random_state)
n_components = n_clusters if n_components is None else n_components
# The first eigen vector is constant only for fully connected graphs
# and should be kept for spectral clustering (drop_first = False)
# See spectral_embedding documentation.
maps = spectral_embedding(affinity, n_components=n_components,
eigen_solver=eigen_solver,
random_state=random_state,
eigen_tol=eigen_tol, drop_first=False)
if assign_labels == 'kmeans':
_, labels, _ = k_means(maps, n_clusters, random_state=random_state,
n_init=n_init)
else:
labels = discretize(maps, random_state=random_state)
return labels
class SpectralClustering(ClusterMixin, BaseEstimator):
"""Apply clustering to a projection of the normalized Laplacian.
In practice Spectral Clustering is very useful when the structure of
the individual clusters is highly non-convex or more generally when
a measure of the center and spread of the cluster is not a suitable
description of the complete cluster. For instance when clusters are
nested circles on the 2D plane.
If affinity is the adjacency matrix of a graph, this method can be
used to find normalized graph cuts.
When calling ``fit``, an affinity matrix is constructed using either
kernel function such the Gaussian (aka RBF) kernel of the euclidean
distanced ``d(X, X)``::
np.exp(-gamma * d(X,X) ** 2)
or a k-nearest neighbors connectivity matrix.
Alternatively, using ``precomputed``, a user-provided affinity
matrix can be used.
Read more in the :ref:`User Guide <spectral_clustering>`.
Parameters
----------
n_clusters : integer, optional
The dimension of the projection subspace.
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities.
n_components : integer, optional, default=n_clusters
Number of eigen vectors to use for the spectral embedding
random_state : int, RandomState instance, default=None
A pseudo random number generator used for the initialization of the
lobpcg eigen vectors decomposition when ``eigen_solver='amg'`` and by
the K-Means initialization. Use an int to make the randomness
deterministic.
See :term:`Glossary <random_state>`.
n_init : int, optional, default: 10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
gamma : float, default=1.0
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
Ignored for ``affinity='nearest_neighbors'``.
affinity : string or callable, default 'rbf'
How to construct the affinity matrix.
- 'nearest_neighbors' : construct the affinity matrix by computing a
graph of nearest neighbors.
- 'rbf' : construct the affinity matrix using a radial basis function
(RBF) kernel.
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
of precomputed nearest neighbors, and constructs the affinity matrix
by selecting the ``n_neighbors`` nearest neighbors.
- one of the kernels supported by
:func:`~sklearn.metrics.pairwise_kernels`.
Only kernels that produce similarity scores (non-negative values that
increase with similarity) should be used. This property is not checked
by the clustering algorithm.
n_neighbors : integer
Number of neighbors to use when constructing the affinity matrix using
the nearest neighbors method. Ignored for ``affinity='rbf'``.
eigen_tol : float, optional, default: 0.0
Stopping criterion for eigendecomposition of the Laplacian matrix
when ``eigen_solver='arpack'``.
assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
The strategy to use to assign labels in the embedding
space. There are two ways to assign labels after the laplacian
embedding. k-means can be applied and is a popular choice. But it can
also be sensitive to initialization. Discretization is another approach
which is less sensitive to random initialization.
degree : float, default=3
Degree of the polynomial kernel. Ignored by other kernels.
coef0 : float, default=1
Zero coefficient for polynomial and sigmoid kernels.
Ignored by other kernels.
kernel_params : dictionary of string to any, optional
Parameters (keyword arguments) and values for kernel passed as
callable object. Ignored by other kernels.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
affinity_matrix_ : array-like, shape (n_samples, n_samples)
Affinity matrix used for clustering. Available only if after calling
``fit``.
labels_ : array, shape (n_samples,)
Labels of each point
Examples
--------
>>> from sklearn.cluster import SpectralClustering
>>> import numpy as np
>>> X = np.array([[1, 1], [2, 1], [1, 0],
... [4, 7], [3, 5], [3, 6]])
>>> clustering = SpectralClustering(n_clusters=2,
... assign_labels="discretize",
... random_state=0).fit(X)
>>> clustering.labels_
array([1, 1, 1, 0, 0, 0])
>>> clustering
SpectralClustering(assign_labels='discretize', n_clusters=2,
random_state=0)
Notes
-----
If you have an affinity matrix, such as a distance matrix,
for which 0 means identical elements, and high values means
very dissimilar elements, it can be transformed in a
similarity matrix that is well suited for the algorithm by
applying the Gaussian (RBF, heat) kernel::
np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
Where ``delta`` is a free parameter representing the width of the Gaussian
kernel.
Another alternative is to take a symmetric version of the k
nearest neighbors connectivity matrix of the points.
If the pyamg package is installed, it is used: this greatly
speeds up computation.
References
----------
- Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
- A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
- Multiclass spectral clustering, 2003
Stella X. Yu, Jianbo Shi
https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
"""
@_deprecate_positional_args
def __init__(self, n_clusters=8, *, eigen_solver=None, n_components=None,
random_state=None, n_init=10, gamma=1., affinity='rbf',
n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans',
degree=3, coef0=1, kernel_params=None, n_jobs=None):
self.n_clusters = n_clusters
self.eigen_solver = eigen_solver
self.n_components = n_components
self.random_state = random_state
self.n_init = n_init
self.gamma = gamma
self.affinity = affinity
self.n_neighbors = n_neighbors
self.eigen_tol = eigen_tol
self.assign_labels = assign_labels
self.degree = degree
self.coef0 = coef0
self.kernel_params = kernel_params
self.n_jobs = n_jobs
def fit(self, X, y=None):
"""Perform spectral clustering from features, or affinity matrix.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features), or \
array-like, shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse matrix is
provided in a format other than ``csr_matrix``, ``csc_matrix``,
or ``coo_matrix``, it will be converted into a sparse
``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self
"""
X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=np.float64, ensure_min_samples=2)
allow_squared = self.affinity in ["precomputed",
"precomputed_nearest_neighbors"]
if X.shape[0] == X.shape[1] and not allow_squared:
warnings.warn("The spectral clustering API has changed. ``fit``"
"now constructs an affinity matrix from data. To use"
" a custom affinity matrix, "
"set ``affinity=precomputed``.")
if self.affinity == 'nearest_neighbors':
connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors,
include_self=True,
n_jobs=self.n_jobs)
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
elif self.affinity == 'precomputed_nearest_neighbors':
estimator = NearestNeighbors(n_neighbors=self.n_neighbors,
n_jobs=self.n_jobs,
metric="precomputed").fit(X)
connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
elif self.affinity == 'precomputed':
self.affinity_matrix_ = X
else:
params = self.kernel_params
if params is None:
params = {}
if not callable(self.affinity):
params['gamma'] = self.gamma
params['degree'] = self.degree
params['coef0'] = self.coef0
self.affinity_matrix_ = pairwise_kernels(X, metric=self.affinity,
filter_params=True,
**params)
random_state = check_random_state(self.random_state)
self.labels_ = spectral_clustering(self.affinity_matrix_,
n_clusters=self.n_clusters,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
random_state=random_state,
n_init=self.n_init,
eigen_tol=self.eigen_tol,
assign_labels=self.assign_labels)
return self
def fit_predict(self, X, y=None):
"""Perform spectral clustering from features, or affinity matrix,
and return cluster labels.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features), or \
array-like, shape (n_samples, n_samples)
Training instances to cluster, or similarities / affinities between
instances if ``affinity='precomputed'``. If a sparse matrix is
provided in a format other than ``csr_matrix``, ``csc_matrix``,
or ``coo_matrix``, it will be converted into a sparse
``csr_matrix``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray, shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)
@property
def _pairwise(self):
return self.affinity in ["precomputed",
"precomputed_nearest_neighbors"]

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _affinity_propagation # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.affinity_propagation_'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_affinity_propagation, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _bicluster # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.bicluster'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_bicluster, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _birch # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.birch'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_birch, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _dbscan # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.dbscan_'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_dbscan, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _agglomerative # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.hierarchical'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_agglomerative, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _kmeans # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.k_means_'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_kmeans, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _mean_shift # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.mean_shift_'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_mean_shift, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _optics # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.optics_'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_optics, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,50 @@
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD 3 clause
import os
import numpy
def configuration(parent_package='', top_path=None):
from numpy.distutils.misc_util import Configuration
libraries = []
if os.name == 'posix':
libraries.append('m')
config = Configuration('cluster', parent_package, top_path)
config.add_extension('_dbscan_inner',
sources=['_dbscan_inner.pyx'],
include_dirs=[numpy.get_include()],
language="c++")
config.add_extension('_hierarchical_fast',
sources=['_hierarchical_fast.pyx'],
language="c++",
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_extension('_k_means_fast',
sources=['_k_means_fast.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_extension('_k_means_lloyd',
sources=['_k_means_lloyd.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_extension('_k_means_elkan',
sources=['_k_means_elkan.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_subpackage('tests')
return config
if __name__ == '__main__':
from numpy.distutils.core import setup
setup(**configuration(top_path='').todict())

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _spectral # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.cluster.spectral'
correct_import_path = 'sklearn.cluster'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_spectral, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,28 @@
"""
Common utilities for testing clustering.
"""
import numpy as np
###############################################################################
# Generate sample data
def generate_clustered_data(seed=0, n_clusters=3, n_features=2,
n_samples_per_cluster=20, std=.4):
prng = np.random.RandomState(seed)
# the data is voluntary shifted away from zero to check clustering
# algorithm robustness with regards to non centered data
means = np.array([[1, 1, 1, 0],
[-1, -1, 0, 1],
[1, -1, 1, 1],
[-1, 1, 1, 0],
]) + 10
X = np.empty((0, n_features))
for i in range(n_clusters):
X = np.r_[X, means[i][:n_features]
+ std * prng.randn(n_samples_per_cluster, n_features)]
return X

View file

@ -0,0 +1,246 @@
"""
Testing for Clustering methods
"""
import numpy as np
import pytest
from scipy.sparse import csr_matrix
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import (
assert_array_equal, assert_warns,
assert_warns_message, assert_no_warnings)
from sklearn.cluster import AffinityPropagation
from sklearn.cluster._affinity_propagation import (
_equal_similarities_and_preferences
)
from sklearn.cluster import affinity_propagation
from sklearn.datasets import make_blobs
from sklearn.metrics import euclidean_distances
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(n_samples=60, n_features=2, centers=centers,
cluster_std=0.4, shuffle=True, random_state=0)
def test_affinity_propagation():
# Affinity Propagation algorithm
# Compute similarities
S = -euclidean_distances(X, squared=True)
preference = np.median(S) * 10
# Compute Affinity Propagation
cluster_centers_indices, labels = affinity_propagation(
S, preference=preference, random_state=39)
n_clusters_ = len(cluster_centers_indices)
assert n_clusters == n_clusters_
af = AffinityPropagation(preference=preference, affinity="precomputed",
random_state=28)
labels_precomputed = af.fit(S).labels_
af = AffinityPropagation(preference=preference, verbose=True,
random_state=37)
labels = af.fit(X).labels_
assert_array_equal(labels, labels_precomputed)
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
assert np.unique(labels).size == n_clusters_
assert n_clusters == n_clusters_
# Test also with no copy
_, labels_no_copy = affinity_propagation(S, preference=preference,
copy=False, random_state=74)
assert_array_equal(labels, labels_no_copy)
# Test input validation
with pytest.raises(ValueError):
affinity_propagation(S[:, :-1])
with pytest.raises(ValueError):
affinity_propagation(S, damping=0)
af = AffinityPropagation(affinity="unknown", random_state=78)
with pytest.raises(ValueError):
af.fit(X)
af_2 = AffinityPropagation(affinity='precomputed', random_state=21)
with pytest.raises(TypeError):
af_2.fit(csr_matrix((3, 3)))
def test_affinity_propagation_predict():
# Test AffinityPropagation.predict
af = AffinityPropagation(affinity="euclidean", random_state=63)
labels = af.fit_predict(X)
labels2 = af.predict(X)
assert_array_equal(labels, labels2)
def test_affinity_propagation_predict_error():
# Test exception in AffinityPropagation.predict
# Not fitted.
af = AffinityPropagation(affinity="euclidean")
with pytest.raises(ValueError):
af.predict(X)
# Predict not supported when affinity="precomputed".
S = np.dot(X, X.T)
af = AffinityPropagation(affinity="precomputed", random_state=57)
af.fit(S)
with pytest.raises(ValueError):
af.predict(X)
def test_affinity_propagation_fit_non_convergence():
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array and training samples should be labelled
# as noise (-1)
X = np.array([[0, 0], [1, 1], [-2, -2]])
# Force non-convergence by allowing only a single iteration
af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
assert_warns(ConvergenceWarning, af.fit, X)
assert_array_equal(np.empty((0, 2)), af.cluster_centers_)
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
def test_affinity_propagation_equal_mutual_similarities():
X = np.array([[-1, 1], [1, -1]])
S = -euclidean_distances(X, squared=True)
# setting preference > similarity
cluster_center_indices, labels = assert_warns_message(
UserWarning, "mutually equal", affinity_propagation, S, preference=0)
# expect every sample to become an exemplar
assert_array_equal([0, 1], cluster_center_indices)
assert_array_equal([0, 1], labels)
# setting preference < similarity
cluster_center_indices, labels = assert_warns_message(
UserWarning, "mutually equal", affinity_propagation, S, preference=-10)
# expect one cluster, with arbitrary (first) sample as exemplar
assert_array_equal([0], cluster_center_indices)
assert_array_equal([0, 0], labels)
# setting different preferences
cluster_center_indices, labels = assert_no_warnings(
affinity_propagation, S, preference=[-20, -10], random_state=37)
# expect one cluster, with highest-preference sample as exemplar
assert_array_equal([1], cluster_center_indices)
assert_array_equal([0, 0], labels)
def test_affinity_propagation_predict_non_convergence():
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array
X = np.array([[0, 0], [1, 1], [-2, -2]])
# Force non-convergence by allowing only a single iteration
af = assert_warns(ConvergenceWarning,
AffinityPropagation(preference=-10,
max_iter=1, random_state=75).fit, X)
# At prediction time, consider new samples as noise since there are no
# clusters
to_predict = np.array([[2, 2], [3, 3], [4, 4]])
y = assert_warns(ConvergenceWarning, af.predict, to_predict)
assert_array_equal(np.array([-1, -1, -1]), y)
def test_affinity_propagation_non_convergence_regressiontest():
X = np.array([[1, 0, 0, 0, 0, 0],
[0, 1, 1, 1, 0, 0],
[0, 0, 1, 0, 0, 1]])
af = AffinityPropagation(affinity='euclidean',
max_iter=2, random_state=34).fit(X)
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
def test_equal_similarities_and_preferences():
# Unequal distances
X = np.array([[0, 0], [1, 1], [-2, -2]])
S = -euclidean_distances(X, squared=True)
assert not _equal_similarities_and_preferences(S, np.array(0))
assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Equal distances
X = np.array([[0, 0], [1, 1]])
S = -euclidean_distances(X, squared=True)
# Different preferences
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Same preferences
assert _equal_similarities_and_preferences(S, np.array([0, 0]))
assert _equal_similarities_and_preferences(S, np.array(0))
def test_affinity_propagation_random_state():
# Significance of random_state parameter
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300, centers=centers,
cluster_std=0.5, random_state=0)
# random_state = 0
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
ap.fit(X)
centers0 = ap.cluster_centers_
# random_state = 76
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
ap.fit(X)
centers76 = ap.cluster_centers_
assert np.mean((centers0 - centers76) ** 2) > 1
# FIXME: to be removed in 0.25
def test_affinity_propagation_random_state_warning():
# test that a warning is raised when random_state is not defined.
X = np.array([[0, 0], [1, 1], [-2, -2]])
match = ("'random_state' has been introduced in 0.23. "
"It will be set to None starting from 0.25 which "
"means that results will differ at every function "
"call. Set 'random_state' to None to silence this "
"warning, or to 0 to keep the behavior of versions "
"<0.23.")
with pytest.warns(FutureWarning, match=match):
AffinityPropagation().fit(X)
@pytest.mark.parametrize('centers', [csr_matrix(np.zeros((1, 10))),
np.zeros((1, 10))])
def test_affinity_propagation_convergence_warning_dense_sparse(centers):
"""Non-regression, see #13334"""
rng = np.random.RandomState(42)
X = rng.rand(40, 10)
y = (4 * rng.rand(40)).astype(np.int)
ap = AffinityPropagation(random_state=46)
ap.fit(X, y)
ap.cluster_centers_ = centers
with pytest.warns(None) as record:
assert_array_equal(ap.predict(X),
np.zeros(X.shape[0], dtype=int))
assert len(record) == 0
def test_affinity_propagation_float32():
# Test to fix incorrect clusters due to dtype change
# (non-regression test for issue #10832)
X = np.array([[1, 0, 0, 0],
[0, 1, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 1]], dtype='float32')
afp = AffinityPropagation(preference=1, affinity='precomputed',
random_state=0).fit(X)
expected = np.array([0, 1, 1, 2])
assert_array_equal(afp.labels_, expected)

View file

@ -0,0 +1,277 @@
"""Testing for Spectral Biclustering methods"""
import numpy as np
import pytest
from scipy.sparse import csr_matrix, issparse
from sklearn.model_selection import ParameterGrid
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.base import BaseEstimator, BiclusterMixin
from sklearn.cluster import SpectralCoclustering
from sklearn.cluster import SpectralBiclustering
from sklearn.cluster._bicluster import _scale_normalize
from sklearn.cluster._bicluster import _bistochastic_normalize
from sklearn.cluster._bicluster import _log_normalize
from sklearn.metrics import (consensus_score, v_measure_score)
from sklearn.datasets import make_biclusters, make_checkerboard
class MockBiclustering(BiclusterMixin, BaseEstimator):
# Mock object for testing get_submatrix.
def __init__(self):
pass
def get_indices(self, i):
# Overridden to reproduce old get_submatrix test.
return (np.where([True, True, False, False, True])[0],
np.where([False, False, True, True])[0])
def test_get_submatrix():
data = np.arange(20).reshape(5, 4)
model = MockBiclustering()
for X in (data, csr_matrix(data), data.tolist()):
submatrix = model.get_submatrix(0, X)
if issparse(submatrix):
submatrix = submatrix.toarray()
assert_array_equal(submatrix, [[2, 3],
[6, 7],
[18, 19]])
submatrix[:] = -1
if issparse(X):
X = X.toarray()
assert np.all(X != -1)
def _test_shape_indices(model):
# Test get_shape and get_indices on fitted model.
for i in range(model.n_clusters):
m, n = model.get_shape(i)
i_ind, j_ind = model.get_indices(i)
assert len(i_ind) == m
assert len(j_ind) == n
def test_spectral_coclustering():
# Test Dhillon's Spectral CoClustering on a simple problem.
param_grid = {'svd_method': ['randomized', 'arpack'],
'n_svd_vecs': [None, 20],
'mini_batch': [False, True],
'init': ['k-means++'],
'n_init': [10]}
random_state = 0
S, rows, cols = make_biclusters((30, 30), 3, noise=0.5,
random_state=random_state)
S -= S.min() # needs to be nonnegative before making it sparse
S = np.where(S < 1, 0, S) # threshold some values
for mat in (S, csr_matrix(S)):
for kwargs in ParameterGrid(param_grid):
model = SpectralCoclustering(n_clusters=3,
random_state=random_state,
**kwargs)
model.fit(mat)
assert model.rows_.shape == (3, 30)
assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
_test_shape_indices(model)
def test_spectral_biclustering():
# Test Kluger methods on a checkerboard dataset.
S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
random_state=0)
non_default_params = {'method': ['scale', 'log'],
'svd_method': ['arpack'],
'n_svd_vecs': [20],
'mini_batch': [True]}
for mat in (S, csr_matrix(S)):
for param_name, param_values in non_default_params.items():
for param_value in param_values:
model = SpectralBiclustering(
n_clusters=3,
n_init=3,
init='k-means++',
random_state=0,
)
model.set_params(**dict([(param_name, param_value)]))
if issparse(mat) and model.get_params().get('method') == 'log':
# cannot take log of sparse matrix
with pytest.raises(ValueError):
model.fit(mat)
continue
else:
model.fit(mat)
assert model.rows_.shape == (9, 30)
assert model.columns_.shape == (9, 30)
assert_array_equal(model.rows_.sum(axis=0),
np.repeat(3, 30))
assert_array_equal(model.columns_.sum(axis=0),
np.repeat(3, 30))
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
_test_shape_indices(model)
def _do_scale_test(scaled):
"""Check that rows sum to one constant, and columns to another."""
row_sum = scaled.sum(axis=1)
col_sum = scaled.sum(axis=0)
if issparse(scaled):
row_sum = np.asarray(row_sum).squeeze()
col_sum = np.asarray(col_sum).squeeze()
assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100),
decimal=1)
assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100),
decimal=1)
def _do_bistochastic_test(scaled):
"""Check that rows and columns sum to the same constant."""
_do_scale_test(scaled)
assert_almost_equal(scaled.sum(axis=0).mean(),
scaled.sum(axis=1).mean(),
decimal=1)
def test_scale_normalize():
generator = np.random.RandomState(0)
X = generator.rand(100, 100)
for mat in (X, csr_matrix(X)):
scaled, _, _ = _scale_normalize(mat)
_do_scale_test(scaled)
if issparse(mat):
assert issparse(scaled)
def test_bistochastic_normalize():
generator = np.random.RandomState(0)
X = generator.rand(100, 100)
for mat in (X, csr_matrix(X)):
scaled = _bistochastic_normalize(mat)
_do_bistochastic_test(scaled)
if issparse(mat):
assert issparse(scaled)
def test_log_normalize():
# adding any constant to a log-scaled matrix should make it
# bistochastic
generator = np.random.RandomState(0)
mat = generator.rand(100, 100)
scaled = _log_normalize(mat) + 1
_do_bistochastic_test(scaled)
def test_fit_best_piecewise():
model = SpectralBiclustering(random_state=0)
vectors = np.array([[0, 0, 0, 1, 1, 1],
[2, 2, 2, 3, 3, 3],
[0, 1, 2, 3, 4, 5]])
best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
assert_array_equal(best, vectors[:2])
def test_project_and_cluster():
model = SpectralBiclustering(random_state=0)
data = np.array([[1, 1, 1],
[1, 1, 1],
[3, 6, 3],
[3, 6, 3]])
vectors = np.array([[1, 0],
[0, 1],
[0, 0]])
for mat in (data, csr_matrix(data)):
labels = model._project_and_cluster(mat, vectors,
n_clusters=2)
assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
def test_perfect_checkerboard():
# XXX Previously failed on build bot (not reproducible)
model = SpectralBiclustering(3, svd_method="arpack", random_state=0)
S, rows, cols = make_checkerboard((30, 30), 3, noise=0,
random_state=0)
model.fit(S)
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
S, rows, cols = make_checkerboard((40, 30), 3, noise=0,
random_state=0)
model.fit(S)
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
S, rows, cols = make_checkerboard((30, 40), 3, noise=0,
random_state=0)
model.fit(S)
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
@pytest.mark.parametrize(
"args",
[{'n_clusters': (3, 3, 3)},
{'n_clusters': 'abc'},
{'n_clusters': (3, 'abc')},
{'method': 'unknown'},
{'n_components': 0},
{'n_best': 0},
{'svd_method': 'unknown'},
{'n_components': 3, 'n_best': 4}]
)
def test_errors(args):
data = np.arange(25).reshape((5, 5))
model = SpectralBiclustering(**args)
with pytest.raises(ValueError):
model.fit(data)
def test_wrong_shape():
model = SpectralBiclustering()
data = np.arange(27).reshape((3, 3, 3))
with pytest.raises(ValueError):
model.fit(data)
@pytest.mark.parametrize('est',
(SpectralBiclustering(), SpectralCoclustering()))
def test_n_features_in_(est):
X, _, _ = make_biclusters((3, 3), 3, random_state=0)
assert not hasattr(est, 'n_features_in_')
est.fit(X)
assert est.n_features_in_ == 3
@pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering])
@pytest.mark.parametrize("n_jobs", [None, 1])
def test_n_jobs_deprecated(klass, n_jobs):
# FIXME: remove in 0.25
depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
"in 0.25.")
S, _, _ = make_biclusters((30, 30), 3, noise=0.5, random_state=0)
est = klass(random_state=0, n_jobs=n_jobs)
with pytest.warns(FutureWarning, match=depr_msg):
est.fit(S)

View file

@ -0,0 +1,169 @@
"""
Tests for the birch clustering algorithm.
"""
from scipy import sparse
import numpy as np
import pytest
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_blobs
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNet
from sklearn.metrics import pairwise_distances_argmin, v_measure_score
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_warns
def test_n_samples_leaves_roots():
# Sanity check for the number of samples in leaves and roots
X, y = make_blobs(n_samples=10)
brc = Birch()
brc.fit(X)
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
n_samples_leaves = sum([sc.n_samples_ for leaf in brc._get_leaves()
for sc in leaf.subclusters_])
assert n_samples_leaves == X.shape[0]
assert n_samples_root == X.shape[0]
def test_partial_fit():
# Test that fit is equivalent to calling partial_fit multiple times
X, y = make_blobs(n_samples=100)
brc = Birch(n_clusters=3)
brc.fit(X)
brc_partial = Birch(n_clusters=None)
brc_partial.partial_fit(X[:50])
brc_partial.partial_fit(X[50:])
assert_array_almost_equal(brc_partial.subcluster_centers_,
brc.subcluster_centers_)
# Test that same global labels are obtained after calling partial_fit
# with None
brc_partial.set_params(n_clusters=3)
brc_partial.partial_fit(None)
assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
def test_birch_predict():
# Test the predict method predicts the nearest centroid.
rng = np.random.RandomState(0)
X = generate_clustered_data(n_clusters=3, n_features=3,
n_samples_per_cluster=10)
# n_samples * n_samples_per_cluster
shuffle_indices = np.arange(30)
rng.shuffle(shuffle_indices)
X_shuffle = X[shuffle_indices, :]
brc = Birch(n_clusters=4, threshold=1.)
brc.fit(X_shuffle)
centroids = brc.subcluster_centers_
assert_array_equal(brc.labels_, brc.predict(X_shuffle))
nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
def test_n_clusters():
# Test that n_clusters param works properly
X, y = make_blobs(n_samples=100, centers=10)
brc1 = Birch(n_clusters=10)
brc1.fit(X)
assert len(brc1.subcluster_centers_) > 10
assert len(np.unique(brc1.labels_)) == 10
# Test that n_clusters = Agglomerative Clustering gives
# the same results.
gc = AgglomerativeClustering(n_clusters=10)
brc2 = Birch(n_clusters=gc)
brc2.fit(X)
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
assert_array_equal(brc1.labels_, brc2.labels_)
# Test that the wrong global clustering step raises an Error.
clf = ElasticNet()
brc3 = Birch(n_clusters=clf)
with pytest.raises(ValueError):
brc3.fit(X)
# Test that a small number of clusters raises a warning.
brc4 = Birch(threshold=10000.)
assert_warns(ConvergenceWarning, brc4.fit, X)
def test_sparse_X():
# Test that sparse and dense data give same results
X, y = make_blobs(n_samples=100, centers=10)
brc = Birch(n_clusters=10)
brc.fit(X)
csr = sparse.csr_matrix(X)
brc_sparse = Birch(n_clusters=10)
brc_sparse.fit(csr)
assert_array_equal(brc.labels_, brc_sparse.labels_)
assert_array_almost_equal(brc.subcluster_centers_,
brc_sparse.subcluster_centers_)
def check_branching_factor(node, branching_factor):
subclusters = node.subclusters_
assert branching_factor >= len(subclusters)
for cluster in subclusters:
if cluster.child_:
check_branching_factor(cluster.child_, branching_factor)
def test_branching_factor():
# Test that nodes have at max branching_factor number of subclusters
X, y = make_blobs()
branching_factor = 9
# Purposefully set a low threshold to maximize the subclusters.
brc = Birch(n_clusters=None, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
brc = Birch(n_clusters=3, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
# Raises error when branching_factor is set to one.
brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
with pytest.raises(ValueError):
brc.fit(X)
def check_threshold(birch_instance, threshold):
"""Use the leaf linked list for traversal"""
current_leaf = birch_instance.dummy_leaf_.next_leaf_
while current_leaf:
subclusters = current_leaf.subclusters_
for sc in subclusters:
assert threshold >= sc.radius
current_leaf = current_leaf.next_leaf_
def test_threshold():
# Test that the leaf subclusters have a threshold lesser than radius
X, y = make_blobs(n_samples=80, centers=4)
brc = Birch(threshold=0.5, n_clusters=None)
brc.fit(X)
check_threshold(brc, 0.5)
brc = Birch(threshold=5.0, n_clusters=None)
brc.fit(X)
check_threshold(brc, 5.)
def test_birch_n_clusters_long_int():
# Check that birch supports n_clusters with np.int64 dtype, for instance
# coming from np.arange. #16484
X, _ = make_blobs(random_state=0)
n_clusters = np.int64(5)
Birch(n_clusters=n_clusters).fit(X)

View file

@ -0,0 +1,395 @@
"""
Tests for DBSCAN clustering algorithm
"""
import pickle
import numpy as np
from scipy.spatial import distance
from scipy import sparse
import pytest
from sklearn.utils._testing import assert_array_equal
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import dbscan
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.metrics.pairwise import pairwise_distances
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
def test_dbscan_similarity():
# Tests the DBSCAN algorithm with a similarity array.
# Parameters chosen specifically for this task.
eps = 0.15
min_samples = 10
# Compute similarities
D = distance.squareform(distance.pdist(X))
D /= np.max(D)
# Compute DBSCAN
core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
labels = db.fit(D).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_feature():
# Tests the DBSCAN algorithm with a feature vector array.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
metric = 'euclidean'
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(X, metric=metric, eps=eps,
min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_sparse():
core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8,
min_samples=10)
core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10)
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
@pytest.mark.parametrize('include_self', [False, True])
def test_dbscan_sparse_precomputed(include_self):
D = pairwise_distances(X)
nn = NearestNeighbors(radius=.9).fit(X)
X_ = X if include_self else None
D_sparse = nn.radius_neighbors_graph(X=X_, mode='distance')
# Ensure it is sparse not merely on diagonals:
assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
core_sparse, labels_sparse = dbscan(D_sparse,
eps=.8,
min_samples=10,
metric='precomputed')
core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10,
metric='precomputed')
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed_different_eps():
# test that precomputed neighbors graph is filtered if computed with
# a radius larger than DBSCAN's eps.
lower_eps = 0.2
nn = NearestNeighbors(radius=lower_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode='distance')
dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed')
higher_eps = lower_eps + 0.7
nn = NearestNeighbors(radius=higher_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode='distance')
dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed')
assert_array_equal(dbscan_lower[0], dbscan_higher[0])
assert_array_equal(dbscan_lower[1], dbscan_higher[1])
@pytest.mark.parametrize('use_sparse', [True, False])
@pytest.mark.parametrize('metric', ['precomputed', 'minkowski'])
def test_dbscan_input_not_modified(use_sparse, metric):
# test that the input is not modified by dbscan
X = np.random.RandomState(0).rand(10, 10)
X = sparse.csr_matrix(X) if use_sparse else X
X_copy = X.copy()
dbscan(X, metric=metric)
if use_sparse:
assert_array_equal(X.toarray(), X_copy.toarray())
else:
assert_array_equal(X, X_copy)
def test_dbscan_no_core_samples():
rng = np.random.RandomState(0)
X = rng.rand(40, 10)
X[X < .8] = 0
for X_ in [X, sparse.csr_matrix(X)]:
db = DBSCAN(min_samples=6).fit(X_)
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
assert_array_equal(db.labels_, -1)
assert db.core_sample_indices_.shape == (0,)
def test_dbscan_callable():
# Tests the DBSCAN algorithm with a callable metric.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
# metric is the function reference, not the string key.
metric = distance.euclidean
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(X, metric=metric, eps=eps,
min_samples=min_samples,
algorithm='ball_tree')
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples,
algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_metric_params():
# Tests that DBSCAN works with the metrics_params argument.
eps = 0.8
min_samples = 10
p = 1
# Compute DBSCAN with metric_params arg
db = DBSCAN(metric='minkowski', metric_params={'p': p}, eps=eps,
min_samples=min_samples, algorithm='ball_tree').fit(X)
core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
# Test that sample labels are the same as passing Minkowski 'p' directly
db = DBSCAN(metric='minkowski', eps=eps, min_samples=min_samples,
algorithm='ball_tree', p=p).fit(X)
core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_2)
assert_array_equal(labels_1, labels_2)
# Minkowski with p=1 should be equivalent to Manhattan distance
db = DBSCAN(metric='manhattan', eps=eps, min_samples=min_samples,
algorithm='ball_tree').fit(X)
core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_3)
assert_array_equal(labels_1, labels_3)
def test_dbscan_balltree():
# Tests the DBSCAN algorithm with balltree for neighbor calculation.
eps = 0.8
min_samples = 10
D = pairwise_distances(X)
core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree')
labels = db.fit(X).labels_
n_clusters_3 = len(set(labels)) - int(-1 in labels)
assert n_clusters_3 == n_clusters
db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_4 = len(set(labels)) - int(-1 in labels)
assert n_clusters_4 == n_clusters
db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples,
algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_5 = len(set(labels)) - int(-1 in labels)
assert n_clusters_5 == n_clusters
def test_input_validation():
# DBSCAN.fit should accept a list of lists.
X = [[1., 2.], [3., 4.]]
DBSCAN().fit(X) # must not raise exception
@pytest.mark.parametrize(
"args",
[{'eps': -1.0}, {'algorithm': 'blah'}, {'metric': 'blah'},
{'leaf_size': -1}, {'p': -1}]
)
def test_dbscan_badargs(args):
# Test bad argument values: these should all raise ValueErrors
with pytest.raises(ValueError):
dbscan(X, **args)
def test_pickle():
obj = DBSCAN()
s = pickle.dumps(obj)
assert type(pickle.loads(s)) == obj.__class__
def test_boundaries():
# ensure min_samples is inclusive of core point
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
assert 0 in core
# ensure eps is inclusive of circumference
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
assert 0 in core
core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
assert 0 not in core
def test_weighted_dbscan():
# ensure sample_weight is validated
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2])
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2, 3, 4])
# ensure sample_weight has an effect
assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
min_samples=6)[0])
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
min_samples=6)[0])
# points within eps of each other:
assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
sample_weight=[5, 1], min_samples=6)[0])
# and effect of non-positive and non-integer sample_weight:
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
eps=1.5, min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
eps=1.5, min_samples=6)[0])
# for non-negative sample_weight, cores should be identical to repetition
rng = np.random.RandomState(42)
sample_weight = rng.randint(0, 5, X.shape[0])
core1, label1 = dbscan(X, sample_weight=sample_weight)
assert len(label1) == len(X)
X_repeated = np.repeat(X, sample_weight, axis=0)
core_repeated, label_repeated = dbscan(X_repeated)
core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
core_repeated_mask[core_repeated] = True
core_mask = np.zeros(X.shape[0], dtype=bool)
core_mask[core1] = True
assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
# sample_weight should work with precomputed distance matrix
D = pairwise_distances(X)
core3, label3 = dbscan(D, sample_weight=sample_weight,
metric='precomputed')
assert_array_equal(core1, core3)
assert_array_equal(label1, label3)
# sample_weight should work with estimator
est = DBSCAN().fit(X, sample_weight=sample_weight)
core4 = est.core_sample_indices_
label4 = est.labels_
assert_array_equal(core1, core4)
assert_array_equal(label1, label4)
est = DBSCAN()
label5 = est.fit_predict(X, sample_weight=sample_weight)
core5 = est.core_sample_indices_
assert_array_equal(core1, core5)
assert_array_equal(label1, label5)
assert_array_equal(label1, est.labels_)
@pytest.mark.parametrize('algorithm', ['brute', 'kd_tree', 'ball_tree'])
def test_dbscan_core_samples_toy(algorithm):
X = [[0], [2], [3], [4], [6], [8], [10]]
n_samples = len(X)
# Degenerate case: every sample is a core sample, either with its own
# cluster or including other close core samples.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=1)
assert_array_equal(core_samples, np.arange(n_samples))
assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
# With eps=1 and min_samples=2 only the 3 samples from the denser area
# are core samples. All other points are isolated and considered noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=2)
assert_array_equal(core_samples, [1, 2, 3])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# Only the sample in the middle of the dense area is core. Its two
# neighbors are edge samples. Remaining samples are noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=3)
assert_array_equal(core_samples, [2])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# It's no longer possible to extract core samples with eps=1:
# everything is noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=4)
assert_array_equal(core_samples, [])
assert_array_equal(labels, np.full(n_samples, -1.))
def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
# see https://github.com/scikit-learn/scikit-learn/issues/4641 for
# more details
X = np.eye(10)
labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_
assert len(set(labels)) == 1
X = np.zeros((10, 10))
labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_
assert len(set(labels)) == 1
def test_dbscan_precomputed_metric_with_initial_rows_zero():
# sample matrix with initial two row all zero
ar = np.array([
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
[0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0]
])
matrix = sparse.csr_matrix(ar)
labels = DBSCAN(eps=0.2, metric='precomputed',
min_samples=2).fit(matrix).labels_
assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])

View file

@ -0,0 +1,43 @@
"""
Tests for sklearn.cluster._feature_agglomeration
"""
# Authors: Sergul Aydore 2017
import numpy as np
from sklearn.cluster import FeatureAgglomeration
from sklearn.utils._testing import assert_no_warnings
from sklearn.utils._testing import assert_array_almost_equal
def test_feature_agglomeration():
n_clusters = 1
X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features)
agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
pooling_func=np.mean)
agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
pooling_func=np.median)
assert_no_warnings(agglo_mean.fit, X)
assert_no_warnings(agglo_median.fit, X)
assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
assert np.size(np.unique(agglo_median.labels_)) == n_clusters
assert np.size(agglo_mean.labels_) == X.shape[1]
assert np.size(agglo_median.labels_) == X.shape[1]
# Test transform
Xt_mean = agglo_mean.transform(X)
Xt_median = agglo_median.transform(X)
assert Xt_mean.shape[1] == n_clusters
assert Xt_median.shape[1] == n_clusters
assert Xt_mean == np.array([1 / 3.])
assert Xt_median == np.array([0.])
# Test inverse transform
X_full_mean = agglo_mean.inverse_transform(Xt_mean)
X_full_median = agglo_median.inverse_transform(Xt_median)
assert np.unique(X_full_mean[0]).size == n_clusters
assert np.unique(X_full_median[0]).size == n_clusters
assert_array_almost_equal(agglo_mean.transform(X_full_mean),
Xt_mean)
assert_array_almost_equal(agglo_median.transform(X_full_median),
Xt_median)

View file

@ -0,0 +1,765 @@
"""
Several basic tests for hierarchical clustering procedures
"""
# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
# Matteo Visconti di Oleggio Castello 2014
# License: BSD 3 clause
from tempfile import mkdtemp
import shutil
import pytest
from functools import partial
import numpy as np
from scipy import sparse
from scipy.cluster import hierarchy
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import ignore_warnings
from sklearn.cluster import ward_tree
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn.cluster._agglomerative import (_hc_cut, _TREE_BUILDERS,
linkage_tree,
_fix_connectivity)
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\
manhattan_distances, pairwise_distances
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster._hierarchical_fast import average_merge, max_merge
from sklearn.utils._fast_dict import IntFloatDict
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns
from sklearn.datasets import make_moons, make_circles
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
with pytest.raises(ValueError):
AgglomerativeClustering(linkage='foo').fit(X)
with pytest.raises(ValueError):
linkage_tree(X, linkage='foo')
with pytest.raises(ValueError):
linkage_tree(X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_structured_linkage_tree():
# Check that we obtain the correct solution for structured linkage trees.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
# Avoiding a mask with only 'True' entries
mask[4:7, 4:7] = 0
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for tree_builder in _TREE_BUILDERS.values():
children, n_components, n_leaves, parent = \
tree_builder(X.T, connectivity=connectivity)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
# Check that ward_tree raises a ValueError with a connectivity matrix
# of the wrong shape
with pytest.raises(ValueError):
tree_builder(X.T, connectivity=np.ones((4, 4)))
# Check that fitting with no samples raises an error
with pytest.raises(ValueError):
tree_builder(X.T[:0], connectivity=connectivity)
def test_unstructured_linkage_tree():
# Check that we obtain the correct solution for unstructured linkage trees.
rng = np.random.RandomState(0)
X = rng.randn(50, 100)
for this_X in (X, X[0]):
# With specified a number of clusters just for the sake of
# raising a warning and testing the warning code
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, ward_tree, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
for tree_builder in _TREE_BUILDERS.values():
for this_X in (X, X[0]):
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, tree_builder, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_height_linkage_tree():
# Check that the height of the results of linkage tree is sorted.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for linkage_func in _TREE_BUILDERS.values():
children, n_nodes, n_leaves, parent = linkage_func(
X.T, connectivity=connectivity)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_agglomerative_clustering_wrong_arg_memory():
# Test either if an error is raised when memory is not
# either a str or a joblib.Memory instance
rng = np.random.RandomState(0)
n_samples = 100
X = rng.randn(n_samples, 50)
memory = 5
clustering = AgglomerativeClustering(memory=memory)
with pytest.raises(ValueError):
clustering.fit(X)
def test_zero_cosine_linkage_tree():
# Check that zero vectors in X produce an error when
# 'cosine' affinity is used
X = np.array([[0, 1],
[0, 0]])
msg = 'Cosine affinity cannot be used when X contains zero vectors'
assert_raise_message(ValueError, msg, linkage_tree, X, affinity='cosine')
def test_agglomerative_clustering():
# Check that we obtain the correct number of clusters with
# agglomerative clustering.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
for linkage in ("ward", "complete", "average", "single"):
clustering = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
linkage=linkage)
clustering.fit(X)
# test caching
try:
tempdir = mkdtemp()
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity,
memory=tempdir,
linkage=linkage)
clustering.fit(X)
labels = clustering.labels_
assert np.size(np.unique(labels)) == 10
finally:
shutil.rmtree(tempdir)
# Turn caching off now
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage=linkage)
# Check that we obtain the same solution with early-stopping of the
# tree building
clustering.compute_full_tree = False
clustering.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
labels), 1)
clustering.connectivity = None
clustering.fit(X)
assert np.size(np.unique(clustering.labels_)) == 10
# Check that we raise a TypeError on dense matrices
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=sparse.lil_matrix(
connectivity.toarray()[:10, :10]),
linkage=linkage)
with pytest.raises(ValueError):
clustering.fit(X)
# Test that using ward with another metric than euclidean raises an
# exception
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity.toarray(),
affinity="manhattan",
linkage="ward")
with pytest.raises(ValueError):
clustering.fit(X)
# Test using another metric than euclidean works with linkage complete
for affinity in PAIRED_DISTANCES.keys():
# Compare our (structured) implementation to scipy
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=np.ones((n_samples, n_samples)),
affinity=affinity,
linkage="complete")
clustering.fit(X)
clustering2 = AgglomerativeClustering(
n_clusters=10,
connectivity=None,
affinity=affinity,
linkage="complete")
clustering2.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering2.labels_,
clustering.labels_),
1)
# Test that using a distance matrix (affinity = 'precomputed') has same
# results (with connectivity constraints)
clustering = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
linkage="complete")
clustering.fit(X)
X_dist = pairwise_distances(X)
clustering2 = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
affinity='precomputed',
linkage="complete")
clustering2.fit(X_dist)
assert_array_equal(clustering.labels_, clustering2.labels_)
def test_ward_agglomeration():
# Check that we obtain the correct solution in a simplistic case
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
agglo.fit(X)
assert np.size(np.unique(agglo.labels_)) == 5
X_red = agglo.transform(X)
assert X_red.shape[1] == 5
X_full = agglo.inverse_transform(X_red)
assert np.unique(X_full[0]).size == 5
assert_array_almost_equal(agglo.transform(X_full), X_red)
# Check that fitting with no samples raises a ValueError
with pytest.raises(ValueError):
agglo.fit(X[:0])
def test_single_linkage_clustering():
# Check that we get the correct result in two emblematic cases
moons, moon_labels = make_moons(noise=0.05, random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
clustering.fit(moons)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
moon_labels), 1)
circles, circle_labels = make_circles(factor=0.5, noise=0.025,
random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
clustering.fit(circles)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
circle_labels), 1)
def assess_same_labelling(cut1, cut2):
"""Util for comparison with scipy"""
co_clust = []
for cut in [cut1, cut2]:
n = len(cut)
k = cut.max() + 1
ecut = np.zeros((n, k))
ecut[np.arange(n), cut] = 1
co_clust.append(np.dot(ecut, ecut.T))
assert (co_clust[0] == co_clust[1]).all()
def test_sparse_scikit_vs_scipy():
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
n, p, k = 10, 5, 3
rng = np.random.RandomState(0)
# Not using a lil_matrix here, just to check that non sparse
# matrices are well handled
connectivity = np.ones((n, n))
for linkage in _TREE_BUILDERS.keys():
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method=linkage)
children_ = out[:, :2].astype(np.int, copy=False)
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
X, connectivity=connectivity)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(children, children_, 'linkage tree differs'
' from scipy impl for'
' linkage: ' + linkage)
cut = _hc_cut(k, children, n_leaves)
cut_ = _hc_cut(k, children_, n_leaves)
assess_same_labelling(cut, cut_)
# Test error management in _hc_cut
with pytest.raises(ValueError):
_hc_cut(n_leaves + 1, children, n_leaves)
# Make sure our custom mst_linkage_core gives
# the same results as scipy's builtin
@pytest.mark.parametrize('seed', range(5))
def test_vector_scikit_single_vs_scipy_single(seed):
n_samples, n_features, n_clusters = 10, 5, 3
rng = np.random.RandomState(seed)
X = .1 * rng.normal(size=(n_samples, n_features))
X -= 4. * np.arange(n_samples)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method='single')
children_scipy = out[:, :2].astype(np.int)
children, _, n_leaves, _ = _TREE_BUILDERS['single'](X)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(children, children_scipy,
'linkage tree differs'
' from scipy impl for'
' single linkage.')
cut = _hc_cut(n_clusters, children, n_leaves)
cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
assess_same_labelling(cut, cut_scipy)
def test_identical_points():
# Ensure identical points are handled correctly when using mst with
# a sparse connectivity matrix
X = np.array([[0, 0, 0], [0, 0, 0],
[1, 1, 1], [1, 1, 1],
[2, 2, 2], [2, 2, 2]])
true_labels = np.array([0, 0, 1, 1, 2, 2])
connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
connectivity, n_components = _fix_connectivity(X,
connectivity,
'euclidean')
for linkage in ('single', 'average', 'average', 'ward'):
clustering = AgglomerativeClustering(n_clusters=3,
linkage=linkage,
connectivity=connectivity)
clustering.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
true_labels), 1)
def test_connectivity_propagation():
# Check that connectivity in the ward tree is propagated correctly during
# merging.
X = np.array([(.014, .120), (.014, .099), (.014, .097),
(.017, .153), (.017, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .152), (.018, .149), (.018, .144)])
connectivity = kneighbors_graph(X, 10, include_self=False)
ward = AgglomerativeClustering(
n_clusters=4, connectivity=connectivity, linkage='ward')
# If changes are not propagated correctly, fit crashes with an
# IndexError
ward.fit(X)
def test_ward_tree_children_order():
# Check that children are ordered in the same way for both structured and
# unstructured versions of ward_tree.
# test on five random datasets
n, p = 10, 5
rng = np.random.RandomState(0)
connectivity = np.ones((n, n))
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X)
out_structured = ward_tree(X, connectivity=connectivity)
assert_array_equal(out_unstructured[0], out_structured[0])
def test_ward_linkage_tree_return_distance():
# Test return_distance option on linkage and ward trees
# test that return_distance when set true, gives same
# output on both structured and unstructured clustering.
n, p = 10, 5
rng = np.random.RandomState(0)
connectivity = np.ones((n, n))
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X, return_distance=True)
out_structured = ward_tree(X, connectivity=connectivity,
return_distance=True)
# get children
children_unstructured = out_unstructured[0]
children_structured = out_structured[0]
# check if we got the same clusters
assert_array_equal(children_unstructured, children_structured)
# check if the distances are the same
dist_unstructured = out_unstructured[-1]
dist_structured = out_structured[-1]
assert_array_almost_equal(dist_unstructured, dist_structured)
for linkage in ['average', 'complete', 'single']:
structured_items = linkage_tree(
X, connectivity=connectivity, linkage=linkage,
return_distance=True)[-1]
unstructured_items = linkage_tree(
X, linkage=linkage, return_distance=True)[-1]
structured_dist = structured_items[-1]
unstructured_dist = unstructured_items[-1]
structured_children = structured_items[0]
unstructured_children = unstructured_items[0]
assert_array_almost_equal(structured_dist, unstructured_dist)
assert_array_almost_equal(
structured_children, unstructured_children)
# test on the following dataset where we know the truth
# taken from scipy/cluster/tests/hierarchy_test_data.py
X = np.array([[1.43054825, -7.5693489],
[6.95887839, 6.82293382],
[2.87137846, -9.68248579],
[7.87974764, -6.05485803],
[8.24018364, -6.09495602],
[7.39020262, 8.54004355]])
# truth
linkage_X_ward = np.array([[3., 4., 0.36265956, 2.],
[1., 5., 1.77045373, 2.],
[0., 2., 2.55760419, 2.],
[6., 8., 9.10208346, 4.],
[7., 9., 24.7784379, 6.]])
linkage_X_complete = np.array(
[[3., 4., 0.36265956, 2.],
[1., 5., 1.77045373, 2.],
[0., 2., 2.55760419, 2.],
[6., 8., 6.96742194, 4.],
[7., 9., 18.77445997, 6.]])
linkage_X_average = np.array(
[[3., 4., 0.36265956, 2.],
[1., 5., 1.77045373, 2.],
[0., 2., 2.55760419, 2.],
[6., 8., 6.55832839, 4.],
[7., 9., 15.44089605, 6.]])
n_samples, n_features = np.shape(X)
connectivity_X = np.ones((n_samples, n_samples))
out_X_unstructured = ward_tree(X, return_distance=True)
out_X_structured = ward_tree(X, connectivity=connectivity_X,
return_distance=True)
# check that the labels are the same
assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
linkage_options = ['complete', 'average', 'single']
X_linkage_truth = [linkage_X_complete, linkage_X_average]
for (linkage, X_truth) in zip(linkage_options, X_linkage_truth):
out_X_unstructured = linkage_tree(
X, return_distance=True, linkage=linkage)
out_X_structured = linkage_tree(
X, connectivity=connectivity_X, linkage=linkage,
return_distance=True)
# check that the labels are the same
assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
assert_array_equal(X_truth[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_connectivity_fixing_non_lil():
# Check non regression of a bug if a non item assignable connectivity is
# provided with more than one component.
# create dummy data
x = np.array([[0, 0], [1, 1]])
# create a mask with several components to force connectivity fixing
m = np.array([[True, False], [False, True]])
c = grid_to_graph(n_x=2, n_y=2, mask=m)
w = AgglomerativeClustering(connectivity=c, linkage='ward')
assert_warns(UserWarning, w.fit, x)
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
other_keys = np.arange(50, dtype=np.intp)[::2]
other_values = np.full(50, 0.5)[::2]
other = IntFloatDict(other_keys, other_values)
# Complete smoke test
max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
def test_connectivity_callable():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(
connectivity=partial(kneighbors_graph, n_neighbors=3,
include_self=False))
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_ignores_diagonal():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_compute_full_tree():
# Test that the full tree is computed if n_clusters is small
rng = np.random.RandomState(0)
X = rng.randn(10, 2)
connectivity = kneighbors_graph(X, 5, include_self=False)
# When n_clusters is less, the full tree should be built
# that is the number of merges should be n_samples - 1
agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - 1
# When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
# we should stop when there are n_clusters.
n_clusters = 101
X = rng.randn(200, 2)
connectivity = kneighbors_graph(X, 10, include_self=False)
agc = AgglomerativeClustering(n_clusters=n_clusters,
connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - n_clusters
def test_n_components():
# Test n_components returned by linkage, average and ward tree
rng = np.random.RandomState(0)
X = rng.rand(5, 5)
# Connectivity matrix having five components.
connectivity = np.eye(5)
for linkage_func in _TREE_BUILDERS.values():
assert ignore_warnings(linkage_func)(X, connectivity)[1] == 5
def test_agg_n_clusters():
# Test that an error is raised when n_clusters <= 0
rng = np.random.RandomState(0)
X = rng.rand(20, 10)
for n_clus in [-1, 0]:
agc = AgglomerativeClustering(n_clusters=n_clus)
msg = ("n_clusters should be an integer greater than 0."
" %s was provided." % str(agc.n_clusters))
assert_raise_message(ValueError, msg, agc.fit, X)
def test_affinity_passed_to_fix_connectivity():
# Test that the affinity parameter is actually passed to the pairwise
# function
size = 2
rng = np.random.RandomState(0)
X = rng.randn(size, size)
mask = np.array([True, False, False, True])
connectivity = grid_to_graph(n_x=size, n_y=size,
mask=mask, return_as=np.ndarray)
class FakeAffinity:
def __init__(self):
self.counter = 0
def increment(self, *args, **kwargs):
self.counter += 1
return self.counter
fa = FakeAffinity()
linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
assert fa.counter == 3
@pytest.mark.parametrize('linkage', ['ward', 'complete', 'average'])
def test_agglomerative_clustering_with_distance_threshold(linkage):
# Check that we obtain the correct number of clusters with
# agglomerative clustering with distance_threshold.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
# test when distance threshold is set to 10
distance_threshold = 10
for conn in [None, connectivity]:
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=distance_threshold,
connectivity=conn, linkage=linkage)
clustering.fit(X)
clusters_produced = clustering.labels_
num_clusters_produced = len(np.unique(clustering.labels_))
# test if the clusters produced match the point in the linkage tree
# where the distance exceeds the threshold
tree_builder = _TREE_BUILDERS[linkage]
children, n_components, n_leaves, parent, distances = \
tree_builder(X, connectivity=conn, n_clusters=None,
return_distance=True)
num_clusters_at_threshold = np.count_nonzero(
distances >= distance_threshold) + 1
# test number of clusters produced
assert num_clusters_at_threshold == num_clusters_produced
# test clusters produced
clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced,
children=children,
n_leaves=n_leaves)
assert np.array_equiv(clusters_produced,
clusters_at_threshold)
def test_small_distance_threshold():
rng = np.random.RandomState(0)
n_samples = 10
X = rng.randint(-300, 300, size=(n_samples, 3))
# this should result in all data in their own clusters, given that
# their pairwise distances are bigger than .1 (which may not be the case
# with a different random seed).
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=1.,
linkage="single").fit(X)
# check that the pairwise distances are indeed all larger than .1
all_distances = pairwise_distances(X, metric='minkowski', p=2)
np.fill_diagonal(all_distances, np.inf)
assert np.all(all_distances > .1)
assert clustering.n_clusters_ == n_samples
def test_cluster_distances_with_distance_threshold():
rng = np.random.RandomState(0)
n_samples = 100
X = rng.randint(-10, 10, size=(n_samples, 3))
# check the distances within the clusters and with other clusters
distance_threshold = 4
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=distance_threshold,
linkage="single").fit(X)
labels = clustering.labels_
D = pairwise_distances(X, metric="minkowski", p=2)
# to avoid taking the 0 diagonal in min()
np.fill_diagonal(D, np.inf)
for label in np.unique(labels):
in_cluster_mask = labels == label
max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask]
.min(axis=0).max())
min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask]
.min(axis=0).min())
# single data point clusters only have that inf diagonal here
if in_cluster_mask.sum() > 1:
assert max_in_cluster_distance < distance_threshold
assert min_out_cluster_distance >= distance_threshold
@pytest.mark.parametrize('linkage', ['ward', 'complete', 'average'])
@pytest.mark.parametrize(('threshold', 'y_true'),
[(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])])
def test_agglomerative_clustering_with_distance_threshold_edge_case(
linkage, threshold, y_true):
# test boundary case of distance_threshold matching the distance
X = [[0], [1]]
clusterer = AgglomerativeClustering(
n_clusters=None,
distance_threshold=threshold,
linkage=linkage)
y_pred = clusterer.fit_predict(X)
assert adjusted_rand_score(y_true, y_pred) == 1
def test_dist_threshold_invalid_parameters():
X = [[0], [1]]
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=None,
distance_threshold=None).fit(X)
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=2,
distance_threshold=1).fit(X)
X = [[0], [1]]
with pytest.raises(ValueError, match="compute_full_tree must be True if"):
AgglomerativeClustering(n_clusters=None,
distance_threshold=1,
compute_full_tree=False).fit(X)
def test_invalid_shape_precomputed_dist_matrix():
# Check that an error is raised when affinity='precomputed'
# and a non square matrix is passed (PR #16257).
rng = np.random.RandomState(0)
X = rng.rand(5, 3)
with pytest.raises(ValueError, match="Distance matrix should be square, "):
AgglomerativeClustering(affinity='precomputed',
linkage='complete').fit(X)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,194 @@
"""
Testing for mean shift clustering methods
"""
import numpy as np
import warnings
import pytest
from scipy import sparse
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_allclose
from sklearn.cluster import MeanShift
from sklearn.cluster import mean_shift
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import get_bin_seeds
from sklearn.datasets import make_blobs
from sklearn.metrics import v_measure_score
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(n_samples=300, n_features=2, centers=centers,
cluster_std=0.4, shuffle=True, random_state=11)
def test_estimate_bandwidth():
# Test estimate_bandwidth
bandwidth = estimate_bandwidth(X, n_samples=200)
assert 0.9 <= bandwidth <= 1.5
def test_estimate_bandwidth_1sample():
# Test estimate_bandwidth when n_samples=1 and quantile<1, so that
# n_neighbors is set to 1.
bandwidth = estimate_bandwidth(X, n_samples=1, quantile=0.3)
assert bandwidth == pytest.approx(0., abs=1e-5)
@pytest.mark.parametrize("bandwidth, cluster_all, expected, "
"first_cluster_label",
[(1.2, True, 3, 0), (1.2, False, 4, -1)])
def test_mean_shift(bandwidth, cluster_all, expected, first_cluster_label):
# Test MeanShift algorithm
ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
labels = ms.fit(X).labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
assert n_clusters_ == expected
assert labels_unique[0] == first_cluster_label
cluster_centers, labels_mean_shift = mean_shift(X, cluster_all=cluster_all)
labels_mean_shift_unique = np.unique(labels_mean_shift)
n_clusters_mean_shift = len(labels_mean_shift_unique)
assert n_clusters_mean_shift == expected
assert labels_mean_shift_unique[0] == first_cluster_label
def test_mean_shift_negative_bandwidth():
bandwidth = -1
ms = MeanShift(bandwidth=bandwidth)
msg = (r"bandwidth needs to be greater than zero or None,"
r" got -1\.000000")
with pytest.raises(ValueError, match=msg):
ms.fit(X)
def test_estimate_bandwidth_with_sparse_matrix():
# Test estimate_bandwidth with sparse matrix
X = sparse.lil_matrix((1000, 1000))
msg = "A sparse matrix was passed, but dense data is required."
assert_raise_message(TypeError, msg, estimate_bandwidth, X)
def test_parallel():
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(n_samples=50, n_features=2, centers=centers,
cluster_std=0.4, shuffle=True, random_state=11)
ms1 = MeanShift(n_jobs=2)
ms1.fit(X)
ms2 = MeanShift()
ms2.fit(X)
assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_)
assert_array_equal(ms1.labels_, ms2.labels_)
def test_meanshift_predict():
# Test MeanShift.predict
ms = MeanShift(bandwidth=1.2)
labels = ms.fit_predict(X)
labels2 = ms.predict(X)
assert_array_equal(labels, labels2)
def test_meanshift_all_orphans():
# init away from the data, crash with a sensible warning
ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
msg = "No point was within bandwidth=0.1"
assert_raise_message(ValueError, msg, ms.fit, X,)
def test_unfitted():
# Non-regression: before fit, there should be not fitted attributes.
ms = MeanShift()
assert not hasattr(ms, "cluster_centers_")
assert not hasattr(ms, "labels_")
def test_cluster_intensity_tie():
X = np.array([[1, 1], [2, 1], [1, 0],
[4, 7], [3, 5], [3, 6]])
c1 = MeanShift(bandwidth=2).fit(X)
X = np.array([[4, 7], [3, 5], [3, 6],
[1, 1], [2, 1], [1, 0]])
c2 = MeanShift(bandwidth=2).fit(X)
assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
def test_bin_seeds():
# Test the bin seeding technique which can be used in the mean shift
# algorithm
# Data is just 6 points in the plane
X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2],
[2., 1.], [2.1, 1.1], [0., 0.]])
# With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
# found
ground_truth = {(1., 1.), (2., 1.), (0., 0.)}
test_bins = get_bin_seeds(X, 1, 1)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
# found
ground_truth = {(1., 1.), (2., 1.)}
test_bins = get_bin_seeds(X, 1, 2)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
# we bail and use the whole data here.
with warnings.catch_warnings(record=True):
test_bins = get_bin_seeds(X, 0.01, 1)
assert_array_almost_equal(test_bins, X)
# tight clusters around [0, 0] and [1, 1], only get two bins
X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]],
cluster_std=0.1, random_state=0)
test_bins = get_bin_seeds(X, 1)
assert_array_equal(test_bins, [[0, 0], [1, 1]])
@pytest.mark.parametrize('max_iter', [1, 100])
def test_max_iter(max_iter):
clusters1, _ = mean_shift(X, max_iter=max_iter)
ms = MeanShift(max_iter=max_iter).fit(X)
clusters2 = ms.cluster_centers_
assert ms.n_iter_ <= ms.max_iter
assert len(clusters1) == len(clusters2)
for c1, c2 in zip(clusters1, clusters2):
assert np.allclose(c1, c2)
def test_mean_shift_zero_bandwidth():
# Check that mean shift works when the estimated bandwidth is 0.
X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1)
# estimate_bandwidth with default args returns 0 on this dataset
bandwidth = estimate_bandwidth(X)
assert bandwidth == 0
# get_bin_seeds with a 0 bin_size should return the dataset itself
assert get_bin_seeds(X, bin_size=bandwidth) is X
# MeanShift with binning and a 0 estimated bandwidth should be equivalent
# to no binning.
ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
ms_nobinning = MeanShift(bin_seeding=False).fit(X)
expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
assert v_measure_score(ms_binning.labels_, expected_labels) == 1
assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1
assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)

View file

@ -0,0 +1,429 @@
# Authors: Shane Grigsby <refuge@rocktalus.com>
# Adrin Jalali <adrin.jalali@gmail.com>
# License: BSD 3 clause
import numpy as np
import pytest
from sklearn.datasets import make_blobs
from sklearn.cluster import OPTICS
from sklearn.cluster._optics import _extend_region, _extract_xi_labels
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import DBSCAN
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_allclose
from sklearn.cluster.tests.common import generate_clustered_data
rng = np.random.RandomState(0)
n_points_per_cluster = 10
C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))
@pytest.mark.parametrize(
('r_plot', 'end'),
[[[10, 8.9, 8.8, 8.7, 7, 10], 3],
[[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
])
def test_extend_downward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_downward = ratio >= 1 / .9
upward = ratio < 1
e = _extend_region(steep_downward, upward, 0, 2)
assert e == end
@pytest.mark.parametrize(
('r_plot', 'end'),
[[[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
[[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
[[1, 2, 2.1, 2, np.inf], 0],
[[1, 2, 2.1, np.inf], 2],
])
def test_extend_upward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_upward = ratio <= .9
downward = ratio > 1
e = _extend_region(steep_upward, downward, 0, 2)
assert e == end
@pytest.mark.parametrize(
('ordering', 'clusters', 'expected'),
[[[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
[[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
])
def test_the_extract_xi_labels(ordering, clusters, expected):
labels = _extract_xi_labels(ordering, clusters)
assert_array_equal(labels, expected)
def test_extract_xi():
# small and easy test (no clusters around other clusters)
# but with a clear noise data.
rng = np.random.RandomState(0)
n_points_per_cluster = 5
C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + .6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6))
expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5,
-1, [4] * 5]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(min_samples=3, min_cluster_size=2,
max_eps=20, cluster_method='xi',
xi=0.4).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# check float min_samples and min_cluster_size
clust = OPTICS(min_samples=0.1, min_cluster_size=0.08,
max_eps=20, cluster_method='xi',
xi=0.4).fit(X)
assert_array_equal(clust.labels_, expected_labels)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5,
-1, -1, [4] * 5]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(min_samples=3, min_cluster_size=3,
max_eps=20, cluster_method='xi',
xi=0.3).fit(X)
# this may fail if the predecessor correction is not at work!
assert_array_equal(clust.labels_, expected_labels)
C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]]
C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
X = np.vstack((C1, C2, C3))
expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(min_samples=2, min_cluster_size=2,
max_eps=np.inf, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
def test_cluster_hierarchy_():
rng = np.random.RandomState(0)
n_points_per_cluster = 100
C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2)
C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2))
X = shuffle(X, random_state=0)
clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_
assert clusters.shape == (2, 2)
diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
assert diff / len(X) < 0.05
def test_correct_number_of_clusters():
# in 'auto' mode
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# Parameters chosen specifically for this task.
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1)
clust.fit(X)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
assert n_clusters_1 == n_clusters
# check attribute types and sizes
assert clust.labels_.shape == (len(X),)
assert clust.labels_.dtype.kind == 'i'
assert clust.reachability_.shape == (len(X),)
assert clust.reachability_.dtype.kind == 'f'
assert clust.core_distances_.shape == (len(X),)
assert clust.core_distances_.dtype.kind == 'f'
assert clust.ordering_.shape == (len(X),)
assert clust.ordering_.dtype.kind == 'i'
assert set(clust.ordering_) == set(range(len(X)))
def test_minimum_number_of_sample_check():
# test that we check a minimum number of samples
msg = "min_samples must be no greater than"
# Compute OPTICS
X = [[1, 1]]
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1)
# Run the fit
assert_raise_message(ValueError, msg, clust.fit, X)
def test_bad_extract():
# Test an extraction of eps too close to original eps
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 0.03,
cluster_method='dbscan',
eps=0.3, min_samples=10)
assert_raise_message(ValueError, msg, clust.fit, X)
def test_bad_reachability():
msg = "All reachability values are inf. Set a larger max_eps."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
with pytest.warns(UserWarning, match=msg):
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
clust.fit(X)
def test_close_extract():
# Test extract where extraction eps is close to scaled max_eps
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
# Compute OPTICS
clust = OPTICS(max_eps=1.0, cluster_method='dbscan',
eps=0.3, min_samples=10).fit(X)
# Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
assert max(clust.labels_) == 2
@pytest.mark.parametrize('eps', [0.1, .3, .5])
@pytest.mark.parametrize('min_samples', [3, 10, 20])
def test_dbscan_optics_parity(eps, min_samples):
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
# calculate optics with dbscan extract at 0.3 epsilon
op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
eps=eps).fit(X)
# calculate dbscan labels
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
contingency = contingency_matrix(db.labels_, op.labels_)
agree = min(np.sum(np.max(contingency, axis=0)),
np.sum(np.max(contingency, axis=1)))
disagree = X.shape[0] - agree
percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
# verify label mismatch is <= 5% labels
assert percent_mismatch <= 0.05
def test_min_samples_edge_case():
C1 = [[0, 0], [0, 0.1], [0, -.1]]
C2 = [[10, 10], [10, 9], [10, 11]]
C3 = [[100, 100], [100, 96], [100, 106]]
X = np.vstack((C1, C2, C3))
expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
clust = OPTICS(min_samples=3,
max_eps=7, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
clust = OPTICS(min_samples=3,
max_eps=3, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[-1] * 9]
with pytest.warns(UserWarning, match="All reachability values"):
clust = OPTICS(min_samples=4,
max_eps=3, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# try arbitrary minimum sizes
@pytest.mark.parametrize('min_cluster_size', range(2, X.shape[0] // 10, 23))
def test_min_cluster_size(min_cluster_size):
redX = X[::2] # reduce for speed
clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
if cluster_sizes.size:
assert min(cluster_sizes) >= min_cluster_size
# check behaviour is the same when min_cluster_size is a fraction
clust_frac = OPTICS(min_samples=9,
min_cluster_size=min_cluster_size / redX.shape[0])
clust_frac.fit(redX)
assert_array_equal(clust.labels_, clust_frac.labels_)
@pytest.mark.parametrize('min_cluster_size', [0, -1, 1.1, 2.2])
def test_min_cluster_size_invalid(min_cluster_size):
clust = OPTICS(min_cluster_size=min_cluster_size)
with pytest.raises(ValueError, match="must be a positive integer or a "):
clust.fit(X)
def test_min_cluster_size_invalid2():
clust = OPTICS(min_cluster_size=len(X) + 1)
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(X)
def test_processing_order():
# Ensure that we consider all unprocessed points,
# not only direct neighbors. when picking the next point.
Y = [[0], [10], [-10], [25]]
clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
assert_array_equal(clust.ordering_, [0, 1, 2, 3])
def test_compare_to_ELKI():
# Expected values, computed with (future) ELKI 0.7.5 using:
# java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
# -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
# where the FixedDBIDsFilter gives 0-indexed ids.
r1 = [np.inf, 1.0574896366427478, 0.7587934993548423, 0.7290174038973836,
0.7290174038973836, 0.7290174038973836, 0.6861627576116127,
0.7587934993548423, 0.9280118450166668, 1.1748022534146194,
3.3355455741292257, 0.49618389254482587, 0.2552805046961355,
0.2552805046961355, 0.24944622248445714, 0.24944622248445714,
0.24944622248445714, 0.2552805046961355, 0.2552805046961355,
0.3086779122185853, 4.163024452756142, 1.623152630340929,
0.45315840475822655, 0.25468325192031926, 0.2254004358159971,
0.18765711877083036, 0.1821471333893275, 0.1821471333893275,
0.18765711877083036, 0.18765711877083036, 0.2240202988740153,
1.154337614548715, 1.342604473837069, 1.323308536402633,
0.8607514948648837, 0.27219111215810565, 0.13260875220533205,
0.13260875220533205, 0.09890587675958984, 0.09890587675958984,
0.13548790801634494, 0.1575483940837384, 0.17515137170530226,
0.17575920159442388, 0.27219111215810565, 0.6101447895405373,
1.3189208094864302, 1.323308536402633, 2.2509184159764577,
2.4517810628594527, 3.675977064404973, 3.8264795626020365,
2.9130735341510614, 2.9130735341510614, 2.9130735341510614,
2.9130735341510614, 2.8459300127258036, 2.8459300127258036,
2.8459300127258036, 3.0321982337972537]
o1 = [0, 3, 6, 4, 7, 8, 2, 9, 5, 1, 31, 30, 32, 34, 33, 38, 39, 35, 37, 36,
44, 21, 23, 24, 22, 25, 27, 29, 26, 28, 20, 40, 45, 46, 10, 15, 11,
13, 17, 19, 18, 12, 16, 14, 47, 49, 43, 48, 42, 41, 53, 57, 51, 52,
56, 59, 54, 55, 58, 50]
p1 = [-1, 0, 3, 6, 6, 6, 8, 3, 7, 5, 1, 31, 30, 30, 34, 34, 34, 32, 32, 37,
36, 44, 21, 23, 24, 22, 25, 25, 22, 22, 22, 21, 40, 45, 46, 10, 15,
15, 13, 13, 15, 11, 19, 15, 10, 47, 12, 45, 14, 43, 42, 53, 57, 57,
57, 57, 59, 59, 59, 58]
# Tests against known extraction array
# Does NOT work with metric='euclidean', because sklearn euclidean has
# worse numeric precision. 'minkowski' is slower but more accurate.
clust1 = OPTICS(min_samples=5).fit(X)
assert_array_equal(clust1.ordering_, np.array(o1))
assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
# ELKI currently does not print the core distances (which are not used much
# in literature, but we can at least ensure to have this consistency:
for i in clust1.ordering_[1:]:
assert (clust1.reachability_[i] >=
clust1.core_distances_[clust1.predecessor_[i]])
# Expected values, computed with (future) ELKI 0.7.5 using
r2 = [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
np.inf, np.inf, np.inf, 0.27219111215810565, 0.13260875220533205,
0.13260875220533205, 0.09890587675958984, 0.09890587675958984,
0.13548790801634494, 0.1575483940837384, 0.17515137170530226,
0.17575920159442388, 0.27219111215810565, 0.4928068613197889,
np.inf, 0.2666183922512113, 0.18765711877083036, 0.1821471333893275,
0.1821471333893275, 0.1821471333893275, 0.18715928772277457,
0.18765711877083036, 0.18765711877083036, 0.25468325192031926,
np.inf, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714,
0.24944622248445714, 0.24944622248445714, 0.2552805046961355,
0.2552805046961355, 0.3086779122185853, 0.34466409325984865,
np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
np.inf, np.inf]
o2 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 11, 13, 17, 19, 18, 12, 16, 14,
47, 46, 20, 22, 25, 23, 27, 29, 24, 26, 28, 21, 30, 32, 34, 33, 38,
39, 35, 37, 36, 31, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59]
p2 = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 15, 15, 13, 13, 15,
11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30,
30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1]
clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
assert_array_equal(clust2.ordering_, np.array(o2))
assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
index = np.where(clust1.core_distances_ <= 0.5)[0]
assert_allclose(clust1.core_distances_[index],
clust2.core_distances_[index])
def test_wrong_cluster_method():
clust = OPTICS(cluster_method='superfancy')
with pytest.raises(ValueError, match="cluster_method should be one of "):
clust.fit(X)
def test_extract_dbscan():
# testing an easy dbscan case. Not including clusters with different
# densities.
rng = np.random.RandomState(0)
n_points_per_cluster = 20
C1 = [-5, -2] + .2 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .2 * rng.randn(n_points_per_cluster, 2)
C3 = [1, 2] + .2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4))
clust = OPTICS(cluster_method='dbscan', eps=.5).fit(X)
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
def test_precomputed_dists():
redX = X[::2]
dists = pairwise_distances(redX, metric='euclidean')
clust1 = OPTICS(min_samples=10, algorithm='brute',
metric='precomputed').fit(dists)
clust2 = OPTICS(min_samples=10, algorithm='brute',
metric='euclidean').fit(redX)
assert_allclose(clust1.reachability_, clust2.reachability_)
assert_array_equal(clust1.labels_, clust2.labels_)

View file

@ -0,0 +1,250 @@
"""Testing for Spectral Clustering methods"""
import numpy as np
from scipy import sparse
import pytest
import pickle
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns_message
from sklearn.cluster import SpectralClustering, spectral_clustering
from sklearn.cluster._spectral import discretize
from sklearn.feature_extraction import img_to_graph
from sklearn.metrics import pairwise_distances
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_blobs
try:
from pyamg import smoothed_aggregation_solver # noqa
amg_loaded = True
except ImportError:
amg_loaded = False
@pytest.mark.parametrize('eigen_solver', ('arpack', 'lobpcg'))
@pytest.mark.parametrize('assign_labels', ('kmeans', 'discretize'))
def test_spectral_clustering(eigen_solver, assign_labels):
S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]])
for mat in (S, sparse.csr_matrix(S)):
model = SpectralClustering(random_state=0, n_clusters=2,
affinity='precomputed',
eigen_solver=eigen_solver,
assign_labels=assign_labels
).fit(mat)
labels = model.labels_
if labels[0] == 0:
labels = 1 - labels
assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
model_copy = pickle.loads(pickle.dumps(model))
assert model_copy.n_clusters == model.n_clusters
assert model_copy.eigen_solver == model.eigen_solver
assert_array_equal(model_copy.labels_, model.labels_)
def test_spectral_unknown_mode():
# Test that SpectralClustering fails with an unknown mode set.
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
with pytest.raises(ValueError):
spectral_clustering(S, n_clusters=2, random_state=0,
eigen_solver="<unknown>")
def test_spectral_unknown_assign_labels():
# Test that SpectralClustering fails with an unknown assign_labels set.
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
with pytest.raises(ValueError):
spectral_clustering(S, n_clusters=2, random_state=0,
assign_labels="<unknown>")
def test_spectral_clustering_sparse():
X, y = make_blobs(n_samples=20, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
S = rbf_kernel(X, gamma=1)
S = np.maximum(S - 1e-4, 0)
S = sparse.coo_matrix(S)
labels = SpectralClustering(random_state=0, n_clusters=2,
affinity='precomputed').fit(S).labels_
assert adjusted_rand_score(y, labels) == 1
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
X, y = make_blobs(n_samples=200, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(
n_neighbors=n_neighbors + additional_neighbors).fit(X)
graph = nn.kneighbors_graph(X, mode='connectivity')
labels = SpectralClustering(random_state=0, n_clusters=2,
affinity='precomputed_nearest_neighbors',
n_neighbors=n_neighbors).fit(graph).labels_
results.append(labels)
assert_array_equal(results[0], results[1])
def test_affinities():
# Note: in the following, random_state has been selected to have
# a dataset that yields a stable eigen decomposition both when built
# on OSX and Linux
X, y = make_blobs(n_samples=20, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
# nearest neighbors affinity
sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
random_state=0)
assert_warns_message(UserWarning, 'not fully connected', sp.fit, X)
assert adjusted_rand_score(y, sp.labels_) == 1
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
labels = sp.fit(X).labels_
assert adjusted_rand_score(y, labels) == 1
X = check_random_state(10).rand(10, 5) * 10
kernels_available = kernel_metrics()
for kern in kernels_available:
# Additive chi^2 gives a negative similarity matrix which
# doesn't make sense for spectral clustering
if kern != 'additive_chi2':
sp = SpectralClustering(n_clusters=2, affinity=kern,
random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1,
random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
def histogram(x, y, **kwargs):
# Histogram kernel implemented as a callable.
assert kwargs == {} # no kernel_params that we didn't ask for
return np.minimum(x, y).sum()
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
# raise error on unknown affinity
sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
with pytest.raises(ValueError):
sp.fit(X)
@pytest.mark.parametrize('n_samples', [50, 100, 150, 500])
def test_discretize(n_samples):
# Test the discretize using a noise assignment matrix
random_state = np.random.RandomState(seed=8)
for n_class in range(2, 10):
# random class labels
y_true = random_state.randint(0, n_class + 1, n_samples)
y_true = np.array(y_true, np.float)
# noise class assignment matrix
y_indicator = sparse.coo_matrix((np.ones(n_samples),
(np.arange(n_samples),
y_true)),
shape=(n_samples,
n_class + 1))
y_true_noisy = (y_indicator.toarray()
+ 0.1 * random_state.randn(n_samples,
n_class + 1))
y_pred = discretize(y_true_noisy, random_state=random_state)
assert adjusted_rand_score(y_true, y_pred) > 0.8
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
def test_spectral_clustering_with_arpack_amg_solvers():
# Test that spectral_clustering is the same for arpack and amg solver
# Based on toy example from plot_segmentation_toy.py
# a small two coin image
x, y = np.indices((40, 40))
center1, center2 = (14, 12), (20, 25)
radius1, radius2 = 8, 7
circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2
circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2
circles = circle1 | circle2
mask = circles.copy()
img = circles.astype(float)
graph = img_to_graph(img, mask=mask)
graph.data = np.exp(-graph.data / graph.data.std())
labels_arpack = spectral_clustering(
graph, n_clusters=2, eigen_solver='arpack', random_state=0)
assert len(np.unique(labels_arpack)) == 2
if amg_loaded:
labels_amg = spectral_clustering(
graph, n_clusters=2, eigen_solver='amg', random_state=0)
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
else:
with pytest.raises(ValueError):
spectral_clustering(graph, n_clusters=2, eigen_solver='amg',
random_state=0)
def test_n_components():
# Test that after adding n_components, result is different and
# n_components = n_clusters by default
X, y = make_blobs(n_samples=20, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
sp = SpectralClustering(n_clusters=2, random_state=0)
labels = sp.fit(X).labels_
# set n_components = n_cluster and test if result is the same
labels_same_ncomp = SpectralClustering(n_clusters=2, n_components=2,
random_state=0).fit(X).labels_
# test that n_components=n_clusters by default
assert_array_equal(labels, labels_same_ncomp)
# test that n_components affect result
# n_clusters=8 by default, and set n_components=2
labels_diff_ncomp = SpectralClustering(n_components=2,
random_state=0).fit(X).labels_
assert not np.array_equal(labels, labels_diff_ncomp)