Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,28 @@
"""
Common utilities for testing clustering.
"""
import numpy as np
###############################################################################
# Generate sample data
def generate_clustered_data(seed=0, n_clusters=3, n_features=2,
n_samples_per_cluster=20, std=.4):
prng = np.random.RandomState(seed)
# the data is voluntary shifted away from zero to check clustering
# algorithm robustness with regards to non centered data
means = np.array([[1, 1, 1, 0],
[-1, -1, 0, 1],
[1, -1, 1, 1],
[-1, 1, 1, 0],
]) + 10
X = np.empty((0, n_features))
for i in range(n_clusters):
X = np.r_[X, means[i][:n_features]
+ std * prng.randn(n_samples_per_cluster, n_features)]
return X

View file

@ -0,0 +1,246 @@
"""
Testing for Clustering methods
"""
import numpy as np
import pytest
from scipy.sparse import csr_matrix
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import (
assert_array_equal, assert_warns,
assert_warns_message, assert_no_warnings)
from sklearn.cluster import AffinityPropagation
from sklearn.cluster._affinity_propagation import (
_equal_similarities_and_preferences
)
from sklearn.cluster import affinity_propagation
from sklearn.datasets import make_blobs
from sklearn.metrics import euclidean_distances
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(n_samples=60, n_features=2, centers=centers,
cluster_std=0.4, shuffle=True, random_state=0)
def test_affinity_propagation():
# Affinity Propagation algorithm
# Compute similarities
S = -euclidean_distances(X, squared=True)
preference = np.median(S) * 10
# Compute Affinity Propagation
cluster_centers_indices, labels = affinity_propagation(
S, preference=preference, random_state=39)
n_clusters_ = len(cluster_centers_indices)
assert n_clusters == n_clusters_
af = AffinityPropagation(preference=preference, affinity="precomputed",
random_state=28)
labels_precomputed = af.fit(S).labels_
af = AffinityPropagation(preference=preference, verbose=True,
random_state=37)
labels = af.fit(X).labels_
assert_array_equal(labels, labels_precomputed)
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
assert np.unique(labels).size == n_clusters_
assert n_clusters == n_clusters_
# Test also with no copy
_, labels_no_copy = affinity_propagation(S, preference=preference,
copy=False, random_state=74)
assert_array_equal(labels, labels_no_copy)
# Test input validation
with pytest.raises(ValueError):
affinity_propagation(S[:, :-1])
with pytest.raises(ValueError):
affinity_propagation(S, damping=0)
af = AffinityPropagation(affinity="unknown", random_state=78)
with pytest.raises(ValueError):
af.fit(X)
af_2 = AffinityPropagation(affinity='precomputed', random_state=21)
with pytest.raises(TypeError):
af_2.fit(csr_matrix((3, 3)))
def test_affinity_propagation_predict():
# Test AffinityPropagation.predict
af = AffinityPropagation(affinity="euclidean", random_state=63)
labels = af.fit_predict(X)
labels2 = af.predict(X)
assert_array_equal(labels, labels2)
def test_affinity_propagation_predict_error():
# Test exception in AffinityPropagation.predict
# Not fitted.
af = AffinityPropagation(affinity="euclidean")
with pytest.raises(ValueError):
af.predict(X)
# Predict not supported when affinity="precomputed".
S = np.dot(X, X.T)
af = AffinityPropagation(affinity="precomputed", random_state=57)
af.fit(S)
with pytest.raises(ValueError):
af.predict(X)
def test_affinity_propagation_fit_non_convergence():
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array and training samples should be labelled
# as noise (-1)
X = np.array([[0, 0], [1, 1], [-2, -2]])
# Force non-convergence by allowing only a single iteration
af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
assert_warns(ConvergenceWarning, af.fit, X)
assert_array_equal(np.empty((0, 2)), af.cluster_centers_)
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
def test_affinity_propagation_equal_mutual_similarities():
X = np.array([[-1, 1], [1, -1]])
S = -euclidean_distances(X, squared=True)
# setting preference > similarity
cluster_center_indices, labels = assert_warns_message(
UserWarning, "mutually equal", affinity_propagation, S, preference=0)
# expect every sample to become an exemplar
assert_array_equal([0, 1], cluster_center_indices)
assert_array_equal([0, 1], labels)
# setting preference < similarity
cluster_center_indices, labels = assert_warns_message(
UserWarning, "mutually equal", affinity_propagation, S, preference=-10)
# expect one cluster, with arbitrary (first) sample as exemplar
assert_array_equal([0], cluster_center_indices)
assert_array_equal([0, 0], labels)
# setting different preferences
cluster_center_indices, labels = assert_no_warnings(
affinity_propagation, S, preference=[-20, -10], random_state=37)
# expect one cluster, with highest-preference sample as exemplar
assert_array_equal([1], cluster_center_indices)
assert_array_equal([0, 0], labels)
def test_affinity_propagation_predict_non_convergence():
# In case of non-convergence of affinity_propagation(), the cluster
# centers should be an empty array
X = np.array([[0, 0], [1, 1], [-2, -2]])
# Force non-convergence by allowing only a single iteration
af = assert_warns(ConvergenceWarning,
AffinityPropagation(preference=-10,
max_iter=1, random_state=75).fit, X)
# At prediction time, consider new samples as noise since there are no
# clusters
to_predict = np.array([[2, 2], [3, 3], [4, 4]])
y = assert_warns(ConvergenceWarning, af.predict, to_predict)
assert_array_equal(np.array([-1, -1, -1]), y)
def test_affinity_propagation_non_convergence_regressiontest():
X = np.array([[1, 0, 0, 0, 0, 0],
[0, 1, 1, 1, 0, 0],
[0, 0, 1, 0, 0, 1]])
af = AffinityPropagation(affinity='euclidean',
max_iter=2, random_state=34).fit(X)
assert_array_equal(np.array([-1, -1, -1]), af.labels_)
def test_equal_similarities_and_preferences():
# Unequal distances
X = np.array([[0, 0], [1, 1], [-2, -2]])
S = -euclidean_distances(X, squared=True)
assert not _equal_similarities_and_preferences(S, np.array(0))
assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Equal distances
X = np.array([[0, 0], [1, 1]])
S = -euclidean_distances(X, squared=True)
# Different preferences
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Same preferences
assert _equal_similarities_and_preferences(S, np.array([0, 0]))
assert _equal_similarities_and_preferences(S, np.array(0))
def test_affinity_propagation_random_state():
# Significance of random_state parameter
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=300, centers=centers,
cluster_std=0.5, random_state=0)
# random_state = 0
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
ap.fit(X)
centers0 = ap.cluster_centers_
# random_state = 76
ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
ap.fit(X)
centers76 = ap.cluster_centers_
assert np.mean((centers0 - centers76) ** 2) > 1
# FIXME: to be removed in 0.25
def test_affinity_propagation_random_state_warning():
# test that a warning is raised when random_state is not defined.
X = np.array([[0, 0], [1, 1], [-2, -2]])
match = ("'random_state' has been introduced in 0.23. "
"It will be set to None starting from 0.25 which "
"means that results will differ at every function "
"call. Set 'random_state' to None to silence this "
"warning, or to 0 to keep the behavior of versions "
"<0.23.")
with pytest.warns(FutureWarning, match=match):
AffinityPropagation().fit(X)
@pytest.mark.parametrize('centers', [csr_matrix(np.zeros((1, 10))),
np.zeros((1, 10))])
def test_affinity_propagation_convergence_warning_dense_sparse(centers):
"""Non-regression, see #13334"""
rng = np.random.RandomState(42)
X = rng.rand(40, 10)
y = (4 * rng.rand(40)).astype(np.int)
ap = AffinityPropagation(random_state=46)
ap.fit(X, y)
ap.cluster_centers_ = centers
with pytest.warns(None) as record:
assert_array_equal(ap.predict(X),
np.zeros(X.shape[0], dtype=int))
assert len(record) == 0
def test_affinity_propagation_float32():
# Test to fix incorrect clusters due to dtype change
# (non-regression test for issue #10832)
X = np.array([[1, 0, 0, 0],
[0, 1, 1, 0],
[0, 1, 1, 0],
[0, 0, 0, 1]], dtype='float32')
afp = AffinityPropagation(preference=1, affinity='precomputed',
random_state=0).fit(X)
expected = np.array([0, 1, 1, 2])
assert_array_equal(afp.labels_, expected)

View file

@ -0,0 +1,277 @@
"""Testing for Spectral Biclustering methods"""
import numpy as np
import pytest
from scipy.sparse import csr_matrix, issparse
from sklearn.model_selection import ParameterGrid
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.base import BaseEstimator, BiclusterMixin
from sklearn.cluster import SpectralCoclustering
from sklearn.cluster import SpectralBiclustering
from sklearn.cluster._bicluster import _scale_normalize
from sklearn.cluster._bicluster import _bistochastic_normalize
from sklearn.cluster._bicluster import _log_normalize
from sklearn.metrics import (consensus_score, v_measure_score)
from sklearn.datasets import make_biclusters, make_checkerboard
class MockBiclustering(BiclusterMixin, BaseEstimator):
# Mock object for testing get_submatrix.
def __init__(self):
pass
def get_indices(self, i):
# Overridden to reproduce old get_submatrix test.
return (np.where([True, True, False, False, True])[0],
np.where([False, False, True, True])[0])
def test_get_submatrix():
data = np.arange(20).reshape(5, 4)
model = MockBiclustering()
for X in (data, csr_matrix(data), data.tolist()):
submatrix = model.get_submatrix(0, X)
if issparse(submatrix):
submatrix = submatrix.toarray()
assert_array_equal(submatrix, [[2, 3],
[6, 7],
[18, 19]])
submatrix[:] = -1
if issparse(X):
X = X.toarray()
assert np.all(X != -1)
def _test_shape_indices(model):
# Test get_shape and get_indices on fitted model.
for i in range(model.n_clusters):
m, n = model.get_shape(i)
i_ind, j_ind = model.get_indices(i)
assert len(i_ind) == m
assert len(j_ind) == n
def test_spectral_coclustering():
# Test Dhillon's Spectral CoClustering on a simple problem.
param_grid = {'svd_method': ['randomized', 'arpack'],
'n_svd_vecs': [None, 20],
'mini_batch': [False, True],
'init': ['k-means++'],
'n_init': [10]}
random_state = 0
S, rows, cols = make_biclusters((30, 30), 3, noise=0.5,
random_state=random_state)
S -= S.min() # needs to be nonnegative before making it sparse
S = np.where(S < 1, 0, S) # threshold some values
for mat in (S, csr_matrix(S)):
for kwargs in ParameterGrid(param_grid):
model = SpectralCoclustering(n_clusters=3,
random_state=random_state,
**kwargs)
model.fit(mat)
assert model.rows_.shape == (3, 30)
assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
_test_shape_indices(model)
def test_spectral_biclustering():
# Test Kluger methods on a checkerboard dataset.
S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
random_state=0)
non_default_params = {'method': ['scale', 'log'],
'svd_method': ['arpack'],
'n_svd_vecs': [20],
'mini_batch': [True]}
for mat in (S, csr_matrix(S)):
for param_name, param_values in non_default_params.items():
for param_value in param_values:
model = SpectralBiclustering(
n_clusters=3,
n_init=3,
init='k-means++',
random_state=0,
)
model.set_params(**dict([(param_name, param_value)]))
if issparse(mat) and model.get_params().get('method') == 'log':
# cannot take log of sparse matrix
with pytest.raises(ValueError):
model.fit(mat)
continue
else:
model.fit(mat)
assert model.rows_.shape == (9, 30)
assert model.columns_.shape == (9, 30)
assert_array_equal(model.rows_.sum(axis=0),
np.repeat(3, 30))
assert_array_equal(model.columns_.sum(axis=0),
np.repeat(3, 30))
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
_test_shape_indices(model)
def _do_scale_test(scaled):
"""Check that rows sum to one constant, and columns to another."""
row_sum = scaled.sum(axis=1)
col_sum = scaled.sum(axis=0)
if issparse(scaled):
row_sum = np.asarray(row_sum).squeeze()
col_sum = np.asarray(col_sum).squeeze()
assert_array_almost_equal(row_sum, np.tile(row_sum.mean(), 100),
decimal=1)
assert_array_almost_equal(col_sum, np.tile(col_sum.mean(), 100),
decimal=1)
def _do_bistochastic_test(scaled):
"""Check that rows and columns sum to the same constant."""
_do_scale_test(scaled)
assert_almost_equal(scaled.sum(axis=0).mean(),
scaled.sum(axis=1).mean(),
decimal=1)
def test_scale_normalize():
generator = np.random.RandomState(0)
X = generator.rand(100, 100)
for mat in (X, csr_matrix(X)):
scaled, _, _ = _scale_normalize(mat)
_do_scale_test(scaled)
if issparse(mat):
assert issparse(scaled)
def test_bistochastic_normalize():
generator = np.random.RandomState(0)
X = generator.rand(100, 100)
for mat in (X, csr_matrix(X)):
scaled = _bistochastic_normalize(mat)
_do_bistochastic_test(scaled)
if issparse(mat):
assert issparse(scaled)
def test_log_normalize():
# adding any constant to a log-scaled matrix should make it
# bistochastic
generator = np.random.RandomState(0)
mat = generator.rand(100, 100)
scaled = _log_normalize(mat) + 1
_do_bistochastic_test(scaled)
def test_fit_best_piecewise():
model = SpectralBiclustering(random_state=0)
vectors = np.array([[0, 0, 0, 1, 1, 1],
[2, 2, 2, 3, 3, 3],
[0, 1, 2, 3, 4, 5]])
best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2)
assert_array_equal(best, vectors[:2])
def test_project_and_cluster():
model = SpectralBiclustering(random_state=0)
data = np.array([[1, 1, 1],
[1, 1, 1],
[3, 6, 3],
[3, 6, 3]])
vectors = np.array([[1, 0],
[0, 1],
[0, 0]])
for mat in (data, csr_matrix(data)):
labels = model._project_and_cluster(mat, vectors,
n_clusters=2)
assert_almost_equal(v_measure_score(labels, [0, 0, 1, 1]), 1.0)
def test_perfect_checkerboard():
# XXX Previously failed on build bot (not reproducible)
model = SpectralBiclustering(3, svd_method="arpack", random_state=0)
S, rows, cols = make_checkerboard((30, 30), 3, noise=0,
random_state=0)
model.fit(S)
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
S, rows, cols = make_checkerboard((40, 30), 3, noise=0,
random_state=0)
model.fit(S)
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
S, rows, cols = make_checkerboard((30, 40), 3, noise=0,
random_state=0)
model.fit(S)
assert consensus_score(model.biclusters_,
(rows, cols)) == 1
@pytest.mark.parametrize(
"args",
[{'n_clusters': (3, 3, 3)},
{'n_clusters': 'abc'},
{'n_clusters': (3, 'abc')},
{'method': 'unknown'},
{'n_components': 0},
{'n_best': 0},
{'svd_method': 'unknown'},
{'n_components': 3, 'n_best': 4}]
)
def test_errors(args):
data = np.arange(25).reshape((5, 5))
model = SpectralBiclustering(**args)
with pytest.raises(ValueError):
model.fit(data)
def test_wrong_shape():
model = SpectralBiclustering()
data = np.arange(27).reshape((3, 3, 3))
with pytest.raises(ValueError):
model.fit(data)
@pytest.mark.parametrize('est',
(SpectralBiclustering(), SpectralCoclustering()))
def test_n_features_in_(est):
X, _, _ = make_biclusters((3, 3), 3, random_state=0)
assert not hasattr(est, 'n_features_in_')
est.fit(X)
assert est.n_features_in_ == 3
@pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering])
@pytest.mark.parametrize("n_jobs", [None, 1])
def test_n_jobs_deprecated(klass, n_jobs):
# FIXME: remove in 0.25
depr_msg = ("'n_jobs' was deprecated in version 0.23 and will be removed "
"in 0.25.")
S, _, _ = make_biclusters((30, 30), 3, noise=0.5, random_state=0)
est = klass(random_state=0, n_jobs=n_jobs)
with pytest.warns(FutureWarning, match=depr_msg):
est.fit(S)

View file

@ -0,0 +1,169 @@
"""
Tests for the birch clustering algorithm.
"""
from scipy import sparse
import numpy as np
import pytest
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.cluster import Birch
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_blobs
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNet
from sklearn.metrics import pairwise_distances_argmin, v_measure_score
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_warns
def test_n_samples_leaves_roots():
# Sanity check for the number of samples in leaves and roots
X, y = make_blobs(n_samples=10)
brc = Birch()
brc.fit(X)
n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
n_samples_leaves = sum([sc.n_samples_ for leaf in brc._get_leaves()
for sc in leaf.subclusters_])
assert n_samples_leaves == X.shape[0]
assert n_samples_root == X.shape[0]
def test_partial_fit():
# Test that fit is equivalent to calling partial_fit multiple times
X, y = make_blobs(n_samples=100)
brc = Birch(n_clusters=3)
brc.fit(X)
brc_partial = Birch(n_clusters=None)
brc_partial.partial_fit(X[:50])
brc_partial.partial_fit(X[50:])
assert_array_almost_equal(brc_partial.subcluster_centers_,
brc.subcluster_centers_)
# Test that same global labels are obtained after calling partial_fit
# with None
brc_partial.set_params(n_clusters=3)
brc_partial.partial_fit(None)
assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
def test_birch_predict():
# Test the predict method predicts the nearest centroid.
rng = np.random.RandomState(0)
X = generate_clustered_data(n_clusters=3, n_features=3,
n_samples_per_cluster=10)
# n_samples * n_samples_per_cluster
shuffle_indices = np.arange(30)
rng.shuffle(shuffle_indices)
X_shuffle = X[shuffle_indices, :]
brc = Birch(n_clusters=4, threshold=1.)
brc.fit(X_shuffle)
centroids = brc.subcluster_centers_
assert_array_equal(brc.labels_, brc.predict(X_shuffle))
nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
def test_n_clusters():
# Test that n_clusters param works properly
X, y = make_blobs(n_samples=100, centers=10)
brc1 = Birch(n_clusters=10)
brc1.fit(X)
assert len(brc1.subcluster_centers_) > 10
assert len(np.unique(brc1.labels_)) == 10
# Test that n_clusters = Agglomerative Clustering gives
# the same results.
gc = AgglomerativeClustering(n_clusters=10)
brc2 = Birch(n_clusters=gc)
brc2.fit(X)
assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
assert_array_equal(brc1.labels_, brc2.labels_)
# Test that the wrong global clustering step raises an Error.
clf = ElasticNet()
brc3 = Birch(n_clusters=clf)
with pytest.raises(ValueError):
brc3.fit(X)
# Test that a small number of clusters raises a warning.
brc4 = Birch(threshold=10000.)
assert_warns(ConvergenceWarning, brc4.fit, X)
def test_sparse_X():
# Test that sparse and dense data give same results
X, y = make_blobs(n_samples=100, centers=10)
brc = Birch(n_clusters=10)
brc.fit(X)
csr = sparse.csr_matrix(X)
brc_sparse = Birch(n_clusters=10)
brc_sparse.fit(csr)
assert_array_equal(brc.labels_, brc_sparse.labels_)
assert_array_almost_equal(brc.subcluster_centers_,
brc_sparse.subcluster_centers_)
def check_branching_factor(node, branching_factor):
subclusters = node.subclusters_
assert branching_factor >= len(subclusters)
for cluster in subclusters:
if cluster.child_:
check_branching_factor(cluster.child_, branching_factor)
def test_branching_factor():
# Test that nodes have at max branching_factor number of subclusters
X, y = make_blobs()
branching_factor = 9
# Purposefully set a low threshold to maximize the subclusters.
brc = Birch(n_clusters=None, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
brc = Birch(n_clusters=3, branching_factor=branching_factor,
threshold=0.01)
brc.fit(X)
check_branching_factor(brc.root_, branching_factor)
# Raises error when branching_factor is set to one.
brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
with pytest.raises(ValueError):
brc.fit(X)
def check_threshold(birch_instance, threshold):
"""Use the leaf linked list for traversal"""
current_leaf = birch_instance.dummy_leaf_.next_leaf_
while current_leaf:
subclusters = current_leaf.subclusters_
for sc in subclusters:
assert threshold >= sc.radius
current_leaf = current_leaf.next_leaf_
def test_threshold():
# Test that the leaf subclusters have a threshold lesser than radius
X, y = make_blobs(n_samples=80, centers=4)
brc = Birch(threshold=0.5, n_clusters=None)
brc.fit(X)
check_threshold(brc, 0.5)
brc = Birch(threshold=5.0, n_clusters=None)
brc.fit(X)
check_threshold(brc, 5.)
def test_birch_n_clusters_long_int():
# Check that birch supports n_clusters with np.int64 dtype, for instance
# coming from np.arange. #16484
X, _ = make_blobs(random_state=0)
n_clusters = np.int64(5)
Birch(n_clusters=n_clusters).fit(X)

View file

@ -0,0 +1,395 @@
"""
Tests for DBSCAN clustering algorithm
"""
import pickle
import numpy as np
from scipy.spatial import distance
from scipy import sparse
import pytest
from sklearn.utils._testing import assert_array_equal
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import dbscan
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.metrics.pairwise import pairwise_distances
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
def test_dbscan_similarity():
# Tests the DBSCAN algorithm with a similarity array.
# Parameters chosen specifically for this task.
eps = 0.15
min_samples = 10
# Compute similarities
D = distance.squareform(distance.pdist(X))
D /= np.max(D)
# Compute DBSCAN
core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
labels = db.fit(D).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_feature():
# Tests the DBSCAN algorithm with a feature vector array.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
metric = 'euclidean'
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(X, metric=metric, eps=eps,
min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_sparse():
core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8,
min_samples=10)
core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10)
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
@pytest.mark.parametrize('include_self', [False, True])
def test_dbscan_sparse_precomputed(include_self):
D = pairwise_distances(X)
nn = NearestNeighbors(radius=.9).fit(X)
X_ = X if include_self else None
D_sparse = nn.radius_neighbors_graph(X=X_, mode='distance')
# Ensure it is sparse not merely on diagonals:
assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
core_sparse, labels_sparse = dbscan(D_sparse,
eps=.8,
min_samples=10,
metric='precomputed')
core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10,
metric='precomputed')
assert_array_equal(core_dense, core_sparse)
assert_array_equal(labels_dense, labels_sparse)
def test_dbscan_sparse_precomputed_different_eps():
# test that precomputed neighbors graph is filtered if computed with
# a radius larger than DBSCAN's eps.
lower_eps = 0.2
nn = NearestNeighbors(radius=lower_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode='distance')
dbscan_lower = dbscan(D_sparse, eps=lower_eps, metric='precomputed')
higher_eps = lower_eps + 0.7
nn = NearestNeighbors(radius=higher_eps).fit(X)
D_sparse = nn.radius_neighbors_graph(X, mode='distance')
dbscan_higher = dbscan(D_sparse, eps=lower_eps, metric='precomputed')
assert_array_equal(dbscan_lower[0], dbscan_higher[0])
assert_array_equal(dbscan_lower[1], dbscan_higher[1])
@pytest.mark.parametrize('use_sparse', [True, False])
@pytest.mark.parametrize('metric', ['precomputed', 'minkowski'])
def test_dbscan_input_not_modified(use_sparse, metric):
# test that the input is not modified by dbscan
X = np.random.RandomState(0).rand(10, 10)
X = sparse.csr_matrix(X) if use_sparse else X
X_copy = X.copy()
dbscan(X, metric=metric)
if use_sparse:
assert_array_equal(X.toarray(), X_copy.toarray())
else:
assert_array_equal(X, X_copy)
def test_dbscan_no_core_samples():
rng = np.random.RandomState(0)
X = rng.rand(40, 10)
X[X < .8] = 0
for X_ in [X, sparse.csr_matrix(X)]:
db = DBSCAN(min_samples=6).fit(X_)
assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
assert_array_equal(db.labels_, -1)
assert db.core_sample_indices_.shape == (0,)
def test_dbscan_callable():
# Tests the DBSCAN algorithm with a callable metric.
# Parameters chosen specifically for this task.
# Different eps to other test, because distance is not normalised.
eps = 0.8
min_samples = 10
# metric is the function reference, not the string key.
metric = distance.euclidean
# Compute DBSCAN
# parameters chosen for task
core_samples, labels = dbscan(X, metric=metric, eps=eps,
min_samples=min_samples,
algorithm='ball_tree')
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples,
algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
def test_dbscan_metric_params():
# Tests that DBSCAN works with the metrics_params argument.
eps = 0.8
min_samples = 10
p = 1
# Compute DBSCAN with metric_params arg
db = DBSCAN(metric='minkowski', metric_params={'p': p}, eps=eps,
min_samples=min_samples, algorithm='ball_tree').fit(X)
core_sample_1, labels_1 = db.core_sample_indices_, db.labels_
# Test that sample labels are the same as passing Minkowski 'p' directly
db = DBSCAN(metric='minkowski', eps=eps, min_samples=min_samples,
algorithm='ball_tree', p=p).fit(X)
core_sample_2, labels_2 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_2)
assert_array_equal(labels_1, labels_2)
# Minkowski with p=1 should be equivalent to Manhattan distance
db = DBSCAN(metric='manhattan', eps=eps, min_samples=min_samples,
algorithm='ball_tree').fit(X)
core_sample_3, labels_3 = db.core_sample_indices_, db.labels_
assert_array_equal(core_sample_1, core_sample_3)
assert_array_equal(labels_1, labels_3)
def test_dbscan_balltree():
# Tests the DBSCAN algorithm with balltree for neighbor calculation.
eps = 0.8
min_samples = 10
D = pairwise_distances(X)
core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
min_samples=min_samples)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert n_clusters_1 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert n_clusters_2 == n_clusters
db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree')
labels = db.fit(X).labels_
n_clusters_3 = len(set(labels)) - int(-1 in labels)
assert n_clusters_3 == n_clusters
db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_4 = len(set(labels)) - int(-1 in labels)
assert n_clusters_4 == n_clusters
db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples,
algorithm='ball_tree')
labels = db.fit(X).labels_
n_clusters_5 = len(set(labels)) - int(-1 in labels)
assert n_clusters_5 == n_clusters
def test_input_validation():
# DBSCAN.fit should accept a list of lists.
X = [[1., 2.], [3., 4.]]
DBSCAN().fit(X) # must not raise exception
@pytest.mark.parametrize(
"args",
[{'eps': -1.0}, {'algorithm': 'blah'}, {'metric': 'blah'},
{'leaf_size': -1}, {'p': -1}]
)
def test_dbscan_badargs(args):
# Test bad argument values: these should all raise ValueErrors
with pytest.raises(ValueError):
dbscan(X, **args)
def test_pickle():
obj = DBSCAN()
s = pickle.dumps(obj)
assert type(pickle.loads(s)) == obj.__class__
def test_boundaries():
# ensure min_samples is inclusive of core point
core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
assert 0 in core
# ensure eps is inclusive of circumference
core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
assert 0 in core
core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
assert 0 not in core
def test_weighted_dbscan():
# ensure sample_weight is validated
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2])
with pytest.raises(ValueError):
dbscan([[0], [1]], sample_weight=[2, 3, 4])
# ensure sample_weight has an effect
assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
min_samples=6)[0])
assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
min_samples=6)[0])
# points within eps of each other:
assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
sample_weight=[5, 1], min_samples=6)[0])
# and effect of non-positive and non-integer sample_weight:
assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
eps=1.5, min_samples=6)[0])
assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
eps=1.5, min_samples=6)[0])
assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
eps=1.5, min_samples=6)[0])
# for non-negative sample_weight, cores should be identical to repetition
rng = np.random.RandomState(42)
sample_weight = rng.randint(0, 5, X.shape[0])
core1, label1 = dbscan(X, sample_weight=sample_weight)
assert len(label1) == len(X)
X_repeated = np.repeat(X, sample_weight, axis=0)
core_repeated, label_repeated = dbscan(X_repeated)
core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
core_repeated_mask[core_repeated] = True
core_mask = np.zeros(X.shape[0], dtype=bool)
core_mask[core1] = True
assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)
# sample_weight should work with precomputed distance matrix
D = pairwise_distances(X)
core3, label3 = dbscan(D, sample_weight=sample_weight,
metric='precomputed')
assert_array_equal(core1, core3)
assert_array_equal(label1, label3)
# sample_weight should work with estimator
est = DBSCAN().fit(X, sample_weight=sample_weight)
core4 = est.core_sample_indices_
label4 = est.labels_
assert_array_equal(core1, core4)
assert_array_equal(label1, label4)
est = DBSCAN()
label5 = est.fit_predict(X, sample_weight=sample_weight)
core5 = est.core_sample_indices_
assert_array_equal(core1, core5)
assert_array_equal(label1, label5)
assert_array_equal(label1, est.labels_)
@pytest.mark.parametrize('algorithm', ['brute', 'kd_tree', 'ball_tree'])
def test_dbscan_core_samples_toy(algorithm):
X = [[0], [2], [3], [4], [6], [8], [10]]
n_samples = len(X)
# Degenerate case: every sample is a core sample, either with its own
# cluster or including other close core samples.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=1)
assert_array_equal(core_samples, np.arange(n_samples))
assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4])
# With eps=1 and min_samples=2 only the 3 samples from the denser area
# are core samples. All other points are isolated and considered noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=2)
assert_array_equal(core_samples, [1, 2, 3])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# Only the sample in the middle of the dense area is core. Its two
# neighbors are edge samples. Remaining samples are noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=3)
assert_array_equal(core_samples, [2])
assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1])
# It's no longer possible to extract core samples with eps=1:
# everything is noise.
core_samples, labels = dbscan(X, algorithm=algorithm, eps=1,
min_samples=4)
assert_array_equal(core_samples, [])
assert_array_equal(labels, np.full(n_samples, -1.))
def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
# see https://github.com/scikit-learn/scikit-learn/issues/4641 for
# more details
X = np.eye(10)
labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_
assert len(set(labels)) == 1
X = np.zeros((10, 10))
labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_
assert len(set(labels)) == 1
def test_dbscan_precomputed_metric_with_initial_rows_zero():
# sample matrix with initial two row all zero
ar = np.array([
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
[0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
[0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0]
])
matrix = sparse.csr_matrix(ar)
labels = DBSCAN(eps=0.2, metric='precomputed',
min_samples=2).fit(matrix).labels_
assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])

View file

@ -0,0 +1,43 @@
"""
Tests for sklearn.cluster._feature_agglomeration
"""
# Authors: Sergul Aydore 2017
import numpy as np
from sklearn.cluster import FeatureAgglomeration
from sklearn.utils._testing import assert_no_warnings
from sklearn.utils._testing import assert_array_almost_equal
def test_feature_agglomeration():
n_clusters = 1
X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features)
agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
pooling_func=np.mean)
agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
pooling_func=np.median)
assert_no_warnings(agglo_mean.fit, X)
assert_no_warnings(agglo_median.fit, X)
assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
assert np.size(np.unique(agglo_median.labels_)) == n_clusters
assert np.size(agglo_mean.labels_) == X.shape[1]
assert np.size(agglo_median.labels_) == X.shape[1]
# Test transform
Xt_mean = agglo_mean.transform(X)
Xt_median = agglo_median.transform(X)
assert Xt_mean.shape[1] == n_clusters
assert Xt_median.shape[1] == n_clusters
assert Xt_mean == np.array([1 / 3.])
assert Xt_median == np.array([0.])
# Test inverse transform
X_full_mean = agglo_mean.inverse_transform(Xt_mean)
X_full_median = agglo_median.inverse_transform(Xt_median)
assert np.unique(X_full_mean[0]).size == n_clusters
assert np.unique(X_full_median[0]).size == n_clusters
assert_array_almost_equal(agglo_mean.transform(X_full_mean),
Xt_mean)
assert_array_almost_equal(agglo_median.transform(X_full_median),
Xt_median)

View file

@ -0,0 +1,765 @@
"""
Several basic tests for hierarchical clustering procedures
"""
# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
# Matteo Visconti di Oleggio Castello 2014
# License: BSD 3 clause
from tempfile import mkdtemp
import shutil
import pytest
from functools import partial
import numpy as np
from scipy import sparse
from scipy.cluster import hierarchy
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import ignore_warnings
from sklearn.cluster import ward_tree
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn.cluster._agglomerative import (_hc_cut, _TREE_BUILDERS,
linkage_tree,
_fix_connectivity)
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\
manhattan_distances, pairwise_distances
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster._hierarchical_fast import average_merge, max_merge
from sklearn.utils._fast_dict import IntFloatDict
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns
from sklearn.datasets import make_moons, make_circles
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
with pytest.raises(ValueError):
AgglomerativeClustering(linkage='foo').fit(X)
with pytest.raises(ValueError):
linkage_tree(X, linkage='foo')
with pytest.raises(ValueError):
linkage_tree(X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hierarchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hierarchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_structured_linkage_tree():
# Check that we obtain the correct solution for structured linkage trees.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
# Avoiding a mask with only 'True' entries
mask[4:7, 4:7] = 0
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for tree_builder in _TREE_BUILDERS.values():
children, n_components, n_leaves, parent = \
tree_builder(X.T, connectivity=connectivity)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
# Check that ward_tree raises a ValueError with a connectivity matrix
# of the wrong shape
with pytest.raises(ValueError):
tree_builder(X.T, connectivity=np.ones((4, 4)))
# Check that fitting with no samples raises an error
with pytest.raises(ValueError):
tree_builder(X.T[:0], connectivity=connectivity)
def test_unstructured_linkage_tree():
# Check that we obtain the correct solution for unstructured linkage trees.
rng = np.random.RandomState(0)
X = rng.randn(50, 100)
for this_X in (X, X[0]):
# With specified a number of clusters just for the sake of
# raising a warning and testing the warning code
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, ward_tree, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
for tree_builder in _TREE_BUILDERS.values():
for this_X in (X, X[0]):
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, tree_builder, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_height_linkage_tree():
# Check that the height of the results of linkage tree is sorted.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for linkage_func in _TREE_BUILDERS.values():
children, n_nodes, n_leaves, parent = linkage_func(
X.T, connectivity=connectivity)
n_nodes = 2 * X.shape[1] - 1
assert len(children) + n_leaves == n_nodes
def test_agglomerative_clustering_wrong_arg_memory():
# Test either if an error is raised when memory is not
# either a str or a joblib.Memory instance
rng = np.random.RandomState(0)
n_samples = 100
X = rng.randn(n_samples, 50)
memory = 5
clustering = AgglomerativeClustering(memory=memory)
with pytest.raises(ValueError):
clustering.fit(X)
def test_zero_cosine_linkage_tree():
# Check that zero vectors in X produce an error when
# 'cosine' affinity is used
X = np.array([[0, 1],
[0, 0]])
msg = 'Cosine affinity cannot be used when X contains zero vectors'
assert_raise_message(ValueError, msg, linkage_tree, X, affinity='cosine')
def test_agglomerative_clustering():
# Check that we obtain the correct number of clusters with
# agglomerative clustering.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
for linkage in ("ward", "complete", "average", "single"):
clustering = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
linkage=linkage)
clustering.fit(X)
# test caching
try:
tempdir = mkdtemp()
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity,
memory=tempdir,
linkage=linkage)
clustering.fit(X)
labels = clustering.labels_
assert np.size(np.unique(labels)) == 10
finally:
shutil.rmtree(tempdir)
# Turn caching off now
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage=linkage)
# Check that we obtain the same solution with early-stopping of the
# tree building
clustering.compute_full_tree = False
clustering.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
labels), 1)
clustering.connectivity = None
clustering.fit(X)
assert np.size(np.unique(clustering.labels_)) == 10
# Check that we raise a TypeError on dense matrices
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=sparse.lil_matrix(
connectivity.toarray()[:10, :10]),
linkage=linkage)
with pytest.raises(ValueError):
clustering.fit(X)
# Test that using ward with another metric than euclidean raises an
# exception
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity.toarray(),
affinity="manhattan",
linkage="ward")
with pytest.raises(ValueError):
clustering.fit(X)
# Test using another metric than euclidean works with linkage complete
for affinity in PAIRED_DISTANCES.keys():
# Compare our (structured) implementation to scipy
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=np.ones((n_samples, n_samples)),
affinity=affinity,
linkage="complete")
clustering.fit(X)
clustering2 = AgglomerativeClustering(
n_clusters=10,
connectivity=None,
affinity=affinity,
linkage="complete")
clustering2.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering2.labels_,
clustering.labels_),
1)
# Test that using a distance matrix (affinity = 'precomputed') has same
# results (with connectivity constraints)
clustering = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
linkage="complete")
clustering.fit(X)
X_dist = pairwise_distances(X)
clustering2 = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
affinity='precomputed',
linkage="complete")
clustering2.fit(X_dist)
assert_array_equal(clustering.labels_, clustering2.labels_)
def test_ward_agglomeration():
# Check that we obtain the correct solution in a simplistic case
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
agglo.fit(X)
assert np.size(np.unique(agglo.labels_)) == 5
X_red = agglo.transform(X)
assert X_red.shape[1] == 5
X_full = agglo.inverse_transform(X_red)
assert np.unique(X_full[0]).size == 5
assert_array_almost_equal(agglo.transform(X_full), X_red)
# Check that fitting with no samples raises a ValueError
with pytest.raises(ValueError):
agglo.fit(X[:0])
def test_single_linkage_clustering():
# Check that we get the correct result in two emblematic cases
moons, moon_labels = make_moons(noise=0.05, random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
clustering.fit(moons)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
moon_labels), 1)
circles, circle_labels = make_circles(factor=0.5, noise=0.025,
random_state=42)
clustering = AgglomerativeClustering(n_clusters=2, linkage='single')
clustering.fit(circles)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
circle_labels), 1)
def assess_same_labelling(cut1, cut2):
"""Util for comparison with scipy"""
co_clust = []
for cut in [cut1, cut2]:
n = len(cut)
k = cut.max() + 1
ecut = np.zeros((n, k))
ecut[np.arange(n), cut] = 1
co_clust.append(np.dot(ecut, ecut.T))
assert (co_clust[0] == co_clust[1]).all()
def test_sparse_scikit_vs_scipy():
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
n, p, k = 10, 5, 3
rng = np.random.RandomState(0)
# Not using a lil_matrix here, just to check that non sparse
# matrices are well handled
connectivity = np.ones((n, n))
for linkage in _TREE_BUILDERS.keys():
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method=linkage)
children_ = out[:, :2].astype(np.int, copy=False)
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
X, connectivity=connectivity)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(children, children_, 'linkage tree differs'
' from scipy impl for'
' linkage: ' + linkage)
cut = _hc_cut(k, children, n_leaves)
cut_ = _hc_cut(k, children_, n_leaves)
assess_same_labelling(cut, cut_)
# Test error management in _hc_cut
with pytest.raises(ValueError):
_hc_cut(n_leaves + 1, children, n_leaves)
# Make sure our custom mst_linkage_core gives
# the same results as scipy's builtin
@pytest.mark.parametrize('seed', range(5))
def test_vector_scikit_single_vs_scipy_single(seed):
n_samples, n_features, n_clusters = 10, 5, 3
rng = np.random.RandomState(seed)
X = .1 * rng.normal(size=(n_samples, n_features))
X -= 4. * np.arange(n_samples)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method='single')
children_scipy = out[:, :2].astype(np.int)
children, _, n_leaves, _ = _TREE_BUILDERS['single'](X)
# Sort the order of child nodes per row for consistency
children.sort(axis=1)
assert_array_equal(children, children_scipy,
'linkage tree differs'
' from scipy impl for'
' single linkage.')
cut = _hc_cut(n_clusters, children, n_leaves)
cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
assess_same_labelling(cut, cut_scipy)
def test_identical_points():
# Ensure identical points are handled correctly when using mst with
# a sparse connectivity matrix
X = np.array([[0, 0, 0], [0, 0, 0],
[1, 1, 1], [1, 1, 1],
[2, 2, 2], [2, 2, 2]])
true_labels = np.array([0, 0, 1, 1, 2, 2])
connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
connectivity, n_components = _fix_connectivity(X,
connectivity,
'euclidean')
for linkage in ('single', 'average', 'average', 'ward'):
clustering = AgglomerativeClustering(n_clusters=3,
linkage=linkage,
connectivity=connectivity)
clustering.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
true_labels), 1)
def test_connectivity_propagation():
# Check that connectivity in the ward tree is propagated correctly during
# merging.
X = np.array([(.014, .120), (.014, .099), (.014, .097),
(.017, .153), (.017, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .152), (.018, .149), (.018, .144)])
connectivity = kneighbors_graph(X, 10, include_self=False)
ward = AgglomerativeClustering(
n_clusters=4, connectivity=connectivity, linkage='ward')
# If changes are not propagated correctly, fit crashes with an
# IndexError
ward.fit(X)
def test_ward_tree_children_order():
# Check that children are ordered in the same way for both structured and
# unstructured versions of ward_tree.
# test on five random datasets
n, p = 10, 5
rng = np.random.RandomState(0)
connectivity = np.ones((n, n))
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X)
out_structured = ward_tree(X, connectivity=connectivity)
assert_array_equal(out_unstructured[0], out_structured[0])
def test_ward_linkage_tree_return_distance():
# Test return_distance option on linkage and ward trees
# test that return_distance when set true, gives same
# output on both structured and unstructured clustering.
n, p = 10, 5
rng = np.random.RandomState(0)
connectivity = np.ones((n, n))
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X, return_distance=True)
out_structured = ward_tree(X, connectivity=connectivity,
return_distance=True)
# get children
children_unstructured = out_unstructured[0]
children_structured = out_structured[0]
# check if we got the same clusters
assert_array_equal(children_unstructured, children_structured)
# check if the distances are the same
dist_unstructured = out_unstructured[-1]
dist_structured = out_structured[-1]
assert_array_almost_equal(dist_unstructured, dist_structured)
for linkage in ['average', 'complete', 'single']:
structured_items = linkage_tree(
X, connectivity=connectivity, linkage=linkage,
return_distance=True)[-1]
unstructured_items = linkage_tree(
X, linkage=linkage, return_distance=True)[-1]
structured_dist = structured_items[-1]
unstructured_dist = unstructured_items[-1]
structured_children = structured_items[0]
unstructured_children = unstructured_items[0]
assert_array_almost_equal(structured_dist, unstructured_dist)
assert_array_almost_equal(
structured_children, unstructured_children)
# test on the following dataset where we know the truth
# taken from scipy/cluster/tests/hierarchy_test_data.py
X = np.array([[1.43054825, -7.5693489],
[6.95887839, 6.82293382],
[2.87137846, -9.68248579],
[7.87974764, -6.05485803],
[8.24018364, -6.09495602],
[7.39020262, 8.54004355]])
# truth
linkage_X_ward = np.array([[3., 4., 0.36265956, 2.],
[1., 5., 1.77045373, 2.],
[0., 2., 2.55760419, 2.],
[6., 8., 9.10208346, 4.],
[7., 9., 24.7784379, 6.]])
linkage_X_complete = np.array(
[[3., 4., 0.36265956, 2.],
[1., 5., 1.77045373, 2.],
[0., 2., 2.55760419, 2.],
[6., 8., 6.96742194, 4.],
[7., 9., 18.77445997, 6.]])
linkage_X_average = np.array(
[[3., 4., 0.36265956, 2.],
[1., 5., 1.77045373, 2.],
[0., 2., 2.55760419, 2.],
[6., 8., 6.55832839, 4.],
[7., 9., 15.44089605, 6.]])
n_samples, n_features = np.shape(X)
connectivity_X = np.ones((n_samples, n_samples))
out_X_unstructured = ward_tree(X, return_distance=True)
out_X_structured = ward_tree(X, connectivity=connectivity_X,
return_distance=True)
# check that the labels are the same
assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
linkage_options = ['complete', 'average', 'single']
X_linkage_truth = [linkage_X_complete, linkage_X_average]
for (linkage, X_truth) in zip(linkage_options, X_linkage_truth):
out_X_unstructured = linkage_tree(
X, return_distance=True, linkage=linkage)
out_X_structured = linkage_tree(
X, connectivity=connectivity_X, linkage=linkage,
return_distance=True)
# check that the labels are the same
assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
assert_array_equal(X_truth[:, :2], out_X_structured[0])
# check that the distances are correct
assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_connectivity_fixing_non_lil():
# Check non regression of a bug if a non item assignable connectivity is
# provided with more than one component.
# create dummy data
x = np.array([[0, 0], [1, 1]])
# create a mask with several components to force connectivity fixing
m = np.array([[True, False], [False, True]])
c = grid_to_graph(n_x=2, n_y=2, mask=m)
w = AgglomerativeClustering(connectivity=c, linkage='ward')
assert_warns(UserWarning, w.fit, x)
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
other_keys = np.arange(50, dtype=np.intp)[::2]
other_values = np.full(50, 0.5)[::2]
other = IntFloatDict(other_keys, other_values)
# Complete smoke test
max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
def test_connectivity_callable():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(
connectivity=partial(kneighbors_graph, n_neighbors=3,
include_self=False))
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_ignores_diagonal():
rng = np.random.RandomState(0)
X = rng.rand(20, 5)
connectivity = kneighbors_graph(X, 3, include_self=False)
connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
aglc1 = AgglomerativeClustering(connectivity=connectivity)
aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
aglc1.fit(X)
aglc2.fit(X)
assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_compute_full_tree():
# Test that the full tree is computed if n_clusters is small
rng = np.random.RandomState(0)
X = rng.randn(10, 2)
connectivity = kneighbors_graph(X, 5, include_self=False)
# When n_clusters is less, the full tree should be built
# that is the number of merges should be n_samples - 1
agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - 1
# When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
# we should stop when there are n_clusters.
n_clusters = 101
X = rng.randn(200, 2)
connectivity = kneighbors_graph(X, 10, include_self=False)
agc = AgglomerativeClustering(n_clusters=n_clusters,
connectivity=connectivity)
agc.fit(X)
n_samples = X.shape[0]
n_nodes = agc.children_.shape[0]
assert n_nodes == n_samples - n_clusters
def test_n_components():
# Test n_components returned by linkage, average and ward tree
rng = np.random.RandomState(0)
X = rng.rand(5, 5)
# Connectivity matrix having five components.
connectivity = np.eye(5)
for linkage_func in _TREE_BUILDERS.values():
assert ignore_warnings(linkage_func)(X, connectivity)[1] == 5
def test_agg_n_clusters():
# Test that an error is raised when n_clusters <= 0
rng = np.random.RandomState(0)
X = rng.rand(20, 10)
for n_clus in [-1, 0]:
agc = AgglomerativeClustering(n_clusters=n_clus)
msg = ("n_clusters should be an integer greater than 0."
" %s was provided." % str(agc.n_clusters))
assert_raise_message(ValueError, msg, agc.fit, X)
def test_affinity_passed_to_fix_connectivity():
# Test that the affinity parameter is actually passed to the pairwise
# function
size = 2
rng = np.random.RandomState(0)
X = rng.randn(size, size)
mask = np.array([True, False, False, True])
connectivity = grid_to_graph(n_x=size, n_y=size,
mask=mask, return_as=np.ndarray)
class FakeAffinity:
def __init__(self):
self.counter = 0
def increment(self, *args, **kwargs):
self.counter += 1
return self.counter
fa = FakeAffinity()
linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
assert fa.counter == 3
@pytest.mark.parametrize('linkage', ['ward', 'complete', 'average'])
def test_agglomerative_clustering_with_distance_threshold(linkage):
# Check that we obtain the correct number of clusters with
# agglomerative clustering with distance_threshold.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
# test when distance threshold is set to 10
distance_threshold = 10
for conn in [None, connectivity]:
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=distance_threshold,
connectivity=conn, linkage=linkage)
clustering.fit(X)
clusters_produced = clustering.labels_
num_clusters_produced = len(np.unique(clustering.labels_))
# test if the clusters produced match the point in the linkage tree
# where the distance exceeds the threshold
tree_builder = _TREE_BUILDERS[linkage]
children, n_components, n_leaves, parent, distances = \
tree_builder(X, connectivity=conn, n_clusters=None,
return_distance=True)
num_clusters_at_threshold = np.count_nonzero(
distances >= distance_threshold) + 1
# test number of clusters produced
assert num_clusters_at_threshold == num_clusters_produced
# test clusters produced
clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced,
children=children,
n_leaves=n_leaves)
assert np.array_equiv(clusters_produced,
clusters_at_threshold)
def test_small_distance_threshold():
rng = np.random.RandomState(0)
n_samples = 10
X = rng.randint(-300, 300, size=(n_samples, 3))
# this should result in all data in their own clusters, given that
# their pairwise distances are bigger than .1 (which may not be the case
# with a different random seed).
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=1.,
linkage="single").fit(X)
# check that the pairwise distances are indeed all larger than .1
all_distances = pairwise_distances(X, metric='minkowski', p=2)
np.fill_diagonal(all_distances, np.inf)
assert np.all(all_distances > .1)
assert clustering.n_clusters_ == n_samples
def test_cluster_distances_with_distance_threshold():
rng = np.random.RandomState(0)
n_samples = 100
X = rng.randint(-10, 10, size=(n_samples, 3))
# check the distances within the clusters and with other clusters
distance_threshold = 4
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=distance_threshold,
linkage="single").fit(X)
labels = clustering.labels_
D = pairwise_distances(X, metric="minkowski", p=2)
# to avoid taking the 0 diagonal in min()
np.fill_diagonal(D, np.inf)
for label in np.unique(labels):
in_cluster_mask = labels == label
max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask]
.min(axis=0).max())
min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask]
.min(axis=0).min())
# single data point clusters only have that inf diagonal here
if in_cluster_mask.sum() > 1:
assert max_in_cluster_distance < distance_threshold
assert min_out_cluster_distance >= distance_threshold
@pytest.mark.parametrize('linkage', ['ward', 'complete', 'average'])
@pytest.mark.parametrize(('threshold', 'y_true'),
[(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])])
def test_agglomerative_clustering_with_distance_threshold_edge_case(
linkage, threshold, y_true):
# test boundary case of distance_threshold matching the distance
X = [[0], [1]]
clusterer = AgglomerativeClustering(
n_clusters=None,
distance_threshold=threshold,
linkage=linkage)
y_pred = clusterer.fit_predict(X)
assert adjusted_rand_score(y_true, y_pred) == 1
def test_dist_threshold_invalid_parameters():
X = [[0], [1]]
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=None,
distance_threshold=None).fit(X)
with pytest.raises(ValueError, match="Exactly one of "):
AgglomerativeClustering(n_clusters=2,
distance_threshold=1).fit(X)
X = [[0], [1]]
with pytest.raises(ValueError, match="compute_full_tree must be True if"):
AgglomerativeClustering(n_clusters=None,
distance_threshold=1,
compute_full_tree=False).fit(X)
def test_invalid_shape_precomputed_dist_matrix():
# Check that an error is raised when affinity='precomputed'
# and a non square matrix is passed (PR #16257).
rng = np.random.RandomState(0)
X = rng.rand(5, 3)
with pytest.raises(ValueError, match="Distance matrix should be square, "):
AgglomerativeClustering(affinity='precomputed',
linkage='complete').fit(X)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,194 @@
"""
Testing for mean shift clustering methods
"""
import numpy as np
import warnings
import pytest
from scipy import sparse
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_allclose
from sklearn.cluster import MeanShift
from sklearn.cluster import mean_shift
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import get_bin_seeds
from sklearn.datasets import make_blobs
from sklearn.metrics import v_measure_score
n_clusters = 3
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(n_samples=300, n_features=2, centers=centers,
cluster_std=0.4, shuffle=True, random_state=11)
def test_estimate_bandwidth():
# Test estimate_bandwidth
bandwidth = estimate_bandwidth(X, n_samples=200)
assert 0.9 <= bandwidth <= 1.5
def test_estimate_bandwidth_1sample():
# Test estimate_bandwidth when n_samples=1 and quantile<1, so that
# n_neighbors is set to 1.
bandwidth = estimate_bandwidth(X, n_samples=1, quantile=0.3)
assert bandwidth == pytest.approx(0., abs=1e-5)
@pytest.mark.parametrize("bandwidth, cluster_all, expected, "
"first_cluster_label",
[(1.2, True, 3, 0), (1.2, False, 4, -1)])
def test_mean_shift(bandwidth, cluster_all, expected, first_cluster_label):
# Test MeanShift algorithm
ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all)
labels = ms.fit(X).labels_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
assert n_clusters_ == expected
assert labels_unique[0] == first_cluster_label
cluster_centers, labels_mean_shift = mean_shift(X, cluster_all=cluster_all)
labels_mean_shift_unique = np.unique(labels_mean_shift)
n_clusters_mean_shift = len(labels_mean_shift_unique)
assert n_clusters_mean_shift == expected
assert labels_mean_shift_unique[0] == first_cluster_label
def test_mean_shift_negative_bandwidth():
bandwidth = -1
ms = MeanShift(bandwidth=bandwidth)
msg = (r"bandwidth needs to be greater than zero or None,"
r" got -1\.000000")
with pytest.raises(ValueError, match=msg):
ms.fit(X)
def test_estimate_bandwidth_with_sparse_matrix():
# Test estimate_bandwidth with sparse matrix
X = sparse.lil_matrix((1000, 1000))
msg = "A sparse matrix was passed, but dense data is required."
assert_raise_message(TypeError, msg, estimate_bandwidth, X)
def test_parallel():
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
X, _ = make_blobs(n_samples=50, n_features=2, centers=centers,
cluster_std=0.4, shuffle=True, random_state=11)
ms1 = MeanShift(n_jobs=2)
ms1.fit(X)
ms2 = MeanShift()
ms2.fit(X)
assert_array_almost_equal(ms1.cluster_centers_, ms2.cluster_centers_)
assert_array_equal(ms1.labels_, ms2.labels_)
def test_meanshift_predict():
# Test MeanShift.predict
ms = MeanShift(bandwidth=1.2)
labels = ms.fit_predict(X)
labels2 = ms.predict(X)
assert_array_equal(labels, labels2)
def test_meanshift_all_orphans():
# init away from the data, crash with a sensible warning
ms = MeanShift(bandwidth=0.1, seeds=[[-9, -9], [-10, -10]])
msg = "No point was within bandwidth=0.1"
assert_raise_message(ValueError, msg, ms.fit, X,)
def test_unfitted():
# Non-regression: before fit, there should be not fitted attributes.
ms = MeanShift()
assert not hasattr(ms, "cluster_centers_")
assert not hasattr(ms, "labels_")
def test_cluster_intensity_tie():
X = np.array([[1, 1], [2, 1], [1, 0],
[4, 7], [3, 5], [3, 6]])
c1 = MeanShift(bandwidth=2).fit(X)
X = np.array([[4, 7], [3, 5], [3, 6],
[1, 1], [2, 1], [1, 0]])
c2 = MeanShift(bandwidth=2).fit(X)
assert_array_equal(c1.labels_, [1, 1, 1, 0, 0, 0])
assert_array_equal(c2.labels_, [0, 0, 0, 1, 1, 1])
def test_bin_seeds():
# Test the bin seeding technique which can be used in the mean shift
# algorithm
# Data is just 6 points in the plane
X = np.array([[1., 1.], [1.4, 1.4], [1.8, 1.2],
[2., 1.], [2.1, 1.1], [0., 0.]])
# With a bin coarseness of 1.0 and min_bin_freq of 1, 3 bins should be
# found
ground_truth = {(1., 1.), (2., 1.), (0., 0.)}
test_bins = get_bin_seeds(X, 1, 1)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin coarseness of 1.0 and min_bin_freq of 2, 2 bins should be
# found
ground_truth = {(1., 1.), (2., 1.)}
test_bins = get_bin_seeds(X, 1, 2)
test_result = set(tuple(p) for p in test_bins)
assert len(ground_truth.symmetric_difference(test_result)) == 0
# With a bin size of 0.01 and min_bin_freq of 1, 6 bins should be found
# we bail and use the whole data here.
with warnings.catch_warnings(record=True):
test_bins = get_bin_seeds(X, 0.01, 1)
assert_array_almost_equal(test_bins, X)
# tight clusters around [0, 0] and [1, 1], only get two bins
X, _ = make_blobs(n_samples=100, n_features=2, centers=[[0, 0], [1, 1]],
cluster_std=0.1, random_state=0)
test_bins = get_bin_seeds(X, 1)
assert_array_equal(test_bins, [[0, 0], [1, 1]])
@pytest.mark.parametrize('max_iter', [1, 100])
def test_max_iter(max_iter):
clusters1, _ = mean_shift(X, max_iter=max_iter)
ms = MeanShift(max_iter=max_iter).fit(X)
clusters2 = ms.cluster_centers_
assert ms.n_iter_ <= ms.max_iter
assert len(clusters1) == len(clusters2)
for c1, c2 in zip(clusters1, clusters2):
assert np.allclose(c1, c2)
def test_mean_shift_zero_bandwidth():
# Check that mean shift works when the estimated bandwidth is 0.
X = np.array([1, 1, 1, 2, 2, 2, 3, 3]).reshape(-1, 1)
# estimate_bandwidth with default args returns 0 on this dataset
bandwidth = estimate_bandwidth(X)
assert bandwidth == 0
# get_bin_seeds with a 0 bin_size should return the dataset itself
assert get_bin_seeds(X, bin_size=bandwidth) is X
# MeanShift with binning and a 0 estimated bandwidth should be equivalent
# to no binning.
ms_binning = MeanShift(bin_seeding=True, bandwidth=None).fit(X)
ms_nobinning = MeanShift(bin_seeding=False).fit(X)
expected_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2])
assert v_measure_score(ms_binning.labels_, expected_labels) == 1
assert v_measure_score(ms_nobinning.labels_, expected_labels) == 1
assert_allclose(ms_binning.cluster_centers_, ms_nobinning.cluster_centers_)

View file

@ -0,0 +1,429 @@
# Authors: Shane Grigsby <refuge@rocktalus.com>
# Adrin Jalali <adrin.jalali@gmail.com>
# License: BSD 3 clause
import numpy as np
import pytest
from sklearn.datasets import make_blobs
from sklearn.cluster import OPTICS
from sklearn.cluster._optics import _extend_region, _extract_xi_labels
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import DBSCAN
from sklearn.utils import shuffle
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_allclose
from sklearn.cluster.tests.common import generate_clustered_data
rng = np.random.RandomState(0)
n_points_per_cluster = 10
C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))
@pytest.mark.parametrize(
('r_plot', 'end'),
[[[10, 8.9, 8.8, 8.7, 7, 10], 3],
[[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
[[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4],
])
def test_extend_downward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_downward = ratio >= 1 / .9
upward = ratio < 1
e = _extend_region(steep_downward, upward, 0, 2)
assert e == end
@pytest.mark.parametrize(
('r_plot', 'end'),
[[[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6],
[[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0],
[[1, 2, 2.1, 2, np.inf], 0],
[[1, 2, 2.1, np.inf], 2],
])
def test_extend_upward(r_plot, end):
r_plot = np.array(r_plot)
ratio = r_plot[:-1] / r_plot[1:]
steep_upward = ratio <= .9
downward = ratio > 1
e = _extend_region(steep_upward, downward, 0, 2)
assert e == end
@pytest.mark.parametrize(
('ordering', 'clusters', 'expected'),
[[[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]],
[[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]],
[[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]],
])
def test_the_extract_xi_labels(ordering, clusters, expected):
labels = _extract_xi_labels(ordering, clusters)
assert_array_equal(labels, expected)
def test_extract_xi():
# small and easy test (no clusters around other clusters)
# but with a clear noise data.
rng = np.random.RandomState(0)
n_points_per_cluster = 5
C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2)
C5 = [3, -2] + .6 * rng.randn(n_points_per_cluster, 2)
C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6))
expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5,
-1, [4] * 5]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(min_samples=3, min_cluster_size=2,
max_eps=20, cluster_method='xi',
xi=0.4).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# check float min_samples and min_cluster_size
clust = OPTICS(min_samples=0.1, min_cluster_size=0.08,
max_eps=20, cluster_method='xi',
xi=0.4).fit(X)
assert_array_equal(clust.labels_, expected_labels)
X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6))
expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5,
-1, -1, [4] * 5]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(min_samples=3, min_cluster_size=3,
max_eps=20, cluster_method='xi',
xi=0.3).fit(X)
# this may fail if the predecessor correction is not at work!
assert_array_equal(clust.labels_, expected_labels)
C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]]
C2 = [[10, 10], [10, 9], [10, 11], [9, 10]]
C3 = [[100, 100], [100, 90], [100, 110], [90, 100]]
X = np.vstack((C1, C2, C3))
expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4]
X, expected_labels = shuffle(X, expected_labels, random_state=rng)
clust = OPTICS(min_samples=2, min_cluster_size=2,
max_eps=np.inf, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
def test_cluster_hierarchy_():
rng = np.random.RandomState(0)
n_points_per_cluster = 100
C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2)
C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2))
X = shuffle(X, random_state=0)
clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_
assert clusters.shape == (2, 2)
diff = np.sum(clusters - np.array([[0, 99], [0, 199]]))
assert diff / len(X) < 0.05
def test_correct_number_of_clusters():
# in 'auto' mode
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# Parameters chosen specifically for this task.
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1)
clust.fit(X)
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
assert n_clusters_1 == n_clusters
# check attribute types and sizes
assert clust.labels_.shape == (len(X),)
assert clust.labels_.dtype.kind == 'i'
assert clust.reachability_.shape == (len(X),)
assert clust.reachability_.dtype.kind == 'f'
assert clust.core_distances_.shape == (len(X),)
assert clust.core_distances_.dtype.kind == 'f'
assert clust.ordering_.shape == (len(X),)
assert clust.ordering_.dtype.kind == 'i'
assert set(clust.ordering_) == set(range(len(X)))
def test_minimum_number_of_sample_check():
# test that we check a minimum number of samples
msg = "min_samples must be no greater than"
# Compute OPTICS
X = [[1, 1]]
clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1)
# Run the fit
assert_raise_message(ValueError, msg, clust.fit, X)
def test_bad_extract():
# Test an extraction of eps too close to original eps
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
# Compute OPTICS
clust = OPTICS(max_eps=5.0 * 0.03,
cluster_method='dbscan',
eps=0.3, min_samples=10)
assert_raise_message(ValueError, msg, clust.fit, X)
def test_bad_reachability():
msg = "All reachability values are inf. Set a larger max_eps."
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
with pytest.warns(UserWarning, match=msg):
clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
clust.fit(X)
def test_close_extract():
# Test extract where extraction eps is close to scaled max_eps
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
# Compute OPTICS
clust = OPTICS(max_eps=1.0, cluster_method='dbscan',
eps=0.3, min_samples=10).fit(X)
# Cluster ordering starts at 0; max cluster label = 2 is 3 clusters
assert max(clust.labels_) == 2
@pytest.mark.parametrize('eps', [0.1, .3, .5])
@pytest.mark.parametrize('min_samples', [3, 10, 20])
def test_dbscan_optics_parity(eps, min_samples):
# Test that OPTICS clustering labels are <= 5% difference of DBSCAN
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers,
cluster_std=0.4, random_state=0)
# calculate optics with dbscan extract at 0.3 epsilon
op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
eps=eps).fit(X)
# calculate dbscan labels
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
contingency = contingency_matrix(db.labels_, op.labels_)
agree = min(np.sum(np.max(contingency, axis=0)),
np.sum(np.max(contingency, axis=1)))
disagree = X.shape[0] - agree
percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)
# verify label mismatch is <= 5% labels
assert percent_mismatch <= 0.05
def test_min_samples_edge_case():
C1 = [[0, 0], [0, 0.1], [0, -.1]]
C2 = [[10, 10], [10, 9], [10, 11]]
C3 = [[100, 100], [100, 96], [100, 106]]
X = np.vstack((C1, C2, C3))
expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3]
clust = OPTICS(min_samples=3,
max_eps=7, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3]
clust = OPTICS(min_samples=3,
max_eps=3, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
expected_labels = np.r_[[-1] * 9]
with pytest.warns(UserWarning, match="All reachability values"):
clust = OPTICS(min_samples=4,
max_eps=3, cluster_method='xi',
xi=0.04).fit(X)
assert_array_equal(clust.labels_, expected_labels)
# try arbitrary minimum sizes
@pytest.mark.parametrize('min_cluster_size', range(2, X.shape[0] // 10, 23))
def test_min_cluster_size(min_cluster_size):
redX = X[::2] # reduce for speed
clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
if cluster_sizes.size:
assert min(cluster_sizes) >= min_cluster_size
# check behaviour is the same when min_cluster_size is a fraction
clust_frac = OPTICS(min_samples=9,
min_cluster_size=min_cluster_size / redX.shape[0])
clust_frac.fit(redX)
assert_array_equal(clust.labels_, clust_frac.labels_)
@pytest.mark.parametrize('min_cluster_size', [0, -1, 1.1, 2.2])
def test_min_cluster_size_invalid(min_cluster_size):
clust = OPTICS(min_cluster_size=min_cluster_size)
with pytest.raises(ValueError, match="must be a positive integer or a "):
clust.fit(X)
def test_min_cluster_size_invalid2():
clust = OPTICS(min_cluster_size=len(X) + 1)
with pytest.raises(ValueError, match="must be no greater than the "):
clust.fit(X)
def test_processing_order():
# Ensure that we consider all unprocessed points,
# not only direct neighbors. when picking the next point.
Y = [[0], [10], [-10], [25]]
clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
assert_array_equal(clust.ordering_, [0, 1, 2, 3])
def test_compare_to_ELKI():
# Expected values, computed with (future) ELKI 0.7.5 using:
# java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
# -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
# where the FixedDBIDsFilter gives 0-indexed ids.
r1 = [np.inf, 1.0574896366427478, 0.7587934993548423, 0.7290174038973836,
0.7290174038973836, 0.7290174038973836, 0.6861627576116127,
0.7587934993548423, 0.9280118450166668, 1.1748022534146194,
3.3355455741292257, 0.49618389254482587, 0.2552805046961355,
0.2552805046961355, 0.24944622248445714, 0.24944622248445714,
0.24944622248445714, 0.2552805046961355, 0.2552805046961355,
0.3086779122185853, 4.163024452756142, 1.623152630340929,
0.45315840475822655, 0.25468325192031926, 0.2254004358159971,
0.18765711877083036, 0.1821471333893275, 0.1821471333893275,
0.18765711877083036, 0.18765711877083036, 0.2240202988740153,
1.154337614548715, 1.342604473837069, 1.323308536402633,
0.8607514948648837, 0.27219111215810565, 0.13260875220533205,
0.13260875220533205, 0.09890587675958984, 0.09890587675958984,
0.13548790801634494, 0.1575483940837384, 0.17515137170530226,
0.17575920159442388, 0.27219111215810565, 0.6101447895405373,
1.3189208094864302, 1.323308536402633, 2.2509184159764577,
2.4517810628594527, 3.675977064404973, 3.8264795626020365,
2.9130735341510614, 2.9130735341510614, 2.9130735341510614,
2.9130735341510614, 2.8459300127258036, 2.8459300127258036,
2.8459300127258036, 3.0321982337972537]
o1 = [0, 3, 6, 4, 7, 8, 2, 9, 5, 1, 31, 30, 32, 34, 33, 38, 39, 35, 37, 36,
44, 21, 23, 24, 22, 25, 27, 29, 26, 28, 20, 40, 45, 46, 10, 15, 11,
13, 17, 19, 18, 12, 16, 14, 47, 49, 43, 48, 42, 41, 53, 57, 51, 52,
56, 59, 54, 55, 58, 50]
p1 = [-1, 0, 3, 6, 6, 6, 8, 3, 7, 5, 1, 31, 30, 30, 34, 34, 34, 32, 32, 37,
36, 44, 21, 23, 24, 22, 25, 25, 22, 22, 22, 21, 40, 45, 46, 10, 15,
15, 13, 13, 15, 11, 19, 15, 10, 47, 12, 45, 14, 43, 42, 53, 57, 57,
57, 57, 59, 59, 59, 58]
# Tests against known extraction array
# Does NOT work with metric='euclidean', because sklearn euclidean has
# worse numeric precision. 'minkowski' is slower but more accurate.
clust1 = OPTICS(min_samples=5).fit(X)
assert_array_equal(clust1.ordering_, np.array(o1))
assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1))
assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1))
# ELKI currently does not print the core distances (which are not used much
# in literature, but we can at least ensure to have this consistency:
for i in clust1.ordering_[1:]:
assert (clust1.reachability_[i] >=
clust1.core_distances_[clust1.predecessor_[i]])
# Expected values, computed with (future) ELKI 0.7.5 using
r2 = [np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
np.inf, np.inf, np.inf, 0.27219111215810565, 0.13260875220533205,
0.13260875220533205, 0.09890587675958984, 0.09890587675958984,
0.13548790801634494, 0.1575483940837384, 0.17515137170530226,
0.17575920159442388, 0.27219111215810565, 0.4928068613197889,
np.inf, 0.2666183922512113, 0.18765711877083036, 0.1821471333893275,
0.1821471333893275, 0.1821471333893275, 0.18715928772277457,
0.18765711877083036, 0.18765711877083036, 0.25468325192031926,
np.inf, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714,
0.24944622248445714, 0.24944622248445714, 0.2552805046961355,
0.2552805046961355, 0.3086779122185853, 0.34466409325984865,
np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf,
np.inf, np.inf]
o2 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 11, 13, 17, 19, 18, 12, 16, 14,
47, 46, 20, 22, 25, 23, 27, 29, 24, 26, 28, 21, 30, 32, 34, 33, 38,
39, 35, 37, 36, 31, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59]
p2 = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 15, 15, 13, 13, 15,
11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30,
30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1]
clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X)
assert_array_equal(clust2.ordering_, np.array(o2))
assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2))
assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2))
index = np.where(clust1.core_distances_ <= 0.5)[0]
assert_allclose(clust1.core_distances_[index],
clust2.core_distances_[index])
def test_wrong_cluster_method():
clust = OPTICS(cluster_method='superfancy')
with pytest.raises(ValueError, match="cluster_method should be one of "):
clust.fit(X)
def test_extract_dbscan():
# testing an easy dbscan case. Not including clusters with different
# densities.
rng = np.random.RandomState(0)
n_points_per_cluster = 20
C1 = [-5, -2] + .2 * rng.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .2 * rng.randn(n_points_per_cluster, 2)
C3 = [1, 2] + .2 * rng.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .2 * rng.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4))
clust = OPTICS(cluster_method='dbscan', eps=.5).fit(X)
assert_array_equal(np.sort(np.unique(clust.labels_)), [0, 1, 2, 3])
def test_precomputed_dists():
redX = X[::2]
dists = pairwise_distances(redX, metric='euclidean')
clust1 = OPTICS(min_samples=10, algorithm='brute',
metric='precomputed').fit(dists)
clust2 = OPTICS(min_samples=10, algorithm='brute',
metric='euclidean').fit(redX)
assert_allclose(clust1.reachability_, clust2.reachability_)
assert_array_equal(clust1.labels_, clust2.labels_)

View file

@ -0,0 +1,250 @@
"""Testing for Spectral Clustering methods"""
import numpy as np
from scipy import sparse
import pytest
import pickle
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns_message
from sklearn.cluster import SpectralClustering, spectral_clustering
from sklearn.cluster._spectral import discretize
from sklearn.feature_extraction import img_to_graph
from sklearn.metrics import pairwise_distances
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.datasets import make_blobs
try:
from pyamg import smoothed_aggregation_solver # noqa
amg_loaded = True
except ImportError:
amg_loaded = False
@pytest.mark.parametrize('eigen_solver', ('arpack', 'lobpcg'))
@pytest.mark.parametrize('assign_labels', ('kmeans', 'discretize'))
def test_spectral_clustering(eigen_solver, assign_labels):
S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
[0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]])
for mat in (S, sparse.csr_matrix(S)):
model = SpectralClustering(random_state=0, n_clusters=2,
affinity='precomputed',
eigen_solver=eigen_solver,
assign_labels=assign_labels
).fit(mat)
labels = model.labels_
if labels[0] == 0:
labels = 1 - labels
assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
model_copy = pickle.loads(pickle.dumps(model))
assert model_copy.n_clusters == model.n_clusters
assert model_copy.eigen_solver == model.eigen_solver
assert_array_equal(model_copy.labels_, model.labels_)
def test_spectral_unknown_mode():
# Test that SpectralClustering fails with an unknown mode set.
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
with pytest.raises(ValueError):
spectral_clustering(S, n_clusters=2, random_state=0,
eigen_solver="<unknown>")
def test_spectral_unknown_assign_labels():
# Test that SpectralClustering fails with an unknown assign_labels set.
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
with pytest.raises(ValueError):
spectral_clustering(S, n_clusters=2, random_state=0,
assign_labels="<unknown>")
def test_spectral_clustering_sparse():
X, y = make_blobs(n_samples=20, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
S = rbf_kernel(X, gamma=1)
S = np.maximum(S - 1e-4, 0)
S = sparse.coo_matrix(S)
labels = SpectralClustering(random_state=0, n_clusters=2,
affinity='precomputed').fit(S).labels_
assert adjusted_rand_score(y, labels) == 1
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
X, y = make_blobs(n_samples=200, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(
n_neighbors=n_neighbors + additional_neighbors).fit(X)
graph = nn.kneighbors_graph(X, mode='connectivity')
labels = SpectralClustering(random_state=0, n_clusters=2,
affinity='precomputed_nearest_neighbors',
n_neighbors=n_neighbors).fit(graph).labels_
results.append(labels)
assert_array_equal(results[0], results[1])
def test_affinities():
# Note: in the following, random_state has been selected to have
# a dataset that yields a stable eigen decomposition both when built
# on OSX and Linux
X, y = make_blobs(n_samples=20, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
# nearest neighbors affinity
sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
random_state=0)
assert_warns_message(UserWarning, 'not fully connected', sp.fit, X)
assert adjusted_rand_score(y, sp.labels_) == 1
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
labels = sp.fit(X).labels_
assert adjusted_rand_score(y, labels) == 1
X = check_random_state(10).rand(10, 5) * 10
kernels_available = kernel_metrics()
for kern in kernels_available:
# Additive chi^2 gives a negative similarity matrix which
# doesn't make sense for spectral clustering
if kern != 'additive_chi2':
sp = SpectralClustering(n_clusters=2, affinity=kern,
random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1,
random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
def histogram(x, y, **kwargs):
# Histogram kernel implemented as a callable.
assert kwargs == {} # no kernel_params that we didn't ask for
return np.minimum(x, y).sum()
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
labels = sp.fit(X).labels_
assert (X.shape[0],) == labels.shape
# raise error on unknown affinity
sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
with pytest.raises(ValueError):
sp.fit(X)
@pytest.mark.parametrize('n_samples', [50, 100, 150, 500])
def test_discretize(n_samples):
# Test the discretize using a noise assignment matrix
random_state = np.random.RandomState(seed=8)
for n_class in range(2, 10):
# random class labels
y_true = random_state.randint(0, n_class + 1, n_samples)
y_true = np.array(y_true, np.float)
# noise class assignment matrix
y_indicator = sparse.coo_matrix((np.ones(n_samples),
(np.arange(n_samples),
y_true)),
shape=(n_samples,
n_class + 1))
y_true_noisy = (y_indicator.toarray()
+ 0.1 * random_state.randn(n_samples,
n_class + 1))
y_pred = discretize(y_true_noisy, random_state=random_state)
assert adjusted_rand_score(y_true, y_pred) > 0.8
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
def test_spectral_clustering_with_arpack_amg_solvers():
# Test that spectral_clustering is the same for arpack and amg solver
# Based on toy example from plot_segmentation_toy.py
# a small two coin image
x, y = np.indices((40, 40))
center1, center2 = (14, 12), (20, 25)
radius1, radius2 = 8, 7
circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1 ** 2
circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2 ** 2
circles = circle1 | circle2
mask = circles.copy()
img = circles.astype(float)
graph = img_to_graph(img, mask=mask)
graph.data = np.exp(-graph.data / graph.data.std())
labels_arpack = spectral_clustering(
graph, n_clusters=2, eigen_solver='arpack', random_state=0)
assert len(np.unique(labels_arpack)) == 2
if amg_loaded:
labels_amg = spectral_clustering(
graph, n_clusters=2, eigen_solver='amg', random_state=0)
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
else:
with pytest.raises(ValueError):
spectral_clustering(graph, n_clusters=2, eigen_solver='amg',
random_state=0)
def test_n_components():
# Test that after adding n_components, result is different and
# n_components = n_clusters by default
X, y = make_blobs(n_samples=20, random_state=0,
centers=[[1, 1], [-1, -1]], cluster_std=0.01)
sp = SpectralClustering(n_clusters=2, random_state=0)
labels = sp.fit(X).labels_
# set n_components = n_cluster and test if result is the same
labels_same_ncomp = SpectralClustering(n_clusters=2, n_components=2,
random_state=0).fit(X).labels_
# test that n_components=n_clusters by default
assert_array_equal(labels, labels_same_ncomp)
# test that n_components affect result
# n_clusters=8 by default, and set n_components=2
labels_diff_ncomp = SpectralClustering(n_components=2,
random_state=0).fit(X).labels_
assert not np.array_equal(labels, labels_diff_ncomp)