Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
188
venv/Lib/site-packages/sklearn/manifold/tests/test_isomap.py
Normal file
188
venv/Lib/site-packages/sklearn/manifold/tests/test_isomap.py
Normal file
|
@ -0,0 +1,188 @@
|
|||
from itertools import product
|
||||
import numpy as np
|
||||
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn import manifold
|
||||
from sklearn import neighbors
|
||||
from sklearn import pipeline
|
||||
from sklearn import preprocessing
|
||||
|
||||
from scipy.sparse import rand as sparse_rand
|
||||
|
||||
eigen_solvers = ['auto', 'dense', 'arpack']
|
||||
path_methods = ['auto', 'FW', 'D']
|
||||
|
||||
|
||||
def test_isomap_simple_grid():
|
||||
# Isomap should preserve distances when all neighbors are used
|
||||
N_per_side = 5
|
||||
Npts = N_per_side ** 2
|
||||
n_neighbors = Npts - 1
|
||||
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
X = np.array(list(product(range(N_per_side), repeat=2)))
|
||||
|
||||
# distances from each point to all others
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors,
|
||||
mode='distance').toarray()
|
||||
|
||||
for eigen_solver in eigen_solvers:
|
||||
for path_method in path_methods:
|
||||
clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method)
|
||||
clf.fit(X)
|
||||
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_,
|
||||
n_neighbors,
|
||||
mode='distance').toarray()
|
||||
assert_array_almost_equal(G, G_iso)
|
||||
|
||||
|
||||
def test_isomap_reconstruction_error():
|
||||
# Same setup as in test_isomap_simple_grid, with an added dimension
|
||||
N_per_side = 5
|
||||
Npts = N_per_side ** 2
|
||||
n_neighbors = Npts - 1
|
||||
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
X = np.array(list(product(range(N_per_side), repeat=2)))
|
||||
|
||||
# add noise in a third dimension
|
||||
rng = np.random.RandomState(0)
|
||||
noise = 0.1 * rng.randn(Npts, 1)
|
||||
X = np.concatenate((X, noise), 1)
|
||||
|
||||
# compute input kernel
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors,
|
||||
mode='distance').toarray()
|
||||
|
||||
centerer = preprocessing.KernelCenterer()
|
||||
K = centerer.fit_transform(-0.5 * G ** 2)
|
||||
|
||||
for eigen_solver in eigen_solvers:
|
||||
for path_method in path_methods:
|
||||
clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method)
|
||||
clf.fit(X)
|
||||
|
||||
# compute output kernel
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_,
|
||||
n_neighbors,
|
||||
mode='distance').toarray()
|
||||
|
||||
K_iso = centerer.fit_transform(-0.5 * G_iso ** 2)
|
||||
|
||||
# make sure error agrees
|
||||
reconstruction_error = np.linalg.norm(K - K_iso) / Npts
|
||||
assert_almost_equal(reconstruction_error,
|
||||
clf.reconstruction_error())
|
||||
|
||||
|
||||
def test_transform():
|
||||
n_samples = 200
|
||||
n_components = 10
|
||||
noise_scale = 0.01
|
||||
|
||||
# Create S-curve dataset
|
||||
X, y = datasets.make_s_curve(n_samples, random_state=0)
|
||||
|
||||
# Compute isomap embedding
|
||||
iso = manifold.Isomap(n_components=n_components, n_neighbors=2)
|
||||
X_iso = iso.fit_transform(X)
|
||||
|
||||
# Re-embed a noisy version of the points
|
||||
rng = np.random.RandomState(0)
|
||||
noise = noise_scale * rng.randn(*X.shape)
|
||||
X_iso2 = iso.transform(X + noise)
|
||||
|
||||
# Make sure the rms error on re-embedding is comparable to noise_scale
|
||||
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
|
||||
|
||||
|
||||
def test_pipeline():
|
||||
# check that Isomap works fine as a transformer in a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
clf = pipeline.Pipeline(
|
||||
[('isomap', manifold.Isomap()),
|
||||
('clf', neighbors.KNeighborsClassifier())])
|
||||
clf.fit(X, y)
|
||||
assert .9 < clf.score(X, y)
|
||||
|
||||
|
||||
def test_pipeline_with_nearest_neighbors_transformer():
|
||||
# Test chaining NearestNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = 'auto'
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X2, _ = datasets.make_blobs(random_state=1)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = pipeline.make_pipeline(
|
||||
neighbors.KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, algorithm=algorithm, mode='distance'),
|
||||
manifold.Isomap(n_neighbors=n_neighbors, metric='precomputed'))
|
||||
est_compact = manifold.Isomap(n_neighbors=n_neighbors,
|
||||
neighbors_algorithm=algorithm)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_different_metric():
|
||||
# Test that the metric parameters work correctly, and default to euclidean
|
||||
def custom_metric(x1, x2):
|
||||
return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))
|
||||
|
||||
# metric, p, is_euclidean
|
||||
metrics = [('euclidean', 2, True),
|
||||
('manhattan', 1, False),
|
||||
('minkowski', 1, False),
|
||||
('minkowski', 2, True),
|
||||
(custom_metric, 2, False)]
|
||||
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
reference = manifold.Isomap().fit_transform(X)
|
||||
|
||||
for metric, p, is_euclidean in metrics:
|
||||
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
|
||||
|
||||
if is_euclidean:
|
||||
assert_array_almost_equal(embedding, reference)
|
||||
else:
|
||||
with pytest.raises(AssertionError, match='not almost equal'):
|
||||
assert_array_almost_equal(embedding, reference)
|
||||
|
||||
|
||||
def test_isomap_clone_bug():
|
||||
# regression test for bug reported in #6062
|
||||
model = manifold.Isomap()
|
||||
for n_neighbors in [10, 15, 20]:
|
||||
model.set_params(n_neighbors=n_neighbors)
|
||||
model.fit(np.random.rand(50, 2))
|
||||
assert (model.nbrs_.n_neighbors ==
|
||||
n_neighbors)
|
||||
|
||||
|
||||
def test_sparse_input():
|
||||
X = sparse_rand(100, 3, density=0.1, format='csr')
|
||||
|
||||
# Should not error
|
||||
for eigen_solver in eigen_solvers:
|
||||
for path_method in path_methods:
|
||||
clf = manifold.Isomap(n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method)
|
||||
clf.fit(X)
|
|
@ -0,0 +1,146 @@
|
|||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
||||
from scipy import linalg
|
||||
import pytest
|
||||
|
||||
from sklearn import neighbors, manifold
|
||||
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
|
||||
eigen_solvers = ['dense', 'arpack']
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test utility routines
|
||||
def test_barycenter_kneighbors_graph():
|
||||
X = np.array([[0, 1], [1.01, 1.], [2, 0]])
|
||||
|
||||
A = barycenter_kneighbors_graph(X, 1)
|
||||
assert_array_almost_equal(
|
||||
A.toarray(),
|
||||
[[0., 1., 0.],
|
||||
[1., 0., 0.],
|
||||
[0., 1., 0.]])
|
||||
|
||||
A = barycenter_kneighbors_graph(X, 2)
|
||||
# check that columns sum to one
|
||||
assert_array_almost_equal(np.sum(A.toarray(), 1), np.ones(3))
|
||||
pred = np.dot(A.toarray(), X)
|
||||
assert linalg.norm(pred - X) / X.shape[0] < 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test LLE by computing the reconstruction error on some manifolds.
|
||||
|
||||
def test_lle_simple_grid():
|
||||
# note: ARPACK is numerically unstable, so this test will fail for
|
||||
# some random seeds. We choose 2 because the tests pass.
|
||||
rng = np.random.RandomState(2)
|
||||
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
X = np.array(list(product(range(5), repeat=2)))
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
n_components = 2
|
||||
clf = manifold.LocallyLinearEmbedding(n_neighbors=5,
|
||||
n_components=n_components,
|
||||
random_state=rng)
|
||||
tol = 0.1
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X, 'fro')
|
||||
assert reconstruction_error < tol
|
||||
|
||||
for solver in eigen_solvers:
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = linalg.norm(
|
||||
np.dot(N, clf.embedding_) - clf.embedding_, 'fro') ** 2
|
||||
|
||||
assert reconstruction_error < tol
|
||||
assert_almost_equal(clf.reconstruction_error_,
|
||||
reconstruction_error, decimal=1)
|
||||
|
||||
# re-embed a noisy version of X using the transform method
|
||||
noise = rng.randn(*X.shape) / 100
|
||||
X_reembedded = clf.transform(X + noise)
|
||||
assert linalg.norm(X_reembedded - clf.embedding_) < tol
|
||||
|
||||
|
||||
def test_lle_manifold():
|
||||
rng = np.random.RandomState(0)
|
||||
# similar test on a slightly more complex manifold
|
||||
X = np.array(list(product(np.arange(18), repeat=2)))
|
||||
X = np.c_[X, X[:, 0] ** 2 / 18]
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
n_components = 2
|
||||
for method in ["standard", "hessian", "modified", "ltsa"]:
|
||||
clf = manifold.LocallyLinearEmbedding(n_neighbors=6,
|
||||
n_components=n_components,
|
||||
method=method, random_state=0)
|
||||
tol = 1.5 if method == "standard" else 3
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X)
|
||||
assert reconstruction_error < tol
|
||||
|
||||
for solver in eigen_solvers:
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = linalg.norm(
|
||||
np.dot(N, clf.embedding_) - clf.embedding_, 'fro') ** 2
|
||||
details = ("solver: %s, method: %s" % (solver, method))
|
||||
assert reconstruction_error < tol, details
|
||||
assert (np.abs(clf.reconstruction_error_ -
|
||||
reconstruction_error) <
|
||||
tol * reconstruction_error), details
|
||||
|
||||
|
||||
# Test the error raised when parameter passed to lle is invalid
|
||||
def test_lle_init_parameters():
|
||||
X = np.random.rand(5, 3)
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(eigen_solver="error")
|
||||
msg = "unrecognized eigen_solver 'error'"
|
||||
assert_raise_message(ValueError, msg, clf.fit, X)
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(method="error")
|
||||
msg = "unrecognized method 'error'"
|
||||
assert_raise_message(ValueError, msg, clf.fit, X)
|
||||
|
||||
|
||||
def test_pipeline():
|
||||
# check that LocallyLinearEmbedding works fine as a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
from sklearn import pipeline, datasets
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
clf = pipeline.Pipeline(
|
||||
[('filter', manifold.LocallyLinearEmbedding(random_state=0)),
|
||||
('clf', neighbors.KNeighborsClassifier())])
|
||||
clf.fit(X, y)
|
||||
assert .9 < clf.score(X, y)
|
||||
|
||||
|
||||
# Test the error raised when the weight matrix is singular
|
||||
def test_singular_matrix():
|
||||
M = np.ones((10, 3))
|
||||
f = ignore_warnings
|
||||
with pytest.raises(ValueError):
|
||||
f(manifold.locally_linear_embedding(M, n_neighbors=2, n_components=1,
|
||||
method='standard',
|
||||
eigen_solver='arpack'))
|
||||
|
||||
|
||||
# regression test for #6033
|
||||
def test_integer_input():
|
||||
rand = np.random.RandomState(0)
|
||||
X = rand.randint(0, 100, size=(20, 3))
|
||||
|
||||
for method in ["standard", "hessian", "modified", "ltsa"]:
|
||||
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
|
||||
clf.fit(X) # this previously raised a TypeError
|
64
venv/Lib/site-packages/sklearn/manifold/tests/test_mds.py
Normal file
64
venv/Lib/site-packages/sklearn/manifold/tests/test_mds.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
import pytest
|
||||
|
||||
from sklearn.manifold import _mds as mds
|
||||
|
||||
|
||||
def test_smacof():
|
||||
# test metric smacof using the data of "Modern Multidimensional Scaling",
|
||||
# Borg & Groenen, p 154
|
||||
sim = np.array([[0, 5, 3, 4],
|
||||
[5, 0, 2, 2],
|
||||
[3, 2, 0, 1],
|
||||
[4, 2, 1, 0]])
|
||||
Z = np.array([[-.266, -.539],
|
||||
[.451, .252],
|
||||
[.016, -.238],
|
||||
[-.200, .524]])
|
||||
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
|
||||
X_true = np.array([[-1.415, -2.471],
|
||||
[1.633, 1.107],
|
||||
[.249, -.067],
|
||||
[-.468, 1.431]])
|
||||
assert_array_almost_equal(X, X_true, decimal=3)
|
||||
|
||||
|
||||
def test_smacof_error():
|
||||
# Not symmetric similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4],
|
||||
[5, 0, 2, 2],
|
||||
[3, 2, 0, 1],
|
||||
[4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim)
|
||||
|
||||
# Not squared similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4],
|
||||
[5, 0, 2, 2],
|
||||
[4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim)
|
||||
|
||||
# init not None and not correct format:
|
||||
sim = np.array([[0, 5, 3, 4],
|
||||
[5, 0, 2, 2],
|
||||
[3, 2, 0, 1],
|
||||
[4, 2, 1, 0]])
|
||||
|
||||
Z = np.array([[-.266, -.539],
|
||||
[.016, -.238],
|
||||
[-.200, .524]])
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, init=Z, n_init=1)
|
||||
|
||||
|
||||
def test_MDS():
|
||||
sim = np.array([[0, 5, 3, 4],
|
||||
[5, 0, 2, 2],
|
||||
[3, 2, 0, 1],
|
||||
[4, 2, 1, 0]])
|
||||
mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
|
||||
mds_clf.fit(sim)
|
|
@ -0,0 +1,347 @@
|
|||
import pytest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import sparse
|
||||
from scipy.sparse import csgraph
|
||||
from scipy.linalg import eigh
|
||||
|
||||
from sklearn.manifold import SpectralEmbedding
|
||||
from sklearn.manifold._spectral_embedding import _graph_is_connected
|
||||
from sklearn.manifold._spectral_embedding import _graph_connected_component
|
||||
from sklearn.manifold import spectral_embedding
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.metrics import normalized_mutual_info_score
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
# non centered, sparse centers to check the
|
||||
centers = np.array([
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
])
|
||||
n_samples = 1000
|
||||
n_clusters, n_features = centers.shape
|
||||
S, true_labels = make_blobs(n_samples=n_samples, centers=centers,
|
||||
cluster_std=1., random_state=42)
|
||||
|
||||
|
||||
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
|
||||
""" Check array A and B are equal with possible sign flipping on
|
||||
each columns"""
|
||||
tol_squared = tol ** 2
|
||||
for A_col, B_col in zip(A.T, B.T):
|
||||
assert (np.max((A_col - B_col) ** 2) <= tol_squared or
|
||||
np.max((A_col + B_col) ** 2) <= tol_squared)
|
||||
|
||||
|
||||
def test_sparse_graph_connected_component():
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 300
|
||||
boundaries = [0, 42, 121, 200, n_samples]
|
||||
p = rng.permutation(n_samples)
|
||||
connections = []
|
||||
|
||||
for start, stop in zip(boundaries[:-1], boundaries[1:]):
|
||||
group = p[start:stop]
|
||||
# Connect all elements within the group at least once via an
|
||||
# arbitrary path that spans the group.
|
||||
for i in range(len(group) - 1):
|
||||
connections.append((group[i], group[i + 1]))
|
||||
|
||||
# Add some more random connections within the group
|
||||
min_idx, max_idx = 0, len(group) - 1
|
||||
n_random_connections = 1000
|
||||
source = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
target = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
connections.extend(zip(group[source], group[target]))
|
||||
|
||||
# Build a symmetric affinity matrix
|
||||
row_idx, column_idx = tuple(np.array(connections).T)
|
||||
data = rng.uniform(.1, 42, size=len(connections))
|
||||
affinity = sparse.coo_matrix((data, (row_idx, column_idx)))
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
for start, stop in zip(boundaries[:-1], boundaries[1:]):
|
||||
component_1 = _graph_connected_component(affinity, p[start])
|
||||
component_size = stop - start
|
||||
assert component_1.sum() == component_size
|
||||
|
||||
# We should retrieve the same component mask by starting by both ends
|
||||
# of the group
|
||||
component_2 = _graph_connected_component(affinity, p[stop - 1])
|
||||
assert component_2.sum() == component_size
|
||||
assert_array_equal(component_1, component_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_two_components(seed=36):
|
||||
# Test spectral embedding with two components
|
||||
random_state = np.random.RandomState(seed)
|
||||
n_sample = 100
|
||||
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
|
||||
# first component
|
||||
affinity[0:n_sample,
|
||||
0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
# second component
|
||||
affinity[n_sample::,
|
||||
n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
|
||||
# Test of internal _graph_connected_component before connection
|
||||
component = _graph_connected_component(affinity, 0)
|
||||
assert component[:n_sample].all()
|
||||
assert not component[n_sample:].any()
|
||||
component = _graph_connected_component(affinity, -1)
|
||||
assert not component[:n_sample].any()
|
||||
assert component[n_sample:].all()
|
||||
|
||||
# connection
|
||||
affinity[0, n_sample + 1] = 1
|
||||
affinity[n_sample + 1, 0] = 1
|
||||
affinity.flat[::2 * n_sample + 1] = 0
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
true_label = np.zeros(shape=2 * n_sample)
|
||||
true_label[0:n_sample] = 1
|
||||
|
||||
se_precomp = SpectralEmbedding(n_components=1, affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed))
|
||||
embedded_coordinate = se_precomp.fit_transform(affinity)
|
||||
# Some numpy versions are touchy with types
|
||||
embedded_coordinate = \
|
||||
se_precomp.fit_transform(affinity.astype(np.float32))
|
||||
# thresholding on the first components using 0.
|
||||
label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float")
|
||||
assert normalized_mutual_info_score(
|
||||
true_label, label_) == pytest.approx(1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)],
|
||||
ids=["dense", "sparse"])
|
||||
def test_spectral_embedding_precomputed_affinity(X, seed=36):
|
||||
# Test spectral embedding with precomputed kernel
|
||||
gamma = 1.0
|
||||
se_precomp = SpectralEmbedding(n_components=2, affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed))
|
||||
se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed))
|
||||
embed_precomp = se_precomp.fit_transform(rbf_kernel(X, gamma=gamma))
|
||||
embed_rbf = se_rbf.fit_transform(X)
|
||||
assert_array_almost_equal(
|
||||
se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
|
||||
|
||||
|
||||
def test_precomputed_nearest_neighbors_filtering():
|
||||
# Test precomputed graph filtering when containing too many neighbors
|
||||
n_neighbors = 2
|
||||
results = []
|
||||
for additional_neighbors in [0, 10]:
|
||||
nn = NearestNeighbors(
|
||||
n_neighbors=n_neighbors + additional_neighbors).fit(S)
|
||||
graph = nn.kneighbors_graph(S, mode='connectivity')
|
||||
embedding = SpectralEmbedding(random_state=0, n_components=2,
|
||||
affinity='precomputed_nearest_neighbors',
|
||||
n_neighbors=n_neighbors
|
||||
).fit(graph).embedding_
|
||||
results.append(embedding)
|
||||
|
||||
assert_array_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)],
|
||||
ids=["dense", "sparse"])
|
||||
def test_spectral_embedding_callable_affinity(X, seed=36):
|
||||
# Test spectral embedding with callable affinity
|
||||
gamma = 0.9
|
||||
kern = rbf_kernel(S, gamma=gamma)
|
||||
se_callable = SpectralEmbedding(n_components=2,
|
||||
affinity=(
|
||||
lambda x: rbf_kernel(x, gamma=gamma)),
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed))
|
||||
se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed))
|
||||
embed_rbf = se_rbf.fit_transform(X)
|
||||
embed_callable = se_callable.fit_transform(X)
|
||||
assert_array_almost_equal(
|
||||
se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
|
||||
|
||||
|
||||
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
|
||||
def test_spectral_embedding_amg_solver(seed=36):
|
||||
# Test spectral embedding with amg solver
|
||||
pytest.importorskip('pyamg')
|
||||
|
||||
se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors",
|
||||
eigen_solver="amg", n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed))
|
||||
se_arpack = SpectralEmbedding(n_components=2, affinity="nearest_neighbors",
|
||||
eigen_solver="arpack", n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed))
|
||||
embed_amg = se_amg.fit_transform(S)
|
||||
embed_arpack = se_arpack.fit_transform(S)
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# same with special case in which amg is not actually used
|
||||
# regression test for #10715
|
||||
# affinity between nodes
|
||||
row = [0, 0, 1, 2, 3, 3, 4]
|
||||
col = [1, 2, 2, 3, 4, 5, 5]
|
||||
val = [100, 100, 100, 1, 100, 100, 100]
|
||||
|
||||
affinity = sparse.coo_matrix((val + val, (row + col, col + row)),
|
||||
shape=(6, 6)).toarray()
|
||||
se_amg.affinity = "precomputed"
|
||||
se_arpack.affinity = "precomputed"
|
||||
embed_amg = se_amg.fit_transform(affinity)
|
||||
embed_arpack = se_arpack.fit_transform(affinity)
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
|
||||
# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
|
||||
# np.random.rand:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
|
||||
def test_spectral_embedding_amg_solver_failure():
|
||||
# Non-regression test for amg solver failure (issue #13393 on github)
|
||||
pytest.importorskip('pyamg')
|
||||
seed = 36
|
||||
num_nodes = 100
|
||||
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
|
||||
upper = sparse.triu(X) - sparse.diags(X.diagonal())
|
||||
sym_matrix = upper + upper.T
|
||||
embedding = spectral_embedding(sym_matrix,
|
||||
n_components=10,
|
||||
eigen_solver='amg',
|
||||
random_state=0)
|
||||
|
||||
# Check that the learned embedding is stable w.r.t. random solver init:
|
||||
for i in range(3):
|
||||
new_embedding = spectral_embedding(sym_matrix,
|
||||
n_components=10,
|
||||
eigen_solver='amg',
|
||||
random_state=i + 1)
|
||||
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:the behavior of nmi will "
|
||||
"change in version 0.22")
|
||||
def test_pipeline_spectral_clustering(seed=36):
|
||||
# Test using pipeline to do spectral clustering
|
||||
random_state = np.random.RandomState(seed)
|
||||
se_rbf = SpectralEmbedding(n_components=n_clusters,
|
||||
affinity="rbf",
|
||||
random_state=random_state)
|
||||
se_knn = SpectralEmbedding(n_components=n_clusters,
|
||||
affinity="nearest_neighbors",
|
||||
n_neighbors=5,
|
||||
random_state=random_state)
|
||||
for se in [se_rbf, se_knn]:
|
||||
km = KMeans(n_clusters=n_clusters, random_state=random_state)
|
||||
km.fit(se.fit_transform(S))
|
||||
assert_array_almost_equal(
|
||||
normalized_mutual_info_score(
|
||||
km.labels_,
|
||||
true_labels), 1.0, 2)
|
||||
|
||||
|
||||
def test_spectral_embedding_unknown_eigensolver(seed=36):
|
||||
# Test that SpectralClustering fails with an unknown eigensolver
|
||||
se = SpectralEmbedding(n_components=1, affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver="<unknown>")
|
||||
with pytest.raises(ValueError):
|
||||
se.fit(S)
|
||||
|
||||
|
||||
def test_spectral_embedding_unknown_affinity(seed=36):
|
||||
# Test that SpectralClustering fails with an unknown affinity type
|
||||
se = SpectralEmbedding(n_components=1, affinity="<unknown>",
|
||||
random_state=np.random.RandomState(seed))
|
||||
with pytest.raises(ValueError):
|
||||
se.fit(S)
|
||||
|
||||
|
||||
def test_connectivity(seed=36):
|
||||
# Test that graph connectivity test works as expected
|
||||
graph = np.array([[1, 0, 0, 0, 0],
|
||||
[0, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1]])
|
||||
assert not _graph_is_connected(graph)
|
||||
assert not _graph_is_connected(sparse.csr_matrix(graph))
|
||||
assert not _graph_is_connected(sparse.csc_matrix(graph))
|
||||
graph = np.array([[1, 1, 0, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1]])
|
||||
assert _graph_is_connected(graph)
|
||||
assert _graph_is_connected(sparse.csr_matrix(graph))
|
||||
assert _graph_is_connected(sparse.csc_matrix(graph))
|
||||
|
||||
|
||||
def test_spectral_embedding_deterministic():
|
||||
# Test that Spectral Embedding is deterministic
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
embedding_1 = spectral_embedding(sims)
|
||||
embedding_2 = spectral_embedding(sims)
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_unnormalized():
|
||||
# Test that spectral_embedding is also processing unnormalized laplacian
|
||||
# correctly
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 8
|
||||
embedding_1 = spectral_embedding(sims,
|
||||
norm_laplacian=False,
|
||||
n_components=n_components,
|
||||
drop_first=False)
|
||||
|
||||
# Verify using manual computation with dense eigh
|
||||
laplacian, dd = csgraph.laplacian(sims, normed=False,
|
||||
return_diag=True)
|
||||
_, diffusion_map = eigh(laplacian)
|
||||
embedding_2 = diffusion_map.T[:n_components]
|
||||
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
|
||||
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_first_eigen_vector():
|
||||
# Test that the first eigenvector of spectral_embedding
|
||||
# is constant and that the second is not (for a connected graph)
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 2
|
||||
|
||||
for seed in range(10):
|
||||
embedding = spectral_embedding(sims,
|
||||
norm_laplacian=False,
|
||||
n_components=n_components,
|
||||
drop_first=False,
|
||||
random_state=seed)
|
||||
|
||||
assert np.std(embedding[:, 0]) == pytest.approx(0)
|
||||
assert np.std(embedding[:, 1]) > 1e-3
|
893
venv/Lib/site-packages/sklearn/manifold/tests/test_t_sne.py
Normal file
893
venv/Lib/site-packages/sklearn/manifold/tests/test_t_sne.py
Normal file
|
@ -0,0 +1,893 @@
|
|||
import sys
|
||||
from io import StringIO
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
import scipy.sparse as sp
|
||||
import pytest
|
||||
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.neighbors import kneighbors_graph
|
||||
from sklearn.exceptions import EfficiencyWarning
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import skip_if_32bit
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.manifold._t_sne import _joint_probabilities
|
||||
from sklearn.manifold._t_sne import _joint_probabilities_nn
|
||||
from sklearn.manifold._t_sne import _kl_divergence
|
||||
from sklearn.manifold._t_sne import _kl_divergence_bh
|
||||
from sklearn.manifold._t_sne import _gradient_descent
|
||||
from sklearn.manifold._t_sne import trustworthiness
|
||||
from sklearn.manifold import TSNE
|
||||
# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
|
||||
from sklearn.manifold import _barnes_hut_tsne # type: ignore
|
||||
from sklearn.manifold._utils import _binary_search_perplexity
|
||||
from sklearn.datasets import make_blobs
|
||||
from scipy.optimize import check_grad
|
||||
from scipy.spatial.distance import pdist
|
||||
from scipy.spatial.distance import squareform
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.metrics.pairwise import manhattan_distances
|
||||
from sklearn.metrics.pairwise import cosine_distances
|
||||
|
||||
|
||||
x = np.linspace(0, 1, 10)
|
||||
xx, yy = np.meshgrid(x, x)
|
||||
X_2d_grid = np.hstack([
|
||||
xx.ravel().reshape(-1, 1),
|
||||
yy.ravel().reshape(-1, 1),
|
||||
])
|
||||
|
||||
|
||||
def test_gradient_descent_stops():
|
||||
# Test stopping conditions of gradient descent.
|
||||
class ObjectiveSmallGradient:
|
||||
def __init__(self):
|
||||
self.it = -1
|
||||
|
||||
def __call__(self, _, compute_error=True):
|
||||
self.it += 1
|
||||
return (10 - self.it) / 10.0, np.array([1e-5])
|
||||
|
||||
def flat_function(_, compute_error=True):
|
||||
return 0.0, np.ones(1)
|
||||
|
||||
# Gradient norm
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
_, error, it = _gradient_descent(
|
||||
ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=100,
|
||||
n_iter_without_progress=100, momentum=0.0, learning_rate=0.0,
|
||||
min_gain=0.0, min_grad_norm=1e-5, verbose=2)
|
||||
finally:
|
||||
out = sys.stdout.getvalue()
|
||||
sys.stdout.close()
|
||||
sys.stdout = old_stdout
|
||||
assert error == 1.0
|
||||
assert it == 0
|
||||
assert("gradient norm" in out)
|
||||
|
||||
# Maximum number of iterations without improvement
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
_, error, it = _gradient_descent(
|
||||
flat_function, np.zeros(1), 0, n_iter=100,
|
||||
n_iter_without_progress=10, momentum=0.0, learning_rate=0.0,
|
||||
min_gain=0.0, min_grad_norm=0.0, verbose=2)
|
||||
finally:
|
||||
out = sys.stdout.getvalue()
|
||||
sys.stdout.close()
|
||||
sys.stdout = old_stdout
|
||||
assert error == 0.0
|
||||
assert it == 11
|
||||
assert("did not make any progress" in out)
|
||||
|
||||
# Maximum number of iterations
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
_, error, it = _gradient_descent(
|
||||
ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=11,
|
||||
n_iter_without_progress=100, momentum=0.0, learning_rate=0.0,
|
||||
min_gain=0.0, min_grad_norm=0.0, verbose=2)
|
||||
finally:
|
||||
out = sys.stdout.getvalue()
|
||||
sys.stdout.close()
|
||||
sys.stdout = old_stdout
|
||||
assert error == 0.0
|
||||
assert it == 10
|
||||
assert("Iteration 10" in out)
|
||||
|
||||
|
||||
def test_binary_search():
|
||||
# Test if the binary search finds Gaussians with desired perplexity.
|
||||
random_state = check_random_state(0)
|
||||
data = random_state.randn(50, 5)
|
||||
distances = pairwise_distances(data).astype(np.float32)
|
||||
desired_perplexity = 25.0
|
||||
P = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
|
||||
P = np.maximum(P, np.finfo(np.double).eps)
|
||||
mean_perplexity = np.mean([np.exp(-np.sum(P[i] * np.log(P[i])))
|
||||
for i in range(P.shape[0])])
|
||||
assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
|
||||
|
||||
|
||||
def test_binary_search_neighbors():
|
||||
# Binary perplexity search approximation.
|
||||
# Should be approximately equal to the slow method when we use
|
||||
# all points as neighbors.
|
||||
n_samples = 200
|
||||
desired_perplexity = 25.0
|
||||
random_state = check_random_state(0)
|
||||
data = random_state.randn(n_samples, 2).astype(np.float32, copy=False)
|
||||
distances = pairwise_distances(data)
|
||||
P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
|
||||
|
||||
# Test that when we use all the neighbors the results are identical
|
||||
n_neighbors = n_samples - 1
|
||||
nn = NearestNeighbors().fit(data)
|
||||
distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors,
|
||||
mode='distance')
|
||||
distances_nn = distance_graph.data.astype(np.float32, copy=False)
|
||||
distances_nn = distances_nn.reshape(n_samples, n_neighbors)
|
||||
P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0)
|
||||
|
||||
indptr = distance_graph.indptr
|
||||
P1_nn = np.array([P1[k, distance_graph.indices[indptr[k]:indptr[k + 1]]]
|
||||
for k in range(n_samples)])
|
||||
assert_array_almost_equal(P1_nn, P2, decimal=4)
|
||||
|
||||
# Test that the highest P_ij are the same when fewer neighbors are used
|
||||
for k in np.linspace(150, n_samples - 1, 5):
|
||||
k = int(k)
|
||||
topn = k * 10 # check the top 10 * k entries out of k * k entries
|
||||
distance_graph = nn.kneighbors_graph(n_neighbors=k, mode='distance')
|
||||
distances_nn = distance_graph.data.astype(np.float32, copy=False)
|
||||
distances_nn = distances_nn.reshape(n_samples, k)
|
||||
P2k = _binary_search_perplexity(distances_nn, desired_perplexity,
|
||||
verbose=0)
|
||||
assert_array_almost_equal(P1_nn, P2, decimal=2)
|
||||
idx = np.argsort(P1.ravel())[::-1]
|
||||
P1top = P1.ravel()[idx][:topn]
|
||||
idx = np.argsort(P2k.ravel())[::-1]
|
||||
P2top = P2k.ravel()[idx][:topn]
|
||||
assert_array_almost_equal(P1top, P2top, decimal=2)
|
||||
|
||||
|
||||
def test_binary_perplexity_stability():
|
||||
# Binary perplexity search should be stable.
|
||||
# The binary_search_perplexity had a bug wherein the P array
|
||||
# was uninitialized, leading to sporadically failing tests.
|
||||
n_neighbors = 10
|
||||
n_samples = 100
|
||||
random_state = check_random_state(0)
|
||||
data = random_state.randn(n_samples, 5)
|
||||
nn = NearestNeighbors().fit(data)
|
||||
distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors,
|
||||
mode='distance')
|
||||
distances = distance_graph.data.astype(np.float32, copy=False)
|
||||
distances = distances.reshape(n_samples, n_neighbors)
|
||||
last_P = None
|
||||
desired_perplexity = 3
|
||||
for _ in range(100):
|
||||
P = _binary_search_perplexity(distances.copy(), desired_perplexity,
|
||||
verbose=0)
|
||||
P1 = _joint_probabilities_nn(distance_graph, desired_perplexity,
|
||||
verbose=0)
|
||||
# Convert the sparse matrix to a dense one for testing
|
||||
P1 = P1.toarray()
|
||||
if last_P is None:
|
||||
last_P = P
|
||||
last_P1 = P1
|
||||
else:
|
||||
assert_array_almost_equal(P, last_P, decimal=4)
|
||||
assert_array_almost_equal(P1, last_P1, decimal=4)
|
||||
|
||||
|
||||
def test_gradient():
|
||||
# Test gradient of Kullback-Leibler divergence.
|
||||
random_state = check_random_state(0)
|
||||
|
||||
n_samples = 50
|
||||
n_features = 2
|
||||
n_components = 2
|
||||
alpha = 1.0
|
||||
|
||||
distances = random_state.randn(n_samples, n_features).astype(np.float32)
|
||||
distances = np.abs(distances.dot(distances.T))
|
||||
np.fill_diagonal(distances, 0.0)
|
||||
X_embedded = random_state.randn(n_samples, n_components).astype(np.float32)
|
||||
|
||||
P = _joint_probabilities(distances, desired_perplexity=25.0,
|
||||
verbose=0)
|
||||
|
||||
def fun(params):
|
||||
return _kl_divergence(params, P, alpha, n_samples, n_components)[0]
|
||||
|
||||
def grad(params):
|
||||
return _kl_divergence(params, P, alpha, n_samples, n_components)[1]
|
||||
|
||||
assert_almost_equal(check_grad(fun, grad, X_embedded.ravel()), 0.0,
|
||||
decimal=5)
|
||||
|
||||
|
||||
def test_trustworthiness():
|
||||
# Test trustworthiness score.
|
||||
random_state = check_random_state(0)
|
||||
|
||||
# Affine transformation
|
||||
X = random_state.randn(100, 2)
|
||||
assert trustworthiness(X, 5.0 + X / 10.0) == 1.0
|
||||
|
||||
# Randomly shuffled
|
||||
X = np.arange(100).reshape(-1, 1)
|
||||
X_embedded = X.copy()
|
||||
random_state.shuffle(X_embedded)
|
||||
assert trustworthiness(X, X_embedded) < 0.6
|
||||
|
||||
# Completely different
|
||||
X = np.arange(5).reshape(-1, 1)
|
||||
X_embedded = np.array([[0], [2], [4], [1], [3]])
|
||||
assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ['exact', 'barnes_hut'])
|
||||
@pytest.mark.parametrize("init", ('random', 'pca'))
|
||||
def test_preserve_trustworthiness_approximately(method, init):
|
||||
# Nearest neighbors should be preserved approximately.
|
||||
random_state = check_random_state(0)
|
||||
n_components = 2
|
||||
X = random_state.randn(50, n_components).astype(np.float32)
|
||||
tsne = TSNE(n_components=n_components, init=init, random_state=0,
|
||||
method=method, n_iter=700)
|
||||
X_embedded = tsne.fit_transform(X)
|
||||
t = trustworthiness(X, X_embedded, n_neighbors=1)
|
||||
assert t > 0.85
|
||||
|
||||
|
||||
def test_optimization_minimizes_kl_divergence():
|
||||
"""t-SNE should give a lower KL divergence with more iterations."""
|
||||
random_state = check_random_state(0)
|
||||
X, _ = make_blobs(n_features=3, random_state=random_state)
|
||||
kl_divergences = []
|
||||
for n_iter in [250, 300, 350]:
|
||||
tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
|
||||
n_iter=n_iter, random_state=0)
|
||||
tsne.fit_transform(X)
|
||||
kl_divergences.append(tsne.kl_divergence_)
|
||||
assert kl_divergences[1] <= kl_divergences[0]
|
||||
assert kl_divergences[2] <= kl_divergences[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
|
||||
def test_fit_csr_matrix(method):
|
||||
# X can be a sparse matrix.
|
||||
rng = check_random_state(0)
|
||||
X = rng.randn(50, 2)
|
||||
X[(rng.randint(0, 50, 25), rng.randint(0, 2, 25))] = 0.0
|
||||
X_csr = sp.csr_matrix(X)
|
||||
tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
|
||||
random_state=0, method=method, n_iter=750)
|
||||
X_embedded = tsne.fit_transform(X_csr)
|
||||
assert_allclose(trustworthiness(X_csr, X_embedded, n_neighbors=1),
|
||||
1.0, rtol=1.1e-1)
|
||||
|
||||
|
||||
def test_preserve_trustworthiness_approximately_with_precomputed_distances():
|
||||
# Nearest neighbors should be preserved approximately.
|
||||
random_state = check_random_state(0)
|
||||
for i in range(3):
|
||||
X = random_state.randn(80, 2)
|
||||
D = squareform(pdist(X), "sqeuclidean")
|
||||
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
|
||||
early_exaggeration=2.0, metric="precomputed",
|
||||
random_state=i, verbose=0, n_iter=500)
|
||||
X_embedded = tsne.fit_transform(D)
|
||||
t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed")
|
||||
assert t > .95
|
||||
|
||||
|
||||
def test_trustworthiness_not_euclidean_metric():
|
||||
# Test trustworthiness with a metric different from 'euclidean' and
|
||||
# 'precomputed'
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.randn(100, 2)
|
||||
assert (trustworthiness(X, X, metric='cosine') ==
|
||||
trustworthiness(pairwise_distances(X, metric='cosine'), X,
|
||||
metric='precomputed'))
|
||||
|
||||
|
||||
def test_early_exaggeration_too_small():
|
||||
# Early exaggeration factor must be >= 1.
|
||||
tsne = TSNE(early_exaggeration=0.99)
|
||||
with pytest.raises(ValueError, match="early_exaggeration .*"):
|
||||
tsne.fit_transform(np.array([[0.0], [0.0]]))
|
||||
|
||||
|
||||
def test_too_few_iterations():
|
||||
# Number of gradient descent iterations must be at least 200.
|
||||
tsne = TSNE(n_iter=199)
|
||||
with pytest.raises(ValueError, match="n_iter .*"):
|
||||
tsne.fit_transform(np.array([[0.0], [0.0]]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('method, retype', [
|
||||
('exact', np.asarray),
|
||||
('barnes_hut', np.asarray),
|
||||
('barnes_hut', sp.csr_matrix),
|
||||
])
|
||||
@pytest.mark.parametrize('D, message_regex', [
|
||||
([[0.0], [1.0]], ".* square distance matrix"),
|
||||
([[0., -1.], [1., 0.]], ".* positive.*"),
|
||||
])
|
||||
def test_bad_precomputed_distances(method, D, retype, message_regex):
|
||||
tsne = TSNE(metric="precomputed", method=method)
|
||||
with pytest.raises(ValueError, match=message_regex):
|
||||
tsne.fit_transform(retype(D))
|
||||
|
||||
|
||||
def test_exact_no_precomputed_sparse():
|
||||
tsne = TSNE(metric='precomputed', method='exact')
|
||||
with pytest.raises(TypeError, match='sparse'):
|
||||
tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]]))
|
||||
|
||||
|
||||
def test_high_perplexity_precomputed_sparse_distances():
|
||||
# Perplexity should be less than 50
|
||||
dist = np.array([[1., 0., 0.], [0., 1., 0.], [1., 0., 0.]])
|
||||
bad_dist = sp.csr_matrix(dist)
|
||||
tsne = TSNE(metric="precomputed")
|
||||
msg = "3 neighbors per samples are required, but some samples have only 1"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
tsne.fit_transform(bad_dist)
|
||||
|
||||
|
||||
@ignore_warnings(category=EfficiencyWarning)
|
||||
def test_sparse_precomputed_distance():
|
||||
"""Make sure that TSNE works identically for sparse and dense matrix"""
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.randn(100, 2)
|
||||
|
||||
D_sparse = kneighbors_graph(X, n_neighbors=100, mode='distance',
|
||||
include_self=True)
|
||||
D = pairwise_distances(X)
|
||||
assert sp.issparse(D_sparse)
|
||||
assert_almost_equal(D_sparse.A, D)
|
||||
|
||||
tsne = TSNE(metric="precomputed", random_state=0)
|
||||
Xt_dense = tsne.fit_transform(D)
|
||||
|
||||
for fmt in ['csr', 'lil']:
|
||||
Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt))
|
||||
assert_almost_equal(Xt_dense, Xt_sparse)
|
||||
|
||||
|
||||
def test_non_positive_computed_distances():
|
||||
# Computed distance matrices must be positive.
|
||||
def metric(x, y):
|
||||
return -1
|
||||
|
||||
tsne = TSNE(metric=metric, method='exact')
|
||||
X = np.array([[0.0, 0.0], [1.0, 1.0]])
|
||||
with pytest.raises(ValueError, match="All distances .*metric given.*"):
|
||||
tsne.fit_transform(X)
|
||||
|
||||
|
||||
def test_init_not_available():
|
||||
# 'init' must be 'pca', 'random', or numpy array.
|
||||
tsne = TSNE(init="not available")
|
||||
m = "'init' must be 'pca', 'random', or a numpy array"
|
||||
with pytest.raises(ValueError, match=m):
|
||||
tsne.fit_transform(np.array([[0.0], [1.0]]))
|
||||
|
||||
|
||||
def test_init_ndarray():
|
||||
# Initialize TSNE with ndarray and test fit
|
||||
tsne = TSNE(init=np.zeros((100, 2)))
|
||||
X_embedded = tsne.fit_transform(np.ones((100, 5)))
|
||||
assert_array_equal(np.zeros((100, 2)), X_embedded)
|
||||
|
||||
|
||||
def test_init_ndarray_precomputed():
|
||||
# Initialize TSNE with ndarray and metric 'precomputed'
|
||||
# Make sure no FutureWarning is thrown from _fit
|
||||
tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed")
|
||||
tsne.fit(np.zeros((100, 100)))
|
||||
|
||||
|
||||
def test_distance_not_available():
|
||||
# 'metric' must be valid.
|
||||
tsne = TSNE(metric="not available", method='exact')
|
||||
with pytest.raises(ValueError, match="Unknown metric not available.*"):
|
||||
tsne.fit_transform(np.array([[0.0], [1.0]]))
|
||||
|
||||
tsne = TSNE(metric="not available", method='barnes_hut')
|
||||
with pytest.raises(ValueError, match="Metric 'not available' not valid.*"):
|
||||
tsne.fit_transform(np.array([[0.0], [1.0]]))
|
||||
|
||||
|
||||
def test_method_not_available():
|
||||
# 'nethod' must be 'barnes_hut' or 'exact'
|
||||
tsne = TSNE(method='not available')
|
||||
with pytest.raises(ValueError, match="'method' must be 'barnes_hut' or "):
|
||||
tsne.fit_transform(np.array([[0.0], [1.0]]))
|
||||
|
||||
|
||||
def test_angle_out_of_range_checks():
|
||||
# check the angle parameter range
|
||||
for angle in [-1, -1e-6, 1 + 1e-6, 2]:
|
||||
tsne = TSNE(angle=angle)
|
||||
with pytest.raises(ValueError, match="'angle' must be between "
|
||||
"0.0 - 1.0"):
|
||||
tsne.fit_transform(np.array([[0.0], [1.0]]))
|
||||
|
||||
|
||||
def test_pca_initialization_not_compatible_with_precomputed_kernel():
|
||||
# Precomputed distance matrices must be square matrices.
|
||||
tsne = TSNE(metric="precomputed", init="pca")
|
||||
with pytest.raises(ValueError, match="The parameter init=\"pca\" cannot"
|
||||
" be used with"
|
||||
" metric=\"precomputed\"."):
|
||||
tsne.fit_transform(np.array([[0.0], [1.0]]))
|
||||
|
||||
|
||||
def test_n_components_range():
|
||||
# barnes_hut method should only be used with n_components <= 3
|
||||
tsne = TSNE(n_components=4, method="barnes_hut")
|
||||
with pytest.raises(ValueError, match="'n_components' should be .*"):
|
||||
tsne.fit_transform(np.array([[0.0], [1.0]]))
|
||||
|
||||
|
||||
def test_early_exaggeration_used():
|
||||
# check that the ``early_exaggeration`` parameter has an effect
|
||||
random_state = check_random_state(0)
|
||||
n_components = 2
|
||||
methods = ['exact', 'barnes_hut']
|
||||
X = random_state.randn(25, n_components).astype(np.float32)
|
||||
for method in methods:
|
||||
tsne = TSNE(n_components=n_components, perplexity=1,
|
||||
learning_rate=100.0, init="pca", random_state=0,
|
||||
method=method, early_exaggeration=1.0, n_iter=250)
|
||||
X_embedded1 = tsne.fit_transform(X)
|
||||
tsne = TSNE(n_components=n_components, perplexity=1,
|
||||
learning_rate=100.0, init="pca", random_state=0,
|
||||
method=method, early_exaggeration=10.0, n_iter=250)
|
||||
X_embedded2 = tsne.fit_transform(X)
|
||||
|
||||
assert not np.allclose(X_embedded1, X_embedded2)
|
||||
|
||||
|
||||
def test_n_iter_used():
|
||||
# check that the ``n_iter`` parameter has an effect
|
||||
random_state = check_random_state(0)
|
||||
n_components = 2
|
||||
methods = ['exact', 'barnes_hut']
|
||||
X = random_state.randn(25, n_components).astype(np.float32)
|
||||
for method in methods:
|
||||
for n_iter in [251, 500]:
|
||||
tsne = TSNE(n_components=n_components, perplexity=1,
|
||||
learning_rate=0.5, init="random", random_state=0,
|
||||
method=method, early_exaggeration=1.0, n_iter=n_iter)
|
||||
tsne.fit_transform(X)
|
||||
|
||||
assert tsne.n_iter_ == n_iter - 1
|
||||
|
||||
|
||||
def test_answer_gradient_two_points():
|
||||
# Test the tree with only a single set of children.
|
||||
#
|
||||
# These tests & answers have been checked against the reference
|
||||
# implementation by LvdM.
|
||||
pos_input = np.array([[1.0, 0.0], [0.0, 1.0]])
|
||||
pos_output = np.array([[-4.961291e-05, -1.072243e-04],
|
||||
[9.259460e-05, 2.702024e-04]])
|
||||
neighbors = np.array([[1],
|
||||
[0]])
|
||||
grad_output = np.array([[-2.37012478e-05, -6.29044398e-05],
|
||||
[2.37012478e-05, 6.29044398e-05]])
|
||||
_run_answer_test(pos_input, pos_output, neighbors, grad_output)
|
||||
|
||||
|
||||
def test_answer_gradient_four_points():
|
||||
# Four points tests the tree with multiple levels of children.
|
||||
#
|
||||
# These tests & answers have been checked against the reference
|
||||
# implementation by LvdM.
|
||||
pos_input = np.array([[1.0, 0.0], [0.0, 1.0],
|
||||
[5.0, 2.0], [7.3, 2.2]])
|
||||
pos_output = np.array([[6.080564e-05, -7.120823e-05],
|
||||
[-1.718945e-04, -4.000536e-05],
|
||||
[-2.271720e-04, 8.663310e-05],
|
||||
[-1.032577e-04, -3.582033e-05]])
|
||||
neighbors = np.array([[1, 2, 3],
|
||||
[0, 2, 3],
|
||||
[1, 0, 3],
|
||||
[1, 2, 0]])
|
||||
grad_output = np.array([[5.81128448e-05, -7.78033454e-06],
|
||||
[-5.81526851e-05, 7.80976444e-06],
|
||||
[4.24275173e-08, -3.69569698e-08],
|
||||
[-2.58720939e-09, 7.52706374e-09]])
|
||||
_run_answer_test(pos_input, pos_output, neighbors, grad_output)
|
||||
|
||||
|
||||
def test_skip_num_points_gradient():
|
||||
# Test the kwargs option skip_num_points.
|
||||
#
|
||||
# Skip num points should make it such that the Barnes_hut gradient
|
||||
# is not calculated for indices below skip_num_point.
|
||||
# Aside from skip_num_points=2 and the first two gradient rows
|
||||
# being set to zero, these data points are the same as in
|
||||
# test_answer_gradient_four_points()
|
||||
pos_input = np.array([[1.0, 0.0], [0.0, 1.0],
|
||||
[5.0, 2.0], [7.3, 2.2]])
|
||||
pos_output = np.array([[6.080564e-05, -7.120823e-05],
|
||||
[-1.718945e-04, -4.000536e-05],
|
||||
[-2.271720e-04, 8.663310e-05],
|
||||
[-1.032577e-04, -3.582033e-05]])
|
||||
neighbors = np.array([[1, 2, 3],
|
||||
[0, 2, 3],
|
||||
[1, 0, 3],
|
||||
[1, 2, 0]])
|
||||
grad_output = np.array([[0.0, 0.0],
|
||||
[0.0, 0.0],
|
||||
[4.24275173e-08, -3.69569698e-08],
|
||||
[-2.58720939e-09, 7.52706374e-09]])
|
||||
_run_answer_test(pos_input, pos_output, neighbors, grad_output,
|
||||
False, 0.1, 2)
|
||||
|
||||
|
||||
def _run_answer_test(pos_input, pos_output, neighbors, grad_output,
|
||||
verbose=False, perplexity=0.1, skip_num_points=0):
|
||||
distances = pairwise_distances(pos_input).astype(np.float32)
|
||||
args = distances, perplexity, verbose
|
||||
pos_output = pos_output.astype(np.float32)
|
||||
neighbors = neighbors.astype(np.int64, copy=False)
|
||||
pij_input = _joint_probabilities(*args)
|
||||
pij_input = squareform(pij_input).astype(np.float32)
|
||||
grad_bh = np.zeros(pos_output.shape, dtype=np.float32)
|
||||
|
||||
from scipy.sparse import csr_matrix
|
||||
P = csr_matrix(pij_input)
|
||||
|
||||
neighbors = P.indices.astype(np.int64)
|
||||
indptr = P.indptr.astype(np.int64)
|
||||
|
||||
_barnes_hut_tsne.gradient(P.data, pos_output, neighbors, indptr,
|
||||
grad_bh, 0.5, 2, 1, skip_num_points=0)
|
||||
assert_array_almost_equal(grad_bh, grad_output, decimal=4)
|
||||
|
||||
|
||||
def test_verbose():
|
||||
# Verbose options write to stdout.
|
||||
random_state = check_random_state(0)
|
||||
tsne = TSNE(verbose=2)
|
||||
X = random_state.randn(5, 2)
|
||||
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
tsne.fit_transform(X)
|
||||
finally:
|
||||
out = sys.stdout.getvalue()
|
||||
sys.stdout.close()
|
||||
sys.stdout = old_stdout
|
||||
|
||||
assert("[t-SNE]" in out)
|
||||
assert("nearest neighbors..." in out)
|
||||
assert("Computed conditional probabilities" in out)
|
||||
assert("Mean sigma" in out)
|
||||
assert("early exaggeration" in out)
|
||||
|
||||
|
||||
def test_chebyshev_metric():
|
||||
# t-SNE should allow metrics that cannot be squared (issue #3526).
|
||||
random_state = check_random_state(0)
|
||||
tsne = TSNE(metric="chebyshev")
|
||||
X = random_state.randn(5, 2)
|
||||
tsne.fit_transform(X)
|
||||
|
||||
|
||||
def test_reduction_to_one_component():
|
||||
# t-SNE should allow reduction to one component (issue #4154).
|
||||
random_state = check_random_state(0)
|
||||
tsne = TSNE(n_components=1)
|
||||
X = random_state.randn(5, 2)
|
||||
X_embedded = tsne.fit(X).embedding_
|
||||
assert(np.all(np.isfinite(X_embedded)))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
|
||||
@pytest.mark.parametrize('dt', [np.float32, np.float64])
|
||||
def test_64bit(method, dt):
|
||||
# Ensure 64bit arrays are handled correctly.
|
||||
random_state = check_random_state(0)
|
||||
|
||||
X = random_state.randn(10, 2).astype(dt, copy=False)
|
||||
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
|
||||
random_state=0, method=method, verbose=0,
|
||||
n_iter=300)
|
||||
X_embedded = tsne.fit_transform(X)
|
||||
effective_type = X_embedded.dtype
|
||||
|
||||
# tsne cython code is only single precision, so the output will
|
||||
# always be single precision, irrespectively of the input dtype
|
||||
assert effective_type == np.float32
|
||||
|
||||
|
||||
@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
|
||||
def test_kl_divergence_not_nan(method):
|
||||
# Ensure kl_divergence_ is computed at last iteration
|
||||
# even though n_iter % n_iter_check != 0, i.e. 1003 % 50 != 0
|
||||
random_state = check_random_state(0)
|
||||
|
||||
X = random_state.randn(50, 2)
|
||||
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
|
||||
random_state=0, method=method, verbose=0, n_iter=503)
|
||||
tsne.fit_transform(X)
|
||||
|
||||
assert not np.isnan(tsne.kl_divergence_)
|
||||
|
||||
|
||||
def test_barnes_hut_angle():
|
||||
# When Barnes-Hut's angle=0 this corresponds to the exact method.
|
||||
angle = 0.0
|
||||
perplexity = 10
|
||||
n_samples = 100
|
||||
for n_components in [2, 3]:
|
||||
n_features = 5
|
||||
degrees_of_freedom = float(n_components - 1.0)
|
||||
|
||||
random_state = check_random_state(0)
|
||||
data = random_state.randn(n_samples, n_features)
|
||||
distances = pairwise_distances(data)
|
||||
params = random_state.randn(n_samples, n_components)
|
||||
P = _joint_probabilities(distances, perplexity, verbose=0)
|
||||
kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom,
|
||||
n_samples, n_components)
|
||||
|
||||
n_neighbors = n_samples - 1
|
||||
distances_csr = NearestNeighbors().fit(data).kneighbors_graph(
|
||||
n_neighbors=n_neighbors, mode='distance')
|
||||
P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
|
||||
kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom,
|
||||
n_samples, n_components,
|
||||
angle=angle, skip_num_points=0,
|
||||
verbose=0)
|
||||
|
||||
P = squareform(P)
|
||||
P_bh = P_bh.toarray()
|
||||
assert_array_almost_equal(P_bh, P, decimal=5)
|
||||
assert_almost_equal(kl_exact, kl_bh, decimal=3)
|
||||
|
||||
|
||||
@skip_if_32bit
|
||||
def test_n_iter_without_progress():
|
||||
# Use a dummy negative n_iter_without_progress and check output on stdout
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.randn(100, 10)
|
||||
for method in ["barnes_hut", "exact"]:
|
||||
tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8,
|
||||
random_state=0, method=method, n_iter=351, init="random")
|
||||
tsne._N_ITER_CHECK = 1
|
||||
tsne._EXPLORATION_N_ITER = 0
|
||||
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
tsne.fit_transform(X)
|
||||
finally:
|
||||
out = sys.stdout.getvalue()
|
||||
sys.stdout.close()
|
||||
sys.stdout = old_stdout
|
||||
|
||||
# The output needs to contain the value of n_iter_without_progress
|
||||
assert ("did not make any progress during the "
|
||||
"last -1 episodes. Finished." in out)
|
||||
|
||||
|
||||
def test_min_grad_norm():
|
||||
# Make sure that the parameter min_grad_norm is used correctly
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.randn(100, 2)
|
||||
min_grad_norm = 0.002
|
||||
tsne = TSNE(min_grad_norm=min_grad_norm, verbose=2,
|
||||
random_state=0, method='exact')
|
||||
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
tsne.fit_transform(X)
|
||||
finally:
|
||||
out = sys.stdout.getvalue()
|
||||
sys.stdout.close()
|
||||
sys.stdout = old_stdout
|
||||
|
||||
lines_out = out.split('\n')
|
||||
|
||||
# extract the gradient norm from the verbose output
|
||||
gradient_norm_values = []
|
||||
for line in lines_out:
|
||||
# When the computation is Finished just an old gradient norm value
|
||||
# is repeated that we do not need to store
|
||||
if 'Finished' in line:
|
||||
break
|
||||
|
||||
start_grad_norm = line.find('gradient norm')
|
||||
if start_grad_norm >= 0:
|
||||
line = line[start_grad_norm:]
|
||||
line = line.replace('gradient norm = ', '').split(' ')[0]
|
||||
gradient_norm_values.append(float(line))
|
||||
|
||||
# Compute how often the gradient norm is smaller than min_grad_norm
|
||||
gradient_norm_values = np.array(gradient_norm_values)
|
||||
n_smaller_gradient_norms = \
|
||||
len(gradient_norm_values[gradient_norm_values <= min_grad_norm])
|
||||
|
||||
# The gradient norm can be smaller than min_grad_norm at most once,
|
||||
# because in the moment it becomes smaller the optimization stops
|
||||
assert n_smaller_gradient_norms <= 1
|
||||
|
||||
|
||||
def test_accessible_kl_divergence():
|
||||
# Ensures that the accessible kl_divergence matches the computed value
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.randn(50, 2)
|
||||
tsne = TSNE(n_iter_without_progress=2, verbose=2,
|
||||
random_state=0, method='exact',
|
||||
n_iter=500)
|
||||
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
tsne.fit_transform(X)
|
||||
finally:
|
||||
out = sys.stdout.getvalue()
|
||||
sys.stdout.close()
|
||||
sys.stdout = old_stdout
|
||||
|
||||
# The output needs to contain the accessible kl_divergence as the error at
|
||||
# the last iteration
|
||||
for line in out.split('\n')[::-1]:
|
||||
if 'Iteration' in line:
|
||||
_, _, error = line.partition('error = ')
|
||||
if error:
|
||||
error, _, _ = error.partition(',')
|
||||
break
|
||||
assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
|
||||
def test_uniform_grid(method):
|
||||
"""Make sure that TSNE can approximately recover a uniform 2D grid
|
||||
|
||||
Due to ties in distances between point in X_2d_grid, this test is platform
|
||||
dependent for ``method='barnes_hut'`` due to numerical imprecision.
|
||||
|
||||
Also, t-SNE is not assured to converge to the right solution because bad
|
||||
initialization can lead to convergence to bad local minimum (the
|
||||
optimization problem is non-convex). To avoid breaking the test too often,
|
||||
we re-run t-SNE from the final point when the convergence is not good
|
||||
enough.
|
||||
"""
|
||||
seeds = range(3)
|
||||
n_iter = 500
|
||||
for seed in seeds:
|
||||
tsne = TSNE(n_components=2, init='random', random_state=seed,
|
||||
perplexity=50, n_iter=n_iter, method=method)
|
||||
Y = tsne.fit_transform(X_2d_grid)
|
||||
|
||||
try_name = "{}_{}".format(method, seed)
|
||||
try:
|
||||
assert_uniform_grid(Y, try_name)
|
||||
except AssertionError:
|
||||
# If the test fails a first time, re-run with init=Y to see if
|
||||
# this was caused by a bad initialization. Note that this will
|
||||
# also run an early_exaggeration step.
|
||||
try_name += ":rerun"
|
||||
tsne.init = Y
|
||||
Y = tsne.fit_transform(X_2d_grid)
|
||||
assert_uniform_grid(Y, try_name)
|
||||
|
||||
|
||||
def assert_uniform_grid(Y, try_name=None):
|
||||
# Ensure that the resulting embedding leads to approximately
|
||||
# uniformly spaced points: the distance to the closest neighbors
|
||||
# should be non-zero and approximately constant.
|
||||
nn = NearestNeighbors(n_neighbors=1).fit(Y)
|
||||
dist_to_nn = nn.kneighbors(return_distance=True)[0].ravel()
|
||||
assert dist_to_nn.min() > 0.1
|
||||
|
||||
smallest_to_mean = dist_to_nn.min() / np.mean(dist_to_nn)
|
||||
largest_to_mean = dist_to_nn.max() / np.mean(dist_to_nn)
|
||||
|
||||
assert smallest_to_mean > .5, try_name
|
||||
assert largest_to_mean < 2, try_name
|
||||
|
||||
|
||||
def test_bh_match_exact():
|
||||
# check that the ``barnes_hut`` method match the exact one when
|
||||
# ``angle = 0`` and ``perplexity > n_samples / 3``
|
||||
random_state = check_random_state(0)
|
||||
n_features = 10
|
||||
X = random_state.randn(30, n_features).astype(np.float32)
|
||||
X_embeddeds = {}
|
||||
n_iter = {}
|
||||
for method in ['exact', 'barnes_hut']:
|
||||
tsne = TSNE(n_components=2, method=method, learning_rate=1.0,
|
||||
init="random", random_state=0, n_iter=251,
|
||||
perplexity=30.0, angle=0)
|
||||
# Kill the early_exaggeration
|
||||
tsne._EXPLORATION_N_ITER = 0
|
||||
X_embeddeds[method] = tsne.fit_transform(X)
|
||||
n_iter[method] = tsne.n_iter_
|
||||
|
||||
assert n_iter['exact'] == n_iter['barnes_hut']
|
||||
assert_allclose(X_embeddeds['exact'], X_embeddeds['barnes_hut'], rtol=1e-4)
|
||||
|
||||
|
||||
def test_gradient_bh_multithread_match_sequential():
|
||||
# check that the bh gradient with different num_threads gives the same
|
||||
# results
|
||||
|
||||
n_features = 10
|
||||
n_samples = 30
|
||||
n_components = 2
|
||||
degrees_of_freedom = 1
|
||||
|
||||
angle = 3
|
||||
perplexity = 5
|
||||
|
||||
random_state = check_random_state(0)
|
||||
data = random_state.randn(n_samples, n_features).astype(np.float32)
|
||||
params = random_state.randn(n_samples, n_components)
|
||||
|
||||
n_neighbors = n_samples - 1
|
||||
distances_csr = NearestNeighbors().fit(data).kneighbors_graph(
|
||||
n_neighbors=n_neighbors, mode='distance')
|
||||
P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
|
||||
kl_sequential, grad_sequential = _kl_divergence_bh(
|
||||
params, P_bh, degrees_of_freedom, n_samples, n_components,
|
||||
angle=angle, skip_num_points=0, verbose=0, num_threads=1)
|
||||
for num_threads in [2, 4]:
|
||||
kl_multithread, grad_multithread = _kl_divergence_bh(
|
||||
params, P_bh, degrees_of_freedom, n_samples, n_components,
|
||||
angle=angle, skip_num_points=0, verbose=0, num_threads=num_threads)
|
||||
|
||||
assert_allclose(kl_multithread, kl_sequential, rtol=1e-6)
|
||||
assert_allclose(grad_multithread, grad_multithread)
|
||||
|
||||
|
||||
def test_tsne_with_different_distance_metrics():
|
||||
"""Make sure that TSNE works for different distance metrics"""
|
||||
random_state = check_random_state(0)
|
||||
n_components_original = 3
|
||||
n_components_embedding = 2
|
||||
X = random_state.randn(50, n_components_original).astype(np.float32)
|
||||
metrics = ['manhattan', 'cosine']
|
||||
dist_funcs = [manhattan_distances, cosine_distances]
|
||||
for metric, dist_func in zip(metrics, dist_funcs):
|
||||
X_transformed_tsne = TSNE(
|
||||
metric=metric, n_components=n_components_embedding,
|
||||
random_state=0, n_iter=300).fit_transform(X)
|
||||
X_transformed_tsne_precomputed = TSNE(
|
||||
metric='precomputed', n_components=n_components_embedding,
|
||||
random_state=0, n_iter=300).fit_transform(dist_func(X))
|
||||
assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
|
||||
def test_tsne_n_jobs(method):
|
||||
"""Make sure that the n_jobs parameter doesn't impact the output"""
|
||||
random_state = check_random_state(0)
|
||||
n_features = 10
|
||||
X = random_state.randn(30, n_features)
|
||||
X_tr_ref = TSNE(n_components=2, method=method, perplexity=30.0,
|
||||
angle=0, n_jobs=1, random_state=0).fit_transform(X)
|
||||
X_tr = TSNE(n_components=2, method=method, perplexity=30.0,
|
||||
angle=0, n_jobs=2, random_state=0).fit_transform(X)
|
||||
|
||||
assert_allclose(X_tr_ref, X_tr)
|
Loading…
Add table
Add a link
Reference in a new issue