Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,13 @@
"""
The :mod:`sklearn.manifold` module implements data embedding techniques.
"""
from ._locally_linear import locally_linear_embedding, LocallyLinearEmbedding
from ._isomap import Isomap
from ._mds import MDS, smacof
from ._spectral_embedding import SpectralEmbedding, spectral_embedding
from ._t_sne import TSNE, trustworthiness
__all__ = ['locally_linear_embedding', 'LocallyLinearEmbedding', 'Isomap',
'MDS', 'smacof', 'SpectralEmbedding', 'spectral_embedding', "TSNE",
'trustworthiness']

View file

@ -0,0 +1,282 @@
"""Isomap for manifold learning"""
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) 2011
import numpy as np
from ..base import BaseEstimator, TransformerMixin
from ..neighbors import NearestNeighbors, kneighbors_graph
from ..utils.deprecation import deprecated
from ..utils.validation import check_is_fitted
from ..utils.validation import _deprecate_positional_args
from ..utils.graph import graph_shortest_path
from ..decomposition import KernelPCA
from ..preprocessing import KernelCenterer
class Isomap(TransformerMixin, BaseEstimator):
"""Isomap Embedding
Non-linear dimensionality reduction through Isometric Mapping
Read more in the :ref:`User Guide <isomap>`.
Parameters
----------
n_neighbors : integer
number of neighbors to consider for each point.
n_components : integer
number of coordinates for the manifold
eigen_solver : ['auto'|'arpack'|'dense']
'auto' : Attempt to choose the most efficient solver
for the given problem.
'arpack' : Use Arnoldi decomposition to find the eigenvalues
and eigenvectors.
'dense' : Use a direct solver (i.e. LAPACK)
for the eigenvalue decomposition.
tol : float
Convergence tolerance passed to arpack or lobpcg.
not used if eigen_solver == 'dense'.
max_iter : integer
Maximum number of iterations for the arpack solver.
not used if eigen_solver == 'dense'.
path_method : string ['auto'|'FW'|'D']
Method to use in finding shortest path.
'auto' : attempt to choose the best algorithm automatically.
'FW' : Floyd-Warshall algorithm.
'D' : Dijkstra's algorithm.
neighbors_algorithm : string ['auto'|'brute'|'kd_tree'|'ball_tree']
Algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance.
n_jobs : int or None, default=None
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
metric : string, or callable, default="minkowski"
The metric to use when calculating distance between instances in a
feature array. If metric is a string or callable, it must be one of
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
its metric parameter.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square. X may be a :term:`Glossary <sparse graph>`.
.. versionadded:: 0.22
p : int, default=2
Parameter for the Minkowski metric from
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
.. versionadded:: 0.22
metric_params : dict, default=None
Additional keyword arguments for the metric function.
.. versionadded:: 0.22
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the embedding vectors.
kernel_pca_ : object
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
embedding.
nbrs_ : sklearn.neighbors.NearestNeighbors instance
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
dist_matrix_ : array-like, shape (n_samples, n_samples)
Stores the geodesic distance matrix of training data.
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import Isomap
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = Isomap(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
References
----------
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
framework for nonlinear dimensionality reduction. Science 290 (5500)
"""
@_deprecate_positional_args
def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto',
tol=0, max_iter=None, path_method='auto',
neighbors_algorithm='auto', n_jobs=None, metric='minkowski',
p=2, metric_params=None):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.path_method = path_method
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
self.metric = metric
self.p = p
self.metric_params = metric_params
def _fit_transform(self, X):
self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
algorithm=self.neighbors_algorithm,
metric=self.metric, p=self.p,
metric_params=self.metric_params,
n_jobs=self.n_jobs)
self.nbrs_.fit(X)
self.n_features_in_ = self.nbrs_.n_features_in_
self.kernel_pca_ = KernelPCA(n_components=self.n_components,
kernel="precomputed",
eigen_solver=self.eigen_solver,
tol=self.tol, max_iter=self.max_iter,
n_jobs=self.n_jobs)
kng = kneighbors_graph(self.nbrs_, self.n_neighbors,
metric=self.metric, p=self.p,
metric_params=self.metric_params,
mode='distance', n_jobs=self.n_jobs)
self.dist_matrix_ = graph_shortest_path(kng,
method=self.path_method,
directed=False)
G = self.dist_matrix_ ** 2
G *= -0.5
self.embedding_ = self.kernel_pca_.fit_transform(G)
# mypy error: Decorated property not supported
@deprecated( # type: ignore
"Attribute `training_data_` was deprecated in version 0.22 and"
" will be removed in 0.24."
)
@property
def training_data_(self):
check_is_fitted(self)
return self.nbrs_._fit_X
def reconstruction_error(self):
"""Compute the reconstruction error for the embedding.
Returns
-------
reconstruction_error : float
Notes
-----
The cost function of an isomap embedding is
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
Where D is the matrix of distances for the input data X,
D_fit is the matrix of distances for the output embedding X_fit,
and K is the isomap kernel:
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
"""
G = -0.5 * self.dist_matrix_ ** 2
G_center = KernelCenterer().fit_transform(G)
evals = self.kernel_pca_.lambdas_
return np.sqrt(np.sum(G_center ** 2) - np.sum(evals ** 2)) / G.shape[0]
def fit(self, X, y=None):
"""Compute the embedding vectors for data X
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse graph, precomputed tree, or NearestNeighbors
object.
y : Ignored
Returns
-------
self : returns an instance of self.
"""
self._fit_transform(X)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree}
Training vector, where n_samples in the number of samples
and n_features is the number of features.
y : Ignored
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""Transform X.
This is implemented by linking the points X into the graph of geodesic
distances of the training data. First the `n_neighbors` nearest
neighbors of X are found in the training data, and from these the
shortest geodesic distances from each point in X to each point in
the training data are computed in order to construct the kernel.
The embedding of X is the projection of this kernel onto the
embedding vectors of the training set.
Parameters
----------
X : array-like, shape (n_queries, n_features)
If neighbors_algorithm='precomputed', X is assumed to be a
distance matrix or a sparse graph of shape
(n_queries, n_samples_fit).
Returns
-------
X_new : array-like, shape (n_queries, n_components)
"""
check_is_fitted(self)
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
# Create the graph of shortest distances from X to
# training data via the nearest neighbors of X.
# This can be done as a single array operation, but it potentially
# takes a lot of memory. To avoid that, use a loop:
n_samples_fit = self.nbrs_.n_samples_fit_
n_queries = distances.shape[0]
G_X = np.zeros((n_queries, n_samples_fit))
for i in range(n_queries):
G_X[i] = np.min(self.dist_matrix_[indices[i]] +
distances[i][:, None], 0)
G_X **= 2
G_X *= -0.5
return self.kernel_pca_.transform(G_X)

View file

@ -0,0 +1,729 @@
"""Locally Linear Embedding"""
# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
# Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) INRIA 2011
import numpy as np
from scipy.linalg import eigh, svd, qr, solve
from scipy.sparse import eye, csr_matrix
from scipy.sparse.linalg import eigsh
from ..base import BaseEstimator, TransformerMixin, _UnstableArchMixin
from ..utils import check_random_state, check_array
from ..utils.extmath import stable_cumsum
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..utils.validation import _deprecate_positional_args
from ..neighbors import NearestNeighbors
def barycenter_weights(X, Z, reg=1e-3):
"""Compute barycenter weights of X from Y along the first axis
We estimate the weights to assign to each point in Y[i] to recover
the point X[i]. The barycenter weights sum to 1.
Parameters
----------
X : array-like, shape (n_samples, n_dim)
Z : array-like, shape (n_samples, n_neighbors, n_dim)
reg : float, optional
amount of regularization to add for the problem to be
well-posed in the case of n_neighbors > n_dim
Returns
-------
B : array-like, shape (n_samples, n_neighbors)
Notes
-----
See developers note for more information.
"""
X = check_array(X, dtype=FLOAT_DTYPES)
Z = check_array(Z, dtype=FLOAT_DTYPES, allow_nd=True)
n_samples, n_neighbors = X.shape[0], Z.shape[1]
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
v = np.ones(n_neighbors, dtype=X.dtype)
# this might raise a LinalgError if G is singular and has trace
# zero
for i, A in enumerate(Z.transpose(0, 2, 1)):
C = A.T - X[i] # broadcasting
G = np.dot(C, C.T)
trace = np.trace(G)
if trace > 0:
R = reg * trace
else:
R = reg
G.flat[::Z.shape[1] + 1] += R
w = solve(G, v, sym_pos=True)
B[i, :] = w / np.sum(w)
return B
def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
"""Computes the barycenter weighted graph of k-Neighbors for points in X
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : int
Number of neighbors for each sample.
reg : float, optional
Amount of regularization when solving the least-squares
problem. Only relevant if mode='barycenter'. If None, use the
default.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
A[i, j] is assigned the weight of edge that connects i to j.
See also
--------
sklearn.neighbors.kneighbors_graph
sklearn.neighbors.radius_neighbors_graph
"""
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
X = knn._fit_X
n_samples = knn.n_samples_fit_
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
data = barycenter_weights(X, X[ind], reg=reg)
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
return csr_matrix((data.ravel(), ind.ravel(), indptr),
shape=(n_samples, n_samples))
def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
random_state=None):
"""
Find the null space of a matrix M.
Parameters
----------
M : {array, matrix, sparse matrix, LinearOperator}
Input covariance matrix: should be symmetric positive semi-definite
k : integer
Number of eigenvalues/vectors to return
k_skip : integer, optional
Number of low eigenvalues to skip.
eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, optional
Tolerance for 'arpack' method.
Not used if eigen_solver=='dense'.
max_iter : int
Maximum number of iterations for 'arpack' method.
Not used if eigen_solver=='dense'
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
"""
if eigen_solver == 'auto':
if M.shape[0] > 200 and k + k_skip < 10:
eigen_solver = 'arpack'
else:
eigen_solver = 'dense'
if eigen_solver == 'arpack':
random_state = check_random_state(random_state)
# initialize with [-1,1] as in ARPACK
v0 = random_state.uniform(-1, 1, M.shape[0])
try:
eigen_values, eigen_vectors = eigsh(M, k + k_skip, sigma=0.0,
tol=tol, maxiter=max_iter,
v0=v0)
except RuntimeError as msg:
raise ValueError("Error in determining null-space with ARPACK. "
"Error message: '%s'. "
"Note that method='arpack' can fail when the "
"weight matrix is singular or otherwise "
"ill-behaved. method='dense' is recommended. "
"See online documentation for more information."
% msg)
return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
elif eigen_solver == 'dense':
if hasattr(M, 'toarray'):
M = M.toarray()
eigen_values, eigen_vectors = eigh(
M, eigvals=(k_skip, k + k_skip - 1), overwrite_a=True)
index = np.argsort(np.abs(eigen_values))
return eigen_vectors[:, index], np.sum(eigen_values)
else:
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
@_deprecate_positional_args
def locally_linear_embedding(
X, *, n_neighbors, n_components, reg=1e-3, eigen_solver='auto',
tol=1e-6, max_iter=100, method='standard', hessian_tol=1E-4,
modified_tol=1E-12, random_state=None, n_jobs=None):
"""Perform a Locally Linear Embedding analysis on the data.
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
X : {array-like, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array or a NearestNeighbors object.
n_neighbors : integer
number of neighbors to consider for each point.
n_components : integer
number of coordinates for the manifold.
reg : float
regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, optional
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : integer
maximum number of iterations for the arpack solver.
method : {'standard', 'hessian', 'modified', 'ltsa'}
standard : use the standard locally linear embedding algorithm.
see reference [1]_
hessian : use the Hessian eigenmap method. This method requires
n_neighbors > n_components * (1 + (n_components + 1) / 2.
see reference [2]_
modified : use the modified locally linear embedding algorithm.
see reference [3]_
ltsa : use local tangent space alignment algorithm
see reference [4]_
hessian_tol : float, optional
Tolerance for Hessian eigenmapping method.
Only used if method == 'hessian'
modified_tol : float, optional
Tolerance for modified LLE method.
Only used if method == 'modified'
random_state : int, RandomState instance, default=None
Determines the random number generator when ``solver`` == 'arpack'.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Returns
-------
Y : array-like, shape [n_samples, n_components]
Embedding vectors.
squared_error : float
Reconstruction error for the embedding vectors. Equivalent to
``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
"""
if eigen_solver not in ('auto', 'arpack', 'dense'):
raise ValueError("unrecognized eigen_solver '%s'" % eigen_solver)
if method not in ('standard', 'hessian', 'modified', 'ltsa'):
raise ValueError("unrecognized method '%s'" % method)
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
nbrs.fit(X)
X = nbrs._fit_X
N, d_in = X.shape
if n_components > d_in:
raise ValueError("output dimension must be less than or equal "
"to input dimension")
if n_neighbors >= N:
raise ValueError(
"Expected n_neighbors <= n_samples, "
" but n_samples = %d, n_neighbors = %d" %
(N, n_neighbors)
)
if n_neighbors <= 0:
raise ValueError("n_neighbors must be positive")
M_sparse = (eigen_solver != 'dense')
if method == 'standard':
W = barycenter_kneighbors_graph(
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs)
# we'll compute M = (I-W)'(I-W)
# depending on the solver, we'll do this differently
if M_sparse:
M = eye(*W.shape, format=W.format) - W
M = (M.T * M).tocsr()
else:
M = (W.T * W - W.T - W).toarray()
M.flat[::M.shape[0] + 1] += 1 # W = W - I = W - I
elif method == 'hessian':
dp = n_components * (n_components + 1) // 2
if n_neighbors <= n_components + dp:
raise ValueError("for method='hessian', n_neighbors must be "
"greater than "
"[n_components * (n_components + 3) / 2]")
neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1,
return_distance=False)
neighbors = neighbors[:, 1:]
Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
Yi[:, 0] = 1
M = np.zeros((N, N), dtype=np.float64)
use_svd = (n_neighbors > d_in)
for i in range(N):
Gi = X[neighbors[i]]
Gi -= Gi.mean(0)
# build Hessian estimator
if use_svd:
U = svd(Gi, full_matrices=0)[0]
else:
Ci = np.dot(Gi, Gi.T)
U = eigh(Ci)[1][:, ::-1]
Yi[:, 1:1 + n_components] = U[:, :n_components]
j = 1 + n_components
for k in range(n_components):
Yi[:, j:j + n_components - k] = (U[:, k:k + 1] *
U[:, k:n_components])
j += n_components - k
Q, R = qr(Yi)
w = Q[:, n_components + 1:]
S = w.sum(0)
S[np.where(abs(S) < hessian_tol)] = 1
w /= S
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(w, w.T)
if M_sparse:
M = csr_matrix(M)
elif method == 'modified':
if n_neighbors < n_components:
raise ValueError("modified LLE requires "
"n_neighbors >= n_components")
neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1,
return_distance=False)
neighbors = neighbors[:, 1:]
# find the eigenvectors and eigenvalues of each local covariance
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
# where the columns are eigenvectors
V = np.zeros((N, n_neighbors, n_neighbors))
nev = min(d_in, n_neighbors)
evals = np.zeros([N, nev])
# choose the most efficient way to find the eigenvectors
use_svd = (n_neighbors > d_in)
if use_svd:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
V[i], evals[i], _ = svd(X_nbrs,
full_matrices=True)
evals **= 2
else:
for i in range(N):
X_nbrs = X[neighbors[i]] - X[i]
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
evi, vi = eigh(C_nbrs)
evals[i] = evi[::-1]
V[i] = vi[:, ::-1]
# find regularized weights: this is like normal LLE.
# because we've already computed the SVD of each covariance matrix,
# it's faster to use this rather than np.linalg.solve
reg = 1E-3 * evals.sum(1)
tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
tmp[:, :nev] /= evals + reg[:, None]
tmp[:, nev:] /= reg[:, None]
w_reg = np.zeros((N, n_neighbors))
for i in range(N):
w_reg[i] = np.dot(V[i], tmp[i])
w_reg /= w_reg.sum(1)[:, None]
# calculate eta: the median of the ratio of small to large eigenvalues
# across the points. This is used to determine s_i, below
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
eta = np.median(rho)
# find s_i, the size of the "almost null space" for each point:
# this is the size of the largest set of eigenvalues
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
s_range = np.zeros(N, dtype=int)
evals_cumsum = stable_cumsum(evals, 1)
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
for i in range(N):
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
s_range += n_neighbors - nev # number of zero eigenvalues
# Now calculate M.
# This is the [N x N] matrix whose null space is the desired embedding
M = np.zeros((N, N), dtype=np.float64)
for i in range(N):
s_i = s_range[i]
# select bottom s_i eigenvectors and calculate alpha
Vi = V[i, :, n_neighbors - s_i:]
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
# compute Householder matrix which satisfies
# Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
# using prescription from paper
h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
norm_h = np.linalg.norm(h)
if norm_h < modified_tol:
h *= 0
else:
h /= norm_h
# Householder matrix is
# >> Hi = np.identity(s_i) - 2*np.outer(h,h)
# Then the weight matrix is
# >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
# We do this much more efficiently:
Wi = (Vi - 2 * np.outer(np.dot(Vi, h), h) +
(1 - alpha_i) * w_reg[i, :, None])
# Update M as follows:
# >> W_hat = np.zeros( (N,s_i) )
# >> W_hat[neighbors[i],:] = Wi
# >> W_hat[i] -= 1
# >> M += np.dot(W_hat,W_hat.T)
# We can do this much more efficiently:
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
Wi_sum1 = Wi.sum(1)
M[i, neighbors[i]] -= Wi_sum1
M[neighbors[i], i] -= Wi_sum1
M[i, i] += s_i
if M_sparse:
M = csr_matrix(M)
elif method == 'ltsa':
neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1,
return_distance=False)
neighbors = neighbors[:, 1:]
M = np.zeros((N, N))
use_svd = (n_neighbors > d_in)
for i in range(N):
Xi = X[neighbors[i]]
Xi -= Xi.mean(0)
# compute n_components largest eigenvalues of Xi * Xi^T
if use_svd:
v = svd(Xi, full_matrices=True)[0]
else:
Ci = np.dot(Xi, Xi.T)
v = eigh(Ci)[1][:, ::-1]
Gi = np.zeros((n_neighbors, n_components + 1))
Gi[:, 1:] = v[:, :n_components]
Gi[:, 0] = 1. / np.sqrt(n_neighbors)
GiGiT = np.dot(Gi, Gi.T)
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
M[nbrs_x, nbrs_y] -= GiGiT
M[neighbors[i], neighbors[i]] += 1
return null_space(M, n_components, k_skip=1, eigen_solver=eigen_solver,
tol=tol, max_iter=max_iter, random_state=random_state)
class LocallyLinearEmbedding(TransformerMixin,
_UnstableArchMixin, BaseEstimator):
"""Locally Linear Embedding
Read more in the :ref:`User Guide <locally_linear_embedding>`.
Parameters
----------
n_neighbors : integer
number of neighbors to consider for each point.
n_components : integer
number of coordinates for the manifold
reg : float
regularization constant, multiplies the trace of the local covariance
matrix of the distances.
eigen_solver : string, {'auto', 'arpack', 'dense'}
auto : algorithm will attempt to choose the best method for input data
arpack : use arnoldi iteration in shift-invert mode.
For this method, M may be a dense matrix, sparse matrix,
or general linear operator.
Warning: ARPACK can be unstable for some problems. It is
best to try several random seeds in order to check results.
dense : use standard dense matrix operations for the eigenvalue
decomposition. For this method, M must be an array
or matrix type. This method should be avoided for
large problems.
tol : float, optional
Tolerance for 'arpack' method
Not used if eigen_solver=='dense'.
max_iter : integer
maximum number of iterations for the arpack solver.
Not used if eigen_solver=='dense'.
method : string ('standard', 'hessian', 'modified' or 'ltsa')
standard : use the standard locally linear embedding algorithm. see
reference [1]
hessian : use the Hessian eigenmap method. This method requires
``n_neighbors > n_components * (1 + (n_components + 1) / 2``
see reference [2]
modified : use the modified locally linear embedding algorithm.
see reference [3]
ltsa : use local tangent space alignment algorithm
see reference [4]
hessian_tol : float, optional
Tolerance for Hessian eigenmapping method.
Only used if ``method == 'hessian'``
modified_tol : float, optional
Tolerance for modified LLE method.
Only used if ``method == 'modified'``
neighbors_algorithm : string ['auto'|'brute'|'kd_tree'|'ball_tree']
algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance
random_state : int, RandomState instance, default=None
Determines the random number generator when
``eigen_solver`` == 'arpack'. Pass an int for reproducible results
across multiple function calls. See :term: `Glossary <random_state>`.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : array-like, shape [n_samples, n_components]
Stores the embedding vectors
reconstruction_error_ : float
Reconstruction error associated with `embedding_`
nbrs_ : NearestNeighbors object
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import LocallyLinearEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = LocallyLinearEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
References
----------
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
by locally linear embedding. Science 290:2323 (2000).
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
linear embedding techniques for high-dimensional data.
Proc Natl Acad Sci U S A. 100:5591 (2003).
.. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
Embedding Using Multiple Weights.
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
dimensionality reduction via tangent space alignment.
Journal of Shanghai Univ. 8:406 (2004)
"""
@_deprecate_positional_args
def __init__(self, *, n_neighbors=5, n_components=2, reg=1E-3,
eigen_solver='auto', tol=1E-6, max_iter=100,
method='standard', hessian_tol=1E-4, modified_tol=1E-12,
neighbors_algorithm='auto', random_state=None, n_jobs=None):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.reg = reg
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.method = method
self.hessian_tol = hessian_tol
self.modified_tol = modified_tol
self.random_state = random_state
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
def _fit_transform(self, X):
self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
algorithm=self.neighbors_algorithm,
n_jobs=self.n_jobs)
random_state = check_random_state(self.random_state)
X = self._validate_data(X, dtype=float)
self.nbrs_.fit(X)
self.embedding_, self.reconstruction_error_ = \
locally_linear_embedding(
X=self.nbrs_, n_neighbors=self.n_neighbors,
n_components=self.n_components,
eigen_solver=self.eigen_solver, tol=self.tol,
max_iter=self.max_iter, method=self.method,
hessian_tol=self.hessian_tol, modified_tol=self.modified_tol,
random_state=random_state, reg=self.reg, n_jobs=self.n_jobs)
def fit(self, X, y=None):
"""Compute the embedding vectors for data X
Parameters
----------
X : array-like of shape [n_samples, n_features]
training set.
y : Ignored
Returns
-------
self : returns an instance of self.
"""
self._fit_transform(X)
return self
def fit_transform(self, X, y=None):
"""Compute the embedding vectors for data X and transform X.
Parameters
----------
X : array-like of shape [n_samples, n_features]
training set.
y : Ignored
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""
Transform new points into embedding space.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Returns
-------
X_new : array, shape = [n_samples, n_components]
Notes
-----
Because of scaling performed by this method, it is discouraged to use
it together with methods that are not scale-invariant (like SVMs)
"""
check_is_fitted(self)
X = check_array(X)
ind = self.nbrs_.kneighbors(X, n_neighbors=self.n_neighbors,
return_distance=False)
weights = barycenter_weights(X, self.nbrs_._fit_X[ind],
reg=self.reg)
X_new = np.empty((X.shape[0], self.n_components))
for i in range(X.shape[0]):
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
return X_new

View file

@ -0,0 +1,439 @@
"""
Multi-dimensional Scaling (MDS)
"""
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
# License: BSD
import numpy as np
from joblib import Parallel, delayed, effective_n_jobs
import warnings
from ..base import BaseEstimator
from ..metrics import euclidean_distances
from ..utils import check_random_state, check_array, check_symmetric
from ..isotonic import IsotonicRegression
from ..utils.validation import _deprecate_positional_args
def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
max_iter=300, verbose=0, eps=1e-3, random_state=None):
"""Computes multidimensional scaling using SMACOF algorithm
Parameters
----------
dissimilarities : ndarray, shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : boolean, optional, default: True
Compute metric or nonmetric SMACOF algorithm.
n_components : int, optional, default: 2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : ndarray, shape (n_samples, n_components), optional, default: None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
max_iter : int, optional, default: 300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, optional, default: 0
Level of verbosity.
eps : float, optional, default: 1e-3
Relative tolerance with respect to stress at which to declare
convergence.
random_state : int, RandomState instance, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
Returns
-------
X : ndarray, shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
n_iter : int
The number of iterations corresponding to the best stress.
"""
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
n_samples = dissimilarities.shape[0]
random_state = check_random_state(random_state)
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
sim_flat_w = sim_flat[sim_flat != 0]
if init is None:
# Randomly choose initial configuration
X = random_state.rand(n_samples * n_components)
X = X.reshape((n_samples, n_components))
else:
# overrides the parameter p
n_components = init.shape[1]
if n_samples != init.shape[0]:
raise ValueError("init matrix should be of shape (%d, %d)" %
(n_samples, n_components))
X = init
old_stress = None
ir = IsotonicRegression()
for it in range(max_iter):
# Compute distance and monotonic regression
dis = euclidean_distances(X)
if metric:
disparities = dissimilarities
else:
dis_flat = dis.ravel()
# dissimilarities with 0 are considered as missing values
dis_flat_w = dis_flat[sim_flat != 0]
# Compute the disparities using a monotonic regression
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
disparities = dis_flat.copy()
disparities[sim_flat != 0] = disparities_flat
disparities = disparities.reshape((n_samples, n_samples))
disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
(disparities ** 2).sum())
# Compute stress
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
# Update X using the Guttman transform
dis[dis == 0] = 1e-5
ratio = disparities / dis
B = - ratio
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
X = 1. / n_samples * np.dot(B, X)
dis = np.sqrt((X ** 2).sum(axis=1)).sum()
if verbose >= 2:
print('it: %d, stress %s' % (it, stress))
if old_stress is not None:
if(old_stress - stress / dis) < eps:
if verbose:
print('breaking at iteration %d with stress %s' % (it,
stress))
break
old_stress = stress / dis
return X, stress, it + 1
@_deprecate_positional_args
def smacof(dissimilarities, *, metric=True, n_components=2, init=None,
n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3,
random_state=None, return_n_iter=False):
"""Computes multidimensional scaling using the SMACOF algorithm.
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
multidimensional scaling algorithm which minimizes an objective function
(the *stress*) using a majorization technique. Stress majorization, also
known as the Guttman Transform, guarantees a monotone convergence of
stress, and is more powerful than traditional techniques such as gradient
descent.
The SMACOF algorithm for metric MDS can summarized by the following steps:
1. Set an initial start configuration, randomly or not.
2. Compute the stress
3. Compute the Guttman Transform
4. Iterate 2 and 3 until convergence.
The nonmetric algorithm adds a monotonic regression step before computing
the stress.
Parameters
----------
dissimilarities : ndarray, shape (n_samples, n_samples)
Pairwise dissimilarities between the points. Must be symmetric.
metric : boolean, optional, default: True
Compute metric or nonmetric SMACOF algorithm.
n_components : int, optional, default: 2
Number of dimensions in which to immerse the dissimilarities. If an
``init`` array is provided, this option is overridden and the shape of
``init`` is used to determine the dimensionality of the embedding
space.
init : ndarray, shape (n_samples, n_components), optional, default: None
Starting configuration of the embedding to initialize the algorithm. By
default, the algorithm is initialized with a randomly chosen array.
n_init : int, optional, default: 8
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress. If ``init`` is
provided, this option is overridden and a single run is performed.
n_jobs : int or None, optional (default=None)
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
max_iter : int, optional, default: 300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, optional, default: 0
Level of verbosity.
eps : float, optional, default: 1e-3
Relative tolerance with respect to stress at which to declare
convergence.
random_state : int, RandomState instance, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
return_n_iter : bool, optional, default: False
Whether or not to return the number of iterations.
Returns
-------
X : ndarray, shape (n_samples, n_components)
Coordinates of the points in a ``n_components``-space.
stress : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
n_iter : int
The number of iterations corresponding to the best stress. Returned
only if ``return_n_iter`` is set to ``True``.
Notes
-----
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
"""
dissimilarities = check_array(dissimilarities)
random_state = check_random_state(random_state)
if hasattr(init, '__array__'):
init = np.asarray(init).copy()
if not n_init == 1:
warnings.warn(
'Explicit initial positions passed: '
'performing only one init of the MDS instead of %d'
% n_init)
n_init = 1
best_pos, best_stress = None, None
if effective_n_jobs(n_jobs) == 1:
for it in range(n_init):
pos, stress, n_iter_ = _smacof_single(
dissimilarities, metric=metric,
n_components=n_components, init=init,
max_iter=max_iter, verbose=verbose,
eps=eps, random_state=random_state)
if best_stress is None or stress < best_stress:
best_stress = stress
best_pos = pos.copy()
best_iter = n_iter_
else:
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
delayed(_smacof_single)(
dissimilarities, metric=metric, n_components=n_components,
init=init, max_iter=max_iter, verbose=verbose, eps=eps,
random_state=seed)
for seed in seeds)
positions, stress, n_iters = zip(*results)
best = np.argmin(stress)
best_stress = stress[best]
best_pos = positions[best]
best_iter = n_iters[best]
if return_n_iter:
return best_pos, best_stress, best_iter
else:
return best_pos, best_stress
class MDS(BaseEstimator):
"""Multidimensional scaling
Read more in the :ref:`User Guide <multidimensional_scaling>`.
Parameters
----------
n_components : int, optional, default: 2
Number of dimensions in which to immerse the dissimilarities.
metric : boolean, optional, default: True
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
n_init : int, optional, default: 4
Number of times the SMACOF algorithm will be run with different
initializations. The final results will be the best output of the runs,
determined by the run with the smallest final stress.
max_iter : int, optional, default: 300
Maximum number of iterations of the SMACOF algorithm for a single run.
verbose : int, optional, default: 0
Level of verbosity.
eps : float, optional, default: 1e-3
Relative tolerance with respect to stress at which to declare
convergence.
n_jobs : int or None, optional (default=None)
The number of jobs to use for the computation. If multiple
initializations are used (``n_init``), each run of the algorithm is
computed in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
random_state : int, RandomState instance, default=None
Determines the random number generator used to initialize the centers.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
dissimilarity : 'euclidean' | 'precomputed', optional, default: 'euclidean'
Dissimilarity measure to use:
- 'euclidean':
Pairwise Euclidean distances between points in the dataset.
- 'precomputed':
Pre-computed dissimilarities are passed directly to ``fit`` and
``fit_transform``.
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the position of the dataset in the embedding space.
stress_ : float
The final value of the stress (sum of squared distance of the
disparities and the distances for all constrained points).
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import MDS
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = MDS(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
References
----------
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
Groenen P. Springer Series in Statistics (1997)
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
Psychometrika, 29 (1964)
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
"""
@_deprecate_positional_args
def __init__(self, n_components=2, *, metric=True, n_init=4,
max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
random_state=None, dissimilarity="euclidean"):
self.n_components = n_components
self.dissimilarity = dissimilarity
self.metric = metric
self.n_init = n_init
self.max_iter = max_iter
self.eps = eps
self.verbose = verbose
self.n_jobs = n_jobs
self.random_state = random_state
@property
def _pairwise(self):
return self.kernel == "precomputed"
def fit(self, X, y=None, init=None):
"""
Computes the position of the points in the embedding space
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
Input data. If ``dissimilarity=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
init : ndarray, shape (n_samples,), optional, default: None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
"""
self.fit_transform(X, init=init)
return self
def fit_transform(self, X, y=None, init=None):
"""
Fit the data from X, and returns the embedded coordinates
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
Input data. If ``dissimilarity=='precomputed'``, the input should
be the dissimilarity matrix.
y : Ignored
init : ndarray, shape (n_samples,), optional, default: None
Starting configuration of the embedding to initialize the SMACOF
algorithm. By default, the algorithm is initialized with a randomly
chosen array.
"""
X = self._validate_data(X)
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
warnings.warn("The MDS API has changed. ``fit`` now constructs an"
" dissimilarity matrix from data. To use a custom "
"dissimilarity matrix, set "
"``dissimilarity='precomputed'``.")
if self.dissimilarity == "precomputed":
self.dissimilarity_matrix_ = X
elif self.dissimilarity == "euclidean":
self.dissimilarity_matrix_ = euclidean_distances(X)
else:
raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
" Got %s instead" % str(self.dissimilarity))
self.embedding_, self.stress_, self.n_iter_ = smacof(
self.dissimilarity_matrix_, metric=self.metric,
n_components=self.n_components, init=init, n_init=self.n_init,
n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
eps=self.eps, random_state=self.random_state,
return_n_iter=True)
return self.embedding_

View file

@ -0,0 +1,577 @@
"""Spectral Embedding"""
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Wei LI <kuantkid@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
from scipy import sparse
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh
from scipy.sparse.csgraph import connected_components
from scipy.sparse.csgraph import laplacian as csgraph_laplacian
from ..base import BaseEstimator
from ..utils import check_random_state, check_array, check_symmetric
from ..utils.extmath import _deterministic_vector_sign_flip
from ..utils.fixes import lobpcg
from ..metrics.pairwise import rbf_kernel
from ..neighbors import kneighbors_graph, NearestNeighbors
from ..utils.validation import _deprecate_positional_args
def _graph_connected_component(graph, node_id):
"""Find the largest graph connected components that contains one
given node
Parameters
----------
graph : array-like, shape: (n_samples, n_samples)
adjacency matrix of the graph, non-zero weight means an edge
between the nodes
node_id : int
The index of the query node of the graph
Returns
-------
connected_components_matrix : array-like, shape: (n_samples,)
An array of bool value indicating the indexes of the nodes
belonging to the largest connected components of the given query
node
"""
n_node = graph.shape[0]
if sparse.issparse(graph):
# speed up row-wise access to boolean connection mask
graph = graph.tocsr()
connected_nodes = np.zeros(n_node, dtype=np.bool)
nodes_to_explore = np.zeros(n_node, dtype=np.bool)
nodes_to_explore[node_id] = True
for _ in range(n_node):
last_num_component = connected_nodes.sum()
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
if last_num_component >= connected_nodes.sum():
break
indices = np.where(nodes_to_explore)[0]
nodes_to_explore.fill(False)
for i in indices:
if sparse.issparse(graph):
neighbors = graph[i].toarray().ravel()
else:
neighbors = graph[i]
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
return connected_nodes
def _graph_is_connected(graph):
""" Return whether the graph is connected (True) or Not (False)
Parameters
----------
graph : array-like or sparse matrix, shape: (n_samples, n_samples)
adjacency matrix of the graph, non-zero weight means an edge
between the nodes
Returns
-------
is_connected : bool
True means the graph is fully connected and False means not
"""
if sparse.isspmatrix(graph):
# sparse graph, find all the connected components
n_connected_components, _ = connected_components(graph)
return n_connected_components == 1
else:
# dense graph, find all connected components start from node 0
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
def _set_diag(laplacian, value, norm_laplacian):
"""Set the diagonal of the laplacian matrix and convert it to a
sparse format well suited for eigenvalue decomposition
Parameters
----------
laplacian : array or sparse matrix
The graph laplacian
value : float
The value of the diagonal
norm_laplacian : bool
Whether the value of the diagonal should be changed or not
Returns
-------
laplacian : array or sparse matrix
An array of matrix in a form that is well suited to fast
eigenvalue decomposition, depending on the band width of the
matrix.
"""
n_nodes = laplacian.shape[0]
# We need all entries in the diagonal to values
if not sparse.isspmatrix(laplacian):
if norm_laplacian:
laplacian.flat[::n_nodes + 1] = value
else:
laplacian = laplacian.tocoo()
if norm_laplacian:
diag_idx = (laplacian.row == laplacian.col)
laplacian.data[diag_idx] = value
# If the matrix has a small number of diagonals (as in the
# case of structured matrices coming from images), the
# dia format might be best suited for matvec products:
n_diags = np.unique(laplacian.row - laplacian.col).size
if n_diags <= 7:
# 3 or less outer diagonals on each side
laplacian = laplacian.todia()
else:
# csr has the fastest matvec and is thus best suited to
# arpack
laplacian = laplacian.tocsr()
return laplacian
@_deprecate_positional_args
def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
random_state=None, eigen_tol=0.0,
norm_laplacian=True, drop_first=True):
"""Project the sample on the first eigenvectors of the graph Laplacian.
The adjacency matrix is used to compute a normalized graph Laplacian
whose spectrum (especially the eigenvectors associated to the
smallest eigenvalues) has an interpretation in terms of minimal
number of cuts necessary to split the graph into comparably sized
components.
This embedding can also 'work' even if the ``adjacency`` variable is
not strictly the adjacency matrix of a graph but more generally
an affinity or similarity matrix between samples (for instance the
heat kernel of a euclidean distance matrix or a k-NN matrix).
However care must taken to always make the affinity matrix symmetric
so that the eigenvector decomposition works as expected.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
adjacency : array-like or sparse graph, shape: (n_samples, n_samples)
The adjacency matrix of the graph to embed.
n_components : integer, optional, default 8
The dimension of the projection subspace.
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}, default None
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems,
but may also lead to instabilities.
random_state : int, RandomState instance, default=None
Determines the random number generator used for the initialization of
the lobpcg eigenvectors decomposition when ``solver`` == 'amg'. Pass
an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
eigen_tol : float, optional, default=0.0
Stopping criterion for eigendecomposition of the Laplacian matrix
when using arpack eigen_solver.
norm_laplacian : bool, optional, default=True
If True, then compute normalized Laplacian.
drop_first : bool, optional, default=True
Whether to drop the first eigenvector. For spectral embedding, this
should be True as the first eigenvector should be constant vector for
connected graph, but for spectral clustering, this should be kept as
False to retain the first eigenvector.
Returns
-------
embedding : array, shape=(n_samples, n_components)
The reduced samples.
Notes
-----
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
has one connected component. If there graph has many components, the first
few eigenvectors will simply uncover the connected components of the graph.
References
----------
* https://en.wikipedia.org/wiki/LOBPCG
* Toward the Optimal Preconditioned Eigensolver: Locally Optimal
Block Preconditioned Conjugate Gradient Method
Andrew V. Knyazev
https://doi.org/10.1137%2FS1064827500366124
"""
adjacency = check_symmetric(adjacency)
try:
from pyamg import smoothed_aggregation_solver
except ImportError:
if eigen_solver == "amg":
raise ValueError("The eigen_solver was set to 'amg', but pyamg is "
"not available.")
if eigen_solver is None:
eigen_solver = 'arpack'
elif eigen_solver not in ('arpack', 'lobpcg', 'amg'):
raise ValueError("Unknown value for eigen_solver: '%s'."
"Should be 'amg', 'arpack', or 'lobpcg'"
% eigen_solver)
random_state = check_random_state(random_state)
n_nodes = adjacency.shape[0]
# Whether to drop the first eigenvector
if drop_first:
n_components = n_components + 1
if not _graph_is_connected(adjacency):
warnings.warn("Graph is not fully connected, spectral embedding"
" may not work as expected.")
laplacian, dd = csgraph_laplacian(adjacency, normed=norm_laplacian,
return_diag=True)
if (eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and
(not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components)):
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
# for details see the source code in scipy:
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
# /lobpcg/lobpcg.py#L237
# or matlab:
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# Here we'll use shift-invert mode for fast eigenvalues
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
# for a short explanation of what this means)
# Because the normalized Laplacian has eigenvalues between 0 and 2,
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
# when finding eigenvalues of largest magnitude (keyword which='LM')
# and when these eigenvalues are very large compared to the rest.
# For very large, very sparse graphs, I - L can have many, many
# eigenvalues very near 1.0. This leads to slow convergence. So
# instead, we'll use ARPACK's shift-invert mode, asking for the
# eigenvalues near 1.0. This effectively spreads-out the spectrum
# near 1.0 and leads to much faster convergence: potentially an
# orders-of-magnitude speedup over simply using keyword which='LA'
# in standard mode.
try:
# We are computing the opposite of the laplacian inplace so as
# to spare a memory allocation of a possibly very large array
laplacian *= -1
v0 = random_state.uniform(-1, 1, laplacian.shape[0])
_, diffusion_map = eigsh(
laplacian, k=n_components, sigma=1.0, which='LM',
tol=eigen_tol, v0=v0)
embedding = diffusion_map.T[n_components::-1]
if norm_laplacian:
embedding = embedding / dd
except RuntimeError:
# When submatrices are exactly singular, an LU decomposition
# in arpack fails. We fallback to lobpcg
eigen_solver = "lobpcg"
# Revert the laplacian to its opposite to have lobpcg work
laplacian *= -1
elif eigen_solver == 'amg':
# Use AMG to get a preconditioner and speed up the eigenvalue
# problem.
if not sparse.issparse(laplacian):
warnings.warn("AMG works better for sparse matrices")
# lobpcg needs double precision floats
laplacian = check_array(laplacian, dtype=np.float64,
accept_sparse=True)
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# The Laplacian matrix is always singular, having at least one zero
# eigenvalue, corresponding to the trivial eigenvector, which is a
# constant. Using a singular matrix for preconditioning may result in
# random failures in LOBPCG and is not supported by the existing
# theory:
# see https://doi.org/10.1007/s10208-015-9297-1
# Shift the Laplacian so its diagononal is not all ones. The shift
# does change the eigenpairs however, so we'll feed the shifted
# matrix to the solver and afterward set it back to the original.
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
laplacian += diag_shift
ml = smoothed_aggregation_solver(check_array(laplacian,
accept_sparse='csr'))
laplacian -= diag_shift
M = ml.aspreconditioner()
X = random_state.rand(laplacian.shape[0], n_components + 1)
X[:, 0] = dd.ravel()
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.e-5,
largest=False)
embedding = diffusion_map.T
if norm_laplacian:
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
if eigen_solver == "lobpcg":
# lobpcg needs double precision floats
laplacian = check_array(laplacian, dtype=np.float64,
accept_sparse=True)
if n_nodes < 5 * n_components + 1:
# see note above under arpack why lobpcg has problems with small
# number of nodes
# lobpcg will fallback to eigh, so we short circuit it
if sparse.isspmatrix(laplacian):
laplacian = laplacian.toarray()
_, diffusion_map = eigh(laplacian)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
embedding = embedding / dd
else:
laplacian = _set_diag(laplacian, 1, norm_laplacian)
# We increase the number of eigenvectors requested, as lobpcg
# doesn't behave well in low dimension
X = random_state.rand(laplacian.shape[0], n_components + 1)
X[:, 0] = dd.ravel()
_, diffusion_map = lobpcg(laplacian, X, tol=1e-15,
largest=False, maxiter=2000)
embedding = diffusion_map.T[:n_components]
if norm_laplacian:
embedding = embedding / dd
if embedding.shape[0] == 1:
raise ValueError
embedding = _deterministic_vector_sign_flip(embedding)
if drop_first:
return embedding[1:n_components].T
else:
return embedding[:n_components].T
class SpectralEmbedding(BaseEstimator):
"""Spectral embedding for non-linear dimensionality reduction.
Forms an affinity matrix given by the specified function and
applies spectral decomposition to the corresponding graph laplacian.
The resulting transformation is given by the value of the
eigenvectors for each data point.
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
Read more in the :ref:`User Guide <spectral_embedding>`.
Parameters
----------
n_components : integer, default: 2
The dimension of the projected subspace.
affinity : string or callable, default : "nearest_neighbors"
How to construct the affinity matrix.
- 'nearest_neighbors' : construct the affinity matrix by computing a
graph of nearest neighbors.
- 'rbf' : construct the affinity matrix by computing a radial basis
function (RBF) kernel.
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
of precomputed nearest neighbors, and constructs the affinity matrix
by selecting the ``n_neighbors`` nearest neighbors.
- callable : use passed in function as affinity
the function takes in data matrix (n_samples, n_features)
and return affinity matrix (n_samples, n_samples).
gamma : float, optional, default : 1/n_features
Kernel coefficient for rbf kernel.
random_state : int, RandomState instance, default=None
Determines the random number generator used for the initialization of
the lobpcg eigenvectors when ``solver`` == 'amg'. Pass an int for
reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
The eigenvalue decomposition strategy to use. AMG requires pyamg
to be installed. It can be faster on very large, sparse problems.
n_neighbors : int, default : max(n_samples/10 , 1)
Number of nearest neighbors for nearest_neighbors graph building.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
embedding_ : array, shape = (n_samples, n_components)
Spectral embedding of the training matrix.
affinity_matrix_ : array, shape = (n_samples, n_samples)
Affinity_matrix constructed from samples or precomputed.
n_neighbors_ : int
Number of nearest neighbors effectively used.
Examples
--------
>>> from sklearn.datasets import load_digits
>>> from sklearn.manifold import SpectralEmbedding
>>> X, _ = load_digits(return_X_y=True)
>>> X.shape
(1797, 64)
>>> embedding = SpectralEmbedding(n_components=2)
>>> X_transformed = embedding.fit_transform(X[:100])
>>> X_transformed.shape
(100, 2)
References
----------
- A Tutorial on Spectral Clustering, 2007
Ulrike von Luxburg
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
- On Spectral Clustering: Analysis and an algorithm, 2001
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100
- Normalized cuts and image segmentation, 2000
Jianbo Shi, Jitendra Malik
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
"""
@_deprecate_positional_args
def __init__(self, n_components=2, *, affinity="nearest_neighbors",
gamma=None, random_state=None, eigen_solver=None,
n_neighbors=None, n_jobs=None):
self.n_components = n_components
self.affinity = affinity
self.gamma = gamma
self.random_state = random_state
self.eigen_solver = eigen_solver
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
@property
def _pairwise(self):
return self.affinity in ["precomputed",
"precomputed_nearest_neighbors"]
def _get_affinity_matrix(self, X, Y=None):
"""Calculate the affinity matrix from data
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples
and n_features is the number of features.
If affinity is "precomputed"
X : array-like, shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
Y: Ignored
Returns
-------
affinity_matrix, shape (n_samples, n_samples)
"""
if self.affinity == 'precomputed':
self.affinity_matrix_ = X
return self.affinity_matrix_
if self.affinity == 'precomputed_nearest_neighbors':
estimator = NearestNeighbors(n_neighbors=self.n_neighbors,
n_jobs=self.n_jobs,
metric="precomputed").fit(X)
connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
return self.affinity_matrix_
if self.affinity == 'nearest_neighbors':
if sparse.issparse(X):
warnings.warn("Nearest neighbors affinity currently does "
"not support sparse input, falling back to "
"rbf affinity")
self.affinity = "rbf"
else:
self.n_neighbors_ = (self.n_neighbors
if self.n_neighbors is not None
else max(int(X.shape[0] / 10), 1))
self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_,
include_self=True,
n_jobs=self.n_jobs)
# currently only symmetric affinity_matrix supported
self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ +
self.affinity_matrix_.T)
return self.affinity_matrix_
if self.affinity == 'rbf':
self.gamma_ = (self.gamma
if self.gamma is not None else 1.0 / X.shape[1])
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
return self.affinity_matrix_
self.affinity_matrix_ = self.affinity(X)
return self.affinity_matrix_
def fit(self, X, y=None):
"""Fit the model from data in X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples
and n_features is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2,
estimator=self)
random_state = check_random_state(self.random_state)
if isinstance(self.affinity, str):
if self.affinity not in {"nearest_neighbors", "rbf", "precomputed",
"precomputed_nearest_neighbors"}:
raise ValueError(("%s is not a valid affinity. Expected "
"'precomputed', 'rbf', 'nearest_neighbors' "
"or a callable.") % self.affinity)
elif not callable(self.affinity):
raise ValueError(("'affinity' is expected to be an affinity "
"name or a callable. Got: %s") % self.affinity)
affinity_matrix = self._get_affinity_matrix(X)
self.embedding_ = spectral_embedding(affinity_matrix,
n_components=self.n_components,
eigen_solver=self.eigen_solver,
random_state=random_state)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples
and n_features is the number of features.
If affinity is "precomputed"
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
Interpret X as precomputed adjacency graph computed from
samples.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
self.fit(X)
return self.embedding_

View file

@ -0,0 +1,910 @@
# Author: Alexander Fabisch -- <afabisch@informatik.uni-bremen.de>
# Author: Christopher Moody <chrisemoody@gmail.com>
# Author: Nick Travers <nickt@squareup.com>
# License: BSD 3 clause (C) 2014
# This is the exact and Barnes-Hut t-SNE implementation. There are other
# modifications of the algorithm:
# * Fast Optimization for t-SNE:
# https://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf
from time import time
import numpy as np
from scipy import linalg
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.sparse import csr_matrix, issparse
from ..neighbors import NearestNeighbors
from ..base import BaseEstimator
from ..utils import check_random_state
from ..utils._openmp_helpers import _openmp_effective_n_threads
from ..utils.validation import check_non_negative
from ..utils.validation import _deprecate_positional_args
from ..decomposition import PCA
from ..metrics.pairwise import pairwise_distances
from . import _utils
# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
from . import _barnes_hut_tsne # type: ignore
MACHINE_EPSILON = np.finfo(np.double).eps
def _joint_probabilities(distances, desired_perplexity, verbose):
"""Compute joint probabilities p_ij from distances.
Parameters
----------
distances : array, shape (n_samples * (n_samples-1) / 2,)
Distances of samples are stored as condensed matrices, i.e.
we omit the diagonal and duplicate entries and store everything
in a one-dimensional array.
desired_perplexity : float
Desired perplexity of the joint probability distributions.
verbose : int
Verbosity level.
Returns
-------
P : array, shape (n_samples * (n_samples-1) / 2,)
Condensed joint probability matrix.
"""
# Compute conditional probabilities such that they approximately match
# the desired perplexity
distances = distances.astype(np.float32, copy=False)
conditional_P = _utils._binary_search_perplexity(
distances, desired_perplexity, verbose)
P = conditional_P + conditional_P.T
sum_P = np.maximum(np.sum(P), MACHINE_EPSILON)
P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON)
return P
def _joint_probabilities_nn(distances, desired_perplexity, verbose):
"""Compute joint probabilities p_ij from distances using just nearest
neighbors.
This method is approximately equal to _joint_probabilities. The latter
is O(N), but limiting the joint probability to nearest neighbors improves
this substantially to O(uN).
Parameters
----------
distances : CSR sparse matrix, shape (n_samples, n_samples)
Distances of samples to its n_neighbors nearest neighbors. All other
distances are left to zero (and are not materialized in memory).
desired_perplexity : float
Desired perplexity of the joint probability distributions.
verbose : int
Verbosity level.
Returns
-------
P : csr sparse matrix, shape (n_samples, n_samples)
Condensed joint probability matrix with only nearest neighbors.
"""
t0 = time()
# Compute conditional probabilities such that they approximately match
# the desired perplexity
distances.sort_indices()
n_samples = distances.shape[0]
distances_data = distances.data.reshape(n_samples, -1)
distances_data = distances_data.astype(np.float32, copy=False)
conditional_P = _utils._binary_search_perplexity(
distances_data, desired_perplexity, verbose)
assert np.all(np.isfinite(conditional_P)), \
"All probabilities should be finite"
# Symmetrize the joint probability distribution using sparse operations
P = csr_matrix((conditional_P.ravel(), distances.indices,
distances.indptr),
shape=(n_samples, n_samples))
P = P + P.T
# Normalize the joint probability distribution
sum_P = np.maximum(P.sum(), MACHINE_EPSILON)
P /= sum_P
assert np.all(np.abs(P.data) <= 1.0)
if verbose >= 2:
duration = time() - t0
print("[t-SNE] Computed conditional probabilities in {:.3f}s"
.format(duration))
return P
def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components,
skip_num_points=0, compute_error=True):
"""t-SNE objective function: gradient of the KL divergence
of p_ijs and q_ijs and the absolute error.
Parameters
----------
params : array, shape (n_params,)
Unraveled embedding.
P : array, shape (n_samples * (n_samples-1) / 2,)
Condensed joint probability matrix.
degrees_of_freedom : int
Degrees of freedom of the Student's-t distribution.
n_samples : int
Number of samples.
n_components : int
Dimension of the embedded space.
skip_num_points : int (optional, default:0)
This does not compute the gradient for points with indices below
`skip_num_points`. This is useful when computing transforms of new
data where you'd like to keep the old data fixed.
compute_error: bool (optional, default:True)
If False, the kl_divergence is not computed and returns NaN.
Returns
-------
kl_divergence : float
Kullback-Leibler divergence of p_ij and q_ij.
grad : array, shape (n_params,)
Unraveled gradient of the Kullback-Leibler divergence with respect to
the embedding.
"""
X_embedded = params.reshape(n_samples, n_components)
# Q is a heavy-tailed distribution: Student's t-distribution
dist = pdist(X_embedded, "sqeuclidean")
dist /= degrees_of_freedom
dist += 1.
dist **= (degrees_of_freedom + 1.0) / -2.0
Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON)
# Optimization trick below: np.dot(x, y) is faster than
# np.sum(x * y) because it calls BLAS
# Objective: C (Kullback-Leibler divergence of P and Q)
if compute_error:
kl_divergence = 2.0 * np.dot(
P, np.log(np.maximum(P, MACHINE_EPSILON) / Q))
else:
kl_divergence = np.nan
# Gradient: dC/dY
# pdist always returns double precision distances. Thus we need to take
grad = np.ndarray((n_samples, n_components), dtype=params.dtype)
PQd = squareform((P - Q) * dist)
for i in range(skip_num_points, n_samples):
grad[i] = np.dot(np.ravel(PQd[i], order='K'),
X_embedded[i] - X_embedded)
grad = grad.ravel()
c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
grad *= c
return kl_divergence, grad
def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components,
angle=0.5, skip_num_points=0, verbose=False,
compute_error=True, num_threads=1):
"""t-SNE objective function: KL divergence of p_ijs and q_ijs.
Uses Barnes-Hut tree methods to calculate the gradient that
runs in O(NlogN) instead of O(N^2)
Parameters
----------
params : array, shape (n_params,)
Unraveled embedding.
P : csr sparse matrix, shape (n_samples, n_sample)
Sparse approximate joint probability matrix, computed only for the
k nearest-neighbors and symmetrized.
degrees_of_freedom : int
Degrees of freedom of the Student's-t distribution.
n_samples : int
Number of samples.
n_components : int
Dimension of the embedded space.
angle : float (default: 0.5)
This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
'angle' is the angular size (referred to as theta in [3]) of a distant
node as measured from a point. If this size is below 'angle' then it is
used as a summary node of all points contained within it.
This method is not very sensitive to changes in this parameter
in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
computation time and angle greater 0.8 has quickly increasing error.
skip_num_points : int (optional, default:0)
This does not compute the gradient for points with indices below
`skip_num_points`. This is useful when computing transforms of new
data where you'd like to keep the old data fixed.
verbose : int
Verbosity level.
compute_error: bool (optional, default:True)
If False, the kl_divergence is not computed and returns NaN.
num_threads : int (optional, default:1)
Number of threads used to compute the gradient. This is set here to
avoid calling _openmp_effective_n_threads for each gradient step.
Returns
-------
kl_divergence : float
Kullback-Leibler divergence of p_ij and q_ij.
grad : array, shape (n_params,)
Unraveled gradient of the Kullback-Leibler divergence with respect to
the embedding.
"""
params = params.astype(np.float32, copy=False)
X_embedded = params.reshape(n_samples, n_components)
val_P = P.data.astype(np.float32, copy=False)
neighbors = P.indices.astype(np.int64, copy=False)
indptr = P.indptr.astype(np.int64, copy=False)
grad = np.zeros(X_embedded.shape, dtype=np.float32)
error = _barnes_hut_tsne.gradient(val_P, X_embedded, neighbors, indptr,
grad, angle, n_components, verbose,
dof=degrees_of_freedom,
compute_error=compute_error,
num_threads=num_threads)
c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
grad = grad.ravel()
grad *= c
return error, grad
def _gradient_descent(objective, p0, it, n_iter,
n_iter_check=1, n_iter_without_progress=300,
momentum=0.8, learning_rate=200.0, min_gain=0.01,
min_grad_norm=1e-7, verbose=0, args=None, kwargs=None):
"""Batch gradient descent with momentum and individual gains.
Parameters
----------
objective : function or callable
Should return a tuple of cost and gradient for a given parameter
vector. When expensive to compute, the cost can optionally
be None and can be computed every n_iter_check steps using
the objective_error function.
p0 : array-like, shape (n_params,)
Initial parameter vector.
it : int
Current number of iterations (this function will be called more than
once during the optimization).
n_iter : int
Maximum number of gradient descent iterations.
n_iter_check : int
Number of iterations before evaluating the global error. If the error
is sufficiently low, we abort the optimization.
n_iter_without_progress : int, optional (default: 300)
Maximum number of iterations without progress before we abort the
optimization.
momentum : float, within (0.0, 1.0), optional (default: 0.8)
The momentum generates a weight for previous gradients that decays
exponentially.
learning_rate : float, optional (default: 200.0)
The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
the learning rate is too high, the data may look like a 'ball' with any
point approximately equidistant from its nearest neighbours. If the
learning rate is too low, most points may look compressed in a dense
cloud with few outliers.
min_gain : float, optional (default: 0.01)
Minimum individual gain for each parameter.
min_grad_norm : float, optional (default: 1e-7)
If the gradient norm is below this threshold, the optimization will
be aborted.
verbose : int, optional (default: 0)
Verbosity level.
args : sequence
Arguments to pass to objective function.
kwargs : dict
Keyword arguments to pass to objective function.
Returns
-------
p : array, shape (n_params,)
Optimum parameters.
error : float
Optimum.
i : int
Last iteration.
"""
if args is None:
args = []
if kwargs is None:
kwargs = {}
p = p0.copy().ravel()
update = np.zeros_like(p)
gains = np.ones_like(p)
error = np.finfo(np.float).max
best_error = np.finfo(np.float).max
best_iter = i = it
tic = time()
for i in range(it, n_iter):
check_convergence = (i + 1) % n_iter_check == 0
# only compute the error when needed
kwargs['compute_error'] = check_convergence or i == n_iter - 1
error, grad = objective(p, *args, **kwargs)
grad_norm = linalg.norm(grad)
inc = update * grad < 0.0
dec = np.invert(inc)
gains[inc] += 0.2
gains[dec] *= 0.8
np.clip(gains, min_gain, np.inf, out=gains)
grad *= gains
update = momentum * update - learning_rate * grad
p += update
if check_convergence:
toc = time()
duration = toc - tic
tic = toc
if verbose >= 2:
print("[t-SNE] Iteration %d: error = %.7f,"
" gradient norm = %.7f"
" (%s iterations in %0.3fs)"
% (i + 1, error, grad_norm, n_iter_check, duration))
if error < best_error:
best_error = error
best_iter = i
elif i - best_iter > n_iter_without_progress:
if verbose >= 2:
print("[t-SNE] Iteration %d: did not make any progress "
"during the last %d episodes. Finished."
% (i + 1, n_iter_without_progress))
break
if grad_norm <= min_grad_norm:
if verbose >= 2:
print("[t-SNE] Iteration %d: gradient norm %f. Finished."
% (i + 1, grad_norm))
break
return p, error, i
@_deprecate_positional_args
def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'):
r"""Expresses to what extent the local structure is retained.
The trustworthiness is within [0, 1]. It is defined as
.. math::
T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
\sum_{j \in \mathcal{N}_{i}^{k}} \max(0, (r(i, j) - k))
where for each sample i, :math:`\mathcal{N}_{i}^{k}` are its k nearest
neighbors in the output space, and every sample j is its :math:`r(i, j)`-th
nearest neighbor in the input space. In other words, any unexpected nearest
neighbors in the output space are penalised in proportion to their rank in
the input space.
* "Neighborhood Preservation in Nonlinear Projection Methods: An
Experimental Study"
J. Venna, S. Kaski
* "Learning a Parametric Embedding by Preserving Local Structure"
L.J.P. van der Maaten
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row.
X_embedded : array, shape (n_samples, n_components)
Embedding of the training data in low-dimensional space.
n_neighbors : int, optional (default: 5)
Number of neighbors k that will be considered.
metric : string, or callable, optional, default 'euclidean'
Which metric to use for computing pairwise distances between samples
from the original input space. If metric is 'precomputed', X must be a
matrix of pairwise distances or squared distances. Otherwise, see the
documentation of argument metric in sklearn.pairwise.pairwise_distances
for a list of available metrics.
.. versionadded:: 0.20
Returns
-------
trustworthiness : float
Trustworthiness of the low-dimensional embedding.
"""
dist_X = pairwise_distances(X, metric=metric)
if metric == 'precomputed':
dist_X = dist_X.copy()
# we set the diagonal to np.inf to exclude the points themselves from
# their own neighborhood
np.fill_diagonal(dist_X, np.inf)
ind_X = np.argsort(dist_X, axis=1)
# `ind_X[i]` is the index of sorted distances between i and other samples
ind_X_embedded = NearestNeighbors(n_neighbors=n_neighbors).fit(
X_embedded).kneighbors(return_distance=False)
# We build an inverted index of neighbors in the input space: For sample i,
# we define `inverted_index[i]` as the inverted index of sorted distances:
# inverted_index[i][ind_X[i]] = np.arange(1, n_sample + 1)
n_samples = X.shape[0]
inverted_index = np.zeros((n_samples, n_samples), dtype=int)
ordered_indices = np.arange(n_samples + 1)
inverted_index[ordered_indices[:-1, np.newaxis],
ind_X] = ordered_indices[1:]
ranks = inverted_index[ordered_indices[:-1, np.newaxis],
ind_X_embedded] - n_neighbors
t = np.sum(ranks[ranks > 0])
t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
(2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
return t
class TSNE(BaseEstimator):
"""t-distributed Stochastic Neighbor Embedding.
t-SNE [1] is a tool to visualize high-dimensional data. It converts
similarities between data points to joint probabilities and tries
to minimize the Kullback-Leibler divergence between the joint
probabilities of the low-dimensional embedding and the
high-dimensional data. t-SNE has a cost function that is not convex,
i.e. with different initializations we can get different results.
It is highly recommended to use another dimensionality reduction
method (e.g. PCA for dense data or TruncatedSVD for sparse data)
to reduce the number of dimensions to a reasonable amount (e.g. 50)
if the number of features is very high. This will suppress some
noise and speed up the computation of pairwise distances between
samples. For more tips see Laurens van der Maaten's FAQ [2].
Read more in the :ref:`User Guide <t_sne>`.
Parameters
----------
n_components : int, optional (default: 2)
Dimension of the embedded space.
perplexity : float, optional (default: 30)
The perplexity is related to the number of nearest neighbors that
is used in other manifold learning algorithms. Larger datasets
usually require a larger perplexity. Consider selecting a value
between 5 and 50. Different values can result in significanlty
different results.
early_exaggeration : float, optional (default: 12.0)
Controls how tight natural clusters in the original space are in
the embedded space and how much space will be between them. For
larger values, the space between natural clusters will be larger
in the embedded space. Again, the choice of this parameter is not
very critical. If the cost function increases during initial
optimization, the early exaggeration factor or the learning rate
might be too high.
learning_rate : float, optional (default: 200.0)
The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
the learning rate is too high, the data may look like a 'ball' with any
point approximately equidistant from its nearest neighbours. If the
learning rate is too low, most points may look compressed in a dense
cloud with few outliers. If the cost function gets stuck in a bad local
minimum increasing the learning rate may help.
n_iter : int, optional (default: 1000)
Maximum number of iterations for the optimization. Should be at
least 250.
n_iter_without_progress : int, optional (default: 300)
Maximum number of iterations without progress before we abort the
optimization, used after 250 initial iterations with early
exaggeration. Note that progress is only checked every 50 iterations so
this value is rounded to the next multiple of 50.
.. versionadded:: 0.17
parameter *n_iter_without_progress* to control stopping criteria.
min_grad_norm : float, optional (default: 1e-7)
If the gradient norm is below this threshold, the optimization will
be stopped.
metric : string or callable, optional
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by scipy.spatial.distance.pdist for its metric parameter, or
a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
If metric is "precomputed", X is assumed to be a distance matrix.
Alternatively, if metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays from X as input and return a value indicating
the distance between them. The default is "euclidean" which is
interpreted as squared euclidean distance.
init : string or numpy array, optional (default: "random")
Initialization of embedding. Possible options are 'random', 'pca',
and a numpy array of shape (n_samples, n_components).
PCA initialization cannot be used with precomputed distances and is
usually more globally stable than random initialization.
verbose : int, optional (default: 0)
Verbosity level.
random_state : int, RandomState instance, default=None
Determines the random number generator. Pass an int for reproducible
results across multiple function calls. Note that different
initializations might result in different local minima of the cost
function. See :term: `Glossary <random_state>`.
method : string (default: 'barnes_hut')
By default the gradient calculation algorithm uses Barnes-Hut
approximation running in O(NlogN) time. method='exact'
will run on the slower, but exact, algorithm in O(N^2) time. The
exact algorithm should be used when nearest-neighbor errors need
to be better than 3%. However, the exact method cannot scale to
millions of examples.
.. versionadded:: 0.17
Approximate optimization *method* via the Barnes-Hut.
angle : float (default: 0.5)
Only used if method='barnes_hut'
This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
'angle' is the angular size (referred to as theta in [3]) of a distant
node as measured from a point. If this size is below 'angle' then it is
used as a summary node of all points contained within it.
This method is not very sensitive to changes in this parameter
in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
computation time and angle greater 0.8 has quickly increasing error.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run for neighbors search. This parameter
has no impact when ``metric="precomputed"`` or
(``metric="euclidean"`` and ``method="exact"``).
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. versionadded:: 0.22
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the embedding vectors.
kl_divergence_ : float
Kullback-Leibler divergence after optimization.
n_iter_ : int
Number of iterations run.
Examples
--------
>>> import numpy as np
>>> from sklearn.manifold import TSNE
>>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
>>> X_embedded = TSNE(n_components=2).fit_transform(X)
>>> X_embedded.shape
(4, 2)
References
----------
[1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data
Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008.
[2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding
https://lvdmaaten.github.io/tsne/
[3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.
Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf
"""
# Control the number of exploration iterations with early_exaggeration on
_EXPLORATION_N_ITER = 250
# Control the number of iterations between progress checks
_N_ITER_CHECK = 50
@_deprecate_positional_args
def __init__(self, n_components=2, *, perplexity=30.0,
early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
n_iter_without_progress=300, min_grad_norm=1e-7,
metric="euclidean", init="random", verbose=0,
random_state=None, method='barnes_hut', angle=0.5,
n_jobs=None):
self.n_components = n_components
self.perplexity = perplexity
self.early_exaggeration = early_exaggeration
self.learning_rate = learning_rate
self.n_iter = n_iter
self.n_iter_without_progress = n_iter_without_progress
self.min_grad_norm = min_grad_norm
self.metric = metric
self.init = init
self.verbose = verbose
self.random_state = random_state
self.method = method
self.angle = angle
self.n_jobs = n_jobs
def _fit(self, X, skip_num_points=0):
"""Private function to fit the model using X as training data."""
if self.method not in ['barnes_hut', 'exact']:
raise ValueError("'method' must be 'barnes_hut' or 'exact'")
if self.angle < 0.0 or self.angle > 1.0:
raise ValueError("'angle' must be between 0.0 - 1.0")
if self.method == 'barnes_hut':
X = self._validate_data(X, accept_sparse=['csr'],
ensure_min_samples=2,
dtype=[np.float32, np.float64])
else:
X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=[np.float32, np.float64])
if self.metric == "precomputed":
if isinstance(self.init, str) and self.init == 'pca':
raise ValueError("The parameter init=\"pca\" cannot be "
"used with metric=\"precomputed\".")
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square distance matrix")
check_non_negative(X, "TSNE.fit(). With metric='precomputed', X "
"should contain positive distances.")
if self.method == "exact" and issparse(X):
raise TypeError(
'TSNE with method="exact" does not accept sparse '
'precomputed distance matrix. Use method="barnes_hut" '
'or provide the dense distance matrix.')
if self.method == 'barnes_hut' and self.n_components > 3:
raise ValueError("'n_components' should be inferior to 4 for the "
"barnes_hut algorithm as it relies on "
"quad-tree or oct-tree.")
random_state = check_random_state(self.random_state)
if self.early_exaggeration < 1.0:
raise ValueError("early_exaggeration must be at least 1, but is {}"
.format(self.early_exaggeration))
if self.n_iter < 250:
raise ValueError("n_iter should be at least 250")
n_samples = X.shape[0]
neighbors_nn = None
if self.method == "exact":
# Retrieve the distance matrix, either using the precomputed one or
# computing it.
if self.metric == "precomputed":
distances = X
else:
if self.verbose:
print("[t-SNE] Computing pairwise distances...")
if self.metric == "euclidean":
distances = pairwise_distances(X, metric=self.metric,
squared=True)
else:
distances = pairwise_distances(X, metric=self.metric,
n_jobs=self.n_jobs)
if np.any(distances < 0):
raise ValueError("All distances should be positive, the "
"metric given is not correct")
# compute the joint probability distribution for the input space
P = _joint_probabilities(distances, self.perplexity, self.verbose)
assert np.all(np.isfinite(P)), "All probabilities should be finite"
assert np.all(P >= 0), "All probabilities should be non-negative"
assert np.all(P <= 1), ("All probabilities should be less "
"or then equal to one")
else:
# Compute the number of nearest neighbors to find.
# LvdM uses 3 * perplexity as the number of neighbors.
# In the event that we have very small # of points
# set the neighbors to n - 1.
n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1))
if self.verbose:
print("[t-SNE] Computing {} nearest neighbors..."
.format(n_neighbors))
# Find the nearest neighbors for every point
knn = NearestNeighbors(algorithm='auto',
n_jobs=self.n_jobs,
n_neighbors=n_neighbors,
metric=self.metric)
t0 = time()
knn.fit(X)
duration = time() - t0
if self.verbose:
print("[t-SNE] Indexed {} samples in {:.3f}s...".format(
n_samples, duration))
t0 = time()
distances_nn = knn.kneighbors_graph(mode='distance')
duration = time() - t0
if self.verbose:
print("[t-SNE] Computed neighbors for {} samples "
"in {:.3f}s...".format(n_samples, duration))
# Free the memory used by the ball_tree
del knn
if self.metric == "euclidean":
# knn return the euclidean distance but we need it squared
# to be consistent with the 'exact' method. Note that the
# the method was derived using the euclidean method as in the
# input space. Not sure of the implication of using a different
# metric.
distances_nn.data **= 2
# compute the joint probability distribution for the input space
P = _joint_probabilities_nn(distances_nn, self.perplexity,
self.verbose)
if isinstance(self.init, np.ndarray):
X_embedded = self.init
elif self.init == 'pca':
pca = PCA(n_components=self.n_components, svd_solver='randomized',
random_state=random_state)
X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
elif self.init == 'random':
# The embedding is initialized with iid samples from Gaussians with
# standard deviation 1e-4.
X_embedded = 1e-4 * random_state.randn(
n_samples, self.n_components).astype(np.float32)
else:
raise ValueError("'init' must be 'pca', 'random', or "
"a numpy array")
# Degrees of freedom of the Student's t-distribution. The suggestion
# degrees_of_freedom = n_components - 1 comes from
# "Learning a Parametric Embedding by Preserving Local Structure"
# Laurens van der Maaten, 2009.
degrees_of_freedom = max(self.n_components - 1, 1)
return self._tsne(P, degrees_of_freedom, n_samples,
X_embedded=X_embedded,
neighbors=neighbors_nn,
skip_num_points=skip_num_points)
def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded,
neighbors=None, skip_num_points=0):
"""Runs t-SNE."""
# t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P
# and the Student's t-distributions Q. The optimization algorithm that
# we use is batch gradient descent with two stages:
# * initial optimization with early exaggeration and momentum at 0.5
# * final optimization with momentum at 0.8
params = X_embedded.ravel()
opt_args = {
"it": 0,
"n_iter_check": self._N_ITER_CHECK,
"min_grad_norm": self.min_grad_norm,
"learning_rate": self.learning_rate,
"verbose": self.verbose,
"kwargs": dict(skip_num_points=skip_num_points),
"args": [P, degrees_of_freedom, n_samples, self.n_components],
"n_iter_without_progress": self._EXPLORATION_N_ITER,
"n_iter": self._EXPLORATION_N_ITER,
"momentum": 0.5,
}
if self.method == 'barnes_hut':
obj_func = _kl_divergence_bh
opt_args['kwargs']['angle'] = self.angle
# Repeat verbose argument for _kl_divergence_bh
opt_args['kwargs']['verbose'] = self.verbose
# Get the number of threads for gradient computation here to
# avoid recomputing it at each iteration.
opt_args['kwargs']['num_threads'] = _openmp_effective_n_threads()
else:
obj_func = _kl_divergence
# Learning schedule (part 1): do 250 iteration with lower momentum but
# higher learning rate controlled via the early exaggeration parameter
P *= self.early_exaggeration
params, kl_divergence, it = _gradient_descent(obj_func, params,
**opt_args)
if self.verbose:
print("[t-SNE] KL divergence after %d iterations with early "
"exaggeration: %f" % (it + 1, kl_divergence))
# Learning schedule (part 2): disable early exaggeration and finish
# optimization with a higher momentum at 0.8
P /= self.early_exaggeration
remaining = self.n_iter - self._EXPLORATION_N_ITER
if it < self._EXPLORATION_N_ITER or remaining > 0:
opt_args['n_iter'] = self.n_iter
opt_args['it'] = it + 1
opt_args['momentum'] = 0.8
opt_args['n_iter_without_progress'] = self.n_iter_without_progress
params, kl_divergence, it = _gradient_descent(obj_func, params,
**opt_args)
# Save the final number of iterations
self.n_iter_ = it
if self.verbose:
print("[t-SNE] KL divergence after %d iterations: %f"
% (it + 1, kl_divergence))
X_embedded = params.reshape(n_samples, self.n_components)
self.kl_divergence_ = kl_divergence
return X_embedded
def fit_transform(self, X, y=None):
"""Fit X into an embedded space and return that transformed
output.
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row. If the method
is 'exact', X may be a sparse matrix of type 'csr', 'csc'
or 'coo'. If the method is 'barnes_hut' and the metric is
'precomputed', X may be a precomputed sparse graph.
y : Ignored
Returns
-------
X_new : array, shape (n_samples, n_components)
Embedding of the training data in low-dimensional space.
"""
embedding = self._fit(X)
self.embedding_ = embedding
return self.embedding_
def fit(self, X, y=None):
"""Fit X into an embedded space.
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row. If the method
is 'exact', X may be a sparse matrix of type 'csr', 'csc'
or 'coo'. If the method is 'barnes_hut' and the metric is
'precomputed', X may be a precomputed sparse graph.
y : Ignored
"""
self.fit_transform(X)
return self

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _isomap # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.manifold.isomap'
correct_import_path = 'sklearn.manifold'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_isomap, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _locally_linear # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.manifold.locally_linear'
correct_import_path = 'sklearn.manifold'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_locally_linear, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _mds # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.manifold.mds'
correct_import_path = 'sklearn.manifold'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_mds, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,34 @@
import os
import numpy
def configuration(parent_package="", top_path=None):
from numpy.distutils.misc_util import Configuration
config = Configuration("manifold", parent_package, top_path)
libraries = []
if os.name == 'posix':
libraries.append('m')
config.add_extension("_utils",
sources=["_utils.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
extra_compile_args=["-O3"])
config.add_extension("_barnes_hut_tsne",
sources=["_barnes_hut_tsne.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries,
extra_compile_args=['-O3'])
config.add_subpackage('tests')
return config
if __name__ == "__main__":
from numpy.distutils.core import setup
setup(**configuration().todict())

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _spectral_embedding # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.manifold.spectral_embedding_'
correct_import_path = 'sklearn.manifold'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_spectral_embedding, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _t_sne # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.manifold.t_sne'
correct_import_path = 'sklearn.manifold'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_t_sne, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,188 @@
from itertools import product
import numpy as np
from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from sklearn import datasets
from sklearn import manifold
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing
from scipy.sparse import rand as sparse_rand
eigen_solvers = ['auto', 'dense', 'arpack']
path_methods = ['auto', 'FW', 'D']
def test_isomap_simple_grid():
# Isomap should preserve distances when all neighbors are used
N_per_side = 5
Npts = N_per_side ** 2
n_neighbors = Npts - 1
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(N_per_side), repeat=2)))
# distances from each point to all others
G = neighbors.kneighbors_graph(X, n_neighbors,
mode='distance').toarray()
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2,
eigen_solver=eigen_solver,
path_method=path_method)
clf.fit(X)
G_iso = neighbors.kneighbors_graph(clf.embedding_,
n_neighbors,
mode='distance').toarray()
assert_array_almost_equal(G, G_iso)
def test_isomap_reconstruction_error():
# Same setup as in test_isomap_simple_grid, with an added dimension
N_per_side = 5
Npts = N_per_side ** 2
n_neighbors = Npts - 1
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(N_per_side), repeat=2)))
# add noise in a third dimension
rng = np.random.RandomState(0)
noise = 0.1 * rng.randn(Npts, 1)
X = np.concatenate((X, noise), 1)
# compute input kernel
G = neighbors.kneighbors_graph(X, n_neighbors,
mode='distance').toarray()
centerer = preprocessing.KernelCenterer()
K = centerer.fit_transform(-0.5 * G ** 2)
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2,
eigen_solver=eigen_solver,
path_method=path_method)
clf.fit(X)
# compute output kernel
G_iso = neighbors.kneighbors_graph(clf.embedding_,
n_neighbors,
mode='distance').toarray()
K_iso = centerer.fit_transform(-0.5 * G_iso ** 2)
# make sure error agrees
reconstruction_error = np.linalg.norm(K - K_iso) / Npts
assert_almost_equal(reconstruction_error,
clf.reconstruction_error())
def test_transform():
n_samples = 200
n_components = 10
noise_scale = 0.01
# Create S-curve dataset
X, y = datasets.make_s_curve(n_samples, random_state=0)
# Compute isomap embedding
iso = manifold.Isomap(n_components=n_components, n_neighbors=2)
X_iso = iso.fit_transform(X)
# Re-embed a noisy version of the points
rng = np.random.RandomState(0)
noise = noise_scale * rng.randn(*X.shape)
X_iso2 = iso.transform(X + noise)
# Make sure the rms error on re-embedding is comparable to noise_scale
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
def test_pipeline():
# check that Isomap works fine as a transformer in a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[('isomap', manifold.Isomap()),
('clf', neighbors.KNeighborsClassifier())])
clf.fit(X, y)
assert .9 < clf.score(X, y)
def test_pipeline_with_nearest_neighbors_transformer():
# Test chaining NearestNeighborsTransformer and Isomap with
# neighbors_algorithm='precomputed'
algorithm = 'auto'
n_neighbors = 10
X, _ = datasets.make_blobs(random_state=0)
X2, _ = datasets.make_blobs(random_state=1)
# compare the chained version and the compact version
est_chain = pipeline.make_pipeline(
neighbors.KNeighborsTransformer(
n_neighbors=n_neighbors, algorithm=algorithm, mode='distance'),
manifold.Isomap(n_neighbors=n_neighbors, metric='precomputed'))
est_compact = manifold.Isomap(n_neighbors=n_neighbors,
neighbors_algorithm=algorithm)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_array_almost_equal(Xt_chain, Xt_compact)
Xt_chain = est_chain.transform(X2)
Xt_compact = est_compact.transform(X2)
assert_array_almost_equal(Xt_chain, Xt_compact)
def test_different_metric():
# Test that the metric parameters work correctly, and default to euclidean
def custom_metric(x1, x2):
return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))
# metric, p, is_euclidean
metrics = [('euclidean', 2, True),
('manhattan', 1, False),
('minkowski', 1, False),
('minkowski', 2, True),
(custom_metric, 2, False)]
X, _ = datasets.make_blobs(random_state=0)
reference = manifold.Isomap().fit_transform(X)
for metric, p, is_euclidean in metrics:
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
if is_euclidean:
assert_array_almost_equal(embedding, reference)
else:
with pytest.raises(AssertionError, match='not almost equal'):
assert_array_almost_equal(embedding, reference)
def test_isomap_clone_bug():
# regression test for bug reported in #6062
model = manifold.Isomap()
for n_neighbors in [10, 15, 20]:
model.set_params(n_neighbors=n_neighbors)
model.fit(np.random.rand(50, 2))
assert (model.nbrs_.n_neighbors ==
n_neighbors)
def test_sparse_input():
X = sparse_rand(100, 3, density=0.1, format='csr')
# Should not error
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(n_components=2,
eigen_solver=eigen_solver,
path_method=path_method)
clf.fit(X)

View file

@ -0,0 +1,146 @@
from itertools import product
import numpy as np
from numpy.testing import assert_almost_equal, assert_array_almost_equal
from scipy import linalg
import pytest
from sklearn import neighbors, manifold
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
from sklearn.utils._testing import ignore_warnings
from sklearn.utils._testing import assert_raise_message
eigen_solvers = ['dense', 'arpack']
# ----------------------------------------------------------------------
# Test utility routines
def test_barycenter_kneighbors_graph():
X = np.array([[0, 1], [1.01, 1.], [2, 0]])
A = barycenter_kneighbors_graph(X, 1)
assert_array_almost_equal(
A.toarray(),
[[0., 1., 0.],
[1., 0., 0.],
[0., 1., 0.]])
A = barycenter_kneighbors_graph(X, 2)
# check that columns sum to one
assert_array_almost_equal(np.sum(A.toarray(), 1), np.ones(3))
pred = np.dot(A.toarray(), X)
assert linalg.norm(pred - X) / X.shape[0] < 1
# ----------------------------------------------------------------------
# Test LLE by computing the reconstruction error on some manifolds.
def test_lle_simple_grid():
# note: ARPACK is numerically unstable, so this test will fail for
# some random seeds. We choose 2 because the tests pass.
rng = np.random.RandomState(2)
# grid of equidistant points in 2D, n_components = n_dim
X = np.array(list(product(range(5), repeat=2)))
X = X + 1e-10 * rng.uniform(size=X.shape)
n_components = 2
clf = manifold.LocallyLinearEmbedding(n_neighbors=5,
n_components=n_components,
random_state=rng)
tol = 0.1
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X, 'fro')
assert reconstruction_error < tol
for solver in eigen_solvers:
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = linalg.norm(
np.dot(N, clf.embedding_) - clf.embedding_, 'fro') ** 2
assert reconstruction_error < tol
assert_almost_equal(clf.reconstruction_error_,
reconstruction_error, decimal=1)
# re-embed a noisy version of X using the transform method
noise = rng.randn(*X.shape) / 100
X_reembedded = clf.transform(X + noise)
assert linalg.norm(X_reembedded - clf.embedding_) < tol
def test_lle_manifold():
rng = np.random.RandomState(0)
# similar test on a slightly more complex manifold
X = np.array(list(product(np.arange(18), repeat=2)))
X = np.c_[X, X[:, 0] ** 2 / 18]
X = X + 1e-10 * rng.uniform(size=X.shape)
n_components = 2
for method in ["standard", "hessian", "modified", "ltsa"]:
clf = manifold.LocallyLinearEmbedding(n_neighbors=6,
n_components=n_components,
method=method, random_state=0)
tol = 1.5 if method == "standard" else 3
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
reconstruction_error = linalg.norm(np.dot(N, X) - X)
assert reconstruction_error < tol
for solver in eigen_solvers:
clf.set_params(eigen_solver=solver)
clf.fit(X)
assert clf.embedding_.shape[1] == n_components
reconstruction_error = linalg.norm(
np.dot(N, clf.embedding_) - clf.embedding_, 'fro') ** 2
details = ("solver: %s, method: %s" % (solver, method))
assert reconstruction_error < tol, details
assert (np.abs(clf.reconstruction_error_ -
reconstruction_error) <
tol * reconstruction_error), details
# Test the error raised when parameter passed to lle is invalid
def test_lle_init_parameters():
X = np.random.rand(5, 3)
clf = manifold.LocallyLinearEmbedding(eigen_solver="error")
msg = "unrecognized eigen_solver 'error'"
assert_raise_message(ValueError, msg, clf.fit, X)
clf = manifold.LocallyLinearEmbedding(method="error")
msg = "unrecognized method 'error'"
assert_raise_message(ValueError, msg, clf.fit, X)
def test_pipeline():
# check that LocallyLinearEmbedding works fine as a Pipeline
# only checks that no error is raised.
# TODO check that it actually does something useful
from sklearn import pipeline, datasets
X, y = datasets.make_blobs(random_state=0)
clf = pipeline.Pipeline(
[('filter', manifold.LocallyLinearEmbedding(random_state=0)),
('clf', neighbors.KNeighborsClassifier())])
clf.fit(X, y)
assert .9 < clf.score(X, y)
# Test the error raised when the weight matrix is singular
def test_singular_matrix():
M = np.ones((10, 3))
f = ignore_warnings
with pytest.raises(ValueError):
f(manifold.locally_linear_embedding(M, n_neighbors=2, n_components=1,
method='standard',
eigen_solver='arpack'))
# regression test for #6033
def test_integer_input():
rand = np.random.RandomState(0)
X = rand.randint(0, 100, size=(20, 3))
for method in ["standard", "hessian", "modified", "ltsa"]:
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
clf.fit(X) # this previously raised a TypeError

View file

@ -0,0 +1,64 @@
import numpy as np
from numpy.testing import assert_array_almost_equal
import pytest
from sklearn.manifold import _mds as mds
def test_smacof():
# test metric smacof using the data of "Modern Multidimensional Scaling",
# Borg & Groenen, p 154
sim = np.array([[0, 5, 3, 4],
[5, 0, 2, 2],
[3, 2, 0, 1],
[4, 2, 1, 0]])
Z = np.array([[-.266, -.539],
[.451, .252],
[.016, -.238],
[-.200, .524]])
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
X_true = np.array([[-1.415, -2.471],
[1.633, 1.107],
[.249, -.067],
[-.468, 1.431]])
assert_array_almost_equal(X, X_true, decimal=3)
def test_smacof_error():
# Not symmetric similarity matrix:
sim = np.array([[0, 5, 9, 4],
[5, 0, 2, 2],
[3, 2, 0, 1],
[4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# Not squared similarity matrix:
sim = np.array([[0, 5, 9, 4],
[5, 0, 2, 2],
[4, 2, 1, 0]])
with pytest.raises(ValueError):
mds.smacof(sim)
# init not None and not correct format:
sim = np.array([[0, 5, 3, 4],
[5, 0, 2, 2],
[3, 2, 0, 1],
[4, 2, 1, 0]])
Z = np.array([[-.266, -.539],
[.016, -.238],
[-.200, .524]])
with pytest.raises(ValueError):
mds.smacof(sim, init=Z, n_init=1)
def test_MDS():
sim = np.array([[0, 5, 3, 4],
[5, 0, 2, 2],
[3, 2, 0, 1],
[4, 2, 1, 0]])
mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
mds_clf.fit(sim)

View file

@ -0,0 +1,347 @@
import pytest
import numpy as np
from scipy import sparse
from scipy.sparse import csgraph
from scipy.linalg import eigh
from sklearn.manifold import SpectralEmbedding
from sklearn.manifold._spectral_embedding import _graph_is_connected
from sklearn.manifold._spectral_embedding import _graph_connected_component
from sklearn.manifold import spectral_embedding
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import normalized_mutual_info_score
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
# non centered, sparse centers to check the
centers = np.array([
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
])
n_samples = 1000
n_clusters, n_features = centers.shape
S, true_labels = make_blobs(n_samples=n_samples, centers=centers,
cluster_std=1., random_state=42)
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
""" Check array A and B are equal with possible sign flipping on
each columns"""
tol_squared = tol ** 2
for A_col, B_col in zip(A.T, B.T):
assert (np.max((A_col - B_col) ** 2) <= tol_squared or
np.max((A_col + B_col) ** 2) <= tol_squared)
def test_sparse_graph_connected_component():
rng = np.random.RandomState(42)
n_samples = 300
boundaries = [0, 42, 121, 200, n_samples]
p = rng.permutation(n_samples)
connections = []
for start, stop in zip(boundaries[:-1], boundaries[1:]):
group = p[start:stop]
# Connect all elements within the group at least once via an
# arbitrary path that spans the group.
for i in range(len(group) - 1):
connections.append((group[i], group[i + 1]))
# Add some more random connections within the group
min_idx, max_idx = 0, len(group) - 1
n_random_connections = 1000
source = rng.randint(min_idx, max_idx, size=n_random_connections)
target = rng.randint(min_idx, max_idx, size=n_random_connections)
connections.extend(zip(group[source], group[target]))
# Build a symmetric affinity matrix
row_idx, column_idx = tuple(np.array(connections).T)
data = rng.uniform(.1, 42, size=len(connections))
affinity = sparse.coo_matrix((data, (row_idx, column_idx)))
affinity = 0.5 * (affinity + affinity.T)
for start, stop in zip(boundaries[:-1], boundaries[1:]):
component_1 = _graph_connected_component(affinity, p[start])
component_size = stop - start
assert component_1.sum() == component_size
# We should retrieve the same component mask by starting by both ends
# of the group
component_2 = _graph_connected_component(affinity, p[stop - 1])
assert component_2.sum() == component_size
assert_array_equal(component_1, component_2)
def test_spectral_embedding_two_components(seed=36):
# Test spectral embedding with two components
random_state = np.random.RandomState(seed)
n_sample = 100
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
# first component
affinity[0:n_sample,
0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2
# second component
affinity[n_sample::,
n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2
# Test of internal _graph_connected_component before connection
component = _graph_connected_component(affinity, 0)
assert component[:n_sample].all()
assert not component[n_sample:].any()
component = _graph_connected_component(affinity, -1)
assert not component[:n_sample].any()
assert component[n_sample:].all()
# connection
affinity[0, n_sample + 1] = 1
affinity[n_sample + 1, 0] = 1
affinity.flat[::2 * n_sample + 1] = 0
affinity = 0.5 * (affinity + affinity.T)
true_label = np.zeros(shape=2 * n_sample)
true_label[0:n_sample] = 1
se_precomp = SpectralEmbedding(n_components=1, affinity="precomputed",
random_state=np.random.RandomState(seed))
embedded_coordinate = se_precomp.fit_transform(affinity)
# Some numpy versions are touchy with types
embedded_coordinate = \
se_precomp.fit_transform(affinity.astype(np.float32))
# thresholding on the first components using 0.
label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float")
assert normalized_mutual_info_score(
true_label, label_) == pytest.approx(1.0)
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)],
ids=["dense", "sparse"])
def test_spectral_embedding_precomputed_affinity(X, seed=36):
# Test spectral embedding with precomputed kernel
gamma = 1.0
se_precomp = SpectralEmbedding(n_components=2, affinity="precomputed",
random_state=np.random.RandomState(seed))
se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed))
embed_precomp = se_precomp.fit_transform(rbf_kernel(X, gamma=gamma))
embed_rbf = se_rbf.fit_transform(X)
assert_array_almost_equal(
se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
def test_precomputed_nearest_neighbors_filtering():
# Test precomputed graph filtering when containing too many neighbors
n_neighbors = 2
results = []
for additional_neighbors in [0, 10]:
nn = NearestNeighbors(
n_neighbors=n_neighbors + additional_neighbors).fit(S)
graph = nn.kneighbors_graph(S, mode='connectivity')
embedding = SpectralEmbedding(random_state=0, n_components=2,
affinity='precomputed_nearest_neighbors',
n_neighbors=n_neighbors
).fit(graph).embedding_
results.append(embedding)
assert_array_equal(results[0], results[1])
@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)],
ids=["dense", "sparse"])
def test_spectral_embedding_callable_affinity(X, seed=36):
# Test spectral embedding with callable affinity
gamma = 0.9
kern = rbf_kernel(S, gamma=gamma)
se_callable = SpectralEmbedding(n_components=2,
affinity=(
lambda x: rbf_kernel(x, gamma=gamma)),
gamma=gamma,
random_state=np.random.RandomState(seed))
se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed))
embed_rbf = se_rbf.fit_transform(X)
embed_callable = se_callable.fit_transform(X)
assert_array_almost_equal(
se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
def test_spectral_embedding_amg_solver(seed=36):
# Test spectral embedding with amg solver
pytest.importorskip('pyamg')
se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors",
eigen_solver="amg", n_neighbors=5,
random_state=np.random.RandomState(seed))
se_arpack = SpectralEmbedding(n_components=2, affinity="nearest_neighbors",
eigen_solver="arpack", n_neighbors=5,
random_state=np.random.RandomState(seed))
embed_amg = se_amg.fit_transform(S)
embed_arpack = se_arpack.fit_transform(S)
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# same with special case in which amg is not actually used
# regression test for #10715
# affinity between nodes
row = [0, 0, 1, 2, 3, 3, 4]
col = [1, 2, 2, 3, 4, 5, 5]
val = [100, 100, 100, 1, 100, 100, 100]
affinity = sparse.coo_matrix((val + val, (row + col, col + row)),
shape=(6, 6)).toarray()
se_amg.affinity = "precomputed"
se_arpack.affinity = "precomputed"
embed_amg = se_amg.fit_transform(affinity)
embed_arpack = se_arpack.fit_transform(affinity)
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
# np.random.rand:
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*")
def test_spectral_embedding_amg_solver_failure():
# Non-regression test for amg solver failure (issue #13393 on github)
pytest.importorskip('pyamg')
seed = 36
num_nodes = 100
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
upper = sparse.triu(X) - sparse.diags(X.diagonal())
sym_matrix = upper + upper.T
embedding = spectral_embedding(sym_matrix,
n_components=10,
eigen_solver='amg',
random_state=0)
# Check that the learned embedding is stable w.r.t. random solver init:
for i in range(3):
new_embedding = spectral_embedding(sym_matrix,
n_components=10,
eigen_solver='amg',
random_state=i + 1)
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
@pytest.mark.filterwarnings("ignore:the behavior of nmi will "
"change in version 0.22")
def test_pipeline_spectral_clustering(seed=36):
# Test using pipeline to do spectral clustering
random_state = np.random.RandomState(seed)
se_rbf = SpectralEmbedding(n_components=n_clusters,
affinity="rbf",
random_state=random_state)
se_knn = SpectralEmbedding(n_components=n_clusters,
affinity="nearest_neighbors",
n_neighbors=5,
random_state=random_state)
for se in [se_rbf, se_knn]:
km = KMeans(n_clusters=n_clusters, random_state=random_state)
km.fit(se.fit_transform(S))
assert_array_almost_equal(
normalized_mutual_info_score(
km.labels_,
true_labels), 1.0, 2)
def test_spectral_embedding_unknown_eigensolver(seed=36):
# Test that SpectralClustering fails with an unknown eigensolver
se = SpectralEmbedding(n_components=1, affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver="<unknown>")
with pytest.raises(ValueError):
se.fit(S)
def test_spectral_embedding_unknown_affinity(seed=36):
# Test that SpectralClustering fails with an unknown affinity type
se = SpectralEmbedding(n_components=1, affinity="<unknown>",
random_state=np.random.RandomState(seed))
with pytest.raises(ValueError):
se.fit(S)
def test_connectivity(seed=36):
# Test that graph connectivity test works as expected
graph = np.array([[1, 0, 0, 0, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1]])
assert not _graph_is_connected(graph)
assert not _graph_is_connected(sparse.csr_matrix(graph))
assert not _graph_is_connected(sparse.csc_matrix(graph))
graph = np.array([[1, 1, 0, 0, 0],
[1, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1]])
assert _graph_is_connected(graph)
assert _graph_is_connected(sparse.csr_matrix(graph))
assert _graph_is_connected(sparse.csc_matrix(graph))
def test_spectral_embedding_deterministic():
# Test that Spectral Embedding is deterministic
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
embedding_1 = spectral_embedding(sims)
embedding_2 = spectral_embedding(sims)
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_unnormalized():
# Test that spectral_embedding is also processing unnormalized laplacian
# correctly
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 8
embedding_1 = spectral_embedding(sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False)
# Verify using manual computation with dense eigh
laplacian, dd = csgraph.laplacian(sims, normed=False,
return_diag=True)
_, diffusion_map = eigh(laplacian)
embedding_2 = diffusion_map.T[:n_components]
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_first_eigen_vector():
# Test that the first eigenvector of spectral_embedding
# is constant and that the second is not (for a connected graph)
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 2
for seed in range(10):
embedding = spectral_embedding(sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False,
random_state=seed)
assert np.std(embedding[:, 0]) == pytest.approx(0)
assert np.std(embedding[:, 1]) > 1e-3

View file

@ -0,0 +1,893 @@
import sys
from io import StringIO
import numpy as np
from numpy.testing import assert_allclose
import scipy.sparse as sp
import pytest
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import kneighbors_graph
from sklearn.exceptions import EfficiencyWarning
from sklearn.utils._testing import ignore_warnings
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import skip_if_32bit
from sklearn.utils import check_random_state
from sklearn.manifold._t_sne import _joint_probabilities
from sklearn.manifold._t_sne import _joint_probabilities_nn
from sklearn.manifold._t_sne import _kl_divergence
from sklearn.manifold._t_sne import _kl_divergence_bh
from sklearn.manifold._t_sne import _gradient_descent
from sklearn.manifold._t_sne import trustworthiness
from sklearn.manifold import TSNE
# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
from sklearn.manifold import _barnes_hut_tsne # type: ignore
from sklearn.manifold._utils import _binary_search_perplexity
from sklearn.datasets import make_blobs
from scipy.optimize import check_grad
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import cosine_distances
x = np.linspace(0, 1, 10)
xx, yy = np.meshgrid(x, x)
X_2d_grid = np.hstack([
xx.ravel().reshape(-1, 1),
yy.ravel().reshape(-1, 1),
])
def test_gradient_descent_stops():
# Test stopping conditions of gradient descent.
class ObjectiveSmallGradient:
def __init__(self):
self.it = -1
def __call__(self, _, compute_error=True):
self.it += 1
return (10 - self.it) / 10.0, np.array([1e-5])
def flat_function(_, compute_error=True):
return 0.0, np.ones(1)
# Gradient norm
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
_, error, it = _gradient_descent(
ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=100,
n_iter_without_progress=100, momentum=0.0, learning_rate=0.0,
min_gain=0.0, min_grad_norm=1e-5, verbose=2)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
assert error == 1.0
assert it == 0
assert("gradient norm" in out)
# Maximum number of iterations without improvement
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
_, error, it = _gradient_descent(
flat_function, np.zeros(1), 0, n_iter=100,
n_iter_without_progress=10, momentum=0.0, learning_rate=0.0,
min_gain=0.0, min_grad_norm=0.0, verbose=2)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
assert error == 0.0
assert it == 11
assert("did not make any progress" in out)
# Maximum number of iterations
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
_, error, it = _gradient_descent(
ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=11,
n_iter_without_progress=100, momentum=0.0, learning_rate=0.0,
min_gain=0.0, min_grad_norm=0.0, verbose=2)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
assert error == 0.0
assert it == 10
assert("Iteration 10" in out)
def test_binary_search():
# Test if the binary search finds Gaussians with desired perplexity.
random_state = check_random_state(0)
data = random_state.randn(50, 5)
distances = pairwise_distances(data).astype(np.float32)
desired_perplexity = 25.0
P = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
P = np.maximum(P, np.finfo(np.double).eps)
mean_perplexity = np.mean([np.exp(-np.sum(P[i] * np.log(P[i])))
for i in range(P.shape[0])])
assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
def test_binary_search_neighbors():
# Binary perplexity search approximation.
# Should be approximately equal to the slow method when we use
# all points as neighbors.
n_samples = 200
desired_perplexity = 25.0
random_state = check_random_state(0)
data = random_state.randn(n_samples, 2).astype(np.float32, copy=False)
distances = pairwise_distances(data)
P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0)
# Test that when we use all the neighbors the results are identical
n_neighbors = n_samples - 1
nn = NearestNeighbors().fit(data)
distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors,
mode='distance')
distances_nn = distance_graph.data.astype(np.float32, copy=False)
distances_nn = distances_nn.reshape(n_samples, n_neighbors)
P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0)
indptr = distance_graph.indptr
P1_nn = np.array([P1[k, distance_graph.indices[indptr[k]:indptr[k + 1]]]
for k in range(n_samples)])
assert_array_almost_equal(P1_nn, P2, decimal=4)
# Test that the highest P_ij are the same when fewer neighbors are used
for k in np.linspace(150, n_samples - 1, 5):
k = int(k)
topn = k * 10 # check the top 10 * k entries out of k * k entries
distance_graph = nn.kneighbors_graph(n_neighbors=k, mode='distance')
distances_nn = distance_graph.data.astype(np.float32, copy=False)
distances_nn = distances_nn.reshape(n_samples, k)
P2k = _binary_search_perplexity(distances_nn, desired_perplexity,
verbose=0)
assert_array_almost_equal(P1_nn, P2, decimal=2)
idx = np.argsort(P1.ravel())[::-1]
P1top = P1.ravel()[idx][:topn]
idx = np.argsort(P2k.ravel())[::-1]
P2top = P2k.ravel()[idx][:topn]
assert_array_almost_equal(P1top, P2top, decimal=2)
def test_binary_perplexity_stability():
# Binary perplexity search should be stable.
# The binary_search_perplexity had a bug wherein the P array
# was uninitialized, leading to sporadically failing tests.
n_neighbors = 10
n_samples = 100
random_state = check_random_state(0)
data = random_state.randn(n_samples, 5)
nn = NearestNeighbors().fit(data)
distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors,
mode='distance')
distances = distance_graph.data.astype(np.float32, copy=False)
distances = distances.reshape(n_samples, n_neighbors)
last_P = None
desired_perplexity = 3
for _ in range(100):
P = _binary_search_perplexity(distances.copy(), desired_perplexity,
verbose=0)
P1 = _joint_probabilities_nn(distance_graph, desired_perplexity,
verbose=0)
# Convert the sparse matrix to a dense one for testing
P1 = P1.toarray()
if last_P is None:
last_P = P
last_P1 = P1
else:
assert_array_almost_equal(P, last_P, decimal=4)
assert_array_almost_equal(P1, last_P1, decimal=4)
def test_gradient():
# Test gradient of Kullback-Leibler divergence.
random_state = check_random_state(0)
n_samples = 50
n_features = 2
n_components = 2
alpha = 1.0
distances = random_state.randn(n_samples, n_features).astype(np.float32)
distances = np.abs(distances.dot(distances.T))
np.fill_diagonal(distances, 0.0)
X_embedded = random_state.randn(n_samples, n_components).astype(np.float32)
P = _joint_probabilities(distances, desired_perplexity=25.0,
verbose=0)
def fun(params):
return _kl_divergence(params, P, alpha, n_samples, n_components)[0]
def grad(params):
return _kl_divergence(params, P, alpha, n_samples, n_components)[1]
assert_almost_equal(check_grad(fun, grad, X_embedded.ravel()), 0.0,
decimal=5)
def test_trustworthiness():
# Test trustworthiness score.
random_state = check_random_state(0)
# Affine transformation
X = random_state.randn(100, 2)
assert trustworthiness(X, 5.0 + X / 10.0) == 1.0
# Randomly shuffled
X = np.arange(100).reshape(-1, 1)
X_embedded = X.copy()
random_state.shuffle(X_embedded)
assert trustworthiness(X, X_embedded) < 0.6
# Completely different
X = np.arange(5).reshape(-1, 1)
X_embedded = np.array([[0], [2], [4], [1], [3]])
assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2)
@pytest.mark.parametrize("method", ['exact', 'barnes_hut'])
@pytest.mark.parametrize("init", ('random', 'pca'))
def test_preserve_trustworthiness_approximately(method, init):
# Nearest neighbors should be preserved approximately.
random_state = check_random_state(0)
n_components = 2
X = random_state.randn(50, n_components).astype(np.float32)
tsne = TSNE(n_components=n_components, init=init, random_state=0,
method=method, n_iter=700)
X_embedded = tsne.fit_transform(X)
t = trustworthiness(X, X_embedded, n_neighbors=1)
assert t > 0.85
def test_optimization_minimizes_kl_divergence():
"""t-SNE should give a lower KL divergence with more iterations."""
random_state = check_random_state(0)
X, _ = make_blobs(n_features=3, random_state=random_state)
kl_divergences = []
for n_iter in [250, 300, 350]:
tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
n_iter=n_iter, random_state=0)
tsne.fit_transform(X)
kl_divergences.append(tsne.kl_divergence_)
assert kl_divergences[1] <= kl_divergences[0]
assert kl_divergences[2] <= kl_divergences[1]
@pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
def test_fit_csr_matrix(method):
# X can be a sparse matrix.
rng = check_random_state(0)
X = rng.randn(50, 2)
X[(rng.randint(0, 50, 25), rng.randint(0, 2, 25))] = 0.0
X_csr = sp.csr_matrix(X)
tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
random_state=0, method=method, n_iter=750)
X_embedded = tsne.fit_transform(X_csr)
assert_allclose(trustworthiness(X_csr, X_embedded, n_neighbors=1),
1.0, rtol=1.1e-1)
def test_preserve_trustworthiness_approximately_with_precomputed_distances():
# Nearest neighbors should be preserved approximately.
random_state = check_random_state(0)
for i in range(3):
X = random_state.randn(80, 2)
D = squareform(pdist(X), "sqeuclidean")
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
early_exaggeration=2.0, metric="precomputed",
random_state=i, verbose=0, n_iter=500)
X_embedded = tsne.fit_transform(D)
t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed")
assert t > .95
def test_trustworthiness_not_euclidean_metric():
# Test trustworthiness with a metric different from 'euclidean' and
# 'precomputed'
random_state = check_random_state(0)
X = random_state.randn(100, 2)
assert (trustworthiness(X, X, metric='cosine') ==
trustworthiness(pairwise_distances(X, metric='cosine'), X,
metric='precomputed'))
def test_early_exaggeration_too_small():
# Early exaggeration factor must be >= 1.
tsne = TSNE(early_exaggeration=0.99)
with pytest.raises(ValueError, match="early_exaggeration .*"):
tsne.fit_transform(np.array([[0.0], [0.0]]))
def test_too_few_iterations():
# Number of gradient descent iterations must be at least 200.
tsne = TSNE(n_iter=199)
with pytest.raises(ValueError, match="n_iter .*"):
tsne.fit_transform(np.array([[0.0], [0.0]]))
@pytest.mark.parametrize('method, retype', [
('exact', np.asarray),
('barnes_hut', np.asarray),
('barnes_hut', sp.csr_matrix),
])
@pytest.mark.parametrize('D, message_regex', [
([[0.0], [1.0]], ".* square distance matrix"),
([[0., -1.], [1., 0.]], ".* positive.*"),
])
def test_bad_precomputed_distances(method, D, retype, message_regex):
tsne = TSNE(metric="precomputed", method=method)
with pytest.raises(ValueError, match=message_regex):
tsne.fit_transform(retype(D))
def test_exact_no_precomputed_sparse():
tsne = TSNE(metric='precomputed', method='exact')
with pytest.raises(TypeError, match='sparse'):
tsne.fit_transform(sp.csr_matrix([[0, 5], [5, 0]]))
def test_high_perplexity_precomputed_sparse_distances():
# Perplexity should be less than 50
dist = np.array([[1., 0., 0.], [0., 1., 0.], [1., 0., 0.]])
bad_dist = sp.csr_matrix(dist)
tsne = TSNE(metric="precomputed")
msg = "3 neighbors per samples are required, but some samples have only 1"
with pytest.raises(ValueError, match=msg):
tsne.fit_transform(bad_dist)
@ignore_warnings(category=EfficiencyWarning)
def test_sparse_precomputed_distance():
"""Make sure that TSNE works identically for sparse and dense matrix"""
random_state = check_random_state(0)
X = random_state.randn(100, 2)
D_sparse = kneighbors_graph(X, n_neighbors=100, mode='distance',
include_self=True)
D = pairwise_distances(X)
assert sp.issparse(D_sparse)
assert_almost_equal(D_sparse.A, D)
tsne = TSNE(metric="precomputed", random_state=0)
Xt_dense = tsne.fit_transform(D)
for fmt in ['csr', 'lil']:
Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt))
assert_almost_equal(Xt_dense, Xt_sparse)
def test_non_positive_computed_distances():
# Computed distance matrices must be positive.
def metric(x, y):
return -1
tsne = TSNE(metric=metric, method='exact')
X = np.array([[0.0, 0.0], [1.0, 1.0]])
with pytest.raises(ValueError, match="All distances .*metric given.*"):
tsne.fit_transform(X)
def test_init_not_available():
# 'init' must be 'pca', 'random', or numpy array.
tsne = TSNE(init="not available")
m = "'init' must be 'pca', 'random', or a numpy array"
with pytest.raises(ValueError, match=m):
tsne.fit_transform(np.array([[0.0], [1.0]]))
def test_init_ndarray():
# Initialize TSNE with ndarray and test fit
tsne = TSNE(init=np.zeros((100, 2)))
X_embedded = tsne.fit_transform(np.ones((100, 5)))
assert_array_equal(np.zeros((100, 2)), X_embedded)
def test_init_ndarray_precomputed():
# Initialize TSNE with ndarray and metric 'precomputed'
# Make sure no FutureWarning is thrown from _fit
tsne = TSNE(init=np.zeros((100, 2)), metric="precomputed")
tsne.fit(np.zeros((100, 100)))
def test_distance_not_available():
# 'metric' must be valid.
tsne = TSNE(metric="not available", method='exact')
with pytest.raises(ValueError, match="Unknown metric not available.*"):
tsne.fit_transform(np.array([[0.0], [1.0]]))
tsne = TSNE(metric="not available", method='barnes_hut')
with pytest.raises(ValueError, match="Metric 'not available' not valid.*"):
tsne.fit_transform(np.array([[0.0], [1.0]]))
def test_method_not_available():
# 'nethod' must be 'barnes_hut' or 'exact'
tsne = TSNE(method='not available')
with pytest.raises(ValueError, match="'method' must be 'barnes_hut' or "):
tsne.fit_transform(np.array([[0.0], [1.0]]))
def test_angle_out_of_range_checks():
# check the angle parameter range
for angle in [-1, -1e-6, 1 + 1e-6, 2]:
tsne = TSNE(angle=angle)
with pytest.raises(ValueError, match="'angle' must be between "
"0.0 - 1.0"):
tsne.fit_transform(np.array([[0.0], [1.0]]))
def test_pca_initialization_not_compatible_with_precomputed_kernel():
# Precomputed distance matrices must be square matrices.
tsne = TSNE(metric="precomputed", init="pca")
with pytest.raises(ValueError, match="The parameter init=\"pca\" cannot"
" be used with"
" metric=\"precomputed\"."):
tsne.fit_transform(np.array([[0.0], [1.0]]))
def test_n_components_range():
# barnes_hut method should only be used with n_components <= 3
tsne = TSNE(n_components=4, method="barnes_hut")
with pytest.raises(ValueError, match="'n_components' should be .*"):
tsne.fit_transform(np.array([[0.0], [1.0]]))
def test_early_exaggeration_used():
# check that the ``early_exaggeration`` parameter has an effect
random_state = check_random_state(0)
n_components = 2
methods = ['exact', 'barnes_hut']
X = random_state.randn(25, n_components).astype(np.float32)
for method in methods:
tsne = TSNE(n_components=n_components, perplexity=1,
learning_rate=100.0, init="pca", random_state=0,
method=method, early_exaggeration=1.0, n_iter=250)
X_embedded1 = tsne.fit_transform(X)
tsne = TSNE(n_components=n_components, perplexity=1,
learning_rate=100.0, init="pca", random_state=0,
method=method, early_exaggeration=10.0, n_iter=250)
X_embedded2 = tsne.fit_transform(X)
assert not np.allclose(X_embedded1, X_embedded2)
def test_n_iter_used():
# check that the ``n_iter`` parameter has an effect
random_state = check_random_state(0)
n_components = 2
methods = ['exact', 'barnes_hut']
X = random_state.randn(25, n_components).astype(np.float32)
for method in methods:
for n_iter in [251, 500]:
tsne = TSNE(n_components=n_components, perplexity=1,
learning_rate=0.5, init="random", random_state=0,
method=method, early_exaggeration=1.0, n_iter=n_iter)
tsne.fit_transform(X)
assert tsne.n_iter_ == n_iter - 1
def test_answer_gradient_two_points():
# Test the tree with only a single set of children.
#
# These tests & answers have been checked against the reference
# implementation by LvdM.
pos_input = np.array([[1.0, 0.0], [0.0, 1.0]])
pos_output = np.array([[-4.961291e-05, -1.072243e-04],
[9.259460e-05, 2.702024e-04]])
neighbors = np.array([[1],
[0]])
grad_output = np.array([[-2.37012478e-05, -6.29044398e-05],
[2.37012478e-05, 6.29044398e-05]])
_run_answer_test(pos_input, pos_output, neighbors, grad_output)
def test_answer_gradient_four_points():
# Four points tests the tree with multiple levels of children.
#
# These tests & answers have been checked against the reference
# implementation by LvdM.
pos_input = np.array([[1.0, 0.0], [0.0, 1.0],
[5.0, 2.0], [7.3, 2.2]])
pos_output = np.array([[6.080564e-05, -7.120823e-05],
[-1.718945e-04, -4.000536e-05],
[-2.271720e-04, 8.663310e-05],
[-1.032577e-04, -3.582033e-05]])
neighbors = np.array([[1, 2, 3],
[0, 2, 3],
[1, 0, 3],
[1, 2, 0]])
grad_output = np.array([[5.81128448e-05, -7.78033454e-06],
[-5.81526851e-05, 7.80976444e-06],
[4.24275173e-08, -3.69569698e-08],
[-2.58720939e-09, 7.52706374e-09]])
_run_answer_test(pos_input, pos_output, neighbors, grad_output)
def test_skip_num_points_gradient():
# Test the kwargs option skip_num_points.
#
# Skip num points should make it such that the Barnes_hut gradient
# is not calculated for indices below skip_num_point.
# Aside from skip_num_points=2 and the first two gradient rows
# being set to zero, these data points are the same as in
# test_answer_gradient_four_points()
pos_input = np.array([[1.0, 0.0], [0.0, 1.0],
[5.0, 2.0], [7.3, 2.2]])
pos_output = np.array([[6.080564e-05, -7.120823e-05],
[-1.718945e-04, -4.000536e-05],
[-2.271720e-04, 8.663310e-05],
[-1.032577e-04, -3.582033e-05]])
neighbors = np.array([[1, 2, 3],
[0, 2, 3],
[1, 0, 3],
[1, 2, 0]])
grad_output = np.array([[0.0, 0.0],
[0.0, 0.0],
[4.24275173e-08, -3.69569698e-08],
[-2.58720939e-09, 7.52706374e-09]])
_run_answer_test(pos_input, pos_output, neighbors, grad_output,
False, 0.1, 2)
def _run_answer_test(pos_input, pos_output, neighbors, grad_output,
verbose=False, perplexity=0.1, skip_num_points=0):
distances = pairwise_distances(pos_input).astype(np.float32)
args = distances, perplexity, verbose
pos_output = pos_output.astype(np.float32)
neighbors = neighbors.astype(np.int64, copy=False)
pij_input = _joint_probabilities(*args)
pij_input = squareform(pij_input).astype(np.float32)
grad_bh = np.zeros(pos_output.shape, dtype=np.float32)
from scipy.sparse import csr_matrix
P = csr_matrix(pij_input)
neighbors = P.indices.astype(np.int64)
indptr = P.indptr.astype(np.int64)
_barnes_hut_tsne.gradient(P.data, pos_output, neighbors, indptr,
grad_bh, 0.5, 2, 1, skip_num_points=0)
assert_array_almost_equal(grad_bh, grad_output, decimal=4)
def test_verbose():
# Verbose options write to stdout.
random_state = check_random_state(0)
tsne = TSNE(verbose=2)
X = random_state.randn(5, 2)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
tsne.fit_transform(X)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
assert("[t-SNE]" in out)
assert("nearest neighbors..." in out)
assert("Computed conditional probabilities" in out)
assert("Mean sigma" in out)
assert("early exaggeration" in out)
def test_chebyshev_metric():
# t-SNE should allow metrics that cannot be squared (issue #3526).
random_state = check_random_state(0)
tsne = TSNE(metric="chebyshev")
X = random_state.randn(5, 2)
tsne.fit_transform(X)
def test_reduction_to_one_component():
# t-SNE should allow reduction to one component (issue #4154).
random_state = check_random_state(0)
tsne = TSNE(n_components=1)
X = random_state.randn(5, 2)
X_embedded = tsne.fit(X).embedding_
assert(np.all(np.isfinite(X_embedded)))
@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
@pytest.mark.parametrize('dt', [np.float32, np.float64])
def test_64bit(method, dt):
# Ensure 64bit arrays are handled correctly.
random_state = check_random_state(0)
X = random_state.randn(10, 2).astype(dt, copy=False)
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
random_state=0, method=method, verbose=0,
n_iter=300)
X_embedded = tsne.fit_transform(X)
effective_type = X_embedded.dtype
# tsne cython code is only single precision, so the output will
# always be single precision, irrespectively of the input dtype
assert effective_type == np.float32
@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
def test_kl_divergence_not_nan(method):
# Ensure kl_divergence_ is computed at last iteration
# even though n_iter % n_iter_check != 0, i.e. 1003 % 50 != 0
random_state = check_random_state(0)
X = random_state.randn(50, 2)
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
random_state=0, method=method, verbose=0, n_iter=503)
tsne.fit_transform(X)
assert not np.isnan(tsne.kl_divergence_)
def test_barnes_hut_angle():
# When Barnes-Hut's angle=0 this corresponds to the exact method.
angle = 0.0
perplexity = 10
n_samples = 100
for n_components in [2, 3]:
n_features = 5
degrees_of_freedom = float(n_components - 1.0)
random_state = check_random_state(0)
data = random_state.randn(n_samples, n_features)
distances = pairwise_distances(data)
params = random_state.randn(n_samples, n_components)
P = _joint_probabilities(distances, perplexity, verbose=0)
kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom,
n_samples, n_components)
n_neighbors = n_samples - 1
distances_csr = NearestNeighbors().fit(data).kneighbors_graph(
n_neighbors=n_neighbors, mode='distance')
P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom,
n_samples, n_components,
angle=angle, skip_num_points=0,
verbose=0)
P = squareform(P)
P_bh = P_bh.toarray()
assert_array_almost_equal(P_bh, P, decimal=5)
assert_almost_equal(kl_exact, kl_bh, decimal=3)
@skip_if_32bit
def test_n_iter_without_progress():
# Use a dummy negative n_iter_without_progress and check output on stdout
random_state = check_random_state(0)
X = random_state.randn(100, 10)
for method in ["barnes_hut", "exact"]:
tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8,
random_state=0, method=method, n_iter=351, init="random")
tsne._N_ITER_CHECK = 1
tsne._EXPLORATION_N_ITER = 0
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
tsne.fit_transform(X)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
# The output needs to contain the value of n_iter_without_progress
assert ("did not make any progress during the "
"last -1 episodes. Finished." in out)
def test_min_grad_norm():
# Make sure that the parameter min_grad_norm is used correctly
random_state = check_random_state(0)
X = random_state.randn(100, 2)
min_grad_norm = 0.002
tsne = TSNE(min_grad_norm=min_grad_norm, verbose=2,
random_state=0, method='exact')
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
tsne.fit_transform(X)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
lines_out = out.split('\n')
# extract the gradient norm from the verbose output
gradient_norm_values = []
for line in lines_out:
# When the computation is Finished just an old gradient norm value
# is repeated that we do not need to store
if 'Finished' in line:
break
start_grad_norm = line.find('gradient norm')
if start_grad_norm >= 0:
line = line[start_grad_norm:]
line = line.replace('gradient norm = ', '').split(' ')[0]
gradient_norm_values.append(float(line))
# Compute how often the gradient norm is smaller than min_grad_norm
gradient_norm_values = np.array(gradient_norm_values)
n_smaller_gradient_norms = \
len(gradient_norm_values[gradient_norm_values <= min_grad_norm])
# The gradient norm can be smaller than min_grad_norm at most once,
# because in the moment it becomes smaller the optimization stops
assert n_smaller_gradient_norms <= 1
def test_accessible_kl_divergence():
# Ensures that the accessible kl_divergence matches the computed value
random_state = check_random_state(0)
X = random_state.randn(50, 2)
tsne = TSNE(n_iter_without_progress=2, verbose=2,
random_state=0, method='exact',
n_iter=500)
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
tsne.fit_transform(X)
finally:
out = sys.stdout.getvalue()
sys.stdout.close()
sys.stdout = old_stdout
# The output needs to contain the accessible kl_divergence as the error at
# the last iteration
for line in out.split('\n')[::-1]:
if 'Iteration' in line:
_, _, error = line.partition('error = ')
if error:
error, _, _ = error.partition(',')
break
assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5)
@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
def test_uniform_grid(method):
"""Make sure that TSNE can approximately recover a uniform 2D grid
Due to ties in distances between point in X_2d_grid, this test is platform
dependent for ``method='barnes_hut'`` due to numerical imprecision.
Also, t-SNE is not assured to converge to the right solution because bad
initialization can lead to convergence to bad local minimum (the
optimization problem is non-convex). To avoid breaking the test too often,
we re-run t-SNE from the final point when the convergence is not good
enough.
"""
seeds = range(3)
n_iter = 500
for seed in seeds:
tsne = TSNE(n_components=2, init='random', random_state=seed,
perplexity=50, n_iter=n_iter, method=method)
Y = tsne.fit_transform(X_2d_grid)
try_name = "{}_{}".format(method, seed)
try:
assert_uniform_grid(Y, try_name)
except AssertionError:
# If the test fails a first time, re-run with init=Y to see if
# this was caused by a bad initialization. Note that this will
# also run an early_exaggeration step.
try_name += ":rerun"
tsne.init = Y
Y = tsne.fit_transform(X_2d_grid)
assert_uniform_grid(Y, try_name)
def assert_uniform_grid(Y, try_name=None):
# Ensure that the resulting embedding leads to approximately
# uniformly spaced points: the distance to the closest neighbors
# should be non-zero and approximately constant.
nn = NearestNeighbors(n_neighbors=1).fit(Y)
dist_to_nn = nn.kneighbors(return_distance=True)[0].ravel()
assert dist_to_nn.min() > 0.1
smallest_to_mean = dist_to_nn.min() / np.mean(dist_to_nn)
largest_to_mean = dist_to_nn.max() / np.mean(dist_to_nn)
assert smallest_to_mean > .5, try_name
assert largest_to_mean < 2, try_name
def test_bh_match_exact():
# check that the ``barnes_hut`` method match the exact one when
# ``angle = 0`` and ``perplexity > n_samples / 3``
random_state = check_random_state(0)
n_features = 10
X = random_state.randn(30, n_features).astype(np.float32)
X_embeddeds = {}
n_iter = {}
for method in ['exact', 'barnes_hut']:
tsne = TSNE(n_components=2, method=method, learning_rate=1.0,
init="random", random_state=0, n_iter=251,
perplexity=30.0, angle=0)
# Kill the early_exaggeration
tsne._EXPLORATION_N_ITER = 0
X_embeddeds[method] = tsne.fit_transform(X)
n_iter[method] = tsne.n_iter_
assert n_iter['exact'] == n_iter['barnes_hut']
assert_allclose(X_embeddeds['exact'], X_embeddeds['barnes_hut'], rtol=1e-4)
def test_gradient_bh_multithread_match_sequential():
# check that the bh gradient with different num_threads gives the same
# results
n_features = 10
n_samples = 30
n_components = 2
degrees_of_freedom = 1
angle = 3
perplexity = 5
random_state = check_random_state(0)
data = random_state.randn(n_samples, n_features).astype(np.float32)
params = random_state.randn(n_samples, n_components)
n_neighbors = n_samples - 1
distances_csr = NearestNeighbors().fit(data).kneighbors_graph(
n_neighbors=n_neighbors, mode='distance')
P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
kl_sequential, grad_sequential = _kl_divergence_bh(
params, P_bh, degrees_of_freedom, n_samples, n_components,
angle=angle, skip_num_points=0, verbose=0, num_threads=1)
for num_threads in [2, 4]:
kl_multithread, grad_multithread = _kl_divergence_bh(
params, P_bh, degrees_of_freedom, n_samples, n_components,
angle=angle, skip_num_points=0, verbose=0, num_threads=num_threads)
assert_allclose(kl_multithread, kl_sequential, rtol=1e-6)
assert_allclose(grad_multithread, grad_multithread)
def test_tsne_with_different_distance_metrics():
"""Make sure that TSNE works for different distance metrics"""
random_state = check_random_state(0)
n_components_original = 3
n_components_embedding = 2
X = random_state.randn(50, n_components_original).astype(np.float32)
metrics = ['manhattan', 'cosine']
dist_funcs = [manhattan_distances, cosine_distances]
for metric, dist_func in zip(metrics, dist_funcs):
X_transformed_tsne = TSNE(
metric=metric, n_components=n_components_embedding,
random_state=0, n_iter=300).fit_transform(X)
X_transformed_tsne_precomputed = TSNE(
metric='precomputed', n_components=n_components_embedding,
random_state=0, n_iter=300).fit_transform(dist_func(X))
assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
@pytest.mark.parametrize('method', ['exact', 'barnes_hut'])
def test_tsne_n_jobs(method):
"""Make sure that the n_jobs parameter doesn't impact the output"""
random_state = check_random_state(0)
n_features = 10
X = random_state.randn(30, n_features)
X_tr_ref = TSNE(n_components=2, method=method, perplexity=30.0,
angle=0, n_jobs=1, random_state=0).fit_transform(X)
X_tr = TSNE(n_components=2, method=method, perplexity=30.0,
angle=0, n_jobs=2, random_state=0).fit_transform(X)
assert_allclose(X_tr_ref, X_tr)