Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
439
venv/Lib/site-packages/sklearn/manifold/_mds.py
Normal file
439
venv/Lib/site-packages/sklearn/manifold/_mds.py
Normal file
|
@ -0,0 +1,439 @@
|
|||
"""
|
||||
Multi-dimensional Scaling (MDS)
|
||||
"""
|
||||
|
||||
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
|
||||
# License: BSD
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed, effective_n_jobs
|
||||
|
||||
import warnings
|
||||
|
||||
from ..base import BaseEstimator
|
||||
from ..metrics import euclidean_distances
|
||||
from ..utils import check_random_state, check_array, check_symmetric
|
||||
from ..isotonic import IsotonicRegression
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
|
||||
max_iter=300, verbose=0, eps=1e-3, random_state=None):
|
||||
"""Computes multidimensional scaling using SMACOF algorithm
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : ndarray, shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : boolean, optional, default: True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
|
||||
n_components : int, optional, default: 2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : ndarray, shape (n_samples, n_components), optional, default: None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
max_iter : int, optional, default: 300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, optional, default: 0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, optional, default: 1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray, shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress.
|
||||
"""
|
||||
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
|
||||
|
||||
n_samples = dissimilarities.shape[0]
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
|
||||
sim_flat_w = sim_flat[sim_flat != 0]
|
||||
if init is None:
|
||||
# Randomly choose initial configuration
|
||||
X = random_state.rand(n_samples * n_components)
|
||||
X = X.reshape((n_samples, n_components))
|
||||
else:
|
||||
# overrides the parameter p
|
||||
n_components = init.shape[1]
|
||||
if n_samples != init.shape[0]:
|
||||
raise ValueError("init matrix should be of shape (%d, %d)" %
|
||||
(n_samples, n_components))
|
||||
X = init
|
||||
|
||||
old_stress = None
|
||||
ir = IsotonicRegression()
|
||||
for it in range(max_iter):
|
||||
# Compute distance and monotonic regression
|
||||
dis = euclidean_distances(X)
|
||||
|
||||
if metric:
|
||||
disparities = dissimilarities
|
||||
else:
|
||||
dis_flat = dis.ravel()
|
||||
# dissimilarities with 0 are considered as missing values
|
||||
dis_flat_w = dis_flat[sim_flat != 0]
|
||||
|
||||
# Compute the disparities using a monotonic regression
|
||||
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
|
||||
disparities = dis_flat.copy()
|
||||
disparities[sim_flat != 0] = disparities_flat
|
||||
disparities = disparities.reshape((n_samples, n_samples))
|
||||
disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
|
||||
(disparities ** 2).sum())
|
||||
|
||||
# Compute stress
|
||||
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
|
||||
|
||||
# Update X using the Guttman transform
|
||||
dis[dis == 0] = 1e-5
|
||||
ratio = disparities / dis
|
||||
B = - ratio
|
||||
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
|
||||
X = 1. / n_samples * np.dot(B, X)
|
||||
|
||||
dis = np.sqrt((X ** 2).sum(axis=1)).sum()
|
||||
if verbose >= 2:
|
||||
print('it: %d, stress %s' % (it, stress))
|
||||
if old_stress is not None:
|
||||
if(old_stress - stress / dis) < eps:
|
||||
if verbose:
|
||||
print('breaking at iteration %d with stress %s' % (it,
|
||||
stress))
|
||||
break
|
||||
old_stress = stress / dis
|
||||
|
||||
return X, stress, it + 1
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def smacof(dissimilarities, *, metric=True, n_components=2, init=None,
|
||||
n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3,
|
||||
random_state=None, return_n_iter=False):
|
||||
"""Computes multidimensional scaling using the SMACOF algorithm.
|
||||
|
||||
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
|
||||
multidimensional scaling algorithm which minimizes an objective function
|
||||
(the *stress*) using a majorization technique. Stress majorization, also
|
||||
known as the Guttman Transform, guarantees a monotone convergence of
|
||||
stress, and is more powerful than traditional techniques such as gradient
|
||||
descent.
|
||||
|
||||
The SMACOF algorithm for metric MDS can summarized by the following steps:
|
||||
|
||||
1. Set an initial start configuration, randomly or not.
|
||||
2. Compute the stress
|
||||
3. Compute the Guttman Transform
|
||||
4. Iterate 2 and 3 until convergence.
|
||||
|
||||
The nonmetric algorithm adds a monotonic regression step before computing
|
||||
the stress.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : ndarray, shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : boolean, optional, default: True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
|
||||
n_components : int, optional, default: 2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : ndarray, shape (n_samples, n_components), optional, default: None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
n_init : int, optional, default: 8
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress. If ``init`` is
|
||||
provided, this option is overridden and a single run is performed.
|
||||
|
||||
n_jobs : int or None, optional (default=None)
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
max_iter : int, optional, default: 300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, optional, default: 0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, optional, default: 1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
return_n_iter : bool, optional, default: False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray, shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress. Returned
|
||||
only if ``return_n_iter`` is set to ``True``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
"""
|
||||
|
||||
dissimilarities = check_array(dissimilarities)
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
if hasattr(init, '__array__'):
|
||||
init = np.asarray(init).copy()
|
||||
if not n_init == 1:
|
||||
warnings.warn(
|
||||
'Explicit initial positions passed: '
|
||||
'performing only one init of the MDS instead of %d'
|
||||
% n_init)
|
||||
n_init = 1
|
||||
|
||||
best_pos, best_stress = None, None
|
||||
|
||||
if effective_n_jobs(n_jobs) == 1:
|
||||
for it in range(n_init):
|
||||
pos, stress, n_iter_ = _smacof_single(
|
||||
dissimilarities, metric=metric,
|
||||
n_components=n_components, init=init,
|
||||
max_iter=max_iter, verbose=verbose,
|
||||
eps=eps, random_state=random_state)
|
||||
if best_stress is None or stress < best_stress:
|
||||
best_stress = stress
|
||||
best_pos = pos.copy()
|
||||
best_iter = n_iter_
|
||||
else:
|
||||
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
|
||||
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
|
||||
delayed(_smacof_single)(
|
||||
dissimilarities, metric=metric, n_components=n_components,
|
||||
init=init, max_iter=max_iter, verbose=verbose, eps=eps,
|
||||
random_state=seed)
|
||||
for seed in seeds)
|
||||
positions, stress, n_iters = zip(*results)
|
||||
best = np.argmin(stress)
|
||||
best_stress = stress[best]
|
||||
best_pos = positions[best]
|
||||
best_iter = n_iters[best]
|
||||
|
||||
if return_n_iter:
|
||||
return best_pos, best_stress, best_iter
|
||||
else:
|
||||
return best_pos, best_stress
|
||||
|
||||
|
||||
class MDS(BaseEstimator):
|
||||
"""Multidimensional scaling
|
||||
|
||||
Read more in the :ref:`User Guide <multidimensional_scaling>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, optional, default: 2
|
||||
Number of dimensions in which to immerse the dissimilarities.
|
||||
|
||||
metric : boolean, optional, default: True
|
||||
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
|
||||
|
||||
n_init : int, optional, default: 4
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress.
|
||||
|
||||
max_iter : int, optional, default: 300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, optional, default: 0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, optional, default: 1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence.
|
||||
|
||||
n_jobs : int or None, optional (default=None)
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
dissimilarity : 'euclidean' | 'precomputed', optional, default: 'euclidean'
|
||||
Dissimilarity measure to use:
|
||||
|
||||
- 'euclidean':
|
||||
Pairwise Euclidean distances between points in the dataset.
|
||||
|
||||
- 'precomputed':
|
||||
Pre-computed dissimilarities are passed directly to ``fit`` and
|
||||
``fit_transform``.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : array-like, shape (n_samples, n_components)
|
||||
Stores the position of the dataset in the embedding space.
|
||||
|
||||
stress_ : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import MDS
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = MDS(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
|
||||
References
|
||||
----------
|
||||
"Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
"Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
"Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_components=2, *, metric=True, n_init=4,
|
||||
max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
|
||||
random_state=None, dissimilarity="euclidean"):
|
||||
self.n_components = n_components
|
||||
self.dissimilarity = dissimilarity
|
||||
self.metric = metric
|
||||
self.n_init = n_init
|
||||
self.max_iter = max_iter
|
||||
self.eps = eps
|
||||
self.verbose = verbose
|
||||
self.n_jobs = n_jobs
|
||||
self.random_state = random_state
|
||||
|
||||
@property
|
||||
def _pairwise(self):
|
||||
return self.kernel == "precomputed"
|
||||
|
||||
def fit(self, X, y=None, init=None):
|
||||
"""
|
||||
Computes the position of the points in the embedding space
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
|
||||
Input data. If ``dissimilarity=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
|
||||
init : ndarray, shape (n_samples,), optional, default: None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
"""
|
||||
self.fit_transform(X, init=init)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None, init=None):
|
||||
"""
|
||||
Fit the data from X, and returns the embedded coordinates
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
|
||||
Input data. If ``dissimilarity=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
|
||||
init : ndarray, shape (n_samples,), optional, default: None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
|
||||
warnings.warn("The MDS API has changed. ``fit`` now constructs an"
|
||||
" dissimilarity matrix from data. To use a custom "
|
||||
"dissimilarity matrix, set "
|
||||
"``dissimilarity='precomputed'``.")
|
||||
|
||||
if self.dissimilarity == "precomputed":
|
||||
self.dissimilarity_matrix_ = X
|
||||
elif self.dissimilarity == "euclidean":
|
||||
self.dissimilarity_matrix_ = euclidean_distances(X)
|
||||
else:
|
||||
raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
|
||||
" Got %s instead" % str(self.dissimilarity))
|
||||
|
||||
self.embedding_, self.stress_, self.n_iter_ = smacof(
|
||||
self.dissimilarity_matrix_, metric=self.metric,
|
||||
n_components=self.n_components, init=init, n_init=self.n_init,
|
||||
n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
|
||||
eps=self.eps, random_state=self.random_state,
|
||||
return_n_iter=True)
|
||||
|
||||
return self.embedding_
|
Loading…
Add table
Add a link
Reference in a new issue