Uploaded Test files
This commit is contained in:
		
							parent
							
								
									f584ad9d97
								
							
						
					
					
						commit
						2e81cb7d99
					
				
					 16627 changed files with 2065359 additions and 102444 deletions
				
			
		
							
								
								
									
										439
									
								
								venv/Lib/site-packages/sklearn/manifold/_mds.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										439
									
								
								venv/Lib/site-packages/sklearn/manifold/_mds.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,439 @@ | |||
| """ | ||||
| Multi-dimensional Scaling (MDS) | ||||
| """ | ||||
| 
 | ||||
| # author: Nelle Varoquaux <nelle.varoquaux@gmail.com> | ||||
| # License: BSD | ||||
| 
 | ||||
| import numpy as np | ||||
| from joblib import Parallel, delayed, effective_n_jobs | ||||
| 
 | ||||
| import warnings | ||||
| 
 | ||||
| from ..base import BaseEstimator | ||||
| from ..metrics import euclidean_distances | ||||
| from ..utils import check_random_state, check_array, check_symmetric | ||||
| from ..isotonic import IsotonicRegression | ||||
| from ..utils.validation import _deprecate_positional_args | ||||
| 
 | ||||
| 
 | ||||
| def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, | ||||
|                    max_iter=300, verbose=0, eps=1e-3, random_state=None): | ||||
|     """Computes multidimensional scaling using SMACOF algorithm | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     dissimilarities : ndarray, shape (n_samples, n_samples) | ||||
|         Pairwise dissimilarities between the points. Must be symmetric. | ||||
| 
 | ||||
|     metric : boolean, optional, default: True | ||||
|         Compute metric or nonmetric SMACOF algorithm. | ||||
| 
 | ||||
|     n_components : int, optional, default: 2 | ||||
|         Number of dimensions in which to immerse the dissimilarities. If an | ||||
|         ``init`` array is provided, this option is overridden and the shape of | ||||
|         ``init`` is used to determine the dimensionality of the embedding | ||||
|         space. | ||||
| 
 | ||||
|     init : ndarray, shape (n_samples, n_components), optional, default: None | ||||
|         Starting configuration of the embedding to initialize the algorithm. By | ||||
|         default, the algorithm is initialized with a randomly chosen array. | ||||
| 
 | ||||
|     max_iter : int, optional, default: 300 | ||||
|         Maximum number of iterations of the SMACOF algorithm for a single run. | ||||
| 
 | ||||
|     verbose : int, optional, default: 0 | ||||
|         Level of verbosity. | ||||
| 
 | ||||
|     eps : float, optional, default: 1e-3 | ||||
|         Relative tolerance with respect to stress at which to declare | ||||
|         convergence. | ||||
| 
 | ||||
|     random_state : int, RandomState instance, default=None | ||||
|         Determines the random number generator used to initialize the centers. | ||||
|         Pass an int for reproducible results across multiple function calls. | ||||
|         See :term: `Glossary <random_state>`. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     X : ndarray, shape (n_samples, n_components) | ||||
|         Coordinates of the points in a ``n_components``-space. | ||||
| 
 | ||||
|     stress : float | ||||
|         The final value of the stress (sum of squared distance of the | ||||
|         disparities and the distances for all constrained points). | ||||
| 
 | ||||
|     n_iter : int | ||||
|         The number of iterations corresponding to the best stress. | ||||
|     """ | ||||
|     dissimilarities = check_symmetric(dissimilarities, raise_exception=True) | ||||
| 
 | ||||
|     n_samples = dissimilarities.shape[0] | ||||
|     random_state = check_random_state(random_state) | ||||
| 
 | ||||
|     sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel() | ||||
|     sim_flat_w = sim_flat[sim_flat != 0] | ||||
|     if init is None: | ||||
|         # Randomly choose initial configuration | ||||
|         X = random_state.rand(n_samples * n_components) | ||||
|         X = X.reshape((n_samples, n_components)) | ||||
|     else: | ||||
|         # overrides the parameter p | ||||
|         n_components = init.shape[1] | ||||
|         if n_samples != init.shape[0]: | ||||
|             raise ValueError("init matrix should be of shape (%d, %d)" % | ||||
|                              (n_samples, n_components)) | ||||
|         X = init | ||||
| 
 | ||||
|     old_stress = None | ||||
|     ir = IsotonicRegression() | ||||
|     for it in range(max_iter): | ||||
|         # Compute distance and monotonic regression | ||||
|         dis = euclidean_distances(X) | ||||
| 
 | ||||
|         if metric: | ||||
|             disparities = dissimilarities | ||||
|         else: | ||||
|             dis_flat = dis.ravel() | ||||
|             # dissimilarities with 0 are considered as missing values | ||||
|             dis_flat_w = dis_flat[sim_flat != 0] | ||||
| 
 | ||||
|             # Compute the disparities using a monotonic regression | ||||
|             disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w) | ||||
|             disparities = dis_flat.copy() | ||||
|             disparities[sim_flat != 0] = disparities_flat | ||||
|             disparities = disparities.reshape((n_samples, n_samples)) | ||||
|             disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) / | ||||
|                                    (disparities ** 2).sum()) | ||||
| 
 | ||||
|         # Compute stress | ||||
|         stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 | ||||
| 
 | ||||
|         # Update X using the Guttman transform | ||||
|         dis[dis == 0] = 1e-5 | ||||
|         ratio = disparities / dis | ||||
|         B = - ratio | ||||
|         B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1) | ||||
|         X = 1. / n_samples * np.dot(B, X) | ||||
| 
 | ||||
|         dis = np.sqrt((X ** 2).sum(axis=1)).sum() | ||||
|         if verbose >= 2: | ||||
|             print('it: %d, stress %s' % (it, stress)) | ||||
|         if old_stress is not None: | ||||
|             if(old_stress - stress / dis) < eps: | ||||
|                 if verbose: | ||||
|                     print('breaking at iteration %d with stress %s' % (it, | ||||
|                                                                        stress)) | ||||
|                 break | ||||
|         old_stress = stress / dis | ||||
| 
 | ||||
|     return X, stress, it + 1 | ||||
| 
 | ||||
| 
 | ||||
| @_deprecate_positional_args | ||||
| def smacof(dissimilarities, *, metric=True, n_components=2, init=None, | ||||
|            n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3, | ||||
|            random_state=None, return_n_iter=False): | ||||
|     """Computes multidimensional scaling using the SMACOF algorithm. | ||||
| 
 | ||||
|     The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a | ||||
|     multidimensional scaling algorithm which minimizes an objective function | ||||
|     (the *stress*) using a majorization technique. Stress majorization, also | ||||
|     known as the Guttman Transform, guarantees a monotone convergence of | ||||
|     stress, and is more powerful than traditional techniques such as gradient | ||||
|     descent. | ||||
| 
 | ||||
|     The SMACOF algorithm for metric MDS can summarized by the following steps: | ||||
| 
 | ||||
|     1. Set an initial start configuration, randomly or not. | ||||
|     2. Compute the stress | ||||
|     3. Compute the Guttman Transform | ||||
|     4. Iterate 2 and 3 until convergence. | ||||
| 
 | ||||
|     The nonmetric algorithm adds a monotonic regression step before computing | ||||
|     the stress. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     dissimilarities : ndarray, shape (n_samples, n_samples) | ||||
|         Pairwise dissimilarities between the points. Must be symmetric. | ||||
| 
 | ||||
|     metric : boolean, optional, default: True | ||||
|         Compute metric or nonmetric SMACOF algorithm. | ||||
| 
 | ||||
|     n_components : int, optional, default: 2 | ||||
|         Number of dimensions in which to immerse the dissimilarities. If an | ||||
|         ``init`` array is provided, this option is overridden and the shape of | ||||
|         ``init`` is used to determine the dimensionality of the embedding | ||||
|         space. | ||||
| 
 | ||||
|     init : ndarray, shape (n_samples, n_components), optional, default: None | ||||
|         Starting configuration of the embedding to initialize the algorithm. By | ||||
|         default, the algorithm is initialized with a randomly chosen array. | ||||
| 
 | ||||
|     n_init : int, optional, default: 8 | ||||
|         Number of times the SMACOF algorithm will be run with different | ||||
|         initializations. The final results will be the best output of the runs, | ||||
|         determined by the run with the smallest final stress. If ``init`` is | ||||
|         provided, this option is overridden and a single run is performed. | ||||
| 
 | ||||
|     n_jobs : int or None, optional (default=None) | ||||
|         The number of jobs to use for the computation. If multiple | ||||
|         initializations are used (``n_init``), each run of the algorithm is | ||||
|         computed in parallel. | ||||
| 
 | ||||
|         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. | ||||
|         ``-1`` means using all processors. See :term:`Glossary <n_jobs>` | ||||
|         for more details. | ||||
| 
 | ||||
|     max_iter : int, optional, default: 300 | ||||
|         Maximum number of iterations of the SMACOF algorithm for a single run. | ||||
| 
 | ||||
|     verbose : int, optional, default: 0 | ||||
|         Level of verbosity. | ||||
| 
 | ||||
|     eps : float, optional, default: 1e-3 | ||||
|         Relative tolerance with respect to stress at which to declare | ||||
|         convergence. | ||||
| 
 | ||||
|     random_state : int, RandomState instance, default=None | ||||
|         Determines the random number generator used to initialize the centers. | ||||
|         Pass an int for reproducible results across multiple function calls. | ||||
|         See :term: `Glossary <random_state>`. | ||||
| 
 | ||||
|     return_n_iter : bool, optional, default: False | ||||
|         Whether or not to return the number of iterations. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     X : ndarray, shape (n_samples, n_components) | ||||
|         Coordinates of the points in a ``n_components``-space. | ||||
| 
 | ||||
|     stress : float | ||||
|         The final value of the stress (sum of squared distance of the | ||||
|         disparities and the distances for all constrained points). | ||||
| 
 | ||||
|     n_iter : int | ||||
|         The number of iterations corresponding to the best stress. Returned | ||||
|         only if ``return_n_iter`` is set to ``True``. | ||||
| 
 | ||||
|     Notes | ||||
|     ----- | ||||
|     "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; | ||||
|     Groenen P. Springer Series in Statistics (1997) | ||||
| 
 | ||||
|     "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. | ||||
|     Psychometrika, 29 (1964) | ||||
| 
 | ||||
|     "Multidimensional scaling by optimizing goodness of fit to a nonmetric | ||||
|     hypothesis" Kruskal, J. Psychometrika, 29, (1964) | ||||
|     """ | ||||
| 
 | ||||
|     dissimilarities = check_array(dissimilarities) | ||||
|     random_state = check_random_state(random_state) | ||||
| 
 | ||||
|     if hasattr(init, '__array__'): | ||||
|         init = np.asarray(init).copy() | ||||
|         if not n_init == 1: | ||||
|             warnings.warn( | ||||
|                 'Explicit initial positions passed: ' | ||||
|                 'performing only one init of the MDS instead of %d' | ||||
|                 % n_init) | ||||
|             n_init = 1 | ||||
| 
 | ||||
|     best_pos, best_stress = None, None | ||||
| 
 | ||||
|     if effective_n_jobs(n_jobs) == 1: | ||||
|         for it in range(n_init): | ||||
|             pos, stress, n_iter_ = _smacof_single( | ||||
|                 dissimilarities, metric=metric, | ||||
|                 n_components=n_components, init=init, | ||||
|                 max_iter=max_iter, verbose=verbose, | ||||
|                 eps=eps, random_state=random_state) | ||||
|             if best_stress is None or stress < best_stress: | ||||
|                 best_stress = stress | ||||
|                 best_pos = pos.copy() | ||||
|                 best_iter = n_iter_ | ||||
|     else: | ||||
|         seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) | ||||
|         results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))( | ||||
|             delayed(_smacof_single)( | ||||
|                 dissimilarities, metric=metric, n_components=n_components, | ||||
|                 init=init, max_iter=max_iter, verbose=verbose, eps=eps, | ||||
|                 random_state=seed) | ||||
|             for seed in seeds) | ||||
|         positions, stress, n_iters = zip(*results) | ||||
|         best = np.argmin(stress) | ||||
|         best_stress = stress[best] | ||||
|         best_pos = positions[best] | ||||
|         best_iter = n_iters[best] | ||||
| 
 | ||||
|     if return_n_iter: | ||||
|         return best_pos, best_stress, best_iter | ||||
|     else: | ||||
|         return best_pos, best_stress | ||||
| 
 | ||||
| 
 | ||||
| class MDS(BaseEstimator): | ||||
|     """Multidimensional scaling | ||||
| 
 | ||||
|     Read more in the :ref:`User Guide <multidimensional_scaling>`. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     n_components : int, optional, default: 2 | ||||
|         Number of dimensions in which to immerse the dissimilarities. | ||||
| 
 | ||||
|     metric : boolean, optional, default: True | ||||
|         If ``True``, perform metric MDS; otherwise, perform nonmetric MDS. | ||||
| 
 | ||||
|     n_init : int, optional, default: 4 | ||||
|         Number of times the SMACOF algorithm will be run with different | ||||
|         initializations. The final results will be the best output of the runs, | ||||
|         determined by the run with the smallest final stress. | ||||
| 
 | ||||
|     max_iter : int, optional, default: 300 | ||||
|         Maximum number of iterations of the SMACOF algorithm for a single run. | ||||
| 
 | ||||
|     verbose : int, optional, default: 0 | ||||
|         Level of verbosity. | ||||
| 
 | ||||
|     eps : float, optional, default: 1e-3 | ||||
|         Relative tolerance with respect to stress at which to declare | ||||
|         convergence. | ||||
| 
 | ||||
|     n_jobs : int or None, optional (default=None) | ||||
|         The number of jobs to use for the computation. If multiple | ||||
|         initializations are used (``n_init``), each run of the algorithm is | ||||
|         computed in parallel. | ||||
| 
 | ||||
|         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. | ||||
|         ``-1`` means using all processors. See :term:`Glossary <n_jobs>` | ||||
|         for more details. | ||||
| 
 | ||||
|     random_state : int, RandomState instance, default=None | ||||
|         Determines the random number generator used to initialize the centers. | ||||
|         Pass an int for reproducible results across multiple function calls. | ||||
|         See :term: `Glossary <random_state>`. | ||||
| 
 | ||||
|     dissimilarity : 'euclidean' | 'precomputed', optional, default: 'euclidean' | ||||
|         Dissimilarity measure to use: | ||||
| 
 | ||||
|         - 'euclidean': | ||||
|             Pairwise Euclidean distances between points in the dataset. | ||||
| 
 | ||||
|         - 'precomputed': | ||||
|             Pre-computed dissimilarities are passed directly to ``fit`` and | ||||
|             ``fit_transform``. | ||||
| 
 | ||||
|     Attributes | ||||
|     ---------- | ||||
|     embedding_ : array-like, shape (n_samples, n_components) | ||||
|         Stores the position of the dataset in the embedding space. | ||||
| 
 | ||||
|     stress_ : float | ||||
|         The final value of the stress (sum of squared distance of the | ||||
|         disparities and the distances for all constrained points). | ||||
| 
 | ||||
|     Examples | ||||
|     -------- | ||||
|     >>> from sklearn.datasets import load_digits | ||||
|     >>> from sklearn.manifold import MDS | ||||
|     >>> X, _ = load_digits(return_X_y=True) | ||||
|     >>> X.shape | ||||
|     (1797, 64) | ||||
|     >>> embedding = MDS(n_components=2) | ||||
|     >>> X_transformed = embedding.fit_transform(X[:100]) | ||||
|     >>> X_transformed.shape | ||||
|     (100, 2) | ||||
| 
 | ||||
|     References | ||||
|     ---------- | ||||
|     "Modern Multidimensional Scaling - Theory and Applications" Borg, I.; | ||||
|     Groenen P. Springer Series in Statistics (1997) | ||||
| 
 | ||||
|     "Nonmetric multidimensional scaling: a numerical method" Kruskal, J. | ||||
|     Psychometrika, 29 (1964) | ||||
| 
 | ||||
|     "Multidimensional scaling by optimizing goodness of fit to a nonmetric | ||||
|     hypothesis" Kruskal, J. Psychometrika, 29, (1964) | ||||
| 
 | ||||
|     """ | ||||
|     @_deprecate_positional_args | ||||
|     def __init__(self, n_components=2, *, metric=True, n_init=4, | ||||
|                  max_iter=300, verbose=0, eps=1e-3, n_jobs=None, | ||||
|                  random_state=None, dissimilarity="euclidean"): | ||||
|         self.n_components = n_components | ||||
|         self.dissimilarity = dissimilarity | ||||
|         self.metric = metric | ||||
|         self.n_init = n_init | ||||
|         self.max_iter = max_iter | ||||
|         self.eps = eps | ||||
|         self.verbose = verbose | ||||
|         self.n_jobs = n_jobs | ||||
|         self.random_state = random_state | ||||
| 
 | ||||
|     @property | ||||
|     def _pairwise(self): | ||||
|         return self.kernel == "precomputed" | ||||
| 
 | ||||
|     def fit(self, X, y=None, init=None): | ||||
|         """ | ||||
|         Computes the position of the points in the embedding space | ||||
| 
 | ||||
|         Parameters | ||||
|         ---------- | ||||
|         X : array, shape (n_samples, n_features) or (n_samples, n_samples) | ||||
|             Input data. If ``dissimilarity=='precomputed'``, the input should | ||||
|             be the dissimilarity matrix. | ||||
| 
 | ||||
|         y : Ignored | ||||
| 
 | ||||
|         init : ndarray, shape (n_samples,), optional, default: None | ||||
|             Starting configuration of the embedding to initialize the SMACOF | ||||
|             algorithm. By default, the algorithm is initialized with a randomly | ||||
|             chosen array. | ||||
|         """ | ||||
|         self.fit_transform(X, init=init) | ||||
|         return self | ||||
| 
 | ||||
|     def fit_transform(self, X, y=None, init=None): | ||||
|         """ | ||||
|         Fit the data from X, and returns the embedded coordinates | ||||
| 
 | ||||
|         Parameters | ||||
|         ---------- | ||||
|         X : array, shape (n_samples, n_features) or (n_samples, n_samples) | ||||
|             Input data. If ``dissimilarity=='precomputed'``, the input should | ||||
|             be the dissimilarity matrix. | ||||
| 
 | ||||
|         y : Ignored | ||||
| 
 | ||||
|         init : ndarray, shape (n_samples,), optional, default: None | ||||
|             Starting configuration of the embedding to initialize the SMACOF | ||||
|             algorithm. By default, the algorithm is initialized with a randomly | ||||
|             chosen array. | ||||
|         """ | ||||
|         X = self._validate_data(X) | ||||
|         if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": | ||||
|             warnings.warn("The MDS API has changed. ``fit`` now constructs an" | ||||
|                           " dissimilarity matrix from data. To use a custom " | ||||
|                           "dissimilarity matrix, set " | ||||
|                           "``dissimilarity='precomputed'``.") | ||||
| 
 | ||||
|         if self.dissimilarity == "precomputed": | ||||
|             self.dissimilarity_matrix_ = X | ||||
|         elif self.dissimilarity == "euclidean": | ||||
|             self.dissimilarity_matrix_ = euclidean_distances(X) | ||||
|         else: | ||||
|             raise ValueError("Proximity must be 'precomputed' or 'euclidean'." | ||||
|                              " Got %s instead" % str(self.dissimilarity)) | ||||
| 
 | ||||
|         self.embedding_, self.stress_, self.n_iter_ = smacof( | ||||
|             self.dissimilarity_matrix_, metric=self.metric, | ||||
|             n_components=self.n_components, init=init, n_init=self.n_init, | ||||
|             n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, | ||||
|             eps=self.eps, random_state=self.random_state, | ||||
|             return_n_iter=True) | ||||
| 
 | ||||
|         return self.embedding_ | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue