Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/manifold/_mds.py
+++ b/venv/Lib/site-packages/sklearn/manifold/_mds.py
@ -0,0 +1,439 @@
+"""
+Multi-dimensional Scaling (MDS)
+"""
+
+# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
+# License: BSD
+
+import numpy as np
+from joblib import Parallel, delayed, effective_n_jobs
+
+import warnings
+
+from ..base import BaseEstimator
+from ..metrics import euclidean_distances
+from ..utils import check_random_state, check_array, check_symmetric
+from ..isotonic import IsotonicRegression
+from ..utils.validation import _deprecate_positional_args
+
+
+def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
+                   max_iter=300, verbose=0, eps=1e-3, random_state=None):
+    """Computes multidimensional scaling using SMACOF algorithm
+
+    Parameters
+    ----------
+    dissimilarities : ndarray, shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be symmetric.
+
+    metric : boolean, optional, default: True
+        Compute metric or nonmetric SMACOF algorithm.
+
+    n_components : int, optional, default: 2
+        Number of dimensions in which to immerse the dissimilarities. If an
+        ``init`` array is provided, this option is overridden and the shape of
+        ``init`` is used to determine the dimensionality of the embedding
+        space.
+
+    init : ndarray, shape (n_samples, n_components), optional, default: None
+        Starting configuration of the embedding to initialize the algorithm. By
+        default, the algorithm is initialized with a randomly chosen array.
+
+    max_iter : int, optional, default: 300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, optional, default: 0
+        Level of verbosity.
+
+    eps : float, optional, default: 1e-3
+        Relative tolerance with respect to stress at which to declare
+        convergence.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term: `Glossary <random_state>`.
+
+    Returns
+    -------
+    X : ndarray, shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+
+    n_iter : int
+        The number of iterations corresponding to the best stress.
+    """
+    dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
+
+    n_samples = dissimilarities.shape[0]
+    random_state = check_random_state(random_state)
+
+    sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
+    sim_flat_w = sim_flat[sim_flat != 0]
+    if init is None:
+        # Randomly choose initial configuration
+        X = random_state.rand(n_samples * n_components)
+        X = X.reshape((n_samples, n_components))
+    else:
+        # overrides the parameter p
+        n_components = init.shape[1]
+        if n_samples != init.shape[0]:
+            raise ValueError("init matrix should be of shape (%d, %d)" %
+                             (n_samples, n_components))
+        X = init
+
+    old_stress = None
+    ir = IsotonicRegression()
+    for it in range(max_iter):
+        # Compute distance and monotonic regression
+        dis = euclidean_distances(X)
+
+        if metric:
+            disparities = dissimilarities
+        else:
+            dis_flat = dis.ravel()
+            # dissimilarities with 0 are considered as missing values
+            dis_flat_w = dis_flat[sim_flat != 0]
+
+            # Compute the disparities using a monotonic regression
+            disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
+            disparities = dis_flat.copy()
+            disparities[sim_flat != 0] = disparities_flat
+            disparities = disparities.reshape((n_samples, n_samples))
+            disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
+                                   (disparities ** 2).sum())
+
+        # Compute stress
+        stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
+
+        # Update X using the Guttman transform
+        dis[dis == 0] = 1e-5
+        ratio = disparities / dis
+        B = - ratio
+        B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
+        X = 1. / n_samples * np.dot(B, X)
+
+        dis = np.sqrt((X ** 2).sum(axis=1)).sum()
+        if verbose >= 2:
+            print('it: %d, stress %s' % (it, stress))
+        if old_stress is not None:
+            if(old_stress - stress / dis) < eps:
+                if verbose:
+                    print('breaking at iteration %d with stress %s' % (it,
+                                                                       stress))
+                break
+        old_stress = stress / dis
+
+    return X, stress, it + 1
+
+
+@_deprecate_positional_args
+def smacof(dissimilarities, *, metric=True, n_components=2, init=None,
+           n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3,
+           random_state=None, return_n_iter=False):
+    """Computes multidimensional scaling using the SMACOF algorithm.
+
+    The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
+    multidimensional scaling algorithm which minimizes an objective function
+    (the *stress*) using a majorization technique. Stress majorization, also
+    known as the Guttman Transform, guarantees a monotone convergence of
+    stress, and is more powerful than traditional techniques such as gradient
+    descent.
+
+    The SMACOF algorithm for metric MDS can summarized by the following steps:
+
+    1. Set an initial start configuration, randomly or not.
+    2. Compute the stress
+    3. Compute the Guttman Transform
+    4. Iterate 2 and 3 until convergence.
+
+    The nonmetric algorithm adds a monotonic regression step before computing
+    the stress.
+
+    Parameters
+    ----------
+    dissimilarities : ndarray, shape (n_samples, n_samples)
+        Pairwise dissimilarities between the points. Must be symmetric.
+
+    metric : boolean, optional, default: True
+        Compute metric or nonmetric SMACOF algorithm.
+
+    n_components : int, optional, default: 2
+        Number of dimensions in which to immerse the dissimilarities. If an
+        ``init`` array is provided, this option is overridden and the shape of
+        ``init`` is used to determine the dimensionality of the embedding
+        space.
+
+    init : ndarray, shape (n_samples, n_components), optional, default: None
+        Starting configuration of the embedding to initialize the algorithm. By
+        default, the algorithm is initialized with a randomly chosen array.
+
+    n_init : int, optional, default: 8
+        Number of times the SMACOF algorithm will be run with different
+        initializations. The final results will be the best output of the runs,
+        determined by the run with the smallest final stress. If ``init`` is
+        provided, this option is overridden and a single run is performed.
+
+    n_jobs : int or None, optional (default=None)
+        The number of jobs to use for the computation. If multiple
+        initializations are used (``n_init``), each run of the algorithm is
+        computed in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    max_iter : int, optional, default: 300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, optional, default: 0
+        Level of verbosity.
+
+    eps : float, optional, default: 1e-3
+        Relative tolerance with respect to stress at which to declare
+        convergence.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term: `Glossary <random_state>`.
+
+    return_n_iter : bool, optional, default: False
+        Whether or not to return the number of iterations.
+
+    Returns
+    -------
+    X : ndarray, shape (n_samples, n_components)
+        Coordinates of the points in a ``n_components``-space.
+
+    stress : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+
+    n_iter : int
+        The number of iterations corresponding to the best stress. Returned
+        only if ``return_n_iter`` is set to ``True``.
+
+    Notes
+    -----
+    "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+    Groenen P. Springer Series in Statistics (1997)
+
+    "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+    Psychometrika, 29 (1964)
+
+    "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+    hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+    """
+
+    dissimilarities = check_array(dissimilarities)
+    random_state = check_random_state(random_state)
+
+    if hasattr(init, '__array__'):
+        init = np.asarray(init).copy()
+        if not n_init == 1:
+            warnings.warn(
+                'Explicit initial positions passed: '
+                'performing only one init of the MDS instead of %d'
+                % n_init)
+            n_init = 1
+
+    best_pos, best_stress = None, None
+
+    if effective_n_jobs(n_jobs) == 1:
+        for it in range(n_init):
+            pos, stress, n_iter_ = _smacof_single(
+                dissimilarities, metric=metric,
+                n_components=n_components, init=init,
+                max_iter=max_iter, verbose=verbose,
+                eps=eps, random_state=random_state)
+            if best_stress is None or stress < best_stress:
+                best_stress = stress
+                best_pos = pos.copy()
+                best_iter = n_iter_
+    else:
+        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
+        results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
+            delayed(_smacof_single)(
+                dissimilarities, metric=metric, n_components=n_components,
+                init=init, max_iter=max_iter, verbose=verbose, eps=eps,
+                random_state=seed)
+            for seed in seeds)
+        positions, stress, n_iters = zip(*results)
+        best = np.argmin(stress)
+        best_stress = stress[best]
+        best_pos = positions[best]
+        best_iter = n_iters[best]
+
+    if return_n_iter:
+        return best_pos, best_stress, best_iter
+    else:
+        return best_pos, best_stress
+
+
+class MDS(BaseEstimator):
+    """Multidimensional scaling
+
+    Read more in the :ref:`User Guide <multidimensional_scaling>`.
+
+    Parameters
+    ----------
+    n_components : int, optional, default: 2
+        Number of dimensions in which to immerse the dissimilarities.
+
+    metric : boolean, optional, default: True
+        If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
+
+    n_init : int, optional, default: 4
+        Number of times the SMACOF algorithm will be run with different
+        initializations. The final results will be the best output of the runs,
+        determined by the run with the smallest final stress.
+
+    max_iter : int, optional, default: 300
+        Maximum number of iterations of the SMACOF algorithm for a single run.
+
+    verbose : int, optional, default: 0
+        Level of verbosity.
+
+    eps : float, optional, default: 1e-3
+        Relative tolerance with respect to stress at which to declare
+        convergence.
+
+    n_jobs : int or None, optional (default=None)
+        The number of jobs to use for the computation. If multiple
+        initializations are used (``n_init``), each run of the algorithm is
+        computed in parallel.
+
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    random_state : int, RandomState instance, default=None
+        Determines the random number generator used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term: `Glossary <random_state>`.
+
+    dissimilarity : 'euclidean' | 'precomputed', optional, default: 'euclidean'
+        Dissimilarity measure to use:
+
+        - 'euclidean':
+            Pairwise Euclidean distances between points in the dataset.
+
+        - 'precomputed':
+            Pre-computed dissimilarities are passed directly to ``fit`` and
+            ``fit_transform``.
+
+    Attributes
+    ----------
+    embedding_ : array-like, shape (n_samples, n_components)
+        Stores the position of the dataset in the embedding space.
+
+    stress_ : float
+        The final value of the stress (sum of squared distance of the
+        disparities and the distances for all constrained points).
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_digits
+    >>> from sklearn.manifold import MDS
+    >>> X, _ = load_digits(return_X_y=True)
+    >>> X.shape
+    (1797, 64)
+    >>> embedding = MDS(n_components=2)
+    >>> X_transformed = embedding.fit_transform(X[:100])
+    >>> X_transformed.shape
+    (100, 2)
+
+    References
+    ----------
+    "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
+    Groenen P. Springer Series in Statistics (1997)
+
+    "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
+    Psychometrika, 29 (1964)
+
+    "Multidimensional scaling by optimizing goodness of fit to a nonmetric
+    hypothesis" Kruskal, J. Psychometrika, 29, (1964)
+
+    """
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, metric=True, n_init=4,
+                 max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
+                 random_state=None, dissimilarity="euclidean"):
+        self.n_components = n_components
+        self.dissimilarity = dissimilarity
+        self.metric = metric
+        self.n_init = n_init
+        self.max_iter = max_iter
+        self.eps = eps
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+
+    @property
+    def _pairwise(self):
+        return self.kernel == "precomputed"
+
+    def fit(self, X, y=None, init=None):
+        """
+        Computes the position of the points in the embedding space
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
+            Input data. If ``dissimilarity=='precomputed'``, the input should
+            be the dissimilarity matrix.
+
+        y : Ignored
+
+        init : ndarray, shape (n_samples,), optional, default: None
+            Starting configuration of the embedding to initialize the SMACOF
+            algorithm. By default, the algorithm is initialized with a randomly
+            chosen array.
+        """
+        self.fit_transform(X, init=init)
+        return self
+
+    def fit_transform(self, X, y=None, init=None):
+        """
+        Fit the data from X, and returns the embedded coordinates
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
+            Input data. If ``dissimilarity=='precomputed'``, the input should
+            be the dissimilarity matrix.
+
+        y : Ignored
+
+        init : ndarray, shape (n_samples,), optional, default: None
+            Starting configuration of the embedding to initialize the SMACOF
+            algorithm. By default, the algorithm is initialized with a randomly
+            chosen array.
+        """
+        X = self._validate_data(X)
+        if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
+            warnings.warn("The MDS API has changed. ``fit`` now constructs an"
+                          " dissimilarity matrix from data. To use a custom "
+                          "dissimilarity matrix, set "
+                          "``dissimilarity='precomputed'``.")
+
+        if self.dissimilarity == "precomputed":
+            self.dissimilarity_matrix_ = X
+        elif self.dissimilarity == "euclidean":
+            self.dissimilarity_matrix_ = euclidean_distances(X)
+        else:
+            raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
+                             " Got %s instead" % str(self.dissimilarity))
+
+        self.embedding_, self.stress_, self.n_iter_ = smacof(
+            self.dissimilarity_matrix_, metric=self.metric,
+            n_components=self.n_components, init=init, n_init=self.n_init,
+            n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
+            eps=self.eps, random_state=self.random_state,
+            return_n_iter=True)
+
+        return self.embedding_