Fixed database typo and removed unnecessary class identifier.

2020-10-14 10:10:37 -04:00 · 2020-10-14 10:10:37 -04:00 · 45fb349a7d
commit 45fb349a7d
parent 00ad49a143
5098 changed files with 952558 additions and 85 deletions
--- a/venv/Lib/site-packages/scipy/cluster/init.py
+++ b/venv/Lib/site-packages/scipy/cluster/init.py
@ -0,0 +1,29 @@
+"""
+=========================================
+Clustering package (:mod:`scipy.cluster`)
+=========================================
+
+.. currentmodule:: scipy.cluster
+
+:mod:`scipy.cluster.vq`
+
+Clustering algorithms are useful in information theory, target detection,
+communications, compression, and other areas. The `vq` module only
+supports vector quantization and the k-means algorithms.
+
+:mod:`scipy.cluster.hierarchy`
+
+The `hierarchy` module provides functions for hierarchical and
+agglomerative clustering.  Its features include generating hierarchical
+clusters from distance matrices,
+calculating statistics on clusters, cutting linkages
+to generate flat clusters, and visualizing clusters with dendrograms.
+
+"""
+__all__ = ['vq', 'hierarchy']
+
+from . import vq, hierarchy
+
+from scipy._lib._testutils import PytestTester
+test = PytestTester(__name__)
+del PytestTester
--- a/venv/Lib/site-packages/scipy/cluster/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/pycache/hierarchy.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/pycache/hierarchy.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/pycache/setup.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/pycache/setup.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/pycache/vq.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/pycache/vq.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/_hierarchy.cp36-win32.pyd
+++ b/venv/Lib/site-packages/scipy/cluster/_hierarchy.cp36-win32.pyd
--- a/venv/Lib/site-packages/scipy/cluster/_optimal_leaf_ordering.cp36-win32.pyd
+++ b/venv/Lib/site-packages/scipy/cluster/_optimal_leaf_ordering.cp36-win32.pyd
--- a/venv/Lib/site-packages/scipy/cluster/_vq.cp36-win32.pyd
+++ b/venv/Lib/site-packages/scipy/cluster/_vq.cp36-win32.pyd
--- a/venv/Lib/site-packages/scipy/cluster/hierarchy.py
+++ b/venv/Lib/site-packages/scipy/cluster/hierarchy.py
--- a/venv/Lib/site-packages/scipy/cluster/setup.py
+++ b/venv/Lib/site-packages/scipy/cluster/setup.py
@ -0,0 +1,27 @@
+DEFINE_MACROS = [("SCIPY_PY3K", None)]
+
+
+def configuration(parent_package='', top_path=None):
+    from numpy.distutils.misc_util import Configuration, get_numpy_include_dirs
+    config = Configuration('cluster', parent_package, top_path)
+
+    config.add_data_dir('tests')
+
+    config.add_extension('_vq',
+        sources=[('_vq.c')],
+        include_dirs=[get_numpy_include_dirs()])
+
+    config.add_extension('_hierarchy',
+        sources=[('_hierarchy.c')],
+        include_dirs=[get_numpy_include_dirs()])
+
+    config.add_extension('_optimal_leaf_ordering',
+        sources=[('_optimal_leaf_ordering.c')],
+        include_dirs=[get_numpy_include_dirs()])
+
+    return config
+
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(**configuration(top_path='').todict())
--- a/venv/Lib/site-packages/scipy/cluster/tests/init.py
+++ b/venv/Lib/site-packages/scipy/cluster/tests/init.py
--- a/venv/Lib/site-packages/scipy/cluster/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/tests/pycache/hierarchy_test_data.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/tests/pycache/hierarchy_test_data.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/tests/pycache/test_hierarchy.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/tests/pycache/test_hierarchy.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/tests/pycache/test_vq.cpython-36.pyc
+++ b/venv/Lib/site-packages/scipy/cluster/tests/pycache/test_vq.cpython-36.pyc
--- a/venv/Lib/site-packages/scipy/cluster/tests/hierarchy_test_data.py
+++ b/venv/Lib/site-packages/scipy/cluster/tests/hierarchy_test_data.py
@ -0,0 +1,145 @@
+from numpy import array
+
+
+Q_X = array([[5.26563660e-01, 3.14160190e-01, 8.00656370e-02],
+             [7.50205180e-01, 4.60299830e-01, 8.98696460e-01],
+             [6.65461230e-01, 6.94011420e-01, 9.10465700e-01],
+             [9.64047590e-01, 1.43082200e-03, 7.39874220e-01],
+             [1.08159060e-01, 5.53028790e-01, 6.63804780e-02],
+             [9.31359130e-01, 8.25424910e-01, 9.52315440e-01],
+             [6.78086960e-01, 3.41903970e-01, 5.61481950e-01],
+             [9.82730940e-01, 7.04605210e-01, 8.70978630e-02],
+             [6.14691610e-01, 4.69989230e-02, 6.02406450e-01],
+             [5.80161260e-01, 9.17354970e-01, 5.88163850e-01],
+             [1.38246310e+00, 1.96358160e+00, 1.94437880e+00],
+             [2.10675860e+00, 1.67148730e+00, 1.34854480e+00],
+             [1.39880070e+00, 1.66142050e+00, 1.32224550e+00],
+             [1.71410460e+00, 1.49176380e+00, 1.45432170e+00],
+             [1.54102340e+00, 1.84374950e+00, 1.64658950e+00],
+             [2.08512480e+00, 1.84524350e+00, 2.17340850e+00],
+             [1.30748740e+00, 1.53801650e+00, 2.16007740e+00],
+             [1.41447700e+00, 1.99329070e+00, 1.99107420e+00],
+             [1.61943490e+00, 1.47703280e+00, 1.89788160e+00],
+             [1.59880600e+00, 1.54988980e+00, 1.57563350e+00],
+             [3.37247380e+00, 2.69635310e+00, 3.39981700e+00],
+             [3.13705120e+00, 3.36528090e+00, 3.06089070e+00],
+             [3.29413250e+00, 3.19619500e+00, 2.90700170e+00],
+             [2.65510510e+00, 3.06785900e+00, 2.97198540e+00],
+             [3.30941040e+00, 2.59283970e+00, 2.57714110e+00],
+             [2.59557220e+00, 3.33477370e+00, 3.08793190e+00],
+             [2.58206180e+00, 3.41615670e+00, 3.26441990e+00],
+             [2.71127000e+00, 2.77032450e+00, 2.63466500e+00],
+             [2.79617850e+00, 3.25473720e+00, 3.41801560e+00],
+             [2.64741750e+00, 2.54538040e+00, 3.25354110e+00]])
+
+ytdist = array([662., 877., 255., 412., 996., 295., 468., 268., 400., 754.,
+                564., 138., 219., 869., 669.])
+
+linkage_ytdist_single = array([[2., 5., 138., 2.],
+                               [3., 4., 219., 2.],
+                               [0., 7., 255., 3.],
+                               [1., 8., 268., 4.],
+                               [6., 9., 295., 6.]])
+
+linkage_ytdist_complete = array([[2., 5., 138., 2.],
+                                 [3., 4., 219., 2.],
+                                 [1., 6., 400., 3.],
+                                 [0., 7., 412., 3.],
+                                 [8., 9., 996., 6.]])
+
+linkage_ytdist_average = array([[2., 5., 138., 2.],
+                                [3., 4., 219., 2.],
+                                [0., 7., 333.5, 3.],
+                                [1., 6., 347.5, 3.],
+                                [8., 9., 680.77777778, 6.]])
+
+linkage_ytdist_weighted = array([[2., 5., 138., 2.],
+                                 [3., 4., 219., 2.],
+                                 [0., 7., 333.5, 3.],
+                                 [1., 6., 347.5, 3.],
+                                 [8., 9., 670.125, 6.]])
+
+# the optimal leaf ordering of linkage_ytdist_single
+linkage_ytdist_single_olo = array([[5., 2., 138., 2.],
+                                   [4., 3., 219., 2.],
+                                   [7., 0., 255., 3.],
+                                   [1., 8., 268., 4.],
+                                   [6., 9., 295., 6.]])
+
+X = array([[1.43054825, -7.5693489],
+           [6.95887839, 6.82293382],
+           [2.87137846, -9.68248579],
+           [7.87974764, -6.05485803],
+           [8.24018364, -6.09495602],
+           [7.39020262, 8.54004355]])
+ 
+linkage_X_centroid = array([[3., 4., 0.36265956, 2.],
+                            [1., 5., 1.77045373, 2.],
+                            [0., 2., 2.55760419, 2.],
+                            [6., 8., 6.43614494, 4.],
+                            [7., 9., 15.17363237, 6.]])
+
+linkage_X_median = array([[3., 4., 0.36265956, 2.],
+                          [1., 5., 1.77045373, 2.],
+                          [0., 2., 2.55760419, 2.],
+                          [6., 8., 6.43614494, 4.],
+                          [7., 9., 15.17363237, 6.]])
+
+linkage_X_ward = array([[3., 4., 0.36265956, 2.],
+                        [1., 5., 1.77045373, 2.],
+                        [0., 2., 2.55760419, 2.],
+                        [6., 8., 9.10208346, 4.],
+                        [7., 9., 24.7784379, 6.]])
+
+# the optimal leaf ordering of linkage_X_ward
+linkage_X_ward_olo = array([[4., 3., 0.36265956, 2.],
+                            [5., 1., 1.77045373, 2.],
+                            [2., 0., 2.55760419, 2.],
+                            [6., 8., 9.10208346, 4.],
+                            [7., 9., 24.7784379, 6.]])
+
+inconsistent_ytdist = {
+    1: array([[138., 0., 1., 0.],
+              [219., 0., 1., 0.],
+              [255., 0., 1., 0.],
+              [268., 0., 1., 0.],
+              [295., 0., 1., 0.]]),
+    2: array([[138., 0., 1., 0.],
+              [219., 0., 1., 0.],
+              [237., 25.45584412, 2., 0.70710678],
+              [261.5, 9.19238816, 2., 0.70710678],
+              [233.66666667, 83.9424406, 3., 0.7306594]]),
+    3: array([[138., 0., 1., 0.],
+              [219., 0., 1., 0.],
+              [237., 25.45584412, 2., 0.70710678],
+              [247.33333333, 25.38372182, 3., 0.81417007],
+              [239., 69.36377537, 4., 0.80733783]]),
+    4: array([[138., 0., 1., 0.],
+              [219., 0., 1., 0.],
+              [237., 25.45584412, 2., 0.70710678],
+              [247.33333333, 25.38372182, 3., 0.81417007],
+              [235., 60.73302232, 5., 0.98793042]])}
+
+fcluster_inconsistent = {
+    0.8: array([6, 2, 2, 4, 6, 2, 3, 7, 3, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1,
+                1, 1, 1, 1, 1, 1, 1, 1, 1]),
+    1.0: array([6, 2, 2, 4, 6, 2, 3, 7, 3, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1,
+                1, 1, 1, 1, 1, 1, 1, 1, 1]),
+    2.0: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1, 1, 1, 1, 1, 1, 1, 1, 1])}
+
+fcluster_distance = {
+    0.6: array([4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 3,
+                1, 1, 1, 2, 1, 1, 1, 1, 1]),
+    1.0: array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
+                1, 1, 1, 1, 1, 1, 1, 1, 1]),
+    2.0: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1, 1, 1, 1, 1, 1, 1, 1, 1])}
+
+fcluster_maxclust = {
+    8.0: array([5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 4,
+                1, 1, 1, 3, 1, 1, 1, 1, 2]),
+    4.0: array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2,
+                1, 1, 1, 1, 1, 1, 1, 1, 1]),
+    1.0: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                1, 1, 1, 1, 1, 1, 1, 1, 1])}
--- a/venv/Lib/site-packages/scipy/cluster/tests/test_hierarchy.py
+++ b/venv/Lib/site-packages/scipy/cluster/tests/test_hierarchy.py
--- a/venv/Lib/site-packages/scipy/cluster/tests/test_vq.py
+++ b/venv/Lib/site-packages/scipy/cluster/tests/test_vq.py
@ -0,0 +1,311 @@
+
+import warnings
+import sys
+
+import numpy as np
+from numpy.testing import (assert_array_equal, assert_array_almost_equal,
+                           assert_allclose, assert_equal, assert_,
+                           suppress_warnings)
+import pytest
+from pytest import raises as assert_raises
+
+from scipy.cluster.vq import (kmeans, kmeans2, py_vq, vq, whiten,
+                              ClusterError, _krandinit)
+from scipy.cluster import _vq
+from scipy.sparse.sputils import matrix
+
+
+TESTDATA_2D = np.array([
+    -2.2, 1.17, -1.63, 1.69, -2.04, 4.38, -3.09, 0.95, -1.7, 4.79, -1.68, 0.68,
+    -2.26, 3.34, -2.29, 2.55, -1.72, -0.72, -1.99, 2.34, -2.75, 3.43, -2.45,
+    2.41, -4.26, 3.65, -1.57, 1.87, -1.96, 4.03, -3.01, 3.86, -2.53, 1.28,
+    -4.0, 3.95, -1.62, 1.25, -3.42, 3.17, -1.17, 0.12, -3.03, -0.27, -2.07,
+    -0.55, -1.17, 1.34, -2.82, 3.08, -2.44, 0.24, -1.71, 2.48, -5.23, 4.29,
+    -2.08, 3.69, -1.89, 3.62, -2.09, 0.26, -0.92, 1.07, -2.25, 0.88, -2.25,
+    2.02, -4.31, 3.86, -2.03, 3.42, -2.76, 0.3, -2.48, -0.29, -3.42, 3.21,
+    -2.3, 1.73, -2.84, 0.69, -1.81, 2.48, -5.24, 4.52, -2.8, 1.31, -1.67,
+    -2.34, -1.18, 2.17, -2.17, 2.82, -1.85, 2.25, -2.45, 1.86, -6.79, 3.94,
+    -2.33, 1.89, -1.55, 2.08, -1.36, 0.93, -2.51, 2.74, -2.39, 3.92, -3.33,
+    2.99, -2.06, -0.9, -2.83, 3.35, -2.59, 3.05, -2.36, 1.85, -1.69, 1.8,
+    -1.39, 0.66, -2.06, 0.38, -1.47, 0.44, -4.68, 3.77, -5.58, 3.44, -2.29,
+    2.24, -1.04, -0.38, -1.85, 4.23, -2.88, 0.73, -2.59, 1.39, -1.34, 1.75,
+    -1.95, 1.3, -2.45, 3.09, -1.99, 3.41, -5.55, 5.21, -1.73, 2.52, -2.17,
+    0.85, -2.06, 0.49, -2.54, 2.07, -2.03, 1.3, -3.23, 3.09, -1.55, 1.44,
+    -0.81, 1.1, -2.99, 2.92, -1.59, 2.18, -2.45, -0.73, -3.12, -1.3, -2.83,
+    0.2, -2.77, 3.24, -1.98, 1.6, -4.59, 3.39, -4.85, 3.75, -2.25, 1.71, -3.28,
+    3.38, -1.74, 0.88, -2.41, 1.92, -2.24, 1.19, -2.48, 1.06, -1.68, -0.62,
+    -1.3, 0.39, -1.78, 2.35, -3.54, 2.44, -1.32, 0.66, -2.38, 2.76, -2.35,
+    3.95, -1.86, 4.32, -2.01, -1.23, -1.79, 2.76, -2.13, -0.13, -5.25, 3.84,
+    -2.24, 1.59, -4.85, 2.96, -2.41, 0.01, -0.43, 0.13, -3.92, 2.91, -1.75,
+    -0.53, -1.69, 1.69, -1.09, 0.15, -2.11, 2.17, -1.53, 1.22, -2.1, -0.86,
+    -2.56, 2.28, -3.02, 3.33, -1.12, 3.86, -2.18, -1.19, -3.03, 0.79, -0.83,
+    0.97, -3.19, 1.45, -1.34, 1.28, -2.52, 4.22, -4.53, 3.22, -1.97, 1.75,
+    -2.36, 3.19, -0.83, 1.53, -1.59, 1.86, -2.17, 2.3, -1.63, 2.71, -2.03,
+    3.75, -2.57, -0.6, -1.47, 1.33, -1.95, 0.7, -1.65, 1.27, -1.42, 1.09, -3.0,
+    3.87, -2.51, 3.06, -2.6, 0.74, -1.08, -0.03, -2.44, 1.31, -2.65, 2.99,
+    -1.84, 1.65, -4.76, 3.75, -2.07, 3.98, -2.4, 2.67, -2.21, 1.49, -1.21,
+    1.22, -5.29, 2.38, -2.85, 2.28, -5.6, 3.78, -2.7, 0.8, -1.81, 3.5, -3.75,
+    4.17, -1.29, 2.99, -5.92, 3.43, -1.83, 1.23, -1.24, -1.04, -2.56, 2.37,
+    -3.26, 0.39, -4.63, 2.51, -4.52, 3.04, -1.7, 0.36, -1.41, 0.04, -2.1, 1.0,
+    -1.87, 3.78, -4.32, 3.59, -2.24, 1.38, -1.99, -0.22, -1.87, 1.95, -0.84,
+    2.17, -5.38, 3.56, -1.27, 2.9, -1.79, 3.31, -5.47, 3.85, -1.44, 3.69,
+    -2.02, 0.37, -1.29, 0.33, -2.34, 2.56, -1.74, -1.27, -1.97, 1.22, -2.51,
+    -0.16, -1.64, -0.96, -2.99, 1.4, -1.53, 3.31, -2.24, 0.45, -2.46, 1.71,
+    -2.88, 1.56, -1.63, 1.46, -1.41, 0.68, -1.96, 2.76, -1.61,
+    2.11]).reshape((200, 2))
+
+
+# Global data
+X = np.array([[3.0, 3], [4, 3], [4, 2],
+              [9, 2], [5, 1], [6, 2], [9, 4],
+              [5, 2], [5, 4], [7, 4], [6, 5]])
+
+CODET1 = np.array([[3.0000, 3.0000],
+                   [6.2000, 4.0000],
+                   [5.8000, 1.8000]])
+
+CODET2 = np.array([[11.0/3, 8.0/3],
+                   [6.7500, 4.2500],
+                   [6.2500, 1.7500]])
+
+LABEL1 = np.array([0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1])
+
+
+class TestWhiten(object):
+    def test_whiten(self):
+        desired = np.array([[5.08738849, 2.97091878],
+                            [3.19909255, 0.69660580],
+                            [4.51041982, 0.02640918],
+                            [4.38567074, 0.95120889],
+                            [2.32191480, 1.63195503]])
+        for tp in np.array, matrix:
+            obs = tp([[0.98744510, 0.82766775],
+                      [0.62093317, 0.19406729],
+                      [0.87545741, 0.00735733],
+                      [0.85124403, 0.26499712],
+                      [0.45067590, 0.45464607]])
+            assert_allclose(whiten(obs), desired, rtol=1e-5)
+
+    def test_whiten_zero_std(self):
+        desired = np.array([[0., 1.0, 2.86666544],
+                            [0., 1.0, 1.32460034],
+                            [0., 1.0, 3.74382172]])
+        for tp in np.array, matrix:
+            obs = tp([[0., 1., 0.74109533],
+                      [0., 1., 0.34243798],
+                      [0., 1., 0.96785929]])
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
+                assert_allclose(whiten(obs), desired, rtol=1e-5)
+                assert_equal(len(w), 1)
+                assert_(issubclass(w[-1].category, RuntimeWarning))
+
+    def test_whiten_not_finite(self):
+        for tp in np.array, matrix:
+            for bad_value in np.nan, np.inf, -np.inf:
+                obs = tp([[0.98744510, bad_value],
+                          [0.62093317, 0.19406729],
+                          [0.87545741, 0.00735733],
+                          [0.85124403, 0.26499712],
+                          [0.45067590, 0.45464607]])
+                assert_raises(ValueError, whiten, obs)
+
+
+class TestVq(object):
+    def test_py_vq(self):
+        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
+        for tp in np.array, matrix:
+            label1 = py_vq(tp(X), tp(initc))[0]
+            assert_array_equal(label1, LABEL1)
+
+    def test_vq(self):
+        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
+        for tp in np.array, matrix:
+            label1, dist = _vq.vq(tp(X), tp(initc))
+            assert_array_equal(label1, LABEL1)
+            tlabel1, tdist = vq(tp(X), tp(initc))
+
+    def test_vq_1d(self):
+        # Test special rank 1 vq algo, python implementation.
+        data = X[:, 0]
+        initc = data[:3]
+        a, b = _vq.vq(data, initc)
+        ta, tb = py_vq(data[:, np.newaxis], initc[:, np.newaxis])
+        assert_array_equal(a, ta)
+        assert_array_equal(b, tb)
+
+    def test__vq_sametype(self):
+        a = np.array([1.0, 2.0], dtype=np.float64)
+        b = a.astype(np.float32)
+        assert_raises(TypeError, _vq.vq, a, b)
+
+    def test__vq_invalid_type(self):
+        a = np.array([1, 2], dtype=int)
+        assert_raises(TypeError, _vq.vq, a, a)
+
+    def test_vq_large_nfeat(self):
+        X = np.random.rand(20, 20)
+        code_book = np.random.rand(3, 20)
+
+        codes0, dis0 = _vq.vq(X, code_book)
+        codes1, dis1 = py_vq(X, code_book)
+        assert_allclose(dis0, dis1, 1e-5)
+        assert_array_equal(codes0, codes1)
+
+        X = X.astype(np.float32)
+        code_book = code_book.astype(np.float32)
+
+        codes0, dis0 = _vq.vq(X, code_book)
+        codes1, dis1 = py_vq(X, code_book)
+        assert_allclose(dis0, dis1, 1e-5)
+        assert_array_equal(codes0, codes1)
+
+    def test_vq_large_features(self):
+        X = np.random.rand(10, 5) * 1000000
+        code_book = np.random.rand(2, 5) * 1000000
+
+        codes0, dis0 = _vq.vq(X, code_book)
+        codes1, dis1 = py_vq(X, code_book)
+        assert_allclose(dis0, dis1, 1e-5)
+        assert_array_equal(codes0, codes1)
+
+
+class TestKMean(object):
+    def test_large_features(self):
+        # Generate a data set with large values, and run kmeans on it to
+        # (regression for 1077).
+        d = 300
+        n = 100
+
+        m1 = np.random.randn(d)
+        m2 = np.random.randn(d)
+        x = 10000 * np.random.randn(n, d) - 20000 * m1
+        y = 10000 * np.random.randn(n, d) + 20000 * m2
+
+        data = np.empty((x.shape[0] + y.shape[0], d), np.double)
+        data[:x.shape[0]] = x
+        data[x.shape[0]:] = y
+
+        kmeans(data, 2)
+
+    def test_kmeans_simple(self):
+        np.random.seed(54321)
+        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
+        for tp in np.array, matrix:
+            code1 = kmeans(tp(X), tp(initc), iter=1)[0]
+            assert_array_almost_equal(code1, CODET2)
+
+    def test_kmeans_lost_cluster(self):
+        # This will cause kmeans to have a cluster with no points.
+        data = TESTDATA_2D
+        initk = np.array([[-1.8127404, -0.67128041],
+                         [2.04621601, 0.07401111],
+                         [-2.31149087, -0.05160469]])
+
+        kmeans(data, initk)
+        with suppress_warnings() as sup:
+            sup.filter(UserWarning,
+                       "One of the clusters is empty. Re-run kmeans with a "
+                       "different initialization")
+            kmeans2(data, initk, missing='warn')
+
+        assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
+
+    def test_kmeans2_simple(self):
+        np.random.seed(12345678)
+        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
+        for tp in np.array, matrix:
+            code1 = kmeans2(tp(X), tp(initc), iter=1)[0]
+            code2 = kmeans2(tp(X), tp(initc), iter=2)[0]
+
+            assert_array_almost_equal(code1, CODET1)
+            assert_array_almost_equal(code2, CODET2)
+
+    def test_kmeans2_rank1(self):
+        data = TESTDATA_2D
+        data1 = data[:, 0]
+
+        initc = data1[:3]
+        code = initc.copy()
+        kmeans2(data1, code, iter=1)[0]
+        kmeans2(data1, code, iter=2)[0]
+
+    def test_kmeans2_rank1_2(self):
+        data = TESTDATA_2D
+        data1 = data[:, 0]
+        kmeans2(data1, 2, iter=1)
+
+    def test_kmeans2_high_dim(self):
+        # test kmeans2 when the number of dimensions exceeds the number
+        # of input points
+        data = TESTDATA_2D
+        data = data.reshape((20, 20))[:10]
+        kmeans2(data, 2)
+
+    def test_kmeans2_init(self):
+        np.random.seed(12345)
+        data = TESTDATA_2D
+
+        kmeans2(data, 3, minit='points')
+        kmeans2(data[:, :1], 3, minit='points')  # special case (1-D)
+
+        kmeans2(data, 3, minit='++')
+        kmeans2(data[:, :1], 3, minit='++')  # special case (1-D)
+
+        # minit='random' can give warnings, filter those
+        with suppress_warnings() as sup:
+            sup.filter(message="One of the clusters is empty. Re-run.")
+            kmeans2(data, 3, minit='random')
+            kmeans2(data[:, :1], 3, minit='random')  # special case (1-D)
+
+    @pytest.mark.skipif(sys.platform == 'win32',
+                        reason='Fails with MemoryError in Wine.')
+    def test_krandinit(self):
+        data = TESTDATA_2D
+        datas = [data.reshape((200, 2)), data.reshape((20, 20))[:10]]
+        k = int(1e6)
+        for data in datas:
+            np.random.seed(1234)
+            init = _krandinit(data, k)
+            orig_cov = np.cov(data, rowvar=0)
+            init_cov = np.cov(init, rowvar=0)
+            assert_allclose(orig_cov, init_cov, atol=1e-2)
+
+    def test_kmeans2_empty(self):
+        # Regression test for gh-1032.
+        assert_raises(ValueError, kmeans2, [], 2)
+
+    def test_kmeans_0k(self):
+        # Regression test for gh-1073: fail when k arg is 0.
+        assert_raises(ValueError, kmeans, X, 0)
+        assert_raises(ValueError, kmeans2, X, 0)
+        assert_raises(ValueError, kmeans2, X, np.array([]))
+
+    def test_kmeans_large_thres(self):
+        # Regression test for gh-1774
+        x = np.array([1, 2, 3, 4, 10], dtype=float)
+        res = kmeans(x, 1, thresh=1e16)
+        assert_allclose(res[0], np.array([4.]))
+        assert_allclose(res[1], 2.3999999999999999)
+
+    def test_kmeans2_kpp_low_dim(self):
+        # Regression test for gh-11462
+        prev_res = np.array([[-1.95266667, 0.898],
+                             [-3.153375, 3.3945]])
+        np.random.seed(42)
+        res, _ = kmeans2(TESTDATA_2D, 2, minit='++')
+        assert_allclose(res, prev_res)
+
+    def test_kmeans2_kpp_high_dim(self):
+        # Regression test for gh-11462
+        n_dim = 100
+        size = 10
+        centers = np.vstack([5 * np.ones(n_dim),
+                             -5 * np.ones(n_dim)])
+        np.random.seed(42)
+        data = np.vstack([
+            np.random.multivariate_normal(centers[0], np.eye(n_dim), size=size),
+            np.random.multivariate_normal(centers[1], np.eye(n_dim), size=size)
+        ])
+        res, _ = kmeans2(data, 2, minit='++')
+        assert_array_almost_equal(res, centers, decimal=0)
--- a/venv/Lib/site-packages/scipy/cluster/vq.py
+++ b/venv/Lib/site-packages/scipy/cluster/vq.py
@ -0,0 +1,756 @@
+"""
+K-means clustering and vector quantization (:mod:`scipy.cluster.vq`)
+====================================================================
+
+Provides routines for k-means clustering, generating code books
+from k-means models and quantizing vectors by comparing them with
+centroids in a code book.
+
+.. autosummary::
+   :toctree: generated/
+
+   whiten -- Normalize a group of observations so each feature has unit variance
+   vq -- Calculate code book membership of a set of observation vectors
+   kmeans -- Perform k-means on a set of observation vectors forming k clusters
+   kmeans2 -- A different implementation of k-means with more methods
+           -- for initializing centroids
+
+Background information
+----------------------
+The k-means algorithm takes as input the number of clusters to
+generate, k, and a set of observation vectors to cluster. It
+returns a set of centroids, one for each of the k clusters. An
+observation vector is classified with the cluster number or
+centroid index of the centroid closest to it.
+
+A vector v belongs to cluster i if it is closer to centroid i than
+any other centroid. If v belongs to i, we say centroid i is the
+dominating centroid of v. The k-means algorithm tries to
+minimize distortion, which is defined as the sum of the squared distances
+between each observation vector and its dominating centroid.
+The minimization is achieved by iteratively reclassifying
+the observations into clusters and recalculating the centroids until
+a configuration is reached in which the centroids are stable. One can
+also define a maximum number of iterations.
+
+Since vector quantization is a natural application for k-means,
+information theory terminology is often used. The centroid index
+or cluster index is also referred to as a "code" and the table
+mapping codes to centroids and, vice versa, is often referred to as a
+"code book". The result of k-means, a set of centroids, can be
+used to quantize vectors. Quantization aims to find an encoding of
+vectors that reduces the expected distortion.
+
+All routines expect obs to be an M by N array, where the rows are
+the observation vectors. The codebook is a k by N array, where the
+ith row is the centroid of code word i. The observation vectors
+and centroids have the same feature dimension.
+
+As an example, suppose we wish to compress a 24-bit color image
+(each pixel is represented by one byte for red, one for blue, and
+one for green) before sending it over the web. By using a smaller
+8-bit encoding, we can reduce the amount of data by two
+thirds. Ideally, the colors for each of the 256 possible 8-bit
+encoding values should be chosen to minimize distortion of the
+color. Running k-means with k=256 generates a code book of 256
+codes, which fills up all possible 8-bit sequences. Instead of
+sending a 3-byte value for each pixel, the 8-bit centroid index
+(or code word) of the dominating centroid is transmitted. The code
+book is also sent over the wire so each 8-bit code can be
+translated back to a 24-bit pixel value representation. If the
+image of interest was of an ocean, we would expect many 24-bit
+blues to be represented by 8-bit codes. If it was an image of a
+human face, more flesh-tone colors would be represented in the
+code book.
+
+"""
+import warnings
+import numpy as np
+from collections import deque
+from scipy._lib._util import _asarray_validated
+from scipy.spatial.distance import cdist
+
+from . import _vq
+
+__docformat__ = 'restructuredtext'
+
+__all__ = ['whiten', 'vq', 'kmeans', 'kmeans2']
+
+
+class ClusterError(Exception):
+    pass
+
+
+def whiten(obs, check_finite=True):
+    """
+    Normalize a group of observations on a per feature basis.
+
+    Before running k-means, it is beneficial to rescale each feature
+    dimension of the observation set with whitening. Each feature is
+    divided by its standard deviation across all observations to give
+    it unit variance.
+
+    Parameters
+    ----------
+    obs : ndarray
+        Each row of the array is an observation.  The
+        columns are the features seen during each observation.
+
+        >>> #         f0    f1    f2
+        >>> obs = [[  1.,   1.,   1.],  #o0
+        ...        [  2.,   2.,   2.],  #o1
+        ...        [  3.,   3.,   3.],  #o2
+        ...        [  4.,   4.,   4.]]  #o3
+
+    check_finite : bool, optional
+        Whether to check that the input matrices contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        Default: True
+
+    Returns
+    -------
+    result : ndarray
+        Contains the values in `obs` scaled by the standard deviation
+        of each column.
+
+    Examples
+    --------
+    >>> from scipy.cluster.vq import whiten
+    >>> features  = np.array([[1.9, 2.3, 1.7],
+    ...                       [1.5, 2.5, 2.2],
+    ...                       [0.8, 0.6, 1.7,]])
+    >>> whiten(features)
+    array([[ 4.17944278,  2.69811351,  7.21248917],
+           [ 3.29956009,  2.93273208,  9.33380951],
+           [ 1.75976538,  0.7038557 ,  7.21248917]])
+
+    """
+    obs = _asarray_validated(obs, check_finite=check_finite)
+    std_dev = obs.std(axis=0)
+    zero_std_mask = std_dev == 0
+    if zero_std_mask.any():
+        std_dev[zero_std_mask] = 1.0
+        warnings.warn("Some columns have standard deviation zero. "
+                      "The values of these columns will not change.",
+                      RuntimeWarning)
+    return obs / std_dev
+
+
+def vq(obs, code_book, check_finite=True):
+    """
+    Assign codes from a code book to observations.
+
+    Assigns a code from a code book to each observation. Each
+    observation vector in the 'M' by 'N' `obs` array is compared with the
+    centroids in the code book and assigned the code of the closest
+    centroid.
+
+    The features in `obs` should have unit variance, which can be
+    achieved by passing them through the whiten function. The code
+    book can be created with the k-means algorithm or a different
+    encoding algorithm.
+
+    Parameters
+    ----------
+    obs : ndarray
+        Each row of the 'M' x 'N' array is an observation. The columns are
+        the "features" seen during each observation. The features must be
+        whitened first using the whiten function or something equivalent.
+    code_book : ndarray
+        The code book is usually generated using the k-means algorithm.
+        Each row of the array holds a different code, and the columns are
+        the features of the code.
+
+         >>> #              f0    f1    f2   f3
+         >>> code_book = [
+         ...             [  1.,   2.,   3.,   4.],  #c0
+         ...             [  1.,   2.,   3.,   4.],  #c1
+         ...             [  1.,   2.,   3.,   4.]]  #c2
+
+    check_finite : bool, optional
+        Whether to check that the input matrices contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        Default: True
+
+    Returns
+    -------
+    code : ndarray
+        A length M array holding the code book index for each observation.
+    dist : ndarray
+        The distortion (distance) between the observation and its nearest
+        code.
+
+    Examples
+    --------
+    >>> from numpy import array
+    >>> from scipy.cluster.vq import vq
+    >>> code_book = array([[1.,1.,1.],
+    ...                    [2.,2.,2.]])
+    >>> features  = array([[  1.9,2.3,1.7],
+    ...                    [  1.5,2.5,2.2],
+    ...                    [  0.8,0.6,1.7]])
+    >>> vq(features,code_book)
+    (array([1, 1, 0],'i'), array([ 0.43588989,  0.73484692,  0.83066239]))
+
+    """
+    obs = _asarray_validated(obs, check_finite=check_finite)
+    code_book = _asarray_validated(code_book, check_finite=check_finite)
+    ct = np.common_type(obs, code_book)
+
+    c_obs = obs.astype(ct, copy=False)
+    c_code_book = code_book.astype(ct, copy=False)
+
+    if np.issubdtype(ct, np.float64) or np.issubdtype(ct, np.float32):
+        return _vq.vq(c_obs, c_code_book)
+    return py_vq(obs, code_book, check_finite=False)
+
+
+def py_vq(obs, code_book, check_finite=True):
+    """ Python version of vq algorithm.
+
+    The algorithm computes the Euclidean distance between each
+    observation and every frame in the code_book.
+
+    Parameters
+    ----------
+    obs : ndarray
+        Expects a rank 2 array. Each row is one observation.
+    code_book : ndarray
+        Code book to use. Same format than obs. Should have same number of
+        features (e.g., columns) than obs.
+    check_finite : bool, optional
+        Whether to check that the input matrices contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        Default: True
+
+    Returns
+    -------
+    code : ndarray
+        code[i] gives the label of the ith obversation; its code is
+        code_book[code[i]].
+    mind_dist : ndarray
+        min_dist[i] gives the distance between the ith observation and its
+        corresponding code.
+
+    Notes
+    -----
+    This function is slower than the C version but works for
+    all input types. If the inputs have the wrong types for the
+    C versions of the function, this one is called as a last resort.
+
+    It is about 20 times slower than the C version.
+
+    """
+    obs = _asarray_validated(obs, check_finite=check_finite)
+    code_book = _asarray_validated(code_book, check_finite=check_finite)
+
+    if obs.ndim != code_book.ndim:
+        raise ValueError("Observation and code_book should have the same rank")
+
+    if obs.ndim == 1:
+        obs = obs[:, np.newaxis]
+        code_book = code_book[:, np.newaxis]
+
+    dist = cdist(obs, code_book)
+    code = dist.argmin(axis=1)
+    min_dist = dist[np.arange(len(code)), code]
+    return code, min_dist
+
+
+# py_vq2 was equivalent to py_vq
+py_vq2 = np.deprecate(py_vq, old_name='py_vq2', new_name='py_vq')
+
+
+def _kmeans(obs, guess, thresh=1e-5):
+    """ "raw" version of k-means.
+
+    Returns
+    -------
+    code_book
+        The lowest distortion codebook found.
+    avg_dist
+        The average distance a observation is from a code in the book.
+        Lower means the code_book matches the data better.
+
+    See Also
+    --------
+    kmeans : wrapper around k-means
+
+    Examples
+    --------
+    Note: not whitened in this example.
+
+    >>> from numpy import array
+    >>> from scipy.cluster.vq import _kmeans
+    >>> features  = array([[ 1.9,2.3],
+    ...                    [ 1.5,2.5],
+    ...                    [ 0.8,0.6],
+    ...                    [ 0.4,1.8],
+    ...                    [ 1.0,1.0]])
+    >>> book = array((features[0],features[2]))
+    >>> _kmeans(features,book)
+    (array([[ 1.7       ,  2.4       ],
+           [ 0.73333333,  1.13333333]]), 0.40563916697728591)
+
+    """
+
+    code_book = np.asarray(guess)
+    diff = np.inf
+    prev_avg_dists = deque([diff], maxlen=2)
+    while diff > thresh:
+        # compute membership and distances between obs and code_book
+        obs_code, distort = vq(obs, code_book, check_finite=False)
+        prev_avg_dists.append(distort.mean(axis=-1))
+        # recalc code_book as centroids of associated obs
+        code_book, has_members = _vq.update_cluster_means(obs, obs_code,
+                                                          code_book.shape[0])
+        code_book = code_book[has_members]
+        diff = prev_avg_dists[0] - prev_avg_dists[1]
+
+    return code_book, prev_avg_dists[1]
+
+
+def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True):
+    """
+    Performs k-means on a set of observation vectors forming k clusters.
+
+    The k-means algorithm adjusts the classification of the observations
+    into clusters and updates the cluster centroids until the position of
+    the centroids is stable over successive iterations. In this
+    implementation of the algorithm, the stability of the centroids is
+    determined by comparing the absolute value of the change in the average
+    Euclidean distance between the observations and their corresponding
+    centroids against a threshold. This yields
+    a code book mapping centroids to codes and vice versa.
+
+    Parameters
+    ----------
+    obs : ndarray
+       Each row of the M by N array is an observation vector. The
+       columns are the features seen during each observation.
+       The features must be whitened first with the `whiten` function.
+
+    k_or_guess : int or ndarray
+       The number of centroids to generate. A code is assigned to
+       each centroid, which is also the row index of the centroid
+       in the code_book matrix generated.
+
+       The initial k centroids are chosen by randomly selecting
+       observations from the observation matrix. Alternatively,
+       passing a k by N array specifies the initial k centroids.
+
+    iter : int, optional
+       The number of times to run k-means, returning the codebook
+       with the lowest distortion. This argument is ignored if
+       initial centroids are specified with an array for the
+       ``k_or_guess`` parameter. This parameter does not represent the
+       number of iterations of the k-means algorithm.
+
+    thresh : float, optional
+       Terminates the k-means algorithm if the change in
+       distortion since the last k-means iteration is less than
+       or equal to threshold.
+
+    check_finite : bool, optional
+        Whether to check that the input matrices contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        Default: True
+
+    Returns
+    -------
+    codebook : ndarray
+       A k by N array of k centroids. The ith centroid
+       codebook[i] is represented with the code i. The centroids
+       and codes generated represent the lowest distortion seen,
+       not necessarily the globally minimal distortion.
+
+    distortion : float
+       The mean (non-squared) Euclidean distance between the observations
+       passed and the centroids generated. Note the difference to the standard
+       definition of distortion in the context of the k-means algorithm, which
+       is the sum of the squared distances.
+
+    See Also
+    --------
+    kmeans2 : a different implementation of k-means clustering
+       with more methods for generating initial centroids but without
+       using a distortion change threshold as a stopping criterion.
+
+    whiten : must be called prior to passing an observation matrix
+       to kmeans.
+
+    Examples
+    --------
+    >>> from numpy import array
+    >>> from scipy.cluster.vq import vq, kmeans, whiten
+    >>> import matplotlib.pyplot as plt
+    >>> features  = array([[ 1.9,2.3],
+    ...                    [ 1.5,2.5],
+    ...                    [ 0.8,0.6],
+    ...                    [ 0.4,1.8],
+    ...                    [ 0.1,0.1],
+    ...                    [ 0.2,1.8],
+    ...                    [ 2.0,0.5],
+    ...                    [ 0.3,1.5],
+    ...                    [ 1.0,1.0]])
+    >>> whitened = whiten(features)
+    >>> book = np.array((whitened[0],whitened[2]))
+    >>> kmeans(whitened,book)
+    (array([[ 2.3110306 ,  2.86287398],    # random
+           [ 0.93218041,  1.24398691]]), 0.85684700941625547)
+
+    >>> from numpy import random
+    >>> random.seed((1000,2000))
+    >>> codes = 3
+    >>> kmeans(whitened,codes)
+    (array([[ 2.3110306 ,  2.86287398],    # random
+           [ 1.32544402,  0.65607529],
+           [ 0.40782893,  2.02786907]]), 0.5196582527686241)
+
+    >>> # Create 50 datapoints in two clusters a and b
+    >>> pts = 50
+    >>> a = np.random.multivariate_normal([0, 0], [[4, 1], [1, 4]], size=pts)
+    >>> b = np.random.multivariate_normal([30, 10],
+    ...                                   [[10, 2], [2, 1]],
+    ...                                   size=pts)
+    >>> features = np.concatenate((a, b))
+    >>> # Whiten data
+    >>> whitened = whiten(features)
+    >>> # Find 2 clusters in the data
+    >>> codebook, distortion = kmeans(whitened, 2)
+    >>> # Plot whitened data and cluster centers in red
+    >>> plt.scatter(whitened[:, 0], whitened[:, 1])
+    >>> plt.scatter(codebook[:, 0], codebook[:, 1], c='r')
+    >>> plt.show()
+    """
+    obs = _asarray_validated(obs, check_finite=check_finite)
+    if iter < 1:
+        raise ValueError("iter must be at least 1, got %s" % iter)
+
+    # Determine whether a count (scalar) or an initial guess (array) was passed.
+    if not np.isscalar(k_or_guess):
+        guess = _asarray_validated(k_or_guess, check_finite=check_finite)
+        if guess.size < 1:
+            raise ValueError("Asked for 0 clusters. Initial book was %s" %
+                             guess)
+        return _kmeans(obs, guess, thresh=thresh)
+
+    # k_or_guess is a scalar, now verify that it's an integer
+    k = int(k_or_guess)
+    if k != k_or_guess:
+        raise ValueError("If k_or_guess is a scalar, it must be an integer.")
+    if k < 1:
+        raise ValueError("Asked for %d clusters." % k)
+
+    # initialize best distance value to a large value
+    best_dist = np.inf
+    for i in range(iter):
+        # the initial code book is randomly selected from observations
+        guess = _kpoints(obs, k)
+        book, dist = _kmeans(obs, guess, thresh=thresh)
+        if dist < best_dist:
+            best_book = book
+            best_dist = dist
+    return best_book, best_dist
+
+
+def _kpoints(data, k):
+    """Pick k points at random in data (one row = one observation).
+
+    Parameters
+    ----------
+    data : ndarray
+        Expect a rank 1 or 2 array. Rank 1 are assumed to describe one
+        dimensional data, rank 2 multidimensional data, in which case one
+        row is one observation.
+    k : int
+        Number of samples to generate.
+
+   Returns
+    -------
+    x : ndarray
+        A 'k' by 'N' containing the initial centroids
+
+    """
+    idx = np.random.choice(data.shape[0], size=k, replace=False)
+    return data[idx]
+
+
+def _krandinit(data, k):
+    """Returns k samples of a random variable whose parameters depend on data.
+
+    More precisely, it returns k observations sampled from a Gaussian random
+    variable whose mean and covariances are the ones estimated from the data.
+
+    Parameters
+    ----------
+    data : ndarray
+        Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
+        data, rank 2 multidimensional data, in which case one
+        row is one observation.
+    k : int
+        Number of samples to generate.
+
+    Returns
+    -------
+    x : ndarray
+        A 'k' by 'N' containing the initial centroids
+
+    """
+    mu = data.mean(axis=0)
+
+    if data.ndim == 1:
+        cov = np.cov(data)
+        x = np.random.randn(k)
+        x *= np.sqrt(cov)
+    elif data.shape[1] > data.shape[0]:
+        # initialize when the covariance matrix is rank deficient
+        _, s, vh = np.linalg.svd(data - mu, full_matrices=False)
+        x = np.random.randn(k, s.size)
+        sVh = s[:, None] * vh / np.sqrt(data.shape[0] - 1)
+        x = x.dot(sVh)
+    else:
+        cov = np.atleast_2d(np.cov(data, rowvar=False))
+
+        # k rows, d cols (one row = one obs)
+        # Generate k sample of a random variable ~ Gaussian(mu, cov)
+        x = np.random.randn(k, mu.size)
+        x = x.dot(np.linalg.cholesky(cov).T)
+
+    x += mu
+    return x
+
+
+def _kpp(data, k):
+    """ Picks k points in the data based on the kmeans++ method.
+
+    Parameters
+    ----------
+    data : ndarray
+        Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
+        data, rank 2 multidimensional data, in which case one
+        row is one observation.
+    k : int
+        Number of samples to generate.
+
+    Returns
+    -------
+    init : ndarray
+        A 'k' by 'N' containing the initial centroids.
+
+    References
+    ----------
+    .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
+       careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
+       on Discrete Algorithms, 2007.
+    """
+
+    dims = data.shape[1] if len(data.shape) > 1 else 1
+    init = np.ndarray((k, dims))
+
+    for i in range(k):
+        if i == 0:
+            init[i, :] = data[np.random.randint(data.shape[0])]
+
+        else:
+            D2 = cdist(init[:i,:], data, metric='sqeuclidean').min(axis=0)
+            probs = D2/D2.sum()
+            cumprobs = probs.cumsum()
+            r = np.random.rand()
+            init[i, :] = data[np.searchsorted(cumprobs, r)]
+
+    return init
+
+
+_valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp}
+
+
+def _missing_warn():
+    """Print a warning when called."""
+    warnings.warn("One of the clusters is empty. "
+                  "Re-run kmeans with a different initialization.")
+
+
+def _missing_raise():
+    """Raise a ClusterError when called."""
+    raise ClusterError("One of the clusters is empty. "
+                       "Re-run kmeans with a different initialization.")
+
+
+_valid_miss_meth = {'warn': _missing_warn, 'raise': _missing_raise}
+
+
+def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
+            missing='warn', check_finite=True):
+    """
+    Classify a set of observations into k clusters using the k-means algorithm.
+
+    The algorithm attempts to minimize the Euclidean distance between
+    observations and centroids. Several initialization methods are
+    included.
+
+    Parameters
+    ----------
+    data : ndarray
+        A 'M' by 'N' array of 'M' observations in 'N' dimensions or a length
+        'M' array of 'M' 1-D observations.
+    k : int or ndarray
+        The number of clusters to form as well as the number of
+        centroids to generate. If `minit` initialization string is
+        'matrix', or if a ndarray is given instead, it is
+        interpreted as initial cluster to use instead.
+    iter : int, optional
+        Number of iterations of the k-means algorithm to run. Note
+        that this differs in meaning from the iters parameter to
+        the kmeans function.
+    thresh : float, optional
+        (not used yet)
+    minit : str, optional
+        Method for initialization. Available methods are 'random',
+        'points', '++' and 'matrix':
+
+        'random': generate k centroids from a Gaussian with mean and
+        variance estimated from the data.
+
+        'points': choose k observations (rows) at random from data for
+        the initial centroids.
+
+        '++': choose k observations accordingly to the kmeans++ method
+        (careful seeding)
+
+        'matrix': interpret the k parameter as a k by M (or length k
+        array for 1-D data) array of initial centroids.
+    missing : str, optional
+        Method to deal with empty clusters. Available methods are
+        'warn' and 'raise':
+
+        'warn': give a warning and continue.
+
+        'raise': raise an ClusterError and terminate the algorithm.
+    check_finite : bool, optional
+        Whether to check that the input matrices contain only finite numbers.
+        Disabling may give a performance gain, but may result in problems
+        (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        Default: True
+
+    Returns
+    -------
+    centroid : ndarray
+        A 'k' by 'N' array of centroids found at the last iteration of
+        k-means.
+    label : ndarray
+        label[i] is the code or index of the centroid the
+        ith observation is closest to.
+
+    See Also
+    --------
+    kmeans
+
+    References
+    ----------
+    .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
+       careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
+       on Discrete Algorithms, 2007.
+
+    Examples
+    --------
+    >>> from scipy.cluster.vq import kmeans2
+    >>> import matplotlib.pyplot as plt
+
+    Create z, an array with shape (100, 2) containing a mixture of samples
+    from three multivariate normal distributions.
+
+    >>> np.random.seed(12345678)
+    >>> a = np.random.multivariate_normal([0, 6], [[2, 1], [1, 1.5]], size=45)
+    >>> b = np.random.multivariate_normal([2, 0], [[1, -1], [-1, 3]], size=30)
+    >>> c = np.random.multivariate_normal([6, 4], [[5, 0], [0, 1.2]], size=25)
+    >>> z = np.concatenate((a, b, c))
+    >>> np.random.shuffle(z)
+
+    Compute three clusters.
+
+    >>> centroid, label = kmeans2(z, 3, minit='points')
+    >>> centroid
+    array([[-0.35770296,  5.31342524],
+           [ 2.32210289, -0.50551972],
+           [ 6.17653859,  4.16719247]])
+
+    How many points are in each cluster?
+
+    >>> counts = np.bincount(label)
+    >>> counts
+    array([52, 27, 21])
+
+    Plot the clusters.
+
+    >>> w0 = z[label == 0]
+    >>> w1 = z[label == 1]
+    >>> w2 = z[label == 2]
+    >>> plt.plot(w0[:, 0], w0[:, 1], 'o', alpha=0.5, label='cluster 0')
+    >>> plt.plot(w1[:, 0], w1[:, 1], 'd', alpha=0.5, label='cluster 1')
+    >>> plt.plot(w2[:, 0], w2[:, 1], 's', alpha=0.5, label='cluster 2')
+    >>> plt.plot(centroid[:, 0], centroid[:, 1], 'k*', label='centroids')
+    >>> plt.axis('equal')
+    >>> plt.legend(shadow=True)
+    >>> plt.show()
+
+    """
+    if int(iter) < 1:
+        raise ValueError("Invalid iter (%s), "
+                         "must be a positive integer." % iter)
+    try:
+        miss_meth = _valid_miss_meth[missing]
+    except KeyError:
+        raise ValueError("Unknown missing method %r" % (missing,))
+
+    data = _asarray_validated(data, check_finite=check_finite)
+    if data.ndim == 1:
+        d = 1
+    elif data.ndim == 2:
+        d = data.shape[1]
+    else:
+        raise ValueError("Input of rank > 2 is not supported.")
+
+    if data.size < 1:
+        raise ValueError("Empty input is not supported.")
+
+    # If k is not a single value, it should be compatible with data's shape
+    if minit == 'matrix' or not np.isscalar(k):
+        code_book = np.array(k, copy=True)
+        if data.ndim != code_book.ndim:
+            raise ValueError("k array doesn't match data rank")
+        nc = len(code_book)
+        if data.ndim > 1 and code_book.shape[1] != d:
+            raise ValueError("k array doesn't match data dimension")
+    else:
+        nc = int(k)
+
+        if nc < 1:
+            raise ValueError("Cannot ask kmeans2 for %d clusters"
+                             " (k was %s)" % (nc, k))
+        elif nc != k:
+            warnings.warn("k was not an integer, was converted.")
+
+        try:
+            init_meth = _valid_init_meth[minit]
+        except KeyError:
+            raise ValueError("Unknown init method %r" % (minit,))
+        else:
+            code_book = init_meth(data, k)
+
+    for i in range(iter):
+        # Compute the nearest neighbor for each obs using the current code book
+        label = vq(data, code_book)[0]
+        # Update the code book by computing centroids
+        new_code_book, has_members = _vq.update_cluster_means(data, label, nc)
+        if not has_members.all():
+            miss_meth()
+            # Set the empty clusters to their previous positions
+            new_code_book[~has_members] = code_book[~has_members]
+        code_book = new_code_book
+
+    return code_book, label