Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/datasets/tests/init.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/init.py
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/init.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/init.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/conftest.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/conftest.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_20news.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_20news.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_base.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_base.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_california_housing.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_california_housing.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_common.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_common.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_covtype.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_covtype.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_kddcup99.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_kddcup99.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_lfw.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_lfw.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_olivetti_faces.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_olivetti_faces.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_openml.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_openml.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_rcv1.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_rcv1.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_samples_generator.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_samples_generator.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_svmlight_format.cpython-36.pyc
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/pycache/test_svmlight_format.cpython-36.pyc
--- a/venv/Lib/site-packages/sklearn/datasets/tests/conftest.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/conftest.py
@ -0,0 +1,75 @@
+""" Network tests are only run, if data is already locally available,
+or if download is specifically requested by environment variable."""
+import builtins
+from os import environ
+import pytest
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.datasets import fetch_20newsgroups_vectorized
+from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_covtype
+from sklearn.datasets import fetch_kddcup99
+from sklearn.datasets import fetch_olivetti_faces
+from sklearn.datasets import fetch_rcv1
+
+
+def _wrapped_fetch(f, dataset_name):
+    """ Fetch dataset (download if missing and requested by environment) """
+    download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
+
+    def wrapped(*args, **kwargs):
+        kwargs['download_if_missing'] = download_if_missing
+        try:
+            return f(*args, **kwargs)
+        except IOError:
+            pytest.skip("Download {} to run this test".format(dataset_name))
+    return wrapped
+
+
+@pytest.fixture
+def fetch_20newsgroups_fxt():
+    return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups')
+
+
+@pytest.fixture
+def fetch_20newsgroups_vectorized_fxt():
+    return _wrapped_fetch(fetch_20newsgroups_vectorized,
+                          dataset_name='20newsgroups_vectorized')
+
+
+@pytest.fixture
+def fetch_california_housing_fxt():
+    return _wrapped_fetch(fetch_california_housing,
+                          dataset_name='california_housing')
+
+
+@pytest.fixture
+def fetch_covtype_fxt():
+    return _wrapped_fetch(fetch_covtype, dataset_name='covtype')
+
+
+@pytest.fixture
+def fetch_kddcup99_fxt():
+    return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99')
+
+
+@pytest.fixture
+def fetch_olivetti_faces_fxt():
+    return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces')
+
+
+@pytest.fixture
+def fetch_rcv1_fxt():
+    return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1')
+
+
+@pytest.fixture
+def hide_available_pandas(monkeypatch):
+    """ Pretend pandas was not installed. """
+    import_orig = builtins.__import__
+
+    def mocked_import(name, *args, **kwargs):
+        if name == 'pandas':
+            raise ImportError()
+        return import_orig(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, '__import__', mocked_import)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/api-v1-json-data-features-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/api-v1-json-data-features-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/data-v1-download-1.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1/data-v1-download-1.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-features-1119.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-features-1119.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-list-data_name-adult-census-limit-2-data_version-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-list-data_name-adult-census-limit-2-data_version-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-list-data_name-adult-census-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-list-data_name-adult-census-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/data-v1-download-54002.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/1119/data-v1-download-54002.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-features-2.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-features-2.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-list-data_name-anneal-limit-2-data_version-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-list-data_name-anneal-limit-2-data_version-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-list-data_name-anneal-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-list-data_name-anneal-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/data-v1-download-1666876.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/2/data-v1-download-1666876.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-features-292.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-features-292.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-features-40981.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-features-40981.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-list-data_name-australian-limit-2-data_version-1-status-deactivated.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-list-data_name-australian-limit-2-data_version-1-status-deactivated.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-list-data_name-australian-limit-2-data_version-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-list-data_name-australian-limit-2-data_version-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-list-data_name-australian-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/api-v1-json-data-list-data_name-australian-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/data-v1-download-49822.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/292/data-v1-download-49822.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/api-v1-json-data-3.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/api-v1-json-data-3.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/api-v1-json-data-features-3.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/api-v1-json-data-features-3.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/data-v1-download-3.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/3/data-v1-download-3.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-features-40589.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-features-40589.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-list-data_name-emotions-limit-2-data_version-3.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-list-data_name-emotions-limit-2-data_version-3.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-list-data_name-emotions-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-list-data_name-emotions-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/data-v1-download-4644182.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40589/data-v1-download-4644182.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-40675.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-40675.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-features-40675.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-features-40675.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-list-data_name-glass2-limit-2-data_version-1-status-deactivated.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-list-data_name-glass2-limit-2-data_version-1-status-deactivated.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-list-data_name-glass2-limit-2-data_version-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-list-data_name-glass2-limit-2-data_version-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-list-data_name-glass2-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-list-data_name-glass2-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/data-v1-download-4965250.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40675/data-v1-download-4965250.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-40945.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-40945.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-features-40945.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-features-40945.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-qualities-40945.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/api-v1-json-data-qualities-40945.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-40966.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-40966.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-features-40966.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-features-40966.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-list-data_name-miceprotein-limit-2-data_version-4.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-list-data_name-miceprotein-limit-2-data_version-4.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-list-data_name-miceprotein-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-list-data_name-miceprotein-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/data-v1-download-17928620.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/40966/data-v1-download-17928620.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-561.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-561.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-features-561.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-features-561.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-list-data_name-cpu-limit-2-data_version-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-list-data_name-cpu-limit-2-data_version-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-list-data_name-cpu-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-list-data_name-cpu-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/data-v1-download-52739.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/561/data-v1-download-52739.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-61.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-61.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-features-61.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-features-61.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-list-data_name-iris-limit-2-data_version-1.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-list-data_name-iris-limit-2-data_version-1.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-list-data_name-iris-limit-2-status-active-.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-list-data_name-iris-limit-2-status-active-.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/data-v1-download-61.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/61/data-v1-download-61.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/api-v1-json-data-62.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/api-v1-json-data-features-62.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/api-v1-json-data-qualities-62.json.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/openml/62/data-v1-download-52352.arff.gz
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_classification.txt
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_classification.txt
@ -0,0 +1,9 @@
+# comment
+# note: the next line contains a tab
+1.0 3:2.5 	   11:-5.2 16:1.5 # and an inline comment
+2.0 6:1.0 13:-3 
+# another comment
+3.0 21:27
+4.0 2:1.234567890123456e10 # double precision value
+1.0     # empty line, all zeros
+2.0 3:0 # explicit zeros
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_invalid.txt
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_invalid.txt
@ -0,0 +1,3 @@
+python 2:2.5 10:-5.2 15:1.5
+2.0 5:1.0 12:-3
+3.0 20:27
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_invalid_order.txt
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_invalid_order.txt
@ -0,0 +1 @@
+-1 5:2.5 2:-5.2 15:1.5
--- a/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_multilabel.txt
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/data/svmlight_multilabel.txt
@ -0,0 +1,5 @@
+# multilabel dataset in SVMlight format
+1,0 2:2.5   10:-5.2 15:1.5
+2 5:1.0 12:-3 
+ 2:3.5 11:26
+1,2 20:27
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_20news.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_20news.py
@ -0,0 +1,90 @@
+"""Test the 20news downloader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
+from functools import partial
+
+import numpy as np
+import scipy.sparse as sp
+
+from sklearn.utils._testing import assert_allclose_dense_sparse
+from sklearn.datasets.tests.test_common import check_return_X_y
+from sklearn.preprocessing import normalize
+
+
+def test_20news(fetch_20newsgroups_fxt):
+    data = fetch_20newsgroups_fxt(subset='all', shuffle=False)
+
+    # Extract a reduced dataset
+    data2cats = fetch_20newsgroups_fxt(
+        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
+    # Check that the ordering of the target_names is the same
+    # as the ordering in the full dataset
+    assert data2cats.target_names == data.target_names[-2:]
+    # Assert that we have only 0 and 1 as labels
+    assert np.unique(data2cats.target).tolist() == [0, 1]
+
+    # Check that the number of filenames is consistent with data/target
+    assert len(data2cats.filenames) == len(data2cats.target)
+    assert len(data2cats.filenames) == len(data2cats.data)
+
+    # Check that the first entry of the reduced dataset corresponds to
+    # the first entry of the corresponding category in the full dataset
+    entry1 = data2cats.data[0]
+    category = data2cats.target_names[data2cats.target[0]]
+    label = data.target_names.index(category)
+    entry2 = data.data[np.where(data.target == label)[0][0]]
+    assert entry1 == entry2
+
+    # check that return_X_y option
+    X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True)
+    assert len(X) == len(data.data)
+    assert y.shape == data.target.shape
+
+
+def test_20news_length_consistency(fetch_20newsgroups_fxt):
+    """Checks the length consistencies within the bunch
+
+    This is a non-regression test for a bug present in 0.16.1.
+    """
+    # Extract the full dataset
+    data = fetch_20newsgroups_fxt(subset='all')
+    assert len(data['data']) == len(data.data)
+    assert len(data['target']) == len(data.target)
+    assert len(data['filenames']) == len(data.filenames)
+
+
+def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt):
+    # test subset = train
+    bunch = fetch_20newsgroups_vectorized_fxt(subset="train")
+    assert sp.isspmatrix_csr(bunch.data)
+    assert bunch.data.shape == (11314, 130107)
+    assert bunch.target.shape[0] == 11314
+    assert bunch.data.dtype == np.float64
+
+    # test subset = test
+    bunch = fetch_20newsgroups_vectorized_fxt(subset="test")
+    assert sp.isspmatrix_csr(bunch.data)
+    assert bunch.data.shape == (7532, 130107)
+    assert bunch.target.shape[0] == 7532
+    assert bunch.data.dtype == np.float64
+
+    # test return_X_y option
+    fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test')
+    check_return_X_y(bunch, fetch_func)
+
+    # test subset = all
+    bunch = fetch_20newsgroups_vectorized_fxt(subset='all')
+    assert sp.isspmatrix_csr(bunch.data)
+    assert bunch.data.shape == (11314 + 7532, 130107)
+    assert bunch.target.shape[0] == 11314 + 7532
+    assert bunch.data.dtype == np.float64
+
+
+def test_20news_normalization(fetch_20newsgroups_vectorized_fxt):
+    X = fetch_20newsgroups_vectorized_fxt(normalize=False)
+    X_ = fetch_20newsgroups_vectorized_fxt(normalize=True)
+    X_norm = X_['data'][:100]
+    X = X['data'][:100]
+
+    assert_allclose_dense_sparse(X_norm, normalize(X))
+    assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_base.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_base.py
@ -0,0 +1,306 @@
+import os
+import shutil
+import tempfile
+import warnings
+import numpy
+from pickle import loads
+from pickle import dumps
+from functools import partial
+
+import pytest
+
+import numpy as np
+from sklearn.datasets import get_data_home
+from sklearn.datasets import clear_data_home
+from sklearn.datasets import load_files
+from sklearn.datasets import load_sample_images
+from sklearn.datasets import load_sample_image
+from sklearn.datasets import load_digits
+from sklearn.datasets import load_diabetes
+from sklearn.datasets import load_linnerud
+from sklearn.datasets import load_iris
+from sklearn.datasets import load_breast_cancer
+from sklearn.datasets import load_boston
+from sklearn.datasets import load_wine
+from sklearn.utils import Bunch
+from sklearn.datasets.tests.test_common import check_return_X_y
+from sklearn.datasets.tests.test_common import check_as_frame
+from sklearn.datasets.tests.test_common import check_pandas_dependency_message
+
+from sklearn.externals._pilutil import pillow_installed
+
+from sklearn.utils import IS_PYPY
+
+
+def _remove_dir(path):
+    if os.path.isdir(path):
+        shutil.rmtree(path)
+
+
+@pytest.fixture(scope="module")
+def data_home(tmpdir_factory):
+    tmp_file = str(tmpdir_factory.mktemp("scikit_learn_data_home_test"))
+    yield tmp_file
+    _remove_dir(tmp_file)
+
+
+@pytest.fixture(scope="module")
+def load_files_root(tmpdir_factory):
+    tmp_file = str(tmpdir_factory.mktemp("scikit_learn_load_files_test"))
+    yield tmp_file
+    _remove_dir(tmp_file)
+
+
+@pytest.fixture
+def test_category_dir_1(load_files_root):
+    test_category_dir1 = tempfile.mkdtemp(dir=load_files_root)
+    sample_file = tempfile.NamedTemporaryFile(dir=test_category_dir1,
+                                              delete=False)
+    sample_file.write(b"Hello World!\n")
+    sample_file.close()
+    yield str(test_category_dir1)
+    _remove_dir(test_category_dir1)
+
+
+@pytest.fixture
+def test_category_dir_2(load_files_root):
+    test_category_dir2 = tempfile.mkdtemp(dir=load_files_root)
+    yield str(test_category_dir2)
+    _remove_dir(test_category_dir2)
+
+
+def test_data_home(data_home):
+    # get_data_home will point to a pre-existing folder
+    data_home = get_data_home(data_home=data_home)
+    assert data_home == data_home
+    assert os.path.exists(data_home)
+
+    # clear_data_home will delete both the content and the folder it-self
+    clear_data_home(data_home=data_home)
+    assert not os.path.exists(data_home)
+
+    # if the folder is missing it will be created again
+    data_home = get_data_home(data_home=data_home)
+    assert os.path.exists(data_home)
+
+
+def test_default_empty_load_files(load_files_root):
+    res = load_files(load_files_root)
+    assert len(res.filenames) == 0
+    assert len(res.target_names) == 0
+    assert res.DESCR is None
+
+
+def test_default_load_files(test_category_dir_1, test_category_dir_2,
+                            load_files_root):
+    if IS_PYPY:
+        pytest.xfail('[PyPy] fails due to string containing NUL characters')
+    res = load_files(load_files_root)
+    assert len(res.filenames) == 1
+    assert len(res.target_names) == 2
+    assert res.DESCR is None
+    assert res.data == [b"Hello World!\n"]
+
+
+def test_load_files_w_categories_desc_and_encoding(
+        test_category_dir_1, test_category_dir_2, load_files_root):
+    if IS_PYPY:
+        pytest.xfail('[PyPy] fails due to string containing NUL characters')
+    category = os.path.abspath(test_category_dir_1).split('/').pop()
+    res = load_files(load_files_root, description="test",
+                     categories=category, encoding="utf-8")
+    assert len(res.filenames) == 1
+    assert len(res.target_names) == 1
+    assert res.DESCR == "test"
+    assert res.data == ["Hello World!\n"]
+
+
+def test_load_files_wo_load_content(
+        test_category_dir_1, test_category_dir_2, load_files_root):
+    res = load_files(load_files_root, load_content=False)
+    assert len(res.filenames) == 1
+    assert len(res.target_names) == 2
+    assert res.DESCR is None
+    assert res.get('data') is None
+
+
+def test_load_sample_images():
+    try:
+        res = load_sample_images()
+        assert len(res.images) == 2
+        assert len(res.filenames) == 2
+        images = res.images
+
+        # assert is china image
+        assert np.all(images[0][0, 0, :] ==
+                      np.array([174, 201, 231], dtype=np.uint8))
+        # assert is flower image
+        assert np.all(images[1][0, 0, :] ==
+                      np.array([2, 19, 13], dtype=np.uint8))
+        assert res.DESCR
+    except ImportError:
+        warnings.warn("Could not load sample images, PIL is not available.")
+
+
+def test_load_digits():
+    digits = load_digits()
+    assert digits.data.shape == (1797, 64)
+    assert numpy.unique(digits.target).size == 10
+
+    # test return_X_y option
+    check_return_X_y(digits, partial(load_digits))
+
+
+def test_load_digits_n_class_lt_10():
+    digits = load_digits(n_class=9)
+    assert digits.data.shape == (1617, 64)
+    assert numpy.unique(digits.target).size == 9
+
+
+def test_load_sample_image():
+    try:
+        china = load_sample_image('china.jpg')
+        assert china.dtype == 'uint8'
+        assert china.shape == (427, 640, 3)
+    except ImportError:
+        warnings.warn("Could not load sample images, PIL is not available.")
+
+
+def test_load_missing_sample_image_error():
+    if pillow_installed:
+        with pytest.raises(AttributeError):
+            load_sample_image('blop.jpg')
+    else:
+        warnings.warn("Could not load sample images, PIL is not available.")
+
+
+def test_load_diabetes():
+    res = load_diabetes()
+    assert res.data.shape == (442, 10)
+    assert res.target.size, 442
+    assert len(res.feature_names) == 10
+    assert res.DESCR
+
+    # test return_X_y option
+    check_return_X_y(res, partial(load_diabetes))
+
+
+def test_load_linnerud():
+    res = load_linnerud()
+    assert res.data.shape == (20, 3)
+    assert res.target.shape == (20, 3)
+    assert len(res.target_names) == 3
+    assert res.DESCR
+    assert os.path.exists(res.data_filename)
+    assert os.path.exists(res.target_filename)
+
+    # test return_X_y option
+    check_return_X_y(res, partial(load_linnerud))
+
+
+def test_load_iris():
+    res = load_iris()
+    assert res.data.shape == (150, 4)
+    assert res.target.size == 150
+    assert res.target_names.size == 3
+    assert res.DESCR
+    assert os.path.exists(res.filename)
+
+    # test return_X_y option
+    check_return_X_y(res, partial(load_iris))
+
+
+def test_load_wine():
+    res = load_wine()
+    assert res.data.shape == (178, 13)
+    assert res.target.size == 178
+    assert res.target_names.size == 3
+    assert res.DESCR
+
+    # test return_X_y option
+    check_return_X_y(res, partial(load_wine))
+
+
+def test_load_breast_cancer():
+    res = load_breast_cancer()
+    assert res.data.shape == (569, 30)
+    assert res.target.size == 569
+    assert res.target_names.size == 2
+    assert res.DESCR
+    assert os.path.exists(res.filename)
+
+    # test return_X_y option
+    check_return_X_y(res, partial(load_breast_cancer))
+
+
+@pytest.mark.parametrize("loader_func, data_dtype, target_dtype", [
+    (load_breast_cancer, np.float64, np.int64),
+    (load_diabetes, np.float64, np.float64),
+    (load_digits, np.float64, np.int64),
+    (load_iris, np.float64, np.int64),
+    (load_linnerud, np.float64, np.float64),
+    (load_wine, np.float64, np.int64),
+])
+def test_toy_dataset_as_frame(loader_func, data_dtype, target_dtype):
+    default_result = loader_func()
+    check_as_frame(default_result, partial(loader_func),
+                   expected_data_dtype=data_dtype,
+                   expected_target_dtype=target_dtype)
+
+
+@pytest.mark.parametrize("loader_func", [
+    load_breast_cancer,
+    load_diabetes,
+    load_digits,
+    load_iris,
+    load_linnerud,
+    load_wine,
+])
+def test_toy_dataset_as_frame_no_pandas(loader_func):
+    check_pandas_dependency_message(loader_func)
+
+
+def test_load_boston():
+    res = load_boston()
+    assert res.data.shape == (506, 13)
+    assert res.target.size == 506
+    assert res.feature_names.size == 13
+    assert res.DESCR
+    assert os.path.exists(res.filename)
+
+    # test return_X_y option
+    check_return_X_y(res, partial(load_boston))
+
+
+def test_loads_dumps_bunch():
+    bunch = Bunch(x="x")
+    bunch_from_pkl = loads(dumps(bunch))
+    bunch_from_pkl.x = "y"
+    assert bunch_from_pkl['x'] == bunch_from_pkl.x
+
+
+def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
+    bunch = Bunch(key='original')
+    # This reproduces a problem when Bunch pickles have been created
+    # with scikit-learn 0.16 and are read with 0.17. Basically there
+    # is a surprising behaviour because reading bunch.key uses
+    # bunch.__dict__ (which is non empty for 0.16 Bunch objects)
+    # whereas assigning into bunch.key uses bunch.__setattr__. See
+    # https://github.com/scikit-learn/scikit-learn/issues/6196 for
+    # more details
+    bunch.__dict__['key'] = 'set from __dict__'
+    bunch_from_pkl = loads(dumps(bunch))
+    # After loading from pickle the __dict__ should have been ignored
+    assert bunch_from_pkl.key == 'original'
+    assert bunch_from_pkl['key'] == 'original'
+    # Making sure that changing the attr does change the value
+    # associated with __getitem__ as well
+    bunch_from_pkl.key = 'changed'
+    assert bunch_from_pkl.key == 'changed'
+    assert bunch_from_pkl['key'] == 'changed'
+
+
+def test_bunch_dir():
+    # check that dir (important for autocomplete) shows attributes
+    data = load_iris()
+    assert "data" in dir(data)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_california_housing.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_california_housing.py
@ -0,0 +1,37 @@
+"""Test the california_housing loader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
+import pytest
+
+from sklearn.datasets.tests.test_common import check_return_X_y
+from functools import partial
+
+
+def test_fetch(fetch_california_housing_fxt):
+    data = fetch_california_housing_fxt()
+    assert((20640, 8) == data.data.shape)
+    assert((20640, ) == data.target.shape)
+
+    # test return_X_y option
+    fetch_func = partial(fetch_california_housing_fxt)
+    check_return_X_y(data, fetch_func)
+
+
+def test_fetch_asframe(fetch_california_housing_fxt):
+    pd = pytest.importorskip('pandas')
+    bunch = fetch_california_housing_fxt(as_frame=True)
+    frame = bunch.frame
+    assert hasattr(bunch, 'frame') is True
+    assert frame.shape == (20640, 9)
+    assert isinstance(bunch.data, pd.DataFrame)
+    assert isinstance(bunch.target, pd.Series)
+
+
+def test_pandas_dependency_message(fetch_california_housing_fxt,
+                                   hide_available_pandas):
+    # Check that pandas is imported lazily and that an informative error
+    # message is raised when pandas is missing:
+    expected_msg = ('fetch_california_housing with as_frame=True'
+                    ' requires pandas')
+    with pytest.raises(ImportError, match=expected_msg):
+        fetch_california_housing_fxt(as_frame=True)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_common.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_common.py
@ -0,0 +1,43 @@
+"""Test loaders for common functionality.
+"""
+import pytest
+import numpy as np
+
+
+def check_pandas_dependency_message(fetch_func):
+    try:
+        import pandas  # noqa
+        pytest.skip("This test requires pandas to be not installed")
+    except ImportError:
+        # Check that pandas is imported lazily and that an informative error
+        # message is raised when pandas is missing:
+        expected_msg = ('{} with as_frame=True requires pandas'
+                        .format(fetch_func.__name__))
+        with pytest.raises(ImportError, match=expected_msg):
+            fetch_func(as_frame=True)
+
+
+def check_return_X_y(bunch, fetch_func_partial):
+    X_y_tuple = fetch_func_partial(return_X_y=True)
+    assert isinstance(X_y_tuple, tuple)
+    assert X_y_tuple[0].shape == bunch.data.shape
+    assert X_y_tuple[1].shape == bunch.target.shape
+
+
+def check_as_frame(bunch, fetch_func_partial,
+                   expected_data_dtype=None, expected_target_dtype=None):
+    pd = pytest.importorskip('pandas')
+    frame_bunch = fetch_func_partial(as_frame=True)
+    assert hasattr(frame_bunch, 'frame')
+    assert isinstance(frame_bunch.frame, pd.DataFrame)
+    assert isinstance(frame_bunch.data, pd.DataFrame)
+    assert frame_bunch.data.shape == bunch.data.shape
+    if frame_bunch.target.ndim > 1:
+        assert isinstance(frame_bunch.target, pd.DataFrame)
+    else:
+        assert isinstance(frame_bunch.target, pd.Series)
+    assert frame_bunch.target.shape[0] == bunch.target.shape[0]
+    if expected_data_dtype is not None:
+        assert np.all(frame_bunch.data.dtypes == expected_data_dtype)
+    if expected_target_dtype is not None:
+        assert np.all(frame_bunch.target.dtypes == expected_target_dtype)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_covtype.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_covtype.py
@ -0,0 +1,25 @@
+"""Test the covtype loader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
+
+from sklearn.datasets.tests.test_common import check_return_X_y
+from functools import partial
+
+
+def test_fetch(fetch_covtype_fxt):
+    data1 = fetch_covtype_fxt(shuffle=True, random_state=42)
+    data2 = fetch_covtype_fxt(shuffle=True, random_state=37)
+
+    X1, X2 = data1['data'], data2['data']
+    assert (581012, 54) == X1.shape
+    assert X1.shape == X2.shape
+
+    assert X1.sum() == X2.sum()
+
+    y1, y2 = data1['target'], data2['target']
+    assert (X1.shape[0],) == y1.shape
+    assert (X1.shape[0],) == y2.shape
+
+    # test return_X_y option
+    fetch_func = partial(fetch_covtype_fxt)
+    check_return_X_y(data1, fetch_func)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_kddcup99.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_kddcup99.py
@ -0,0 +1,46 @@
+"""Test  kddcup99 loader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job).
+
+Only 'percent10' mode is tested, as the full data
+is too big to use in unit-testing.
+"""
+
+from sklearn.datasets.tests.test_common import check_return_X_y
+from functools import partial
+
+
+def test_percent10(fetch_kddcup99_fxt):
+    data = fetch_kddcup99_fxt()
+
+    assert data.data.shape == (494021, 41)
+    assert data.target.shape == (494021,)
+
+    data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0)
+    assert data.data.shape == data_shuffled.data.shape
+    assert data.target.shape == data_shuffled.target.shape
+
+    data = fetch_kddcup99_fxt('SA')
+    assert data.data.shape == (100655, 41)
+    assert data.target.shape == (100655,)
+
+    data = fetch_kddcup99_fxt('SF')
+    assert data.data.shape == (73237, 4)
+    assert data.target.shape == (73237,)
+
+    data = fetch_kddcup99_fxt('http')
+    assert data.data.shape == (58725, 3)
+    assert data.target.shape == (58725,)
+
+    data = fetch_kddcup99_fxt('smtp')
+    assert data.data.shape == (9571, 3)
+    assert data.target.shape == (9571,)
+
+    fetch_func = partial(fetch_kddcup99_fxt, 'smtp')
+    check_return_X_y(data, fetch_func)
+
+
+def test_shuffle(fetch_kddcup99_fxt):
+    dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True,
+                                 percent10=True)
+    assert(any(dataset.target[-100:] == b'normal.'))
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_lfw.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_lfw.py
@ -0,0 +1,196 @@
+"""This test for the LFW require medium-size data downloading and processing
+
+If the data has not been already downloaded by running the examples,
+the tests won't run (skipped).
+
+If the test are run, the first execution will be long (typically a bit
+more than a couple of minutes) but as the dataset loader is leveraging
+joblib, successive runs will be fast (less than 200ms).
+"""
+
+import random
+import os
+import shutil
+import tempfile
+import numpy as np
+import pytest
+from functools import partial
+from sklearn.externals._pilutil import pillow_installed, imsave
+from sklearn.datasets import fetch_lfw_pairs
+from sklearn.datasets import fetch_lfw_people
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import SkipTest
+from sklearn.datasets.tests.test_common import check_return_X_y
+
+
+SCIKIT_LEARN_DATA = None
+SCIKIT_LEARN_EMPTY_DATA = None
+LFW_HOME = None
+
+FAKE_NAMES = [
+    'Abdelatif_Smith',
+    'Abhati_Kepler',
+    'Camara_Alvaro',
+    'Chen_Dupont',
+    'John_Lee',
+    'Lin_Bauman',
+    'Onur_Lopez',
+]
+
+
+def setup_module():
+    """Test fixture run once and common to all tests of this module"""
+    if not pillow_installed:
+        raise SkipTest("PIL not installed.")
+
+    global SCIKIT_LEARN_DATA, SCIKIT_LEARN_EMPTY_DATA, LFW_HOME
+
+    SCIKIT_LEARN_DATA = tempfile.mkdtemp(prefix="scikit_learn_lfw_test_")
+    LFW_HOME = os.path.join(SCIKIT_LEARN_DATA, 'lfw_home')
+
+    SCIKIT_LEARN_EMPTY_DATA = tempfile.mkdtemp(
+        prefix="scikit_learn_empty_test_")
+
+    if not os.path.exists(LFW_HOME):
+        os.makedirs(LFW_HOME)
+
+    random_state = random.Random(42)
+    np_rng = np.random.RandomState(42)
+
+    # generate some random jpeg files for each person
+    counts = {}
+    for name in FAKE_NAMES:
+        folder_name = os.path.join(LFW_HOME, 'lfw_funneled', name)
+        if not os.path.exists(folder_name):
+            os.makedirs(folder_name)
+
+        n_faces = np_rng.randint(1, 5)
+        counts[name] = n_faces
+        for i in range(n_faces):
+            file_path = os.path.join(folder_name, name + '_%04d.jpg' % i)
+            uniface = np_rng.randint(0, 255, size=(250, 250, 3))
+            try:
+                imsave(file_path, uniface)
+            except ImportError:
+                raise SkipTest("PIL not installed")
+
+    # add some random file pollution to test robustness
+    with open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb') as f:
+        f.write(b'Text file to be ignored by the dataset loader.')
+
+    # generate some pairing metadata files using the same format as LFW
+    with open(os.path.join(LFW_HOME, 'pairsDevTrain.txt'), 'wb') as f:
+        f.write(b"10\n")
+        more_than_two = [name for name, count in counts.items()
+                         if count >= 2]
+        for i in range(5):
+            name = random_state.choice(more_than_two)
+            first, second = random_state.sample(range(counts[name]), 2)
+            f.write(('%s\t%d\t%d\n' % (name, first, second)).encode())
+
+        for i in range(5):
+            first_name, second_name = random_state.sample(FAKE_NAMES, 2)
+            first_index = random_state.choice(np.arange(counts[first_name]))
+            second_index = random_state.choice(np.arange(counts[second_name]))
+            f.write(('%s\t%d\t%s\t%d\n' % (first_name, first_index,
+                                           second_name, second_index)
+                     ).encode())
+
+    with open(os.path.join(LFW_HOME, 'pairsDevTest.txt'), 'wb') as f:
+        f.write(b"Fake place holder that won't be tested")
+
+    with open(os.path.join(LFW_HOME, 'pairs.txt'), 'wb') as f:
+        f.write(b"Fake place holder that won't be tested")
+
+
+def teardown_module():
+    """Test fixture (clean up) run once after all tests of this module"""
+    if os.path.isdir(SCIKIT_LEARN_DATA):
+        shutil.rmtree(SCIKIT_LEARN_DATA)
+    if os.path.isdir(SCIKIT_LEARN_EMPTY_DATA):
+        shutil.rmtree(SCIKIT_LEARN_EMPTY_DATA)
+
+
+def test_load_empty_lfw_people():
+    with pytest.raises(IOError):
+        fetch_lfw_people(data_home=SCIKIT_LEARN_EMPTY_DATA,
+                         download_if_missing=False)
+
+
+def test_load_fake_lfw_people():
+    lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA,
+                                  min_faces_per_person=3,
+                                  download_if_missing=False)
+
+    # The data is croped around the center as a rectangular bounding box
+    # around the face. Colors are converted to gray levels:
+    assert lfw_people.images.shape == (10, 62, 47)
+    assert lfw_people.data.shape == (10, 2914)
+
+    # the target is array of person integer ids
+    assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2])
+
+    # names of the persons can be found using the target_names array
+    expected_classes = ['Abdelatif Smith', 'Abhati Kepler', 'Onur Lopez']
+    assert_array_equal(lfw_people.target_names, expected_classes)
+
+    # It is possible to ask for the original data without any croping or color
+    # conversion and not limit on the number of picture per person
+    lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None,
+                                  slice_=None, color=True,
+                                  download_if_missing=False)
+    assert lfw_people.images.shape == (17, 250, 250, 3)
+
+    # the ids and class names are the same as previously
+    assert_array_equal(lfw_people.target,
+                       [0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2])
+    assert_array_equal(lfw_people.target_names,
+                       ['Abdelatif Smith', 'Abhati Kepler', 'Camara Alvaro',
+                        'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez'])
+
+    # test return_X_y option
+    fetch_func = partial(fetch_lfw_people, data_home=SCIKIT_LEARN_DATA,
+                         resize=None,
+                         slice_=None, color=True,
+                         download_if_missing=False)
+    check_return_X_y(lfw_people, fetch_func)
+
+
+def test_load_fake_lfw_people_too_restrictive():
+    with pytest.raises(ValueError):
+        fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=100,
+                         download_if_missing=False)
+
+
+def test_load_empty_lfw_pairs():
+    with pytest.raises(IOError):
+        fetch_lfw_pairs(data_home=SCIKIT_LEARN_EMPTY_DATA,
+                        download_if_missing=False)
+
+
+def test_load_fake_lfw_pairs():
+    lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA,
+                                      download_if_missing=False)
+
+    # The data is croped around the center as a rectangular bounding box
+    # around the face. Colors are converted to gray levels:
+    assert lfw_pairs_train.pairs.shape == (10, 2, 62, 47)
+
+    # the target is whether the person is the same or not
+    assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
+
+    # names of the persons can be found using the target_names array
+    expected_classes = ['Different persons', 'Same person']
+    assert_array_equal(lfw_pairs_train.target_names, expected_classes)
+
+    # It is possible to ask for the original data without any croping or color
+    # conversion
+    lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA, resize=None,
+                                      slice_=None, color=True,
+                                      download_if_missing=False)
+    assert lfw_pairs_train.pairs.shape == (10, 2, 250, 250, 3)
+
+    # the ids and class names are the same as previously
+    assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
+    assert_array_equal(lfw_pairs_train.target_names, expected_classes)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_olivetti_faces.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_olivetti_faces.py
@ -0,0 +1,26 @@
+"""Test Olivetti faces fetcher, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
+
+import numpy as np
+
+from sklearn.utils import Bunch
+from sklearn.datasets.tests.test_common import check_return_X_y
+
+from sklearn.utils._testing import assert_array_equal
+
+
+def test_olivetti_faces(fetch_olivetti_faces_fxt):
+    data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0)
+
+    assert isinstance(data, Bunch)
+    for expected_keys in ('data', 'images', 'target', 'DESCR'):
+        assert expected_keys in data.keys()
+
+    assert data.data.shape == (400, 4096)
+    assert data.images.shape == (400, 64, 64)
+    assert data.target.shape == (400,)
+    assert_array_equal(np.unique(np.sort(data.target)), np.arange(40))
+
+    # test the return_X_y option
+    check_return_X_y(data, fetch_olivetti_faces_fxt)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_openml.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_openml.py
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_rcv1.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_rcv1.py
@ -0,0 +1,65 @@
+"""Test the rcv1 loader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
+
+import scipy.sparse as sp
+import numpy as np
+from functools import partial
+from sklearn.datasets.tests.test_common import check_return_X_y
+from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils._testing import assert_array_equal
+
+
+def test_fetch_rcv1(fetch_rcv1_fxt):
+    data1 = fetch_rcv1_fxt(shuffle=False)
+    X1, Y1 = data1.data, data1.target
+    cat_list, s1 = data1.target_names.tolist(), data1.sample_id
+
+    # test sparsity
+    assert sp.issparse(X1)
+    assert sp.issparse(Y1)
+    assert 60915113 == X1.data.size
+    assert 2606875 == Y1.data.size
+
+    # test shapes
+    assert (804414, 47236) == X1.shape
+    assert (804414, 103) == Y1.shape
+    assert (804414,) == s1.shape
+    assert 103 == len(cat_list)
+
+    # test ordering of categories
+    first_categories = ['C11', 'C12', 'C13', 'C14', 'C15', 'C151']
+    assert_array_equal(first_categories, cat_list[:6])
+
+    # test number of sample for some categories
+    some_categories = ('GMIL', 'E143', 'CCAT')
+    number_non_zero_in_cat = (5, 1206, 381327)
+    for num, cat in zip(number_non_zero_in_cat, some_categories):
+        j = cat_list.index(cat)
+        assert num == Y1[:, j].data.size
+
+    # test shuffling and subset
+    data2 = fetch_rcv1_fxt(shuffle=True, subset='train', random_state=77)
+    X2, Y2 = data2.data, data2.target
+    s2 = data2.sample_id
+
+    # test return_X_y option
+    fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset='train')
+    check_return_X_y(data2, fetch_func)
+
+    # The first 23149 samples are the training samples
+    assert_array_equal(np.sort(s1[:23149]), np.sort(s2))
+
+    # test some precise values
+    some_sample_ids = (2286, 3274, 14042)
+    for sample_id in some_sample_ids:
+        idx1 = s1.tolist().index(sample_id)
+        idx2 = s2.tolist().index(sample_id)
+
+        feature_values_1 = X1[idx1, :].toarray()
+        feature_values_2 = X2[idx2, :].toarray()
+        assert_almost_equal(feature_values_1, feature_values_2)
+
+        target_values_1 = Y1[idx1, :].toarray()
+        target_values_2 = Y2[idx2, :].toarray()
+        assert_almost_equal(target_values_1, target_values_2)
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_samples_generator.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_samples_generator.py
@ -0,0 +1,559 @@
+
+from collections import defaultdict
+from functools import partial
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_almost_equal
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import assert_raise_message
+
+from sklearn.datasets import make_classification
+from sklearn.datasets import make_multilabel_classification
+from sklearn.datasets import make_hastie_10_2
+from sklearn.datasets import make_regression
+from sklearn.datasets import make_blobs
+from sklearn.datasets import make_friedman1
+from sklearn.datasets import make_friedman2
+from sklearn.datasets import make_friedman3
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.datasets import make_moons
+from sklearn.datasets import make_circles
+from sklearn.datasets import make_sparse_coded_signal
+from sklearn.datasets import make_sparse_uncorrelated
+from sklearn.datasets import make_spd_matrix
+from sklearn.datasets import make_swiss_roll
+from sklearn.datasets import make_s_curve
+from sklearn.datasets import make_biclusters
+from sklearn.datasets import make_checkerboard
+
+from sklearn.utils.validation import assert_all_finite
+
+
+def test_make_classification():
+    weights = [0.1, 0.25]
+    X, y = make_classification(n_samples=100, n_features=20, n_informative=5,
+                               n_redundant=1, n_repeated=1, n_classes=3,
+                               n_clusters_per_class=1, hypercube=False,
+                               shift=None, scale=None, weights=weights,
+                               random_state=0)
+
+    assert weights == [0.1, 0.25]
+    assert X.shape == (100, 20), "X shape mismatch"
+    assert y.shape == (100,), "y shape mismatch"
+    assert np.unique(y).shape == (3,), "Unexpected number of classes"
+    assert sum(y == 0) == 10, "Unexpected number of samples in class #0"
+    assert sum(y == 1) == 25, "Unexpected number of samples in class #1"
+    assert sum(y == 2) == 65, "Unexpected number of samples in class #2"
+
+    # Test for n_features > 30
+    X, y = make_classification(n_samples=2000, n_features=31, n_informative=31,
+                               n_redundant=0, n_repeated=0, hypercube=True,
+                               scale=0.5, random_state=0)
+
+    assert X.shape == (2000, 31), "X shape mismatch"
+    assert y.shape == (2000,), "y shape mismatch"
+    assert (np.unique(X.view([('', X.dtype)]*X.shape[1])).view(X.dtype)
+            .reshape(-1, X.shape[1]).shape[0] == 2000), (
+                "Unexpected number of unique rows")
+
+
+def test_make_classification_informative_features():
+    """Test the construction of informative features in make_classification
+
+    Also tests `n_clusters_per_class`, `n_classes`, `hypercube` and
+    fully-specified `weights`.
+    """
+    # Create very separate clusters; check that vertices are unique and
+    # correspond to classes
+    class_sep = 1e6
+    make = partial(make_classification, class_sep=class_sep, n_redundant=0,
+                   n_repeated=0, flip_y=0, shift=0, scale=1, shuffle=False)
+
+    for n_informative, weights, n_clusters_per_class in [(2, [1], 1),
+                                                         (2, [1/3] * 3, 1),
+                                                         (2, [1/4] * 4, 1),
+                                                         (2, [1/2] * 2, 2),
+                                                         (2, [3/4, 1/4], 2),
+                                                         (10, [1/3] * 3, 10),
+                                                         (np.int(64), [1], 1)
+                                                         ]:
+        n_classes = len(weights)
+        n_clusters = n_classes * n_clusters_per_class
+        n_samples = n_clusters * 50
+
+        for hypercube in (False, True):
+            X, y = make(n_samples=n_samples, n_classes=n_classes,
+                        weights=weights, n_features=n_informative,
+                        n_informative=n_informative,
+                        n_clusters_per_class=n_clusters_per_class,
+                        hypercube=hypercube, random_state=0)
+
+            assert X.shape == (n_samples, n_informative)
+            assert y.shape == (n_samples,)
+
+            # Cluster by sign, viewed as strings to allow uniquing
+            signs = np.sign(X)
+            signs = signs.view(dtype='|S{0}'.format(signs.strides[0]))
+            unique_signs, cluster_index = np.unique(signs,
+                                                    return_inverse=True)
+
+            assert len(unique_signs) == n_clusters, (
+                "Wrong number of clusters, or not in distinct quadrants")
+
+            clusters_by_class = defaultdict(set)
+            for cluster, cls in zip(cluster_index, y):
+                clusters_by_class[cls].add(cluster)
+            for clusters in clusters_by_class.values():
+                assert len(clusters) == n_clusters_per_class, (
+                    "Wrong number of clusters per class")
+            assert (len(clusters_by_class) == n_classes), (
+                "Wrong number of classes")
+
+            assert_array_almost_equal(np.bincount(y) / len(y) // weights,
+                                      [1] * n_classes,
+                                      err_msg="Wrong number of samples "
+                                              "per class")
+
+            # Ensure on vertices of hypercube
+            for cluster in range(len(unique_signs)):
+                centroid = X[cluster_index == cluster].mean(axis=0)
+                if hypercube:
+                    assert_array_almost_equal(np.abs(centroid) / class_sep,
+                                              np.ones(n_informative),
+                                              decimal=5,
+                                              err_msg="Clusters are not "
+                                                      "centered on hypercube "
+                                                      "vertices")
+                else:
+                    with pytest.raises(AssertionError):
+                        assert_array_almost_equal(np.abs(centroid) / class_sep,
+                                                  np.ones(n_informative),
+                                                  decimal=5,
+                                                  err_msg="Clusters should "
+                                                          "not be centered "
+                                                          "on hypercube "
+                                                          "vertices")
+
+    with pytest.raises(ValueError):
+        make(n_features=2, n_informative=2, n_classes=5,
+             n_clusters_per_class=1)
+    with pytest.raises(ValueError):
+        make(n_features=2, n_informative=2, n_classes=3,
+             n_clusters_per_class=2)
+
+
+@pytest.mark.parametrize(
+    'weights, err_type, err_msg',
+    [
+        ([], ValueError,
+         "Weights specified but incompatible with number of classes."),
+        ([.25, .75, .1], ValueError,
+         "Weights specified but incompatible with number of classes."),
+        (np.array([]), ValueError,
+         "Weights specified but incompatible with number of classes."),
+        (np.array([.25, .75, .1]), ValueError,
+         "Weights specified but incompatible with number of classes."),
+        (np.random.random(3), ValueError,
+         "Weights specified but incompatible with number of classes.")
+    ]
+)
+def test_make_classification_weights_type(weights, err_type, err_msg):
+    with pytest.raises(err_type, match=err_msg):
+        make_classification(weights=weights)
+
+
+@pytest.mark.parametrize("kwargs", [{}, {"n_classes": 3, "n_informative": 3}])
+def test_make_classification_weights_array_or_list_ok(kwargs):
+    X1, y1 = make_classification(weights=[.1, .9],
+                                 random_state=0, **kwargs)
+    X2, y2 = make_classification(weights=np.array([.1, .9]),
+                                 random_state=0, **kwargs)
+    assert_almost_equal(X1, X2)
+    assert_almost_equal(y1, y2)
+
+
+def test_make_multilabel_classification_return_sequences():
+    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
+        X, Y = make_multilabel_classification(n_samples=100, n_features=20,
+                                              n_classes=3, random_state=0,
+                                              return_indicator=False,
+                                              allow_unlabeled=allow_unlabeled)
+        assert X.shape == (100, 20), "X shape mismatch"
+        if not allow_unlabeled:
+            assert max([max(y) for y in Y]) == 2
+        assert min([len(y) for y in Y]) == min_length
+        assert max([len(y) for y in Y]) <= 3
+
+
+def test_make_multilabel_classification_return_indicator():
+    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
+        X, Y = make_multilabel_classification(n_samples=25, n_features=20,
+                                              n_classes=3, random_state=0,
+                                              allow_unlabeled=allow_unlabeled)
+        assert X.shape == (25, 20), "X shape mismatch"
+        assert Y.shape == (25, 3), "Y shape mismatch"
+        assert np.all(np.sum(Y, axis=0) > min_length)
+
+    # Also test return_distributions and return_indicator with True
+    X2, Y2, p_c, p_w_c = make_multilabel_classification(
+        n_samples=25, n_features=20, n_classes=3, random_state=0,
+        allow_unlabeled=allow_unlabeled, return_distributions=True)
+
+    assert_array_almost_equal(X, X2)
+    assert_array_equal(Y, Y2)
+    assert p_c.shape == (3,)
+    assert_almost_equal(p_c.sum(), 1)
+    assert p_w_c.shape == (20, 3)
+    assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
+
+
+def test_make_multilabel_classification_return_indicator_sparse():
+    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
+        X, Y = make_multilabel_classification(n_samples=25, n_features=20,
+                                              n_classes=3, random_state=0,
+                                              return_indicator='sparse',
+                                              allow_unlabeled=allow_unlabeled)
+        assert X.shape == (25, 20), "X shape mismatch"
+        assert Y.shape == (25, 3), "Y shape mismatch"
+        assert sp.issparse(Y)
+
+
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        ({"n_classes": 0}, "'n_classes' should be an integer"),
+        ({"length": 0}, "'length' should be an integer")
+    ]
+)
+def test_make_multilabel_classification_valid_arguments(params, err_msg):
+    with pytest.raises(ValueError, match=err_msg):
+        make_multilabel_classification(**params)
+
+
+def test_make_hastie_10_2():
+    X, y = make_hastie_10_2(n_samples=100, random_state=0)
+    assert X.shape == (100, 10), "X shape mismatch"
+    assert y.shape == (100,), "y shape mismatch"
+    assert np.unique(y).shape == (2,), "Unexpected number of classes"
+
+
+def test_make_regression():
+    X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
+                              effective_rank=5, coef=True, bias=0.0,
+                              noise=1.0, random_state=0)
+
+    assert X.shape == (100, 10), "X shape mismatch"
+    assert y.shape == (100,), "y shape mismatch"
+    assert c.shape == (10,), "coef shape mismatch"
+    assert sum(c != 0.0) == 3, "Unexpected number of informative features"
+
+    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
+    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
+
+    # Test with small number of features.
+    X, y = make_regression(n_samples=100, n_features=1)  # n_informative=3
+    assert X.shape == (100, 1)
+
+
+def test_make_regression_multitarget():
+    X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
+                              n_targets=3, coef=True, noise=1., random_state=0)
+
+    assert X.shape == (100, 10), "X shape mismatch"
+    assert y.shape == (100, 3), "y shape mismatch"
+    assert c.shape == (10, 3), "coef shape mismatch"
+    assert_array_equal(sum(c != 0.0), 3,
+                       "Unexpected number of informative features")
+
+    # Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
+    assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
+
+
+def test_make_blobs():
+    cluster_stds = np.array([0.05, 0.2, 0.4])
+    cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
+    X, y = make_blobs(random_state=0, n_samples=50, n_features=2,
+                      centers=cluster_centers, cluster_std=cluster_stds)
+
+    assert X.shape == (50, 2), "X shape mismatch"
+    assert y.shape == (50,), "y shape mismatch"
+    assert np.unique(y).shape == (3,), "Unexpected number of blobs"
+    for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)):
+        assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
+
+
+def test_make_blobs_n_samples_list():
+    n_samples = [50, 30, 20]
+    X, y = make_blobs(n_samples=n_samples, n_features=2, random_state=0)
+
+    assert X.shape == (sum(n_samples), 2), "X shape mismatch"
+    assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
+        "Incorrect number of samples per blob"
+
+
+def test_make_blobs_n_samples_list_with_centers():
+    n_samples = [20, 20, 20]
+    centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
+    cluster_stds = np.array([0.05, 0.2, 0.4])
+    X, y = make_blobs(n_samples=n_samples, centers=centers,
+                      cluster_std=cluster_stds, random_state=0)
+
+    assert X.shape == (sum(n_samples), 2), "X shape mismatch"
+    assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
+        "Incorrect number of samples per blob"
+    for i, (ctr, std) in enumerate(zip(centers, cluster_stds)):
+        assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
+
+
+@pytest.mark.parametrize(
+    "n_samples",
+    [[5, 3, 0],
+     np.array([5, 3, 0]),
+     tuple([5, 3, 0])]
+)
+def test_make_blobs_n_samples_centers_none(n_samples):
+    centers = None
+    X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=0)
+
+    assert X.shape == (sum(n_samples), 2), "X shape mismatch"
+    assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
+        "Incorrect number of samples per blob"
+
+
+def test_make_blobs_return_centers():
+    n_samples = [10, 20]
+    n_features = 3
+    X, y, centers = make_blobs(n_samples=n_samples, n_features=n_features,
+                               return_centers=True, random_state=0)
+
+    assert centers.shape == (len(n_samples), n_features)
+
+
+def test_make_blobs_error():
+    n_samples = [20, 20, 20]
+    centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
+    cluster_stds = np.array([0.05, 0.2, 0.4])
+    wrong_centers_msg = ("Length of `n_samples` not consistent "
+                         "with number of centers. Got n_samples = {} "
+                         "and centers = {}".format(n_samples, centers[:-1]))
+    assert_raise_message(ValueError, wrong_centers_msg,
+                         make_blobs, n_samples, centers=centers[:-1])
+    wrong_std_msg = ("Length of `clusters_std` not consistent with "
+                     "number of centers. Got centers = {} "
+                     "and cluster_std = {}".format(centers, cluster_stds[:-1]))
+    assert_raise_message(ValueError, wrong_std_msg,
+                         make_blobs, n_samples,
+                         centers=centers, cluster_std=cluster_stds[:-1])
+    wrong_type_msg = ("Parameter `centers` must be array-like. "
+                      "Got {!r} instead".format(3))
+    assert_raise_message(ValueError, wrong_type_msg,
+                         make_blobs, n_samples, centers=3)
+
+
+def test_make_friedman1():
+    X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0,
+                          random_state=0)
+
+    assert X.shape == (5, 10), "X shape mismatch"
+    assert y.shape == (5,), "y shape mismatch"
+
+    assert_array_almost_equal(y,
+                              10 * np.sin(np.pi * X[:, 0] * X[:, 1])
+                              + 20 * (X[:, 2] - 0.5) ** 2
+                              + 10 * X[:, 3] + 5 * X[:, 4])
+
+
+def test_make_friedman2():
+    X, y = make_friedman2(n_samples=5, noise=0.0, random_state=0)
+
+    assert X.shape == (5, 4), "X shape mismatch"
+    assert y.shape == (5,), "y shape mismatch"
+
+    assert_array_almost_equal(y,
+                              (X[:, 0] ** 2
+                               + (X[:, 1] * X[:, 2] - 1
+                                  / (X[:, 1] * X[:, 3])) ** 2) ** 0.5)
+
+
+def test_make_friedman3():
+    X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0)
+
+    assert X.shape == (5, 4), "X shape mismatch"
+    assert y.shape == (5,), "y shape mismatch"
+
+    assert_array_almost_equal(y, np.arctan((X[:, 1] * X[:, 2]
+                                            - 1 / (X[:, 1] * X[:, 3]))
+                                           / X[:, 0]))
+
+
+def test_make_low_rank_matrix():
+    X = make_low_rank_matrix(n_samples=50, n_features=25, effective_rank=5,
+                             tail_strength=0.01, random_state=0)
+
+    assert X.shape == (50, 25), "X shape mismatch"
+
+    from numpy.linalg import svd
+    u, s, v = svd(X)
+    assert sum(s) - 5 < 0.1, "X rank is not approximately 5"
+
+
+def test_make_sparse_coded_signal():
+    Y, D, X = make_sparse_coded_signal(n_samples=5, n_components=8,
+                                       n_features=10, n_nonzero_coefs=3,
+                                       random_state=0)
+    assert Y.shape == (10, 5), "Y shape mismatch"
+    assert D.shape == (10, 8), "D shape mismatch"
+    assert X.shape == (8, 5), "X shape mismatch"
+    for col in X.T:
+        assert len(np.flatnonzero(col)) == 3, 'Non-zero coefs mismatch'
+    assert_array_almost_equal(np.dot(D, X), Y)
+    assert_array_almost_equal(np.sqrt((D ** 2).sum(axis=0)),
+                              np.ones(D.shape[1]))
+
+
+def test_make_sparse_uncorrelated():
+    X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0)
+
+    assert X.shape == (5, 10), "X shape mismatch"
+    assert y.shape == (5,), "y shape mismatch"
+
+
+def test_make_spd_matrix():
+    X = make_spd_matrix(n_dim=5, random_state=0)
+
+    assert X.shape == (5, 5), "X shape mismatch"
+    assert_array_almost_equal(X, X.T)
+
+    from numpy.linalg import eig
+    eigenvalues, _ = eig(X)
+    assert_array_equal(eigenvalues > 0, np.array([True] * 5),
+                       "X is not positive-definite")
+
+
+def test_make_swiss_roll():
+    X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0)
+
+    assert X.shape == (5, 3), "X shape mismatch"
+    assert t.shape == (5,), "t shape mismatch"
+    assert_array_almost_equal(X[:, 0], t * np.cos(t))
+    assert_array_almost_equal(X[:, 2], t * np.sin(t))
+
+
+def test_make_s_curve():
+    X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0)
+
+    assert X.shape == (5, 3), "X shape mismatch"
+    assert t.shape == (5,), "t shape mismatch"
+    assert_array_almost_equal(X[:, 0], np.sin(t))
+    assert_array_almost_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
+
+
+def test_make_biclusters():
+    X, rows, cols = make_biclusters(
+        shape=(100, 100), n_clusters=4, shuffle=True, random_state=0)
+    assert X.shape == (100, 100), "X shape mismatch"
+    assert rows.shape == (4, 100), "rows shape mismatch"
+    assert cols.shape == (4, 100,), "columns shape mismatch"
+    assert_all_finite(X)
+    assert_all_finite(rows)
+    assert_all_finite(cols)
+
+    X2, _, _ = make_biclusters(shape=(100, 100), n_clusters=4,
+                               shuffle=True, random_state=0)
+    assert_array_almost_equal(X, X2)
+
+
+def test_make_checkerboard():
+    X, rows, cols = make_checkerboard(
+        shape=(100, 100), n_clusters=(20, 5),
+        shuffle=True, random_state=0)
+    assert X.shape == (100, 100), "X shape mismatch"
+    assert rows.shape == (100, 100), "rows shape mismatch"
+    assert cols.shape == (100, 100,), "columns shape mismatch"
+
+    X, rows, cols = make_checkerboard(
+        shape=(100, 100), n_clusters=2, shuffle=True, random_state=0)
+    assert_all_finite(X)
+    assert_all_finite(rows)
+    assert_all_finite(cols)
+
+    X1, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
+                                 shuffle=True, random_state=0)
+    X2, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
+                                 shuffle=True, random_state=0)
+    assert_array_almost_equal(X1, X2)
+
+
+def test_make_moons():
+    X, y = make_moons(3, shuffle=False)
+    for x, label in zip(X, y):
+        center = [0.0, 0.0] if label == 0 else [1.0, 0.5]
+        dist_sqr = ((x - center) ** 2).sum()
+        assert_almost_equal(dist_sqr, 1.0,
+                            err_msg="Point is not on expected unit circle")
+
+
+def test_make_moons_unbalanced():
+    X, y = make_moons(n_samples=(7, 5))
+    assert np.sum(y == 0) == 7 and np.sum(y == 1) == 5, \
+        'Number of samples in a moon is wrong'
+    assert X.shape == (12, 2), "X shape mismatch"
+    assert y.shape == (12,), "y shape mismatch"
+
+    with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
+                                         r'or a two-element tuple.'):
+        make_moons(n_samples=[1, 2, 3])
+
+    with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
+                                         r'or a two-element tuple.'):
+        make_moons(n_samples=(10,))
+
+
+def test_make_circles():
+    factor = 0.3
+
+    for (n_samples, n_outer, n_inner) in [(7, 3, 4), (8, 4, 4)]:
+        # Testing odd and even case, because in the past make_circles always
+        # created an even number of samples.
+        X, y = make_circles(n_samples, shuffle=False, noise=None,
+                            factor=factor)
+        assert X.shape == (n_samples, 2), "X shape mismatch"
+        assert y.shape == (n_samples,), "y shape mismatch"
+        center = [0.0, 0.0]
+        for x, label in zip(X, y):
+            dist_sqr = ((x - center) ** 2).sum()
+            dist_exp = 1.0 if label == 0 else factor**2
+            dist_exp = 1.0 if label == 0 else factor ** 2
+            assert_almost_equal(dist_sqr, dist_exp,
+                                err_msg="Point is not on expected circle")
+
+        assert X[y == 0].shape == (n_outer, 2), (
+            "Samples not correctly distributed across circles.")
+        assert X[y == 1].shape == (n_inner, 2), (
+            "Samples not correctly distributed across circles.")
+
+    with pytest.raises(ValueError):
+        make_circles(factor=-0.01)
+    with pytest.raises(ValueError):
+        make_circles(factor=1.)
+
+
+def test_make_circles_unbalanced():
+    X, y = make_circles(n_samples=(2, 8))
+
+    assert np.sum(y == 0) == 2, 'Number of samples in inner circle is wrong'
+    assert np.sum(y == 1) == 8, 'Number of samples in outer circle is wrong'
+    assert X.shape == (10, 2), "X shape mismatch"
+    assert y.shape == (10,), "y shape mismatch"
+
+    with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
+                                         r'or a two-element tuple.'):
+        make_circles(n_samples=[1, 2, 3])
+
+    with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
+                                         r'or a two-element tuple.'):
+        make_circles(n_samples=(10,))
--- a/venv/Lib/site-packages/sklearn/datasets/tests/test_svmlight_format.py
+++ b/venv/Lib/site-packages/sklearn/datasets/tests/test_svmlight_format.py
@ -0,0 +1,521 @@
+from bz2 import BZ2File
+import gzip
+from io import BytesIO
+import numpy as np
+import scipy.sparse as sp
+import os
+import shutil
+from tempfile import NamedTemporaryFile
+
+import pytest
+
+from sklearn.utils._testing import assert_array_equal
+from sklearn.utils._testing import assert_array_almost_equal
+from sklearn.utils._testing import fails_if_pypy
+
+import sklearn
+from sklearn.datasets import (load_svmlight_file, load_svmlight_files,
+                              dump_svmlight_file)
+
+currdir = os.path.dirname(os.path.abspath(__file__))
+datafile = os.path.join(currdir, "data", "svmlight_classification.txt")
+multifile = os.path.join(currdir, "data", "svmlight_multilabel.txt")
+invalidfile = os.path.join(currdir, "data", "svmlight_invalid.txt")
+invalidfile2 = os.path.join(currdir, "data", "svmlight_invalid_order.txt")
+
+pytestmark = fails_if_pypy
+
+
+def test_load_svmlight_file():
+    X, y = load_svmlight_file(datafile)
+
+    # test X's shape
+    assert X.indptr.shape[0] == 7
+    assert X.shape[0] == 6
+    assert X.shape[1] == 21
+    assert y.shape[0] == 6
+
+    # test X's non-zero values
+    for i, j, val in ((0, 2, 2.5), (0, 10, -5.2), (0, 15, 1.5),
+                      (1, 5, 1.0), (1, 12, -3),
+                      (2, 20, 27)):
+
+        assert X[i, j] == val
+
+    # tests X's zero values
+    assert X[0, 3] == 0
+    assert X[0, 5] == 0
+    assert X[1, 8] == 0
+    assert X[1, 16] == 0
+    assert X[2, 18] == 0
+
+    # test can change X's values
+    X[0, 2] *= 2
+    assert X[0, 2] == 5
+
+    # test y
+    assert_array_equal(y, [1, 2, 3, 4, 1, 2])
+
+
+def test_load_svmlight_file_fd():
+    # test loading from file descriptor
+    X1, y1 = load_svmlight_file(datafile)
+
+    fd = os.open(datafile, os.O_RDONLY)
+    try:
+        X2, y2 = load_svmlight_file(fd)
+        assert_array_almost_equal(X1.data, X2.data)
+        assert_array_almost_equal(y1, y2)
+    finally:
+        os.close(fd)
+
+
+def test_load_svmlight_file_multilabel():
+    X, y = load_svmlight_file(multifile, multilabel=True)
+    assert y == [(0, 1), (2,), (), (1, 2)]
+
+
+def test_load_svmlight_files():
+    X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2,
+                                                           dtype=np.float32)
+    assert_array_equal(X_train.toarray(), X_test.toarray())
+    assert_array_almost_equal(y_train, y_test)
+    assert X_train.dtype == np.float32
+    assert X_test.dtype == np.float32
+
+    X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3,
+                                                 dtype=np.float64)
+    assert X1.dtype == X2.dtype
+    assert X2.dtype == X3.dtype
+    assert X3.dtype == np.float64
+
+
+def test_load_svmlight_file_n_features():
+    X, y = load_svmlight_file(datafile, n_features=22)
+
+    # test X'shape
+    assert X.indptr.shape[0] == 7
+    assert X.shape[0] == 6
+    assert X.shape[1] == 22
+
+    # test X's non-zero values
+    for i, j, val in ((0, 2, 2.5), (0, 10, -5.2),
+                      (1, 5, 1.0), (1, 12, -3)):
+
+        assert X[i, j] == val
+
+    # 21 features in file
+    with pytest.raises(ValueError):
+        load_svmlight_file(datafile, n_features=20)
+
+
+def test_load_compressed():
+    X, y = load_svmlight_file(datafile)
+
+    with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp:
+        tmp.close()  # necessary under windows
+        with open(datafile, "rb") as f:
+            with gzip.open(tmp.name, "wb") as fh_out:
+                shutil.copyfileobj(f, fh_out)
+        Xgz, ygz = load_svmlight_file(tmp.name)
+        # because we "close" it manually and write to it,
+        # we need to remove it manually.
+        os.remove(tmp.name)
+    assert_array_almost_equal(X.toarray(), Xgz.toarray())
+    assert_array_almost_equal(y, ygz)
+
+    with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp:
+        tmp.close()  # necessary under windows
+        with open(datafile, "rb") as f:
+            with BZ2File(tmp.name, "wb") as fh_out:
+                shutil.copyfileobj(f, fh_out)
+        Xbz, ybz = load_svmlight_file(tmp.name)
+        # because we "close" it manually and write to it,
+        # we need to remove it manually.
+        os.remove(tmp.name)
+    assert_array_almost_equal(X.toarray(), Xbz.toarray())
+    assert_array_almost_equal(y, ybz)
+
+
+def test_load_invalid_file():
+    with pytest.raises(ValueError):
+        load_svmlight_file(invalidfile)
+
+
+def test_load_invalid_order_file():
+    with pytest.raises(ValueError):
+        load_svmlight_file(invalidfile2)
+
+
+def test_load_zero_based():
+    f = BytesIO(b"-1 4:1.\n1 0:1\n")
+    with pytest.raises(ValueError):
+        load_svmlight_file(f, zero_based=False)
+
+
+def test_load_zero_based_auto():
+    data1 = b"-1 1:1 2:2 3:3\n"
+    data2 = b"-1 0:0 1:1\n"
+
+    f1 = BytesIO(data1)
+    X, y = load_svmlight_file(f1, zero_based="auto")
+    assert X.shape == (1, 3)
+
+    f1 = BytesIO(data1)
+    f2 = BytesIO(data2)
+    X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
+    assert X1.shape == (1, 4)
+    assert X2.shape == (1, 4)
+
+
+def test_load_with_qid():
+    # load svmfile with qid attribute
+    data = b"""
+    3 qid:1 1:0.53 2:0.12
+    2 qid:1 1:0.13 2:0.1
+    7 qid:2 1:0.87 2:0.12"""
+    X, y = load_svmlight_file(BytesIO(data), query_id=False)
+    assert_array_equal(y, [3, 2, 7])
+    assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]])
+    res1 = load_svmlight_files([BytesIO(data)], query_id=True)
+    res2 = load_svmlight_file(BytesIO(data), query_id=True)
+    for X, y, qid in (res1, res2):
+        assert_array_equal(y, [3, 2, 7])
+        assert_array_equal(qid, [1, 1, 2])
+        assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]])
+
+
+@pytest.mark.skip("testing the overflow of 32 bit sparse indexing requires a"
+                  " large amount of memory")
+def test_load_large_qid():
+    """
+    load large libsvm / svmlight file with qid attribute. Tests 64-bit query ID
+    """
+    data = b"\n".join(("3 qid:{0} 1:0.53 2:0.12\n2 qid:{0} 1:0.13 2:0.1"
+                      .format(i).encode() for i in range(1, 40*1000*1000)))
+    X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)
+    assert_array_equal(y[-4:], [3, 2, 3, 2])
+    assert_array_equal(np.unique(qid), np.arange(1, 40*1000*1000))
+
+
+def test_load_invalid_file2():
+    with pytest.raises(ValueError):
+        load_svmlight_files([datafile, invalidfile, datafile])
+
+
+def test_not_a_filename():
+    # in python 3 integers are valid file opening arguments (taken as unix
+    # file descriptors)
+    with pytest.raises(TypeError):
+        load_svmlight_file(.42)
+
+
+def test_invalid_filename():
+    with pytest.raises(IOError):
+        load_svmlight_file("trou pic nic douille")
+
+
+def test_dump():
+    X_sparse, y_dense = load_svmlight_file(datafile)
+    X_dense = X_sparse.toarray()
+    y_sparse = sp.csr_matrix(y_dense)
+
+    # slicing a csr_matrix can unsort its .indices, so test that we sort
+    # those correctly
+    X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
+    y_sliced = y_sparse[np.arange(y_sparse.shape[0])]
+
+    for X in (X_sparse, X_dense, X_sliced):
+        for y in (y_sparse, y_dense, y_sliced):
+            for zero_based in (True, False):
+                for dtype in [np.float32, np.float64, np.int32, np.int64]:
+                    f = BytesIO()
+                    # we need to pass a comment to get the version info in;
+                    # LibSVM doesn't grok comments so they're not put in by
+                    # default anymore.
+
+                    if (sp.issparse(y) and y.shape[0] == 1):
+                        # make sure y's shape is: (n_samples, n_labels)
+                        # when it is sparse
+                        y = y.T
+
+                    # Note: with dtype=np.int32 we are performing unsafe casts,
+                    # where X.astype(dtype) overflows. The result is
+                    # then platform dependent and X_dense.astype(dtype) may be
+                    # different from X_sparse.astype(dtype).asarray().
+                    X_input = X.astype(dtype)
+
+                    dump_svmlight_file(X_input, y, f, comment="test",
+                                       zero_based=zero_based)
+                    f.seek(0)
+
+                    comment = f.readline()
+                    comment = str(comment, "utf-8")
+
+                    assert "scikit-learn %s" % sklearn.__version__ in comment
+
+                    comment = f.readline()
+                    comment = str(comment, "utf-8")
+
+                    assert ["one", "zero"][zero_based] + "-based" in comment
+
+                    X2, y2 = load_svmlight_file(f, dtype=dtype,
+                                                zero_based=zero_based)
+                    assert X2.dtype == dtype
+                    assert_array_equal(X2.sorted_indices().indices, X2.indices)
+
+                    X2_dense = X2.toarray()
+                    if sp.issparse(X_input):
+                        X_input_dense = X_input.toarray()
+                    else:
+                        X_input_dense = X_input
+
+                    if dtype == np.float32:
+                        # allow a rounding error at the last decimal place
+                        assert_array_almost_equal(
+                            X_input_dense, X2_dense, 4)
+                        assert_array_almost_equal(
+                            y_dense.astype(dtype, copy=False), y2, 4)
+                    else:
+                        # allow a rounding error at the last decimal place
+                        assert_array_almost_equal(
+                            X_input_dense, X2_dense, 15)
+                        assert_array_almost_equal(
+                            y_dense.astype(dtype, copy=False), y2, 15)
+
+
+def test_dump_multilabel():
+    X = [[1, 0, 3, 0, 5],
+         [0, 0, 0, 0, 0],
+         [0, 5, 0, 1, 0]]
+    y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]]
+    y_sparse = sp.csr_matrix(y_dense)
+    for y in [y_dense, y_sparse]:
+        f = BytesIO()
+        dump_svmlight_file(X, y, f, multilabel=True)
+        f.seek(0)
+        # make sure it dumps multilabel correctly
+        assert f.readline() == b"1 0:1 2:3 4:5\n"
+        assert f.readline() == b"0,2 \n"
+        assert f.readline() == b"0,1 1:5 3:1\n"
+
+
+def test_dump_concise():
+    one = 1
+    two = 2.1
+    three = 3.01
+    exact = 1.000000000000001
+    # loses the last decimal place
+    almost = 1.0000000000000001
+    X = [[one, two, three, exact, almost],
+         [1e9, 2e18, 3e27, 0, 0],
+         [0, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0],
+         [0, 0, 0, 0, 0]]
+    y = [one, two, three, exact, almost]
+    f = BytesIO()
+    dump_svmlight_file(X, y, f)
+    f.seek(0)
+    # make sure it's using the most concise format possible
+    assert (f.readline() ==
+                 b"1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n")
+    assert f.readline() == b"2.1 0:1000000000 1:2e+18 2:3e+27\n"
+    assert f.readline() == b"3.01 \n"
+    assert f.readline() == b"1.000000000000001 \n"
+    assert f.readline() == b"1 \n"
+    f.seek(0)
+    # make sure it's correct too :)
+    X2, y2 = load_svmlight_file(f)
+    assert_array_almost_equal(X, X2.toarray())
+    assert_array_almost_equal(y, y2)
+
+
+def test_dump_comment():
+    X, y = load_svmlight_file(datafile)
+    X = X.toarray()
+
+    f = BytesIO()
+    ascii_comment = "This is a comment\nspanning multiple lines."
+    dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False)
+    f.seek(0)
+
+    X2, y2 = load_svmlight_file(f, zero_based=False)
+    assert_array_almost_equal(X, X2.toarray())
+    assert_array_almost_equal(y, y2)
+
+    # XXX we have to update this to support Python 3.x
+    utf8_comment = b"It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc"
+    f = BytesIO()
+    with pytest.raises(UnicodeDecodeError):
+        dump_svmlight_file(X, y, f, comment=utf8_comment)
+
+    unicode_comment = utf8_comment.decode("utf-8")
+    f = BytesIO()
+    dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False)
+    f.seek(0)
+
+    X2, y2 = load_svmlight_file(f, zero_based=False)
+    assert_array_almost_equal(X, X2.toarray())
+    assert_array_almost_equal(y, y2)
+
+    f = BytesIO()
+    with pytest.raises(ValueError):
+        dump_svmlight_file(X, y, f, comment="I've got a \0.")
+
+
+def test_dump_invalid():
+    X, y = load_svmlight_file(datafile)
+
+    f = BytesIO()
+    y2d = [y]
+    with pytest.raises(ValueError):
+        dump_svmlight_file(X, y2d, f)
+
+    f = BytesIO()
+    with pytest.raises(ValueError):
+        dump_svmlight_file(X, y[:-1], f)
+
+
+def test_dump_query_id():
+    # test dumping a file with query_id
+    X, y = load_svmlight_file(datafile)
+    X = X.toarray()
+    query_id = np.arange(X.shape[0]) // 2
+    f = BytesIO()
+    dump_svmlight_file(X, y, f, query_id=query_id, zero_based=True)
+
+    f.seek(0)
+    X1, y1, query_id1 = load_svmlight_file(f, query_id=True, zero_based=True)
+    assert_array_almost_equal(X, X1.toarray())
+    assert_array_almost_equal(y, y1)
+    assert_array_almost_equal(query_id, query_id1)
+
+
+def test_load_with_long_qid():
+    # load svmfile with longint qid attribute
+    data = b"""
+    1 qid:0 0:1 1:2 2:3
+    0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985
+    0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985
+    3 qid:9223372036854775807  0:1440446648 1:72048431380967004 2:236784985"""
+    X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)
+
+    true_X = [[1,          2,                 3],
+              [1440446648, 72048431380967004, 236784985],
+              [1440446648, 72048431380967004, 236784985],
+              [1440446648, 72048431380967004, 236784985]]
+
+    true_y = [1, 0, 0, 3]
+    trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807]
+    assert_array_equal(y, true_y)
+    assert_array_equal(X.toarray(), true_X)
+    assert_array_equal(qid, trueQID)
+
+    f = BytesIO()
+    dump_svmlight_file(X, y, f, query_id=qid, zero_based=True)
+    f.seek(0)
+    X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True)
+    assert_array_equal(y, true_y)
+    assert_array_equal(X.toarray(), true_X)
+    assert_array_equal(qid, trueQID)
+
+    f.seek(0)
+    X, y = load_svmlight_file(f, query_id=False, zero_based=True)
+    assert_array_equal(y, true_y)
+    assert_array_equal(X.toarray(), true_X)
+
+
+def test_load_zeros():
+    f = BytesIO()
+    true_X = sp.csr_matrix(np.zeros(shape=(3, 4)))
+    true_y = np.array([0, 1, 0])
+    dump_svmlight_file(true_X, true_y, f)
+
+    for zero_based in ['auto', True, False]:
+        f.seek(0)
+        X, y = load_svmlight_file(f, n_features=4, zero_based=zero_based)
+        assert_array_almost_equal(y, true_y)
+        assert_array_almost_equal(X.toarray(), true_X.toarray())
+
+
+@pytest.mark.parametrize('sparsity', [0, 0.1, .5, 0.99, 1])
+@pytest.mark.parametrize('n_samples', [13, 101])
+@pytest.mark.parametrize('n_features', [2, 7, 41])
+def test_load_with_offsets(sparsity, n_samples, n_features):
+    rng = np.random.RandomState(0)
+    X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features))
+    if sparsity:
+        X[X < sparsity] = 0.0
+    X = sp.csr_matrix(X)
+    y = rng.randint(low=0, high=2, size=n_samples)
+
+    f = BytesIO()
+    dump_svmlight_file(X, y, f)
+    f.seek(0)
+
+    size = len(f.getvalue())
+
+    # put some marks that are likely to happen anywhere in a row
+    mark_0 = 0
+    mark_1 = size // 3
+    length_0 = mark_1 - mark_0
+    mark_2 = 4 * size // 5
+    length_1 = mark_2 - mark_1
+
+    # load the original sparse matrix into 3 independent CSR matrices
+    X_0, y_0 = load_svmlight_file(f, n_features=n_features,
+                                  offset=mark_0, length=length_0)
+    X_1, y_1 = load_svmlight_file(f, n_features=n_features,
+                                  offset=mark_1, length=length_1)
+    X_2, y_2 = load_svmlight_file(f, n_features=n_features,
+                                  offset=mark_2)
+
+    y_concat = np.concatenate([y_0, y_1, y_2])
+    X_concat = sp.vstack([X_0, X_1, X_2])
+    assert_array_almost_equal(y, y_concat)
+    assert_array_almost_equal(X.toarray(), X_concat.toarray())
+
+
+def test_load_offset_exhaustive_splits():
+    rng = np.random.RandomState(0)
+    X = np.array([
+        [0, 0, 0, 0, 0, 0],
+        [1, 2, 3, 4, 0, 6],
+        [1, 2, 3, 4, 0, 6],
+        [0, 0, 0, 0, 0, 0],
+        [1, 0, 3, 0, 0, 0],
+        [0, 0, 0, 0, 0, 1],
+        [1, 0, 0, 0, 0, 0],
+    ])
+    X = sp.csr_matrix(X)
+    n_samples, n_features = X.shape
+    y = rng.randint(low=0, high=2, size=n_samples)
+    query_id = np.arange(n_samples) // 2
+
+    f = BytesIO()
+    dump_svmlight_file(X, y, f, query_id=query_id)
+    f.seek(0)
+
+    size = len(f.getvalue())
+
+    # load the same data in 2 parts with all the possible byte offsets to
+    # locate the split so has to test for particular boundary cases
+    for mark in range(size):
+        f.seek(0)
+        X_0, y_0, q_0 = load_svmlight_file(f, n_features=n_features,
+                                           query_id=True, offset=0,
+                                           length=mark)
+        X_1, y_1, q_1 = load_svmlight_file(f, n_features=n_features,
+                                           query_id=True, offset=mark,
+                                           length=-1)
+        q_concat = np.concatenate([q_0, q_1])
+        y_concat = np.concatenate([y_0, y_1])
+        X_concat = sp.vstack([X_0, X_1])
+        assert_array_almost_equal(y, y_concat)
+        assert_array_equal(query_id, q_concat)
+        assert_array_almost_equal(X.toarray(), X_concat.toarray())
+
+
+def test_load_with_offsets_error():
+    with pytest.raises(ValueError, match="n_features is required"):
+        load_svmlight_file(datafile, offset=3, length=3)