Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
75
venv/Lib/site-packages/sklearn/datasets/tests/conftest.py
Normal file
75
venv/Lib/site-packages/sklearn/datasets/tests/conftest.py
Normal file
|
@ -0,0 +1,75 @@
|
|||
""" Network tests are only run, if data is already locally available,
|
||||
or if download is specifically requested by environment variable."""
|
||||
import builtins
|
||||
from os import environ
|
||||
import pytest
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
from sklearn.datasets import fetch_20newsgroups_vectorized
|
||||
from sklearn.datasets import fetch_california_housing
|
||||
from sklearn.datasets import fetch_covtype
|
||||
from sklearn.datasets import fetch_kddcup99
|
||||
from sklearn.datasets import fetch_olivetti_faces
|
||||
from sklearn.datasets import fetch_rcv1
|
||||
|
||||
|
||||
def _wrapped_fetch(f, dataset_name):
|
||||
""" Fetch dataset (download if missing and requested by environment) """
|
||||
download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
|
||||
|
||||
def wrapped(*args, **kwargs):
|
||||
kwargs['download_if_missing'] = download_if_missing
|
||||
try:
|
||||
return f(*args, **kwargs)
|
||||
except IOError:
|
||||
pytest.skip("Download {} to run this test".format(dataset_name))
|
||||
return wrapped
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_20newsgroups_fxt():
|
||||
return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_20newsgroups_vectorized_fxt():
|
||||
return _wrapped_fetch(fetch_20newsgroups_vectorized,
|
||||
dataset_name='20newsgroups_vectorized')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_california_housing_fxt():
|
||||
return _wrapped_fetch(fetch_california_housing,
|
||||
dataset_name='california_housing')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_covtype_fxt():
|
||||
return _wrapped_fetch(fetch_covtype, dataset_name='covtype')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_kddcup99_fxt():
|
||||
return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_olivetti_faces_fxt():
|
||||
return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_rcv1_fxt():
|
||||
return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hide_available_pandas(monkeypatch):
|
||||
""" Pretend pandas was not installed. """
|
||||
import_orig = builtins.__import__
|
||||
|
||||
def mocked_import(name, *args, **kwargs):
|
||||
if name == 'pandas':
|
||||
raise ImportError()
|
||||
return import_orig(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, '__import__', mocked_import)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,9 @@
|
|||
# comment
|
||||
# note: the next line contains a tab
|
||||
1.0 3:2.5 11:-5.2 16:1.5 # and an inline comment
|
||||
2.0 6:1.0 13:-3
|
||||
# another comment
|
||||
3.0 21:27
|
||||
4.0 2:1.234567890123456e10 # double precision value
|
||||
1.0 # empty line, all zeros
|
||||
2.0 3:0 # explicit zeros
|
|
@ -0,0 +1,3 @@
|
|||
python 2:2.5 10:-5.2 15:1.5
|
||||
2.0 5:1.0 12:-3
|
||||
3.0 20:27
|
|
@ -0,0 +1 @@
|
|||
-1 5:2.5 2:-5.2 15:1.5
|
|
@ -0,0 +1,5 @@
|
|||
# multilabel dataset in SVMlight format
|
||||
1,0 2:2.5 10:-5.2 15:1.5
|
||||
2 5:1.0 12:-3
|
||||
2:3.5 11:26
|
||||
1,2 20:27
|
90
venv/Lib/site-packages/sklearn/datasets/tests/test_20news.py
Normal file
90
venv/Lib/site-packages/sklearn/datasets/tests/test_20news.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
"""Test the 20news downloader, if the data is available,
|
||||
or if specifically requested via environment variable
|
||||
(e.g. for travis cron job)."""
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.utils._testing import assert_allclose_dense_sparse
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
from sklearn.preprocessing import normalize
|
||||
|
||||
|
||||
def test_20news(fetch_20newsgroups_fxt):
|
||||
data = fetch_20newsgroups_fxt(subset='all', shuffle=False)
|
||||
|
||||
# Extract a reduced dataset
|
||||
data2cats = fetch_20newsgroups_fxt(
|
||||
subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
|
||||
# Check that the ordering of the target_names is the same
|
||||
# as the ordering in the full dataset
|
||||
assert data2cats.target_names == data.target_names[-2:]
|
||||
# Assert that we have only 0 and 1 as labels
|
||||
assert np.unique(data2cats.target).tolist() == [0, 1]
|
||||
|
||||
# Check that the number of filenames is consistent with data/target
|
||||
assert len(data2cats.filenames) == len(data2cats.target)
|
||||
assert len(data2cats.filenames) == len(data2cats.data)
|
||||
|
||||
# Check that the first entry of the reduced dataset corresponds to
|
||||
# the first entry of the corresponding category in the full dataset
|
||||
entry1 = data2cats.data[0]
|
||||
category = data2cats.target_names[data2cats.target[0]]
|
||||
label = data.target_names.index(category)
|
||||
entry2 = data.data[np.where(data.target == label)[0][0]]
|
||||
assert entry1 == entry2
|
||||
|
||||
# check that return_X_y option
|
||||
X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True)
|
||||
assert len(X) == len(data.data)
|
||||
assert y.shape == data.target.shape
|
||||
|
||||
|
||||
def test_20news_length_consistency(fetch_20newsgroups_fxt):
|
||||
"""Checks the length consistencies within the bunch
|
||||
|
||||
This is a non-regression test for a bug present in 0.16.1.
|
||||
"""
|
||||
# Extract the full dataset
|
||||
data = fetch_20newsgroups_fxt(subset='all')
|
||||
assert len(data['data']) == len(data.data)
|
||||
assert len(data['target']) == len(data.target)
|
||||
assert len(data['filenames']) == len(data.filenames)
|
||||
|
||||
|
||||
def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt):
|
||||
# test subset = train
|
||||
bunch = fetch_20newsgroups_vectorized_fxt(subset="train")
|
||||
assert sp.isspmatrix_csr(bunch.data)
|
||||
assert bunch.data.shape == (11314, 130107)
|
||||
assert bunch.target.shape[0] == 11314
|
||||
assert bunch.data.dtype == np.float64
|
||||
|
||||
# test subset = test
|
||||
bunch = fetch_20newsgroups_vectorized_fxt(subset="test")
|
||||
assert sp.isspmatrix_csr(bunch.data)
|
||||
assert bunch.data.shape == (7532, 130107)
|
||||
assert bunch.target.shape[0] == 7532
|
||||
assert bunch.data.dtype == np.float64
|
||||
|
||||
# test return_X_y option
|
||||
fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test')
|
||||
check_return_X_y(bunch, fetch_func)
|
||||
|
||||
# test subset = all
|
||||
bunch = fetch_20newsgroups_vectorized_fxt(subset='all')
|
||||
assert sp.isspmatrix_csr(bunch.data)
|
||||
assert bunch.data.shape == (11314 + 7532, 130107)
|
||||
assert bunch.target.shape[0] == 11314 + 7532
|
||||
assert bunch.data.dtype == np.float64
|
||||
|
||||
|
||||
def test_20news_normalization(fetch_20newsgroups_vectorized_fxt):
|
||||
X = fetch_20newsgroups_vectorized_fxt(normalize=False)
|
||||
X_ = fetch_20newsgroups_vectorized_fxt(normalize=True)
|
||||
X_norm = X_['data'][:100]
|
||||
X = X['data'][:100]
|
||||
|
||||
assert_allclose_dense_sparse(X_norm, normalize(X))
|
||||
assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)
|
306
venv/Lib/site-packages/sklearn/datasets/tests/test_base.py
Normal file
306
venv/Lib/site-packages/sklearn/datasets/tests/test_base.py
Normal file
|
@ -0,0 +1,306 @@
|
|||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import warnings
|
||||
import numpy
|
||||
from pickle import loads
|
||||
from pickle import dumps
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import get_data_home
|
||||
from sklearn.datasets import clear_data_home
|
||||
from sklearn.datasets import load_files
|
||||
from sklearn.datasets import load_sample_images
|
||||
from sklearn.datasets import load_sample_image
|
||||
from sklearn.datasets import load_digits
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.datasets import load_linnerud
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
from sklearn.datasets import load_boston
|
||||
from sklearn.datasets import load_wine
|
||||
from sklearn.utils import Bunch
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
from sklearn.datasets.tests.test_common import check_as_frame
|
||||
from sklearn.datasets.tests.test_common import check_pandas_dependency_message
|
||||
|
||||
from sklearn.externals._pilutil import pillow_installed
|
||||
|
||||
from sklearn.utils import IS_PYPY
|
||||
|
||||
|
||||
def _remove_dir(path):
|
||||
if os.path.isdir(path):
|
||||
shutil.rmtree(path)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def data_home(tmpdir_factory):
|
||||
tmp_file = str(tmpdir_factory.mktemp("scikit_learn_data_home_test"))
|
||||
yield tmp_file
|
||||
_remove_dir(tmp_file)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def load_files_root(tmpdir_factory):
|
||||
tmp_file = str(tmpdir_factory.mktemp("scikit_learn_load_files_test"))
|
||||
yield tmp_file
|
||||
_remove_dir(tmp_file)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_category_dir_1(load_files_root):
|
||||
test_category_dir1 = tempfile.mkdtemp(dir=load_files_root)
|
||||
sample_file = tempfile.NamedTemporaryFile(dir=test_category_dir1,
|
||||
delete=False)
|
||||
sample_file.write(b"Hello World!\n")
|
||||
sample_file.close()
|
||||
yield str(test_category_dir1)
|
||||
_remove_dir(test_category_dir1)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_category_dir_2(load_files_root):
|
||||
test_category_dir2 = tempfile.mkdtemp(dir=load_files_root)
|
||||
yield str(test_category_dir2)
|
||||
_remove_dir(test_category_dir2)
|
||||
|
||||
|
||||
def test_data_home(data_home):
|
||||
# get_data_home will point to a pre-existing folder
|
||||
data_home = get_data_home(data_home=data_home)
|
||||
assert data_home == data_home
|
||||
assert os.path.exists(data_home)
|
||||
|
||||
# clear_data_home will delete both the content and the folder it-self
|
||||
clear_data_home(data_home=data_home)
|
||||
assert not os.path.exists(data_home)
|
||||
|
||||
# if the folder is missing it will be created again
|
||||
data_home = get_data_home(data_home=data_home)
|
||||
assert os.path.exists(data_home)
|
||||
|
||||
|
||||
def test_default_empty_load_files(load_files_root):
|
||||
res = load_files(load_files_root)
|
||||
assert len(res.filenames) == 0
|
||||
assert len(res.target_names) == 0
|
||||
assert res.DESCR is None
|
||||
|
||||
|
||||
def test_default_load_files(test_category_dir_1, test_category_dir_2,
|
||||
load_files_root):
|
||||
if IS_PYPY:
|
||||
pytest.xfail('[PyPy] fails due to string containing NUL characters')
|
||||
res = load_files(load_files_root)
|
||||
assert len(res.filenames) == 1
|
||||
assert len(res.target_names) == 2
|
||||
assert res.DESCR is None
|
||||
assert res.data == [b"Hello World!\n"]
|
||||
|
||||
|
||||
def test_load_files_w_categories_desc_and_encoding(
|
||||
test_category_dir_1, test_category_dir_2, load_files_root):
|
||||
if IS_PYPY:
|
||||
pytest.xfail('[PyPy] fails due to string containing NUL characters')
|
||||
category = os.path.abspath(test_category_dir_1).split('/').pop()
|
||||
res = load_files(load_files_root, description="test",
|
||||
categories=category, encoding="utf-8")
|
||||
assert len(res.filenames) == 1
|
||||
assert len(res.target_names) == 1
|
||||
assert res.DESCR == "test"
|
||||
assert res.data == ["Hello World!\n"]
|
||||
|
||||
|
||||
def test_load_files_wo_load_content(
|
||||
test_category_dir_1, test_category_dir_2, load_files_root):
|
||||
res = load_files(load_files_root, load_content=False)
|
||||
assert len(res.filenames) == 1
|
||||
assert len(res.target_names) == 2
|
||||
assert res.DESCR is None
|
||||
assert res.get('data') is None
|
||||
|
||||
|
||||
def test_load_sample_images():
|
||||
try:
|
||||
res = load_sample_images()
|
||||
assert len(res.images) == 2
|
||||
assert len(res.filenames) == 2
|
||||
images = res.images
|
||||
|
||||
# assert is china image
|
||||
assert np.all(images[0][0, 0, :] ==
|
||||
np.array([174, 201, 231], dtype=np.uint8))
|
||||
# assert is flower image
|
||||
assert np.all(images[1][0, 0, :] ==
|
||||
np.array([2, 19, 13], dtype=np.uint8))
|
||||
assert res.DESCR
|
||||
except ImportError:
|
||||
warnings.warn("Could not load sample images, PIL is not available.")
|
||||
|
||||
|
||||
def test_load_digits():
|
||||
digits = load_digits()
|
||||
assert digits.data.shape == (1797, 64)
|
||||
assert numpy.unique(digits.target).size == 10
|
||||
|
||||
# test return_X_y option
|
||||
check_return_X_y(digits, partial(load_digits))
|
||||
|
||||
|
||||
def test_load_digits_n_class_lt_10():
|
||||
digits = load_digits(n_class=9)
|
||||
assert digits.data.shape == (1617, 64)
|
||||
assert numpy.unique(digits.target).size == 9
|
||||
|
||||
|
||||
def test_load_sample_image():
|
||||
try:
|
||||
china = load_sample_image('china.jpg')
|
||||
assert china.dtype == 'uint8'
|
||||
assert china.shape == (427, 640, 3)
|
||||
except ImportError:
|
||||
warnings.warn("Could not load sample images, PIL is not available.")
|
||||
|
||||
|
||||
def test_load_missing_sample_image_error():
|
||||
if pillow_installed:
|
||||
with pytest.raises(AttributeError):
|
||||
load_sample_image('blop.jpg')
|
||||
else:
|
||||
warnings.warn("Could not load sample images, PIL is not available.")
|
||||
|
||||
|
||||
def test_load_diabetes():
|
||||
res = load_diabetes()
|
||||
assert res.data.shape == (442, 10)
|
||||
assert res.target.size, 442
|
||||
assert len(res.feature_names) == 10
|
||||
assert res.DESCR
|
||||
|
||||
# test return_X_y option
|
||||
check_return_X_y(res, partial(load_diabetes))
|
||||
|
||||
|
||||
def test_load_linnerud():
|
||||
res = load_linnerud()
|
||||
assert res.data.shape == (20, 3)
|
||||
assert res.target.shape == (20, 3)
|
||||
assert len(res.target_names) == 3
|
||||
assert res.DESCR
|
||||
assert os.path.exists(res.data_filename)
|
||||
assert os.path.exists(res.target_filename)
|
||||
|
||||
# test return_X_y option
|
||||
check_return_X_y(res, partial(load_linnerud))
|
||||
|
||||
|
||||
def test_load_iris():
|
||||
res = load_iris()
|
||||
assert res.data.shape == (150, 4)
|
||||
assert res.target.size == 150
|
||||
assert res.target_names.size == 3
|
||||
assert res.DESCR
|
||||
assert os.path.exists(res.filename)
|
||||
|
||||
# test return_X_y option
|
||||
check_return_X_y(res, partial(load_iris))
|
||||
|
||||
|
||||
def test_load_wine():
|
||||
res = load_wine()
|
||||
assert res.data.shape == (178, 13)
|
||||
assert res.target.size == 178
|
||||
assert res.target_names.size == 3
|
||||
assert res.DESCR
|
||||
|
||||
# test return_X_y option
|
||||
check_return_X_y(res, partial(load_wine))
|
||||
|
||||
|
||||
def test_load_breast_cancer():
|
||||
res = load_breast_cancer()
|
||||
assert res.data.shape == (569, 30)
|
||||
assert res.target.size == 569
|
||||
assert res.target_names.size == 2
|
||||
assert res.DESCR
|
||||
assert os.path.exists(res.filename)
|
||||
|
||||
# test return_X_y option
|
||||
check_return_X_y(res, partial(load_breast_cancer))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("loader_func, data_dtype, target_dtype", [
|
||||
(load_breast_cancer, np.float64, np.int64),
|
||||
(load_diabetes, np.float64, np.float64),
|
||||
(load_digits, np.float64, np.int64),
|
||||
(load_iris, np.float64, np.int64),
|
||||
(load_linnerud, np.float64, np.float64),
|
||||
(load_wine, np.float64, np.int64),
|
||||
])
|
||||
def test_toy_dataset_as_frame(loader_func, data_dtype, target_dtype):
|
||||
default_result = loader_func()
|
||||
check_as_frame(default_result, partial(loader_func),
|
||||
expected_data_dtype=data_dtype,
|
||||
expected_target_dtype=target_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("loader_func", [
|
||||
load_breast_cancer,
|
||||
load_diabetes,
|
||||
load_digits,
|
||||
load_iris,
|
||||
load_linnerud,
|
||||
load_wine,
|
||||
])
|
||||
def test_toy_dataset_as_frame_no_pandas(loader_func):
|
||||
check_pandas_dependency_message(loader_func)
|
||||
|
||||
|
||||
def test_load_boston():
|
||||
res = load_boston()
|
||||
assert res.data.shape == (506, 13)
|
||||
assert res.target.size == 506
|
||||
assert res.feature_names.size == 13
|
||||
assert res.DESCR
|
||||
assert os.path.exists(res.filename)
|
||||
|
||||
# test return_X_y option
|
||||
check_return_X_y(res, partial(load_boston))
|
||||
|
||||
|
||||
def test_loads_dumps_bunch():
|
||||
bunch = Bunch(x="x")
|
||||
bunch_from_pkl = loads(dumps(bunch))
|
||||
bunch_from_pkl.x = "y"
|
||||
assert bunch_from_pkl['x'] == bunch_from_pkl.x
|
||||
|
||||
|
||||
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
|
||||
bunch = Bunch(key='original')
|
||||
# This reproduces a problem when Bunch pickles have been created
|
||||
# with scikit-learn 0.16 and are read with 0.17. Basically there
|
||||
# is a surprising behaviour because reading bunch.key uses
|
||||
# bunch.__dict__ (which is non empty for 0.16 Bunch objects)
|
||||
# whereas assigning into bunch.key uses bunch.__setattr__. See
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/6196 for
|
||||
# more details
|
||||
bunch.__dict__['key'] = 'set from __dict__'
|
||||
bunch_from_pkl = loads(dumps(bunch))
|
||||
# After loading from pickle the __dict__ should have been ignored
|
||||
assert bunch_from_pkl.key == 'original'
|
||||
assert bunch_from_pkl['key'] == 'original'
|
||||
# Making sure that changing the attr does change the value
|
||||
# associated with __getitem__ as well
|
||||
bunch_from_pkl.key = 'changed'
|
||||
assert bunch_from_pkl.key == 'changed'
|
||||
assert bunch_from_pkl['key'] == 'changed'
|
||||
|
||||
|
||||
def test_bunch_dir():
|
||||
# check that dir (important for autocomplete) shows attributes
|
||||
data = load_iris()
|
||||
assert "data" in dir(data)
|
|
@ -0,0 +1,37 @@
|
|||
"""Test the california_housing loader, if the data is available,
|
||||
or if specifically requested via environment variable
|
||||
(e.g. for travis cron job)."""
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
from functools import partial
|
||||
|
||||
|
||||
def test_fetch(fetch_california_housing_fxt):
|
||||
data = fetch_california_housing_fxt()
|
||||
assert((20640, 8) == data.data.shape)
|
||||
assert((20640, ) == data.target.shape)
|
||||
|
||||
# test return_X_y option
|
||||
fetch_func = partial(fetch_california_housing_fxt)
|
||||
check_return_X_y(data, fetch_func)
|
||||
|
||||
|
||||
def test_fetch_asframe(fetch_california_housing_fxt):
|
||||
pd = pytest.importorskip('pandas')
|
||||
bunch = fetch_california_housing_fxt(as_frame=True)
|
||||
frame = bunch.frame
|
||||
assert hasattr(bunch, 'frame') is True
|
||||
assert frame.shape == (20640, 9)
|
||||
assert isinstance(bunch.data, pd.DataFrame)
|
||||
assert isinstance(bunch.target, pd.Series)
|
||||
|
||||
|
||||
def test_pandas_dependency_message(fetch_california_housing_fxt,
|
||||
hide_available_pandas):
|
||||
# Check that pandas is imported lazily and that an informative error
|
||||
# message is raised when pandas is missing:
|
||||
expected_msg = ('fetch_california_housing with as_frame=True'
|
||||
' requires pandas')
|
||||
with pytest.raises(ImportError, match=expected_msg):
|
||||
fetch_california_housing_fxt(as_frame=True)
|
43
venv/Lib/site-packages/sklearn/datasets/tests/test_common.py
Normal file
43
venv/Lib/site-packages/sklearn/datasets/tests/test_common.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
"""Test loaders for common functionality.
|
||||
"""
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
|
||||
def check_pandas_dependency_message(fetch_func):
|
||||
try:
|
||||
import pandas # noqa
|
||||
pytest.skip("This test requires pandas to be not installed")
|
||||
except ImportError:
|
||||
# Check that pandas is imported lazily and that an informative error
|
||||
# message is raised when pandas is missing:
|
||||
expected_msg = ('{} with as_frame=True requires pandas'
|
||||
.format(fetch_func.__name__))
|
||||
with pytest.raises(ImportError, match=expected_msg):
|
||||
fetch_func(as_frame=True)
|
||||
|
||||
|
||||
def check_return_X_y(bunch, fetch_func_partial):
|
||||
X_y_tuple = fetch_func_partial(return_X_y=True)
|
||||
assert isinstance(X_y_tuple, tuple)
|
||||
assert X_y_tuple[0].shape == bunch.data.shape
|
||||
assert X_y_tuple[1].shape == bunch.target.shape
|
||||
|
||||
|
||||
def check_as_frame(bunch, fetch_func_partial,
|
||||
expected_data_dtype=None, expected_target_dtype=None):
|
||||
pd = pytest.importorskip('pandas')
|
||||
frame_bunch = fetch_func_partial(as_frame=True)
|
||||
assert hasattr(frame_bunch, 'frame')
|
||||
assert isinstance(frame_bunch.frame, pd.DataFrame)
|
||||
assert isinstance(frame_bunch.data, pd.DataFrame)
|
||||
assert frame_bunch.data.shape == bunch.data.shape
|
||||
if frame_bunch.target.ndim > 1:
|
||||
assert isinstance(frame_bunch.target, pd.DataFrame)
|
||||
else:
|
||||
assert isinstance(frame_bunch.target, pd.Series)
|
||||
assert frame_bunch.target.shape[0] == bunch.target.shape[0]
|
||||
if expected_data_dtype is not None:
|
||||
assert np.all(frame_bunch.data.dtypes == expected_data_dtype)
|
||||
if expected_target_dtype is not None:
|
||||
assert np.all(frame_bunch.target.dtypes == expected_target_dtype)
|
|
@ -0,0 +1,25 @@
|
|||
"""Test the covtype loader, if the data is available,
|
||||
or if specifically requested via environment variable
|
||||
(e.g. for travis cron job)."""
|
||||
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
from functools import partial
|
||||
|
||||
|
||||
def test_fetch(fetch_covtype_fxt):
|
||||
data1 = fetch_covtype_fxt(shuffle=True, random_state=42)
|
||||
data2 = fetch_covtype_fxt(shuffle=True, random_state=37)
|
||||
|
||||
X1, X2 = data1['data'], data2['data']
|
||||
assert (581012, 54) == X1.shape
|
||||
assert X1.shape == X2.shape
|
||||
|
||||
assert X1.sum() == X2.sum()
|
||||
|
||||
y1, y2 = data1['target'], data2['target']
|
||||
assert (X1.shape[0],) == y1.shape
|
||||
assert (X1.shape[0],) == y2.shape
|
||||
|
||||
# test return_X_y option
|
||||
fetch_func = partial(fetch_covtype_fxt)
|
||||
check_return_X_y(data1, fetch_func)
|
|
@ -0,0 +1,46 @@
|
|||
"""Test kddcup99 loader, if the data is available,
|
||||
or if specifically requested via environment variable
|
||||
(e.g. for travis cron job).
|
||||
|
||||
Only 'percent10' mode is tested, as the full data
|
||||
is too big to use in unit-testing.
|
||||
"""
|
||||
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
from functools import partial
|
||||
|
||||
|
||||
def test_percent10(fetch_kddcup99_fxt):
|
||||
data = fetch_kddcup99_fxt()
|
||||
|
||||
assert data.data.shape == (494021, 41)
|
||||
assert data.target.shape == (494021,)
|
||||
|
||||
data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0)
|
||||
assert data.data.shape == data_shuffled.data.shape
|
||||
assert data.target.shape == data_shuffled.target.shape
|
||||
|
||||
data = fetch_kddcup99_fxt('SA')
|
||||
assert data.data.shape == (100655, 41)
|
||||
assert data.target.shape == (100655,)
|
||||
|
||||
data = fetch_kddcup99_fxt('SF')
|
||||
assert data.data.shape == (73237, 4)
|
||||
assert data.target.shape == (73237,)
|
||||
|
||||
data = fetch_kddcup99_fxt('http')
|
||||
assert data.data.shape == (58725, 3)
|
||||
assert data.target.shape == (58725,)
|
||||
|
||||
data = fetch_kddcup99_fxt('smtp')
|
||||
assert data.data.shape == (9571, 3)
|
||||
assert data.target.shape == (9571,)
|
||||
|
||||
fetch_func = partial(fetch_kddcup99_fxt, 'smtp')
|
||||
check_return_X_y(data, fetch_func)
|
||||
|
||||
|
||||
def test_shuffle(fetch_kddcup99_fxt):
|
||||
dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True,
|
||||
percent10=True)
|
||||
assert(any(dataset.target[-100:] == b'normal.'))
|
196
venv/Lib/site-packages/sklearn/datasets/tests/test_lfw.py
Normal file
196
venv/Lib/site-packages/sklearn/datasets/tests/test_lfw.py
Normal file
|
@ -0,0 +1,196 @@
|
|||
"""This test for the LFW require medium-size data downloading and processing
|
||||
|
||||
If the data has not been already downloaded by running the examples,
|
||||
the tests won't run (skipped).
|
||||
|
||||
If the test are run, the first execution will be long (typically a bit
|
||||
more than a couple of minutes) but as the dataset loader is leveraging
|
||||
joblib, successive runs will be fast (less than 200ms).
|
||||
"""
|
||||
|
||||
import random
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import numpy as np
|
||||
import pytest
|
||||
from functools import partial
|
||||
from sklearn.externals._pilutil import pillow_installed, imsave
|
||||
from sklearn.datasets import fetch_lfw_pairs
|
||||
from sklearn.datasets import fetch_lfw_people
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import SkipTest
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
|
||||
|
||||
SCIKIT_LEARN_DATA = None
|
||||
SCIKIT_LEARN_EMPTY_DATA = None
|
||||
LFW_HOME = None
|
||||
|
||||
FAKE_NAMES = [
|
||||
'Abdelatif_Smith',
|
||||
'Abhati_Kepler',
|
||||
'Camara_Alvaro',
|
||||
'Chen_Dupont',
|
||||
'John_Lee',
|
||||
'Lin_Bauman',
|
||||
'Onur_Lopez',
|
||||
]
|
||||
|
||||
|
||||
def setup_module():
|
||||
"""Test fixture run once and common to all tests of this module"""
|
||||
if not pillow_installed:
|
||||
raise SkipTest("PIL not installed.")
|
||||
|
||||
global SCIKIT_LEARN_DATA, SCIKIT_LEARN_EMPTY_DATA, LFW_HOME
|
||||
|
||||
SCIKIT_LEARN_DATA = tempfile.mkdtemp(prefix="scikit_learn_lfw_test_")
|
||||
LFW_HOME = os.path.join(SCIKIT_LEARN_DATA, 'lfw_home')
|
||||
|
||||
SCIKIT_LEARN_EMPTY_DATA = tempfile.mkdtemp(
|
||||
prefix="scikit_learn_empty_test_")
|
||||
|
||||
if not os.path.exists(LFW_HOME):
|
||||
os.makedirs(LFW_HOME)
|
||||
|
||||
random_state = random.Random(42)
|
||||
np_rng = np.random.RandomState(42)
|
||||
|
||||
# generate some random jpeg files for each person
|
||||
counts = {}
|
||||
for name in FAKE_NAMES:
|
||||
folder_name = os.path.join(LFW_HOME, 'lfw_funneled', name)
|
||||
if not os.path.exists(folder_name):
|
||||
os.makedirs(folder_name)
|
||||
|
||||
n_faces = np_rng.randint(1, 5)
|
||||
counts[name] = n_faces
|
||||
for i in range(n_faces):
|
||||
file_path = os.path.join(folder_name, name + '_%04d.jpg' % i)
|
||||
uniface = np_rng.randint(0, 255, size=(250, 250, 3))
|
||||
try:
|
||||
imsave(file_path, uniface)
|
||||
except ImportError:
|
||||
raise SkipTest("PIL not installed")
|
||||
|
||||
# add some random file pollution to test robustness
|
||||
with open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb') as f:
|
||||
f.write(b'Text file to be ignored by the dataset loader.')
|
||||
|
||||
# generate some pairing metadata files using the same format as LFW
|
||||
with open(os.path.join(LFW_HOME, 'pairsDevTrain.txt'), 'wb') as f:
|
||||
f.write(b"10\n")
|
||||
more_than_two = [name for name, count in counts.items()
|
||||
if count >= 2]
|
||||
for i in range(5):
|
||||
name = random_state.choice(more_than_two)
|
||||
first, second = random_state.sample(range(counts[name]), 2)
|
||||
f.write(('%s\t%d\t%d\n' % (name, first, second)).encode())
|
||||
|
||||
for i in range(5):
|
||||
first_name, second_name = random_state.sample(FAKE_NAMES, 2)
|
||||
first_index = random_state.choice(np.arange(counts[first_name]))
|
||||
second_index = random_state.choice(np.arange(counts[second_name]))
|
||||
f.write(('%s\t%d\t%s\t%d\n' % (first_name, first_index,
|
||||
second_name, second_index)
|
||||
).encode())
|
||||
|
||||
with open(os.path.join(LFW_HOME, 'pairsDevTest.txt'), 'wb') as f:
|
||||
f.write(b"Fake place holder that won't be tested")
|
||||
|
||||
with open(os.path.join(LFW_HOME, 'pairs.txt'), 'wb') as f:
|
||||
f.write(b"Fake place holder that won't be tested")
|
||||
|
||||
|
||||
def teardown_module():
|
||||
"""Test fixture (clean up) run once after all tests of this module"""
|
||||
if os.path.isdir(SCIKIT_LEARN_DATA):
|
||||
shutil.rmtree(SCIKIT_LEARN_DATA)
|
||||
if os.path.isdir(SCIKIT_LEARN_EMPTY_DATA):
|
||||
shutil.rmtree(SCIKIT_LEARN_EMPTY_DATA)
|
||||
|
||||
|
||||
def test_load_empty_lfw_people():
|
||||
with pytest.raises(IOError):
|
||||
fetch_lfw_people(data_home=SCIKIT_LEARN_EMPTY_DATA,
|
||||
download_if_missing=False)
|
||||
|
||||
|
||||
def test_load_fake_lfw_people():
|
||||
lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA,
|
||||
min_faces_per_person=3,
|
||||
download_if_missing=False)
|
||||
|
||||
# The data is croped around the center as a rectangular bounding box
|
||||
# around the face. Colors are converted to gray levels:
|
||||
assert lfw_people.images.shape == (10, 62, 47)
|
||||
assert lfw_people.data.shape == (10, 2914)
|
||||
|
||||
# the target is array of person integer ids
|
||||
assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2])
|
||||
|
||||
# names of the persons can be found using the target_names array
|
||||
expected_classes = ['Abdelatif Smith', 'Abhati Kepler', 'Onur Lopez']
|
||||
assert_array_equal(lfw_people.target_names, expected_classes)
|
||||
|
||||
# It is possible to ask for the original data without any croping or color
|
||||
# conversion and not limit on the number of picture per person
|
||||
lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None,
|
||||
slice_=None, color=True,
|
||||
download_if_missing=False)
|
||||
assert lfw_people.images.shape == (17, 250, 250, 3)
|
||||
|
||||
# the ids and class names are the same as previously
|
||||
assert_array_equal(lfw_people.target,
|
||||
[0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2])
|
||||
assert_array_equal(lfw_people.target_names,
|
||||
['Abdelatif Smith', 'Abhati Kepler', 'Camara Alvaro',
|
||||
'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez'])
|
||||
|
||||
# test return_X_y option
|
||||
fetch_func = partial(fetch_lfw_people, data_home=SCIKIT_LEARN_DATA,
|
||||
resize=None,
|
||||
slice_=None, color=True,
|
||||
download_if_missing=False)
|
||||
check_return_X_y(lfw_people, fetch_func)
|
||||
|
||||
|
||||
def test_load_fake_lfw_people_too_restrictive():
|
||||
with pytest.raises(ValueError):
|
||||
fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=100,
|
||||
download_if_missing=False)
|
||||
|
||||
|
||||
def test_load_empty_lfw_pairs():
|
||||
with pytest.raises(IOError):
|
||||
fetch_lfw_pairs(data_home=SCIKIT_LEARN_EMPTY_DATA,
|
||||
download_if_missing=False)
|
||||
|
||||
|
||||
def test_load_fake_lfw_pairs():
|
||||
lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA,
|
||||
download_if_missing=False)
|
||||
|
||||
# The data is croped around the center as a rectangular bounding box
|
||||
# around the face. Colors are converted to gray levels:
|
||||
assert lfw_pairs_train.pairs.shape == (10, 2, 62, 47)
|
||||
|
||||
# the target is whether the person is the same or not
|
||||
assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
|
||||
|
||||
# names of the persons can be found using the target_names array
|
||||
expected_classes = ['Different persons', 'Same person']
|
||||
assert_array_equal(lfw_pairs_train.target_names, expected_classes)
|
||||
|
||||
# It is possible to ask for the original data without any croping or color
|
||||
# conversion
|
||||
lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA, resize=None,
|
||||
slice_=None, color=True,
|
||||
download_if_missing=False)
|
||||
assert lfw_pairs_train.pairs.shape == (10, 2, 250, 250, 3)
|
||||
|
||||
# the ids and class names are the same as previously
|
||||
assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
|
||||
assert_array_equal(lfw_pairs_train.target_names, expected_classes)
|
|
@ -0,0 +1,26 @@
|
|||
"""Test Olivetti faces fetcher, if the data is available,
|
||||
or if specifically requested via environment variable
|
||||
(e.g. for travis cron job)."""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils import Bunch
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
def test_olivetti_faces(fetch_olivetti_faces_fxt):
|
||||
data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0)
|
||||
|
||||
assert isinstance(data, Bunch)
|
||||
for expected_keys in ('data', 'images', 'target', 'DESCR'):
|
||||
assert expected_keys in data.keys()
|
||||
|
||||
assert data.data.shape == (400, 4096)
|
||||
assert data.images.shape == (400, 64, 64)
|
||||
assert data.target.shape == (400,)
|
||||
assert_array_equal(np.unique(np.sort(data.target)), np.arange(40))
|
||||
|
||||
# test the return_X_y option
|
||||
check_return_X_y(data, fetch_olivetti_faces_fxt)
|
1192
venv/Lib/site-packages/sklearn/datasets/tests/test_openml.py
Normal file
1192
venv/Lib/site-packages/sklearn/datasets/tests/test_openml.py
Normal file
File diff suppressed because it is too large
Load diff
65
venv/Lib/site-packages/sklearn/datasets/tests/test_rcv1.py
Normal file
65
venv/Lib/site-packages/sklearn/datasets/tests/test_rcv1.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
"""Test the rcv1 loader, if the data is available,
|
||||
or if specifically requested via environment variable
|
||||
(e.g. for travis cron job)."""
|
||||
|
||||
import scipy.sparse as sp
|
||||
import numpy as np
|
||||
from functools import partial
|
||||
from sklearn.datasets.tests.test_common import check_return_X_y
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
def test_fetch_rcv1(fetch_rcv1_fxt):
|
||||
data1 = fetch_rcv1_fxt(shuffle=False)
|
||||
X1, Y1 = data1.data, data1.target
|
||||
cat_list, s1 = data1.target_names.tolist(), data1.sample_id
|
||||
|
||||
# test sparsity
|
||||
assert sp.issparse(X1)
|
||||
assert sp.issparse(Y1)
|
||||
assert 60915113 == X1.data.size
|
||||
assert 2606875 == Y1.data.size
|
||||
|
||||
# test shapes
|
||||
assert (804414, 47236) == X1.shape
|
||||
assert (804414, 103) == Y1.shape
|
||||
assert (804414,) == s1.shape
|
||||
assert 103 == len(cat_list)
|
||||
|
||||
# test ordering of categories
|
||||
first_categories = ['C11', 'C12', 'C13', 'C14', 'C15', 'C151']
|
||||
assert_array_equal(first_categories, cat_list[:6])
|
||||
|
||||
# test number of sample for some categories
|
||||
some_categories = ('GMIL', 'E143', 'CCAT')
|
||||
number_non_zero_in_cat = (5, 1206, 381327)
|
||||
for num, cat in zip(number_non_zero_in_cat, some_categories):
|
||||
j = cat_list.index(cat)
|
||||
assert num == Y1[:, j].data.size
|
||||
|
||||
# test shuffling and subset
|
||||
data2 = fetch_rcv1_fxt(shuffle=True, subset='train', random_state=77)
|
||||
X2, Y2 = data2.data, data2.target
|
||||
s2 = data2.sample_id
|
||||
|
||||
# test return_X_y option
|
||||
fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset='train')
|
||||
check_return_X_y(data2, fetch_func)
|
||||
|
||||
# The first 23149 samples are the training samples
|
||||
assert_array_equal(np.sort(s1[:23149]), np.sort(s2))
|
||||
|
||||
# test some precise values
|
||||
some_sample_ids = (2286, 3274, 14042)
|
||||
for sample_id in some_sample_ids:
|
||||
idx1 = s1.tolist().index(sample_id)
|
||||
idx2 = s2.tolist().index(sample_id)
|
||||
|
||||
feature_values_1 = X1[idx1, :].toarray()
|
||||
feature_values_2 = X2[idx2, :].toarray()
|
||||
assert_almost_equal(feature_values_1, feature_values_2)
|
||||
|
||||
target_values_1 = Y1[idx1, :].toarray()
|
||||
target_values_2 = Y2[idx2, :].toarray()
|
||||
assert_almost_equal(target_values_1, target_values_2)
|
|
@ -0,0 +1,559 @@
|
|||
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.datasets import make_multilabel_classification
|
||||
from sklearn.datasets import make_hastie_10_2
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.datasets import make_friedman1
|
||||
from sklearn.datasets import make_friedman2
|
||||
from sklearn.datasets import make_friedman3
|
||||
from sklearn.datasets import make_low_rank_matrix
|
||||
from sklearn.datasets import make_moons
|
||||
from sklearn.datasets import make_circles
|
||||
from sklearn.datasets import make_sparse_coded_signal
|
||||
from sklearn.datasets import make_sparse_uncorrelated
|
||||
from sklearn.datasets import make_spd_matrix
|
||||
from sklearn.datasets import make_swiss_roll
|
||||
from sklearn.datasets import make_s_curve
|
||||
from sklearn.datasets import make_biclusters
|
||||
from sklearn.datasets import make_checkerboard
|
||||
|
||||
from sklearn.utils.validation import assert_all_finite
|
||||
|
||||
|
||||
def test_make_classification():
|
||||
weights = [0.1, 0.25]
|
||||
X, y = make_classification(n_samples=100, n_features=20, n_informative=5,
|
||||
n_redundant=1, n_repeated=1, n_classes=3,
|
||||
n_clusters_per_class=1, hypercube=False,
|
||||
shift=None, scale=None, weights=weights,
|
||||
random_state=0)
|
||||
|
||||
assert weights == [0.1, 0.25]
|
||||
assert X.shape == (100, 20), "X shape mismatch"
|
||||
assert y.shape == (100,), "y shape mismatch"
|
||||
assert np.unique(y).shape == (3,), "Unexpected number of classes"
|
||||
assert sum(y == 0) == 10, "Unexpected number of samples in class #0"
|
||||
assert sum(y == 1) == 25, "Unexpected number of samples in class #1"
|
||||
assert sum(y == 2) == 65, "Unexpected number of samples in class #2"
|
||||
|
||||
# Test for n_features > 30
|
||||
X, y = make_classification(n_samples=2000, n_features=31, n_informative=31,
|
||||
n_redundant=0, n_repeated=0, hypercube=True,
|
||||
scale=0.5, random_state=0)
|
||||
|
||||
assert X.shape == (2000, 31), "X shape mismatch"
|
||||
assert y.shape == (2000,), "y shape mismatch"
|
||||
assert (np.unique(X.view([('', X.dtype)]*X.shape[1])).view(X.dtype)
|
||||
.reshape(-1, X.shape[1]).shape[0] == 2000), (
|
||||
"Unexpected number of unique rows")
|
||||
|
||||
|
||||
def test_make_classification_informative_features():
|
||||
"""Test the construction of informative features in make_classification
|
||||
|
||||
Also tests `n_clusters_per_class`, `n_classes`, `hypercube` and
|
||||
fully-specified `weights`.
|
||||
"""
|
||||
# Create very separate clusters; check that vertices are unique and
|
||||
# correspond to classes
|
||||
class_sep = 1e6
|
||||
make = partial(make_classification, class_sep=class_sep, n_redundant=0,
|
||||
n_repeated=0, flip_y=0, shift=0, scale=1, shuffle=False)
|
||||
|
||||
for n_informative, weights, n_clusters_per_class in [(2, [1], 1),
|
||||
(2, [1/3] * 3, 1),
|
||||
(2, [1/4] * 4, 1),
|
||||
(2, [1/2] * 2, 2),
|
||||
(2, [3/4, 1/4], 2),
|
||||
(10, [1/3] * 3, 10),
|
||||
(np.int(64), [1], 1)
|
||||
]:
|
||||
n_classes = len(weights)
|
||||
n_clusters = n_classes * n_clusters_per_class
|
||||
n_samples = n_clusters * 50
|
||||
|
||||
for hypercube in (False, True):
|
||||
X, y = make(n_samples=n_samples, n_classes=n_classes,
|
||||
weights=weights, n_features=n_informative,
|
||||
n_informative=n_informative,
|
||||
n_clusters_per_class=n_clusters_per_class,
|
||||
hypercube=hypercube, random_state=0)
|
||||
|
||||
assert X.shape == (n_samples, n_informative)
|
||||
assert y.shape == (n_samples,)
|
||||
|
||||
# Cluster by sign, viewed as strings to allow uniquing
|
||||
signs = np.sign(X)
|
||||
signs = signs.view(dtype='|S{0}'.format(signs.strides[0]))
|
||||
unique_signs, cluster_index = np.unique(signs,
|
||||
return_inverse=True)
|
||||
|
||||
assert len(unique_signs) == n_clusters, (
|
||||
"Wrong number of clusters, or not in distinct quadrants")
|
||||
|
||||
clusters_by_class = defaultdict(set)
|
||||
for cluster, cls in zip(cluster_index, y):
|
||||
clusters_by_class[cls].add(cluster)
|
||||
for clusters in clusters_by_class.values():
|
||||
assert len(clusters) == n_clusters_per_class, (
|
||||
"Wrong number of clusters per class")
|
||||
assert (len(clusters_by_class) == n_classes), (
|
||||
"Wrong number of classes")
|
||||
|
||||
assert_array_almost_equal(np.bincount(y) / len(y) // weights,
|
||||
[1] * n_classes,
|
||||
err_msg="Wrong number of samples "
|
||||
"per class")
|
||||
|
||||
# Ensure on vertices of hypercube
|
||||
for cluster in range(len(unique_signs)):
|
||||
centroid = X[cluster_index == cluster].mean(axis=0)
|
||||
if hypercube:
|
||||
assert_array_almost_equal(np.abs(centroid) / class_sep,
|
||||
np.ones(n_informative),
|
||||
decimal=5,
|
||||
err_msg="Clusters are not "
|
||||
"centered on hypercube "
|
||||
"vertices")
|
||||
else:
|
||||
with pytest.raises(AssertionError):
|
||||
assert_array_almost_equal(np.abs(centroid) / class_sep,
|
||||
np.ones(n_informative),
|
||||
decimal=5,
|
||||
err_msg="Clusters should "
|
||||
"not be centered "
|
||||
"on hypercube "
|
||||
"vertices")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
make(n_features=2, n_informative=2, n_classes=5,
|
||||
n_clusters_per_class=1)
|
||||
with pytest.raises(ValueError):
|
||||
make(n_features=2, n_informative=2, n_classes=3,
|
||||
n_clusters_per_class=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'weights, err_type, err_msg',
|
||||
[
|
||||
([], ValueError,
|
||||
"Weights specified but incompatible with number of classes."),
|
||||
([.25, .75, .1], ValueError,
|
||||
"Weights specified but incompatible with number of classes."),
|
||||
(np.array([]), ValueError,
|
||||
"Weights specified but incompatible with number of classes."),
|
||||
(np.array([.25, .75, .1]), ValueError,
|
||||
"Weights specified but incompatible with number of classes."),
|
||||
(np.random.random(3), ValueError,
|
||||
"Weights specified but incompatible with number of classes.")
|
||||
]
|
||||
)
|
||||
def test_make_classification_weights_type(weights, err_type, err_msg):
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
make_classification(weights=weights)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kwargs", [{}, {"n_classes": 3, "n_informative": 3}])
|
||||
def test_make_classification_weights_array_or_list_ok(kwargs):
|
||||
X1, y1 = make_classification(weights=[.1, .9],
|
||||
random_state=0, **kwargs)
|
||||
X2, y2 = make_classification(weights=np.array([.1, .9]),
|
||||
random_state=0, **kwargs)
|
||||
assert_almost_equal(X1, X2)
|
||||
assert_almost_equal(y1, y2)
|
||||
|
||||
|
||||
def test_make_multilabel_classification_return_sequences():
|
||||
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
|
||||
X, Y = make_multilabel_classification(n_samples=100, n_features=20,
|
||||
n_classes=3, random_state=0,
|
||||
return_indicator=False,
|
||||
allow_unlabeled=allow_unlabeled)
|
||||
assert X.shape == (100, 20), "X shape mismatch"
|
||||
if not allow_unlabeled:
|
||||
assert max([max(y) for y in Y]) == 2
|
||||
assert min([len(y) for y in Y]) == min_length
|
||||
assert max([len(y) for y in Y]) <= 3
|
||||
|
||||
|
||||
def test_make_multilabel_classification_return_indicator():
|
||||
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
|
||||
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
|
||||
n_classes=3, random_state=0,
|
||||
allow_unlabeled=allow_unlabeled)
|
||||
assert X.shape == (25, 20), "X shape mismatch"
|
||||
assert Y.shape == (25, 3), "Y shape mismatch"
|
||||
assert np.all(np.sum(Y, axis=0) > min_length)
|
||||
|
||||
# Also test return_distributions and return_indicator with True
|
||||
X2, Y2, p_c, p_w_c = make_multilabel_classification(
|
||||
n_samples=25, n_features=20, n_classes=3, random_state=0,
|
||||
allow_unlabeled=allow_unlabeled, return_distributions=True)
|
||||
|
||||
assert_array_almost_equal(X, X2)
|
||||
assert_array_equal(Y, Y2)
|
||||
assert p_c.shape == (3,)
|
||||
assert_almost_equal(p_c.sum(), 1)
|
||||
assert p_w_c.shape == (20, 3)
|
||||
assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
|
||||
|
||||
|
||||
def test_make_multilabel_classification_return_indicator_sparse():
|
||||
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
|
||||
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
|
||||
n_classes=3, random_state=0,
|
||||
return_indicator='sparse',
|
||||
allow_unlabeled=allow_unlabeled)
|
||||
assert X.shape == (25, 20), "X shape mismatch"
|
||||
assert Y.shape == (25, 3), "Y shape mismatch"
|
||||
assert sp.issparse(Y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_msg",
|
||||
[
|
||||
({"n_classes": 0}, "'n_classes' should be an integer"),
|
||||
({"length": 0}, "'length' should be an integer")
|
||||
]
|
||||
)
|
||||
def test_make_multilabel_classification_valid_arguments(params, err_msg):
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
make_multilabel_classification(**params)
|
||||
|
||||
|
||||
def test_make_hastie_10_2():
|
||||
X, y = make_hastie_10_2(n_samples=100, random_state=0)
|
||||
assert X.shape == (100, 10), "X shape mismatch"
|
||||
assert y.shape == (100,), "y shape mismatch"
|
||||
assert np.unique(y).shape == (2,), "Unexpected number of classes"
|
||||
|
||||
|
||||
def test_make_regression():
|
||||
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
|
||||
effective_rank=5, coef=True, bias=0.0,
|
||||
noise=1.0, random_state=0)
|
||||
|
||||
assert X.shape == (100, 10), "X shape mismatch"
|
||||
assert y.shape == (100,), "y shape mismatch"
|
||||
assert c.shape == (10,), "coef shape mismatch"
|
||||
assert sum(c != 0.0) == 3, "Unexpected number of informative features"
|
||||
|
||||
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
|
||||
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
|
||||
|
||||
# Test with small number of features.
|
||||
X, y = make_regression(n_samples=100, n_features=1) # n_informative=3
|
||||
assert X.shape == (100, 1)
|
||||
|
||||
|
||||
def test_make_regression_multitarget():
|
||||
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
|
||||
n_targets=3, coef=True, noise=1., random_state=0)
|
||||
|
||||
assert X.shape == (100, 10), "X shape mismatch"
|
||||
assert y.shape == (100, 3), "y shape mismatch"
|
||||
assert c.shape == (10, 3), "coef shape mismatch"
|
||||
assert_array_equal(sum(c != 0.0), 3,
|
||||
"Unexpected number of informative features")
|
||||
|
||||
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
|
||||
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
|
||||
|
||||
|
||||
def test_make_blobs():
|
||||
cluster_stds = np.array([0.05, 0.2, 0.4])
|
||||
cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
|
||||
X, y = make_blobs(random_state=0, n_samples=50, n_features=2,
|
||||
centers=cluster_centers, cluster_std=cluster_stds)
|
||||
|
||||
assert X.shape == (50, 2), "X shape mismatch"
|
||||
assert y.shape == (50,), "y shape mismatch"
|
||||
assert np.unique(y).shape == (3,), "Unexpected number of blobs"
|
||||
for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)):
|
||||
assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
|
||||
|
||||
|
||||
def test_make_blobs_n_samples_list():
|
||||
n_samples = [50, 30, 20]
|
||||
X, y = make_blobs(n_samples=n_samples, n_features=2, random_state=0)
|
||||
|
||||
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
|
||||
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
|
||||
"Incorrect number of samples per blob"
|
||||
|
||||
|
||||
def test_make_blobs_n_samples_list_with_centers():
|
||||
n_samples = [20, 20, 20]
|
||||
centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
|
||||
cluster_stds = np.array([0.05, 0.2, 0.4])
|
||||
X, y = make_blobs(n_samples=n_samples, centers=centers,
|
||||
cluster_std=cluster_stds, random_state=0)
|
||||
|
||||
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
|
||||
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
|
||||
"Incorrect number of samples per blob"
|
||||
for i, (ctr, std) in enumerate(zip(centers, cluster_stds)):
|
||||
assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"n_samples",
|
||||
[[5, 3, 0],
|
||||
np.array([5, 3, 0]),
|
||||
tuple([5, 3, 0])]
|
||||
)
|
||||
def test_make_blobs_n_samples_centers_none(n_samples):
|
||||
centers = None
|
||||
X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=0)
|
||||
|
||||
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
|
||||
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
|
||||
"Incorrect number of samples per blob"
|
||||
|
||||
|
||||
def test_make_blobs_return_centers():
|
||||
n_samples = [10, 20]
|
||||
n_features = 3
|
||||
X, y, centers = make_blobs(n_samples=n_samples, n_features=n_features,
|
||||
return_centers=True, random_state=0)
|
||||
|
||||
assert centers.shape == (len(n_samples), n_features)
|
||||
|
||||
|
||||
def test_make_blobs_error():
|
||||
n_samples = [20, 20, 20]
|
||||
centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
|
||||
cluster_stds = np.array([0.05, 0.2, 0.4])
|
||||
wrong_centers_msg = ("Length of `n_samples` not consistent "
|
||||
"with number of centers. Got n_samples = {} "
|
||||
"and centers = {}".format(n_samples, centers[:-1]))
|
||||
assert_raise_message(ValueError, wrong_centers_msg,
|
||||
make_blobs, n_samples, centers=centers[:-1])
|
||||
wrong_std_msg = ("Length of `clusters_std` not consistent with "
|
||||
"number of centers. Got centers = {} "
|
||||
"and cluster_std = {}".format(centers, cluster_stds[:-1]))
|
||||
assert_raise_message(ValueError, wrong_std_msg,
|
||||
make_blobs, n_samples,
|
||||
centers=centers, cluster_std=cluster_stds[:-1])
|
||||
wrong_type_msg = ("Parameter `centers` must be array-like. "
|
||||
"Got {!r} instead".format(3))
|
||||
assert_raise_message(ValueError, wrong_type_msg,
|
||||
make_blobs, n_samples, centers=3)
|
||||
|
||||
|
||||
def test_make_friedman1():
|
||||
X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0,
|
||||
random_state=0)
|
||||
|
||||
assert X.shape == (5, 10), "X shape mismatch"
|
||||
assert y.shape == (5,), "y shape mismatch"
|
||||
|
||||
assert_array_almost_equal(y,
|
||||
10 * np.sin(np.pi * X[:, 0] * X[:, 1])
|
||||
+ 20 * (X[:, 2] - 0.5) ** 2
|
||||
+ 10 * X[:, 3] + 5 * X[:, 4])
|
||||
|
||||
|
||||
def test_make_friedman2():
|
||||
X, y = make_friedman2(n_samples=5, noise=0.0, random_state=0)
|
||||
|
||||
assert X.shape == (5, 4), "X shape mismatch"
|
||||
assert y.shape == (5,), "y shape mismatch"
|
||||
|
||||
assert_array_almost_equal(y,
|
||||
(X[:, 0] ** 2
|
||||
+ (X[:, 1] * X[:, 2] - 1
|
||||
/ (X[:, 1] * X[:, 3])) ** 2) ** 0.5)
|
||||
|
||||
|
||||
def test_make_friedman3():
|
||||
X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0)
|
||||
|
||||
assert X.shape == (5, 4), "X shape mismatch"
|
||||
assert y.shape == (5,), "y shape mismatch"
|
||||
|
||||
assert_array_almost_equal(y, np.arctan((X[:, 1] * X[:, 2]
|
||||
- 1 / (X[:, 1] * X[:, 3]))
|
||||
/ X[:, 0]))
|
||||
|
||||
|
||||
def test_make_low_rank_matrix():
|
||||
X = make_low_rank_matrix(n_samples=50, n_features=25, effective_rank=5,
|
||||
tail_strength=0.01, random_state=0)
|
||||
|
||||
assert X.shape == (50, 25), "X shape mismatch"
|
||||
|
||||
from numpy.linalg import svd
|
||||
u, s, v = svd(X)
|
||||
assert sum(s) - 5 < 0.1, "X rank is not approximately 5"
|
||||
|
||||
|
||||
def test_make_sparse_coded_signal():
|
||||
Y, D, X = make_sparse_coded_signal(n_samples=5, n_components=8,
|
||||
n_features=10, n_nonzero_coefs=3,
|
||||
random_state=0)
|
||||
assert Y.shape == (10, 5), "Y shape mismatch"
|
||||
assert D.shape == (10, 8), "D shape mismatch"
|
||||
assert X.shape == (8, 5), "X shape mismatch"
|
||||
for col in X.T:
|
||||
assert len(np.flatnonzero(col)) == 3, 'Non-zero coefs mismatch'
|
||||
assert_array_almost_equal(np.dot(D, X), Y)
|
||||
assert_array_almost_equal(np.sqrt((D ** 2).sum(axis=0)),
|
||||
np.ones(D.shape[1]))
|
||||
|
||||
|
||||
def test_make_sparse_uncorrelated():
|
||||
X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0)
|
||||
|
||||
assert X.shape == (5, 10), "X shape mismatch"
|
||||
assert y.shape == (5,), "y shape mismatch"
|
||||
|
||||
|
||||
def test_make_spd_matrix():
|
||||
X = make_spd_matrix(n_dim=5, random_state=0)
|
||||
|
||||
assert X.shape == (5, 5), "X shape mismatch"
|
||||
assert_array_almost_equal(X, X.T)
|
||||
|
||||
from numpy.linalg import eig
|
||||
eigenvalues, _ = eig(X)
|
||||
assert_array_equal(eigenvalues > 0, np.array([True] * 5),
|
||||
"X is not positive-definite")
|
||||
|
||||
|
||||
def test_make_swiss_roll():
|
||||
X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0)
|
||||
|
||||
assert X.shape == (5, 3), "X shape mismatch"
|
||||
assert t.shape == (5,), "t shape mismatch"
|
||||
assert_array_almost_equal(X[:, 0], t * np.cos(t))
|
||||
assert_array_almost_equal(X[:, 2], t * np.sin(t))
|
||||
|
||||
|
||||
def test_make_s_curve():
|
||||
X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0)
|
||||
|
||||
assert X.shape == (5, 3), "X shape mismatch"
|
||||
assert t.shape == (5,), "t shape mismatch"
|
||||
assert_array_almost_equal(X[:, 0], np.sin(t))
|
||||
assert_array_almost_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
|
||||
|
||||
|
||||
def test_make_biclusters():
|
||||
X, rows, cols = make_biclusters(
|
||||
shape=(100, 100), n_clusters=4, shuffle=True, random_state=0)
|
||||
assert X.shape == (100, 100), "X shape mismatch"
|
||||
assert rows.shape == (4, 100), "rows shape mismatch"
|
||||
assert cols.shape == (4, 100,), "columns shape mismatch"
|
||||
assert_all_finite(X)
|
||||
assert_all_finite(rows)
|
||||
assert_all_finite(cols)
|
||||
|
||||
X2, _, _ = make_biclusters(shape=(100, 100), n_clusters=4,
|
||||
shuffle=True, random_state=0)
|
||||
assert_array_almost_equal(X, X2)
|
||||
|
||||
|
||||
def test_make_checkerboard():
|
||||
X, rows, cols = make_checkerboard(
|
||||
shape=(100, 100), n_clusters=(20, 5),
|
||||
shuffle=True, random_state=0)
|
||||
assert X.shape == (100, 100), "X shape mismatch"
|
||||
assert rows.shape == (100, 100), "rows shape mismatch"
|
||||
assert cols.shape == (100, 100,), "columns shape mismatch"
|
||||
|
||||
X, rows, cols = make_checkerboard(
|
||||
shape=(100, 100), n_clusters=2, shuffle=True, random_state=0)
|
||||
assert_all_finite(X)
|
||||
assert_all_finite(rows)
|
||||
assert_all_finite(cols)
|
||||
|
||||
X1, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
|
||||
shuffle=True, random_state=0)
|
||||
X2, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
|
||||
shuffle=True, random_state=0)
|
||||
assert_array_almost_equal(X1, X2)
|
||||
|
||||
|
||||
def test_make_moons():
|
||||
X, y = make_moons(3, shuffle=False)
|
||||
for x, label in zip(X, y):
|
||||
center = [0.0, 0.0] if label == 0 else [1.0, 0.5]
|
||||
dist_sqr = ((x - center) ** 2).sum()
|
||||
assert_almost_equal(dist_sqr, 1.0,
|
||||
err_msg="Point is not on expected unit circle")
|
||||
|
||||
|
||||
def test_make_moons_unbalanced():
|
||||
X, y = make_moons(n_samples=(7, 5))
|
||||
assert np.sum(y == 0) == 7 and np.sum(y == 1) == 5, \
|
||||
'Number of samples in a moon is wrong'
|
||||
assert X.shape == (12, 2), "X shape mismatch"
|
||||
assert y.shape == (12,), "y shape mismatch"
|
||||
|
||||
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
|
||||
r'or a two-element tuple.'):
|
||||
make_moons(n_samples=[1, 2, 3])
|
||||
|
||||
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
|
||||
r'or a two-element tuple.'):
|
||||
make_moons(n_samples=(10,))
|
||||
|
||||
|
||||
def test_make_circles():
|
||||
factor = 0.3
|
||||
|
||||
for (n_samples, n_outer, n_inner) in [(7, 3, 4), (8, 4, 4)]:
|
||||
# Testing odd and even case, because in the past make_circles always
|
||||
# created an even number of samples.
|
||||
X, y = make_circles(n_samples, shuffle=False, noise=None,
|
||||
factor=factor)
|
||||
assert X.shape == (n_samples, 2), "X shape mismatch"
|
||||
assert y.shape == (n_samples,), "y shape mismatch"
|
||||
center = [0.0, 0.0]
|
||||
for x, label in zip(X, y):
|
||||
dist_sqr = ((x - center) ** 2).sum()
|
||||
dist_exp = 1.0 if label == 0 else factor**2
|
||||
dist_exp = 1.0 if label == 0 else factor ** 2
|
||||
assert_almost_equal(dist_sqr, dist_exp,
|
||||
err_msg="Point is not on expected circle")
|
||||
|
||||
assert X[y == 0].shape == (n_outer, 2), (
|
||||
"Samples not correctly distributed across circles.")
|
||||
assert X[y == 1].shape == (n_inner, 2), (
|
||||
"Samples not correctly distributed across circles.")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
make_circles(factor=-0.01)
|
||||
with pytest.raises(ValueError):
|
||||
make_circles(factor=1.)
|
||||
|
||||
|
||||
def test_make_circles_unbalanced():
|
||||
X, y = make_circles(n_samples=(2, 8))
|
||||
|
||||
assert np.sum(y == 0) == 2, 'Number of samples in inner circle is wrong'
|
||||
assert np.sum(y == 1) == 8, 'Number of samples in outer circle is wrong'
|
||||
assert X.shape == (10, 2), "X shape mismatch"
|
||||
assert y.shape == (10,), "y shape mismatch"
|
||||
|
||||
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
|
||||
r'or a two-element tuple.'):
|
||||
make_circles(n_samples=[1, 2, 3])
|
||||
|
||||
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
|
||||
r'or a two-element tuple.'):
|
||||
make_circles(n_samples=(10,))
|
|
@ -0,0 +1,521 @@
|
|||
from bz2 import BZ2File
|
||||
import gzip
|
||||
from io import BytesIO
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
import os
|
||||
import shutil
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import fails_if_pypy
|
||||
|
||||
import sklearn
|
||||
from sklearn.datasets import (load_svmlight_file, load_svmlight_files,
|
||||
dump_svmlight_file)
|
||||
|
||||
currdir = os.path.dirname(os.path.abspath(__file__))
|
||||
datafile = os.path.join(currdir, "data", "svmlight_classification.txt")
|
||||
multifile = os.path.join(currdir, "data", "svmlight_multilabel.txt")
|
||||
invalidfile = os.path.join(currdir, "data", "svmlight_invalid.txt")
|
||||
invalidfile2 = os.path.join(currdir, "data", "svmlight_invalid_order.txt")
|
||||
|
||||
pytestmark = fails_if_pypy
|
||||
|
||||
|
||||
def test_load_svmlight_file():
|
||||
X, y = load_svmlight_file(datafile)
|
||||
|
||||
# test X's shape
|
||||
assert X.indptr.shape[0] == 7
|
||||
assert X.shape[0] == 6
|
||||
assert X.shape[1] == 21
|
||||
assert y.shape[0] == 6
|
||||
|
||||
# test X's non-zero values
|
||||
for i, j, val in ((0, 2, 2.5), (0, 10, -5.2), (0, 15, 1.5),
|
||||
(1, 5, 1.0), (1, 12, -3),
|
||||
(2, 20, 27)):
|
||||
|
||||
assert X[i, j] == val
|
||||
|
||||
# tests X's zero values
|
||||
assert X[0, 3] == 0
|
||||
assert X[0, 5] == 0
|
||||
assert X[1, 8] == 0
|
||||
assert X[1, 16] == 0
|
||||
assert X[2, 18] == 0
|
||||
|
||||
# test can change X's values
|
||||
X[0, 2] *= 2
|
||||
assert X[0, 2] == 5
|
||||
|
||||
# test y
|
||||
assert_array_equal(y, [1, 2, 3, 4, 1, 2])
|
||||
|
||||
|
||||
def test_load_svmlight_file_fd():
|
||||
# test loading from file descriptor
|
||||
X1, y1 = load_svmlight_file(datafile)
|
||||
|
||||
fd = os.open(datafile, os.O_RDONLY)
|
||||
try:
|
||||
X2, y2 = load_svmlight_file(fd)
|
||||
assert_array_almost_equal(X1.data, X2.data)
|
||||
assert_array_almost_equal(y1, y2)
|
||||
finally:
|
||||
os.close(fd)
|
||||
|
||||
|
||||
def test_load_svmlight_file_multilabel():
|
||||
X, y = load_svmlight_file(multifile, multilabel=True)
|
||||
assert y == [(0, 1), (2,), (), (1, 2)]
|
||||
|
||||
|
||||
def test_load_svmlight_files():
|
||||
X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2,
|
||||
dtype=np.float32)
|
||||
assert_array_equal(X_train.toarray(), X_test.toarray())
|
||||
assert_array_almost_equal(y_train, y_test)
|
||||
assert X_train.dtype == np.float32
|
||||
assert X_test.dtype == np.float32
|
||||
|
||||
X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3,
|
||||
dtype=np.float64)
|
||||
assert X1.dtype == X2.dtype
|
||||
assert X2.dtype == X3.dtype
|
||||
assert X3.dtype == np.float64
|
||||
|
||||
|
||||
def test_load_svmlight_file_n_features():
|
||||
X, y = load_svmlight_file(datafile, n_features=22)
|
||||
|
||||
# test X'shape
|
||||
assert X.indptr.shape[0] == 7
|
||||
assert X.shape[0] == 6
|
||||
assert X.shape[1] == 22
|
||||
|
||||
# test X's non-zero values
|
||||
for i, j, val in ((0, 2, 2.5), (0, 10, -5.2),
|
||||
(1, 5, 1.0), (1, 12, -3)):
|
||||
|
||||
assert X[i, j] == val
|
||||
|
||||
# 21 features in file
|
||||
with pytest.raises(ValueError):
|
||||
load_svmlight_file(datafile, n_features=20)
|
||||
|
||||
|
||||
def test_load_compressed():
|
||||
X, y = load_svmlight_file(datafile)
|
||||
|
||||
with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp:
|
||||
tmp.close() # necessary under windows
|
||||
with open(datafile, "rb") as f:
|
||||
with gzip.open(tmp.name, "wb") as fh_out:
|
||||
shutil.copyfileobj(f, fh_out)
|
||||
Xgz, ygz = load_svmlight_file(tmp.name)
|
||||
# because we "close" it manually and write to it,
|
||||
# we need to remove it manually.
|
||||
os.remove(tmp.name)
|
||||
assert_array_almost_equal(X.toarray(), Xgz.toarray())
|
||||
assert_array_almost_equal(y, ygz)
|
||||
|
||||
with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp:
|
||||
tmp.close() # necessary under windows
|
||||
with open(datafile, "rb") as f:
|
||||
with BZ2File(tmp.name, "wb") as fh_out:
|
||||
shutil.copyfileobj(f, fh_out)
|
||||
Xbz, ybz = load_svmlight_file(tmp.name)
|
||||
# because we "close" it manually and write to it,
|
||||
# we need to remove it manually.
|
||||
os.remove(tmp.name)
|
||||
assert_array_almost_equal(X.toarray(), Xbz.toarray())
|
||||
assert_array_almost_equal(y, ybz)
|
||||
|
||||
|
||||
def test_load_invalid_file():
|
||||
with pytest.raises(ValueError):
|
||||
load_svmlight_file(invalidfile)
|
||||
|
||||
|
||||
def test_load_invalid_order_file():
|
||||
with pytest.raises(ValueError):
|
||||
load_svmlight_file(invalidfile2)
|
||||
|
||||
|
||||
def test_load_zero_based():
|
||||
f = BytesIO(b"-1 4:1.\n1 0:1\n")
|
||||
with pytest.raises(ValueError):
|
||||
load_svmlight_file(f, zero_based=False)
|
||||
|
||||
|
||||
def test_load_zero_based_auto():
|
||||
data1 = b"-1 1:1 2:2 3:3\n"
|
||||
data2 = b"-1 0:0 1:1\n"
|
||||
|
||||
f1 = BytesIO(data1)
|
||||
X, y = load_svmlight_file(f1, zero_based="auto")
|
||||
assert X.shape == (1, 3)
|
||||
|
||||
f1 = BytesIO(data1)
|
||||
f2 = BytesIO(data2)
|
||||
X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
|
||||
assert X1.shape == (1, 4)
|
||||
assert X2.shape == (1, 4)
|
||||
|
||||
|
||||
def test_load_with_qid():
|
||||
# load svmfile with qid attribute
|
||||
data = b"""
|
||||
3 qid:1 1:0.53 2:0.12
|
||||
2 qid:1 1:0.13 2:0.1
|
||||
7 qid:2 1:0.87 2:0.12"""
|
||||
X, y = load_svmlight_file(BytesIO(data), query_id=False)
|
||||
assert_array_equal(y, [3, 2, 7])
|
||||
assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]])
|
||||
res1 = load_svmlight_files([BytesIO(data)], query_id=True)
|
||||
res2 = load_svmlight_file(BytesIO(data), query_id=True)
|
||||
for X, y, qid in (res1, res2):
|
||||
assert_array_equal(y, [3, 2, 7])
|
||||
assert_array_equal(qid, [1, 1, 2])
|
||||
assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]])
|
||||
|
||||
|
||||
@pytest.mark.skip("testing the overflow of 32 bit sparse indexing requires a"
|
||||
" large amount of memory")
|
||||
def test_load_large_qid():
|
||||
"""
|
||||
load large libsvm / svmlight file with qid attribute. Tests 64-bit query ID
|
||||
"""
|
||||
data = b"\n".join(("3 qid:{0} 1:0.53 2:0.12\n2 qid:{0} 1:0.13 2:0.1"
|
||||
.format(i).encode() for i in range(1, 40*1000*1000)))
|
||||
X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)
|
||||
assert_array_equal(y[-4:], [3, 2, 3, 2])
|
||||
assert_array_equal(np.unique(qid), np.arange(1, 40*1000*1000))
|
||||
|
||||
|
||||
def test_load_invalid_file2():
|
||||
with pytest.raises(ValueError):
|
||||
load_svmlight_files([datafile, invalidfile, datafile])
|
||||
|
||||
|
||||
def test_not_a_filename():
|
||||
# in python 3 integers are valid file opening arguments (taken as unix
|
||||
# file descriptors)
|
||||
with pytest.raises(TypeError):
|
||||
load_svmlight_file(.42)
|
||||
|
||||
|
||||
def test_invalid_filename():
|
||||
with pytest.raises(IOError):
|
||||
load_svmlight_file("trou pic nic douille")
|
||||
|
||||
|
||||
def test_dump():
|
||||
X_sparse, y_dense = load_svmlight_file(datafile)
|
||||
X_dense = X_sparse.toarray()
|
||||
y_sparse = sp.csr_matrix(y_dense)
|
||||
|
||||
# slicing a csr_matrix can unsort its .indices, so test that we sort
|
||||
# those correctly
|
||||
X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
|
||||
y_sliced = y_sparse[np.arange(y_sparse.shape[0])]
|
||||
|
||||
for X in (X_sparse, X_dense, X_sliced):
|
||||
for y in (y_sparse, y_dense, y_sliced):
|
||||
for zero_based in (True, False):
|
||||
for dtype in [np.float32, np.float64, np.int32, np.int64]:
|
||||
f = BytesIO()
|
||||
# we need to pass a comment to get the version info in;
|
||||
# LibSVM doesn't grok comments so they're not put in by
|
||||
# default anymore.
|
||||
|
||||
if (sp.issparse(y) and y.shape[0] == 1):
|
||||
# make sure y's shape is: (n_samples, n_labels)
|
||||
# when it is sparse
|
||||
y = y.T
|
||||
|
||||
# Note: with dtype=np.int32 we are performing unsafe casts,
|
||||
# where X.astype(dtype) overflows. The result is
|
||||
# then platform dependent and X_dense.astype(dtype) may be
|
||||
# different from X_sparse.astype(dtype).asarray().
|
||||
X_input = X.astype(dtype)
|
||||
|
||||
dump_svmlight_file(X_input, y, f, comment="test",
|
||||
zero_based=zero_based)
|
||||
f.seek(0)
|
||||
|
||||
comment = f.readline()
|
||||
comment = str(comment, "utf-8")
|
||||
|
||||
assert "scikit-learn %s" % sklearn.__version__ in comment
|
||||
|
||||
comment = f.readline()
|
||||
comment = str(comment, "utf-8")
|
||||
|
||||
assert ["one", "zero"][zero_based] + "-based" in comment
|
||||
|
||||
X2, y2 = load_svmlight_file(f, dtype=dtype,
|
||||
zero_based=zero_based)
|
||||
assert X2.dtype == dtype
|
||||
assert_array_equal(X2.sorted_indices().indices, X2.indices)
|
||||
|
||||
X2_dense = X2.toarray()
|
||||
if sp.issparse(X_input):
|
||||
X_input_dense = X_input.toarray()
|
||||
else:
|
||||
X_input_dense = X_input
|
||||
|
||||
if dtype == np.float32:
|
||||
# allow a rounding error at the last decimal place
|
||||
assert_array_almost_equal(
|
||||
X_input_dense, X2_dense, 4)
|
||||
assert_array_almost_equal(
|
||||
y_dense.astype(dtype, copy=False), y2, 4)
|
||||
else:
|
||||
# allow a rounding error at the last decimal place
|
||||
assert_array_almost_equal(
|
||||
X_input_dense, X2_dense, 15)
|
||||
assert_array_almost_equal(
|
||||
y_dense.astype(dtype, copy=False), y2, 15)
|
||||
|
||||
|
||||
def test_dump_multilabel():
|
||||
X = [[1, 0, 3, 0, 5],
|
||||
[0, 0, 0, 0, 0],
|
||||
[0, 5, 0, 1, 0]]
|
||||
y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]]
|
||||
y_sparse = sp.csr_matrix(y_dense)
|
||||
for y in [y_dense, y_sparse]:
|
||||
f = BytesIO()
|
||||
dump_svmlight_file(X, y, f, multilabel=True)
|
||||
f.seek(0)
|
||||
# make sure it dumps multilabel correctly
|
||||
assert f.readline() == b"1 0:1 2:3 4:5\n"
|
||||
assert f.readline() == b"0,2 \n"
|
||||
assert f.readline() == b"0,1 1:5 3:1\n"
|
||||
|
||||
|
||||
def test_dump_concise():
|
||||
one = 1
|
||||
two = 2.1
|
||||
three = 3.01
|
||||
exact = 1.000000000000001
|
||||
# loses the last decimal place
|
||||
almost = 1.0000000000000001
|
||||
X = [[one, two, three, exact, almost],
|
||||
[1e9, 2e18, 3e27, 0, 0],
|
||||
[0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0]]
|
||||
y = [one, two, three, exact, almost]
|
||||
f = BytesIO()
|
||||
dump_svmlight_file(X, y, f)
|
||||
f.seek(0)
|
||||
# make sure it's using the most concise format possible
|
||||
assert (f.readline() ==
|
||||
b"1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n")
|
||||
assert f.readline() == b"2.1 0:1000000000 1:2e+18 2:3e+27\n"
|
||||
assert f.readline() == b"3.01 \n"
|
||||
assert f.readline() == b"1.000000000000001 \n"
|
||||
assert f.readline() == b"1 \n"
|
||||
f.seek(0)
|
||||
# make sure it's correct too :)
|
||||
X2, y2 = load_svmlight_file(f)
|
||||
assert_array_almost_equal(X, X2.toarray())
|
||||
assert_array_almost_equal(y, y2)
|
||||
|
||||
|
||||
def test_dump_comment():
|
||||
X, y = load_svmlight_file(datafile)
|
||||
X = X.toarray()
|
||||
|
||||
f = BytesIO()
|
||||
ascii_comment = "This is a comment\nspanning multiple lines."
|
||||
dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False)
|
||||
f.seek(0)
|
||||
|
||||
X2, y2 = load_svmlight_file(f, zero_based=False)
|
||||
assert_array_almost_equal(X, X2.toarray())
|
||||
assert_array_almost_equal(y, y2)
|
||||
|
||||
# XXX we have to update this to support Python 3.x
|
||||
utf8_comment = b"It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc"
|
||||
f = BytesIO()
|
||||
with pytest.raises(UnicodeDecodeError):
|
||||
dump_svmlight_file(X, y, f, comment=utf8_comment)
|
||||
|
||||
unicode_comment = utf8_comment.decode("utf-8")
|
||||
f = BytesIO()
|
||||
dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False)
|
||||
f.seek(0)
|
||||
|
||||
X2, y2 = load_svmlight_file(f, zero_based=False)
|
||||
assert_array_almost_equal(X, X2.toarray())
|
||||
assert_array_almost_equal(y, y2)
|
||||
|
||||
f = BytesIO()
|
||||
with pytest.raises(ValueError):
|
||||
dump_svmlight_file(X, y, f, comment="I've got a \0.")
|
||||
|
||||
|
||||
def test_dump_invalid():
|
||||
X, y = load_svmlight_file(datafile)
|
||||
|
||||
f = BytesIO()
|
||||
y2d = [y]
|
||||
with pytest.raises(ValueError):
|
||||
dump_svmlight_file(X, y2d, f)
|
||||
|
||||
f = BytesIO()
|
||||
with pytest.raises(ValueError):
|
||||
dump_svmlight_file(X, y[:-1], f)
|
||||
|
||||
|
||||
def test_dump_query_id():
|
||||
# test dumping a file with query_id
|
||||
X, y = load_svmlight_file(datafile)
|
||||
X = X.toarray()
|
||||
query_id = np.arange(X.shape[0]) // 2
|
||||
f = BytesIO()
|
||||
dump_svmlight_file(X, y, f, query_id=query_id, zero_based=True)
|
||||
|
||||
f.seek(0)
|
||||
X1, y1, query_id1 = load_svmlight_file(f, query_id=True, zero_based=True)
|
||||
assert_array_almost_equal(X, X1.toarray())
|
||||
assert_array_almost_equal(y, y1)
|
||||
assert_array_almost_equal(query_id, query_id1)
|
||||
|
||||
|
||||
def test_load_with_long_qid():
|
||||
# load svmfile with longint qid attribute
|
||||
data = b"""
|
||||
1 qid:0 0:1 1:2 2:3
|
||||
0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985
|
||||
0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985
|
||||
3 qid:9223372036854775807 0:1440446648 1:72048431380967004 2:236784985"""
|
||||
X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)
|
||||
|
||||
true_X = [[1, 2, 3],
|
||||
[1440446648, 72048431380967004, 236784985],
|
||||
[1440446648, 72048431380967004, 236784985],
|
||||
[1440446648, 72048431380967004, 236784985]]
|
||||
|
||||
true_y = [1, 0, 0, 3]
|
||||
trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807]
|
||||
assert_array_equal(y, true_y)
|
||||
assert_array_equal(X.toarray(), true_X)
|
||||
assert_array_equal(qid, trueQID)
|
||||
|
||||
f = BytesIO()
|
||||
dump_svmlight_file(X, y, f, query_id=qid, zero_based=True)
|
||||
f.seek(0)
|
||||
X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True)
|
||||
assert_array_equal(y, true_y)
|
||||
assert_array_equal(X.toarray(), true_X)
|
||||
assert_array_equal(qid, trueQID)
|
||||
|
||||
f.seek(0)
|
||||
X, y = load_svmlight_file(f, query_id=False, zero_based=True)
|
||||
assert_array_equal(y, true_y)
|
||||
assert_array_equal(X.toarray(), true_X)
|
||||
|
||||
|
||||
def test_load_zeros():
|
||||
f = BytesIO()
|
||||
true_X = sp.csr_matrix(np.zeros(shape=(3, 4)))
|
||||
true_y = np.array([0, 1, 0])
|
||||
dump_svmlight_file(true_X, true_y, f)
|
||||
|
||||
for zero_based in ['auto', True, False]:
|
||||
f.seek(0)
|
||||
X, y = load_svmlight_file(f, n_features=4, zero_based=zero_based)
|
||||
assert_array_almost_equal(y, true_y)
|
||||
assert_array_almost_equal(X.toarray(), true_X.toarray())
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sparsity', [0, 0.1, .5, 0.99, 1])
|
||||
@pytest.mark.parametrize('n_samples', [13, 101])
|
||||
@pytest.mark.parametrize('n_features', [2, 7, 41])
|
||||
def test_load_with_offsets(sparsity, n_samples, n_features):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features))
|
||||
if sparsity:
|
||||
X[X < sparsity] = 0.0
|
||||
X = sp.csr_matrix(X)
|
||||
y = rng.randint(low=0, high=2, size=n_samples)
|
||||
|
||||
f = BytesIO()
|
||||
dump_svmlight_file(X, y, f)
|
||||
f.seek(0)
|
||||
|
||||
size = len(f.getvalue())
|
||||
|
||||
# put some marks that are likely to happen anywhere in a row
|
||||
mark_0 = 0
|
||||
mark_1 = size // 3
|
||||
length_0 = mark_1 - mark_0
|
||||
mark_2 = 4 * size // 5
|
||||
length_1 = mark_2 - mark_1
|
||||
|
||||
# load the original sparse matrix into 3 independent CSR matrices
|
||||
X_0, y_0 = load_svmlight_file(f, n_features=n_features,
|
||||
offset=mark_0, length=length_0)
|
||||
X_1, y_1 = load_svmlight_file(f, n_features=n_features,
|
||||
offset=mark_1, length=length_1)
|
||||
X_2, y_2 = load_svmlight_file(f, n_features=n_features,
|
||||
offset=mark_2)
|
||||
|
||||
y_concat = np.concatenate([y_0, y_1, y_2])
|
||||
X_concat = sp.vstack([X_0, X_1, X_2])
|
||||
assert_array_almost_equal(y, y_concat)
|
||||
assert_array_almost_equal(X.toarray(), X_concat.toarray())
|
||||
|
||||
|
||||
def test_load_offset_exhaustive_splits():
|
||||
rng = np.random.RandomState(0)
|
||||
X = np.array([
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[1, 2, 3, 4, 0, 6],
|
||||
[1, 2, 3, 4, 0, 6],
|
||||
[0, 0, 0, 0, 0, 0],
|
||||
[1, 0, 3, 0, 0, 0],
|
||||
[0, 0, 0, 0, 0, 1],
|
||||
[1, 0, 0, 0, 0, 0],
|
||||
])
|
||||
X = sp.csr_matrix(X)
|
||||
n_samples, n_features = X.shape
|
||||
y = rng.randint(low=0, high=2, size=n_samples)
|
||||
query_id = np.arange(n_samples) // 2
|
||||
|
||||
f = BytesIO()
|
||||
dump_svmlight_file(X, y, f, query_id=query_id)
|
||||
f.seek(0)
|
||||
|
||||
size = len(f.getvalue())
|
||||
|
||||
# load the same data in 2 parts with all the possible byte offsets to
|
||||
# locate the split so has to test for particular boundary cases
|
||||
for mark in range(size):
|
||||
f.seek(0)
|
||||
X_0, y_0, q_0 = load_svmlight_file(f, n_features=n_features,
|
||||
query_id=True, offset=0,
|
||||
length=mark)
|
||||
X_1, y_1, q_1 = load_svmlight_file(f, n_features=n_features,
|
||||
query_id=True, offset=mark,
|
||||
length=-1)
|
||||
q_concat = np.concatenate([q_0, q_1])
|
||||
y_concat = np.concatenate([y_0, y_1])
|
||||
X_concat = sp.vstack([X_0, X_1])
|
||||
assert_array_almost_equal(y, y_concat)
|
||||
assert_array_equal(query_id, q_concat)
|
||||
assert_array_almost_equal(X.toarray(), X_concat.toarray())
|
||||
|
||||
|
||||
def test_load_with_offsets_error():
|
||||
with pytest.raises(ValueError, match="n_features is required"):
|
||||
load_svmlight_file(datafile, offset=3, length=3)
|
Loading…
Add table
Add a link
Reference in a new issue