Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,75 @@
""" Network tests are only run, if data is already locally available,
or if download is specifically requested by environment variable."""
import builtins
from os import environ
import pytest
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import fetch_covtype
from sklearn.datasets import fetch_kddcup99
from sklearn.datasets import fetch_olivetti_faces
from sklearn.datasets import fetch_rcv1
def _wrapped_fetch(f, dataset_name):
""" Fetch dataset (download if missing and requested by environment) """
download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
def wrapped(*args, **kwargs):
kwargs['download_if_missing'] = download_if_missing
try:
return f(*args, **kwargs)
except IOError:
pytest.skip("Download {} to run this test".format(dataset_name))
return wrapped
@pytest.fixture
def fetch_20newsgroups_fxt():
return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups')
@pytest.fixture
def fetch_20newsgroups_vectorized_fxt():
return _wrapped_fetch(fetch_20newsgroups_vectorized,
dataset_name='20newsgroups_vectorized')
@pytest.fixture
def fetch_california_housing_fxt():
return _wrapped_fetch(fetch_california_housing,
dataset_name='california_housing')
@pytest.fixture
def fetch_covtype_fxt():
return _wrapped_fetch(fetch_covtype, dataset_name='covtype')
@pytest.fixture
def fetch_kddcup99_fxt():
return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99')
@pytest.fixture
def fetch_olivetti_faces_fxt():
return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces')
@pytest.fixture
def fetch_rcv1_fxt():
return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1')
@pytest.fixture
def hide_available_pandas(monkeypatch):
""" Pretend pandas was not installed. """
import_orig = builtins.__import__
def mocked_import(name, *args, **kwargs):
if name == 'pandas':
raise ImportError()
return import_orig(name, *args, **kwargs)
monkeypatch.setattr(builtins, '__import__', mocked_import)

View file

@ -0,0 +1,9 @@
# comment
# note: the next line contains a tab
1.0 3:2.5 11:-5.2 16:1.5 # and an inline comment
2.0 6:1.0 13:-3
# another comment
3.0 21:27
4.0 2:1.234567890123456e10 # double precision value
1.0 # empty line, all zeros
2.0 3:0 # explicit zeros

View file

@ -0,0 +1,3 @@
python 2:2.5 10:-5.2 15:1.5
2.0 5:1.0 12:-3
3.0 20:27

View file

@ -0,0 +1 @@
-1 5:2.5 2:-5.2 15:1.5

View file

@ -0,0 +1,5 @@
# multilabel dataset in SVMlight format
1,0 2:2.5 10:-5.2 15:1.5
2 5:1.0 12:-3
2:3.5 11:26
1,2 20:27

View file

@ -0,0 +1,90 @@
"""Test the 20news downloader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""
from functools import partial
import numpy as np
import scipy.sparse as sp
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.datasets.tests.test_common import check_return_X_y
from sklearn.preprocessing import normalize
def test_20news(fetch_20newsgroups_fxt):
data = fetch_20newsgroups_fxt(subset='all', shuffle=False)
# Extract a reduced dataset
data2cats = fetch_20newsgroups_fxt(
subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
# Check that the ordering of the target_names is the same
# as the ordering in the full dataset
assert data2cats.target_names == data.target_names[-2:]
# Assert that we have only 0 and 1 as labels
assert np.unique(data2cats.target).tolist() == [0, 1]
# Check that the number of filenames is consistent with data/target
assert len(data2cats.filenames) == len(data2cats.target)
assert len(data2cats.filenames) == len(data2cats.data)
# Check that the first entry of the reduced dataset corresponds to
# the first entry of the corresponding category in the full dataset
entry1 = data2cats.data[0]
category = data2cats.target_names[data2cats.target[0]]
label = data.target_names.index(category)
entry2 = data.data[np.where(data.target == label)[0][0]]
assert entry1 == entry2
# check that return_X_y option
X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True)
assert len(X) == len(data.data)
assert y.shape == data.target.shape
def test_20news_length_consistency(fetch_20newsgroups_fxt):
"""Checks the length consistencies within the bunch
This is a non-regression test for a bug present in 0.16.1.
"""
# Extract the full dataset
data = fetch_20newsgroups_fxt(subset='all')
assert len(data['data']) == len(data.data)
assert len(data['target']) == len(data.target)
assert len(data['filenames']) == len(data.filenames)
def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt):
# test subset = train
bunch = fetch_20newsgroups_vectorized_fxt(subset="train")
assert sp.isspmatrix_csr(bunch.data)
assert bunch.data.shape == (11314, 130107)
assert bunch.target.shape[0] == 11314
assert bunch.data.dtype == np.float64
# test subset = test
bunch = fetch_20newsgroups_vectorized_fxt(subset="test")
assert sp.isspmatrix_csr(bunch.data)
assert bunch.data.shape == (7532, 130107)
assert bunch.target.shape[0] == 7532
assert bunch.data.dtype == np.float64
# test return_X_y option
fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test')
check_return_X_y(bunch, fetch_func)
# test subset = all
bunch = fetch_20newsgroups_vectorized_fxt(subset='all')
assert sp.isspmatrix_csr(bunch.data)
assert bunch.data.shape == (11314 + 7532, 130107)
assert bunch.target.shape[0] == 11314 + 7532
assert bunch.data.dtype == np.float64
def test_20news_normalization(fetch_20newsgroups_vectorized_fxt):
X = fetch_20newsgroups_vectorized_fxt(normalize=False)
X_ = fetch_20newsgroups_vectorized_fxt(normalize=True)
X_norm = X_['data'][:100]
X = X['data'][:100]
assert_allclose_dense_sparse(X_norm, normalize(X))
assert np.allclose(np.linalg.norm(X_norm.todense(), axis=1), 1)

View file

@ -0,0 +1,306 @@
import os
import shutil
import tempfile
import warnings
import numpy
from pickle import loads
from pickle import dumps
from functools import partial
import pytest
import numpy as np
from sklearn.datasets import get_data_home
from sklearn.datasets import clear_data_home
from sklearn.datasets import load_files
from sklearn.datasets import load_sample_images
from sklearn.datasets import load_sample_image
from sklearn.datasets import load_digits
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_linnerud
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_boston
from sklearn.datasets import load_wine
from sklearn.utils import Bunch
from sklearn.datasets.tests.test_common import check_return_X_y
from sklearn.datasets.tests.test_common import check_as_frame
from sklearn.datasets.tests.test_common import check_pandas_dependency_message
from sklearn.externals._pilutil import pillow_installed
from sklearn.utils import IS_PYPY
def _remove_dir(path):
if os.path.isdir(path):
shutil.rmtree(path)
@pytest.fixture(scope="module")
def data_home(tmpdir_factory):
tmp_file = str(tmpdir_factory.mktemp("scikit_learn_data_home_test"))
yield tmp_file
_remove_dir(tmp_file)
@pytest.fixture(scope="module")
def load_files_root(tmpdir_factory):
tmp_file = str(tmpdir_factory.mktemp("scikit_learn_load_files_test"))
yield tmp_file
_remove_dir(tmp_file)
@pytest.fixture
def test_category_dir_1(load_files_root):
test_category_dir1 = tempfile.mkdtemp(dir=load_files_root)
sample_file = tempfile.NamedTemporaryFile(dir=test_category_dir1,
delete=False)
sample_file.write(b"Hello World!\n")
sample_file.close()
yield str(test_category_dir1)
_remove_dir(test_category_dir1)
@pytest.fixture
def test_category_dir_2(load_files_root):
test_category_dir2 = tempfile.mkdtemp(dir=load_files_root)
yield str(test_category_dir2)
_remove_dir(test_category_dir2)
def test_data_home(data_home):
# get_data_home will point to a pre-existing folder
data_home = get_data_home(data_home=data_home)
assert data_home == data_home
assert os.path.exists(data_home)
# clear_data_home will delete both the content and the folder it-self
clear_data_home(data_home=data_home)
assert not os.path.exists(data_home)
# if the folder is missing it will be created again
data_home = get_data_home(data_home=data_home)
assert os.path.exists(data_home)
def test_default_empty_load_files(load_files_root):
res = load_files(load_files_root)
assert len(res.filenames) == 0
assert len(res.target_names) == 0
assert res.DESCR is None
def test_default_load_files(test_category_dir_1, test_category_dir_2,
load_files_root):
if IS_PYPY:
pytest.xfail('[PyPy] fails due to string containing NUL characters')
res = load_files(load_files_root)
assert len(res.filenames) == 1
assert len(res.target_names) == 2
assert res.DESCR is None
assert res.data == [b"Hello World!\n"]
def test_load_files_w_categories_desc_and_encoding(
test_category_dir_1, test_category_dir_2, load_files_root):
if IS_PYPY:
pytest.xfail('[PyPy] fails due to string containing NUL characters')
category = os.path.abspath(test_category_dir_1).split('/').pop()
res = load_files(load_files_root, description="test",
categories=category, encoding="utf-8")
assert len(res.filenames) == 1
assert len(res.target_names) == 1
assert res.DESCR == "test"
assert res.data == ["Hello World!\n"]
def test_load_files_wo_load_content(
test_category_dir_1, test_category_dir_2, load_files_root):
res = load_files(load_files_root, load_content=False)
assert len(res.filenames) == 1
assert len(res.target_names) == 2
assert res.DESCR is None
assert res.get('data') is None
def test_load_sample_images():
try:
res = load_sample_images()
assert len(res.images) == 2
assert len(res.filenames) == 2
images = res.images
# assert is china image
assert np.all(images[0][0, 0, :] ==
np.array([174, 201, 231], dtype=np.uint8))
# assert is flower image
assert np.all(images[1][0, 0, :] ==
np.array([2, 19, 13], dtype=np.uint8))
assert res.DESCR
except ImportError:
warnings.warn("Could not load sample images, PIL is not available.")
def test_load_digits():
digits = load_digits()
assert digits.data.shape == (1797, 64)
assert numpy.unique(digits.target).size == 10
# test return_X_y option
check_return_X_y(digits, partial(load_digits))
def test_load_digits_n_class_lt_10():
digits = load_digits(n_class=9)
assert digits.data.shape == (1617, 64)
assert numpy.unique(digits.target).size == 9
def test_load_sample_image():
try:
china = load_sample_image('china.jpg')
assert china.dtype == 'uint8'
assert china.shape == (427, 640, 3)
except ImportError:
warnings.warn("Could not load sample images, PIL is not available.")
def test_load_missing_sample_image_error():
if pillow_installed:
with pytest.raises(AttributeError):
load_sample_image('blop.jpg')
else:
warnings.warn("Could not load sample images, PIL is not available.")
def test_load_diabetes():
res = load_diabetes()
assert res.data.shape == (442, 10)
assert res.target.size, 442
assert len(res.feature_names) == 10
assert res.DESCR
# test return_X_y option
check_return_X_y(res, partial(load_diabetes))
def test_load_linnerud():
res = load_linnerud()
assert res.data.shape == (20, 3)
assert res.target.shape == (20, 3)
assert len(res.target_names) == 3
assert res.DESCR
assert os.path.exists(res.data_filename)
assert os.path.exists(res.target_filename)
# test return_X_y option
check_return_X_y(res, partial(load_linnerud))
def test_load_iris():
res = load_iris()
assert res.data.shape == (150, 4)
assert res.target.size == 150
assert res.target_names.size == 3
assert res.DESCR
assert os.path.exists(res.filename)
# test return_X_y option
check_return_X_y(res, partial(load_iris))
def test_load_wine():
res = load_wine()
assert res.data.shape == (178, 13)
assert res.target.size == 178
assert res.target_names.size == 3
assert res.DESCR
# test return_X_y option
check_return_X_y(res, partial(load_wine))
def test_load_breast_cancer():
res = load_breast_cancer()
assert res.data.shape == (569, 30)
assert res.target.size == 569
assert res.target_names.size == 2
assert res.DESCR
assert os.path.exists(res.filename)
# test return_X_y option
check_return_X_y(res, partial(load_breast_cancer))
@pytest.mark.parametrize("loader_func, data_dtype, target_dtype", [
(load_breast_cancer, np.float64, np.int64),
(load_diabetes, np.float64, np.float64),
(load_digits, np.float64, np.int64),
(load_iris, np.float64, np.int64),
(load_linnerud, np.float64, np.float64),
(load_wine, np.float64, np.int64),
])
def test_toy_dataset_as_frame(loader_func, data_dtype, target_dtype):
default_result = loader_func()
check_as_frame(default_result, partial(loader_func),
expected_data_dtype=data_dtype,
expected_target_dtype=target_dtype)
@pytest.mark.parametrize("loader_func", [
load_breast_cancer,
load_diabetes,
load_digits,
load_iris,
load_linnerud,
load_wine,
])
def test_toy_dataset_as_frame_no_pandas(loader_func):
check_pandas_dependency_message(loader_func)
def test_load_boston():
res = load_boston()
assert res.data.shape == (506, 13)
assert res.target.size == 506
assert res.feature_names.size == 13
assert res.DESCR
assert os.path.exists(res.filename)
# test return_X_y option
check_return_X_y(res, partial(load_boston))
def test_loads_dumps_bunch():
bunch = Bunch(x="x")
bunch_from_pkl = loads(dumps(bunch))
bunch_from_pkl.x = "y"
assert bunch_from_pkl['x'] == bunch_from_pkl.x
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
bunch = Bunch(key='original')
# This reproduces a problem when Bunch pickles have been created
# with scikit-learn 0.16 and are read with 0.17. Basically there
# is a surprising behaviour because reading bunch.key uses
# bunch.__dict__ (which is non empty for 0.16 Bunch objects)
# whereas assigning into bunch.key uses bunch.__setattr__. See
# https://github.com/scikit-learn/scikit-learn/issues/6196 for
# more details
bunch.__dict__['key'] = 'set from __dict__'
bunch_from_pkl = loads(dumps(bunch))
# After loading from pickle the __dict__ should have been ignored
assert bunch_from_pkl.key == 'original'
assert bunch_from_pkl['key'] == 'original'
# Making sure that changing the attr does change the value
# associated with __getitem__ as well
bunch_from_pkl.key = 'changed'
assert bunch_from_pkl.key == 'changed'
assert bunch_from_pkl['key'] == 'changed'
def test_bunch_dir():
# check that dir (important for autocomplete) shows attributes
data = load_iris()
assert "data" in dir(data)

View file

@ -0,0 +1,37 @@
"""Test the california_housing loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""
import pytest
from sklearn.datasets.tests.test_common import check_return_X_y
from functools import partial
def test_fetch(fetch_california_housing_fxt):
data = fetch_california_housing_fxt()
assert((20640, 8) == data.data.shape)
assert((20640, ) == data.target.shape)
# test return_X_y option
fetch_func = partial(fetch_california_housing_fxt)
check_return_X_y(data, fetch_func)
def test_fetch_asframe(fetch_california_housing_fxt):
pd = pytest.importorskip('pandas')
bunch = fetch_california_housing_fxt(as_frame=True)
frame = bunch.frame
assert hasattr(bunch, 'frame') is True
assert frame.shape == (20640, 9)
assert isinstance(bunch.data, pd.DataFrame)
assert isinstance(bunch.target, pd.Series)
def test_pandas_dependency_message(fetch_california_housing_fxt,
hide_available_pandas):
# Check that pandas is imported lazily and that an informative error
# message is raised when pandas is missing:
expected_msg = ('fetch_california_housing with as_frame=True'
' requires pandas')
with pytest.raises(ImportError, match=expected_msg):
fetch_california_housing_fxt(as_frame=True)

View file

@ -0,0 +1,43 @@
"""Test loaders for common functionality.
"""
import pytest
import numpy as np
def check_pandas_dependency_message(fetch_func):
try:
import pandas # noqa
pytest.skip("This test requires pandas to be not installed")
except ImportError:
# Check that pandas is imported lazily and that an informative error
# message is raised when pandas is missing:
expected_msg = ('{} with as_frame=True requires pandas'
.format(fetch_func.__name__))
with pytest.raises(ImportError, match=expected_msg):
fetch_func(as_frame=True)
def check_return_X_y(bunch, fetch_func_partial):
X_y_tuple = fetch_func_partial(return_X_y=True)
assert isinstance(X_y_tuple, tuple)
assert X_y_tuple[0].shape == bunch.data.shape
assert X_y_tuple[1].shape == bunch.target.shape
def check_as_frame(bunch, fetch_func_partial,
expected_data_dtype=None, expected_target_dtype=None):
pd = pytest.importorskip('pandas')
frame_bunch = fetch_func_partial(as_frame=True)
assert hasattr(frame_bunch, 'frame')
assert isinstance(frame_bunch.frame, pd.DataFrame)
assert isinstance(frame_bunch.data, pd.DataFrame)
assert frame_bunch.data.shape == bunch.data.shape
if frame_bunch.target.ndim > 1:
assert isinstance(frame_bunch.target, pd.DataFrame)
else:
assert isinstance(frame_bunch.target, pd.Series)
assert frame_bunch.target.shape[0] == bunch.target.shape[0]
if expected_data_dtype is not None:
assert np.all(frame_bunch.data.dtypes == expected_data_dtype)
if expected_target_dtype is not None:
assert np.all(frame_bunch.target.dtypes == expected_target_dtype)

View file

@ -0,0 +1,25 @@
"""Test the covtype loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""
from sklearn.datasets.tests.test_common import check_return_X_y
from functools import partial
def test_fetch(fetch_covtype_fxt):
data1 = fetch_covtype_fxt(shuffle=True, random_state=42)
data2 = fetch_covtype_fxt(shuffle=True, random_state=37)
X1, X2 = data1['data'], data2['data']
assert (581012, 54) == X1.shape
assert X1.shape == X2.shape
assert X1.sum() == X2.sum()
y1, y2 = data1['target'], data2['target']
assert (X1.shape[0],) == y1.shape
assert (X1.shape[0],) == y2.shape
# test return_X_y option
fetch_func = partial(fetch_covtype_fxt)
check_return_X_y(data1, fetch_func)

View file

@ -0,0 +1,46 @@
"""Test kddcup99 loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job).
Only 'percent10' mode is tested, as the full data
is too big to use in unit-testing.
"""
from sklearn.datasets.tests.test_common import check_return_X_y
from functools import partial
def test_percent10(fetch_kddcup99_fxt):
data = fetch_kddcup99_fxt()
assert data.data.shape == (494021, 41)
assert data.target.shape == (494021,)
data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0)
assert data.data.shape == data_shuffled.data.shape
assert data.target.shape == data_shuffled.target.shape
data = fetch_kddcup99_fxt('SA')
assert data.data.shape == (100655, 41)
assert data.target.shape == (100655,)
data = fetch_kddcup99_fxt('SF')
assert data.data.shape == (73237, 4)
assert data.target.shape == (73237,)
data = fetch_kddcup99_fxt('http')
assert data.data.shape == (58725, 3)
assert data.target.shape == (58725,)
data = fetch_kddcup99_fxt('smtp')
assert data.data.shape == (9571, 3)
assert data.target.shape == (9571,)
fetch_func = partial(fetch_kddcup99_fxt, 'smtp')
check_return_X_y(data, fetch_func)
def test_shuffle(fetch_kddcup99_fxt):
dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True,
percent10=True)
assert(any(dataset.target[-100:] == b'normal.'))

View file

@ -0,0 +1,196 @@
"""This test for the LFW require medium-size data downloading and processing
If the data has not been already downloaded by running the examples,
the tests won't run (skipped).
If the test are run, the first execution will be long (typically a bit
more than a couple of minutes) but as the dataset loader is leveraging
joblib, successive runs will be fast (less than 200ms).
"""
import random
import os
import shutil
import tempfile
import numpy as np
import pytest
from functools import partial
from sklearn.externals._pilutil import pillow_installed, imsave
from sklearn.datasets import fetch_lfw_pairs
from sklearn.datasets import fetch_lfw_people
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import SkipTest
from sklearn.datasets.tests.test_common import check_return_X_y
SCIKIT_LEARN_DATA = None
SCIKIT_LEARN_EMPTY_DATA = None
LFW_HOME = None
FAKE_NAMES = [
'Abdelatif_Smith',
'Abhati_Kepler',
'Camara_Alvaro',
'Chen_Dupont',
'John_Lee',
'Lin_Bauman',
'Onur_Lopez',
]
def setup_module():
"""Test fixture run once and common to all tests of this module"""
if not pillow_installed:
raise SkipTest("PIL not installed.")
global SCIKIT_LEARN_DATA, SCIKIT_LEARN_EMPTY_DATA, LFW_HOME
SCIKIT_LEARN_DATA = tempfile.mkdtemp(prefix="scikit_learn_lfw_test_")
LFW_HOME = os.path.join(SCIKIT_LEARN_DATA, 'lfw_home')
SCIKIT_LEARN_EMPTY_DATA = tempfile.mkdtemp(
prefix="scikit_learn_empty_test_")
if not os.path.exists(LFW_HOME):
os.makedirs(LFW_HOME)
random_state = random.Random(42)
np_rng = np.random.RandomState(42)
# generate some random jpeg files for each person
counts = {}
for name in FAKE_NAMES:
folder_name = os.path.join(LFW_HOME, 'lfw_funneled', name)
if not os.path.exists(folder_name):
os.makedirs(folder_name)
n_faces = np_rng.randint(1, 5)
counts[name] = n_faces
for i in range(n_faces):
file_path = os.path.join(folder_name, name + '_%04d.jpg' % i)
uniface = np_rng.randint(0, 255, size=(250, 250, 3))
try:
imsave(file_path, uniface)
except ImportError:
raise SkipTest("PIL not installed")
# add some random file pollution to test robustness
with open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb') as f:
f.write(b'Text file to be ignored by the dataset loader.')
# generate some pairing metadata files using the same format as LFW
with open(os.path.join(LFW_HOME, 'pairsDevTrain.txt'), 'wb') as f:
f.write(b"10\n")
more_than_two = [name for name, count in counts.items()
if count >= 2]
for i in range(5):
name = random_state.choice(more_than_two)
first, second = random_state.sample(range(counts[name]), 2)
f.write(('%s\t%d\t%d\n' % (name, first, second)).encode())
for i in range(5):
first_name, second_name = random_state.sample(FAKE_NAMES, 2)
first_index = random_state.choice(np.arange(counts[first_name]))
second_index = random_state.choice(np.arange(counts[second_name]))
f.write(('%s\t%d\t%s\t%d\n' % (first_name, first_index,
second_name, second_index)
).encode())
with open(os.path.join(LFW_HOME, 'pairsDevTest.txt'), 'wb') as f:
f.write(b"Fake place holder that won't be tested")
with open(os.path.join(LFW_HOME, 'pairs.txt'), 'wb') as f:
f.write(b"Fake place holder that won't be tested")
def teardown_module():
"""Test fixture (clean up) run once after all tests of this module"""
if os.path.isdir(SCIKIT_LEARN_DATA):
shutil.rmtree(SCIKIT_LEARN_DATA)
if os.path.isdir(SCIKIT_LEARN_EMPTY_DATA):
shutil.rmtree(SCIKIT_LEARN_EMPTY_DATA)
def test_load_empty_lfw_people():
with pytest.raises(IOError):
fetch_lfw_people(data_home=SCIKIT_LEARN_EMPTY_DATA,
download_if_missing=False)
def test_load_fake_lfw_people():
lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA,
min_faces_per_person=3,
download_if_missing=False)
# The data is croped around the center as a rectangular bounding box
# around the face. Colors are converted to gray levels:
assert lfw_people.images.shape == (10, 62, 47)
assert lfw_people.data.shape == (10, 2914)
# the target is array of person integer ids
assert_array_equal(lfw_people.target, [2, 0, 1, 0, 2, 0, 2, 1, 1, 2])
# names of the persons can be found using the target_names array
expected_classes = ['Abdelatif Smith', 'Abhati Kepler', 'Onur Lopez']
assert_array_equal(lfw_people.target_names, expected_classes)
# It is possible to ask for the original data without any croping or color
# conversion and not limit on the number of picture per person
lfw_people = fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, resize=None,
slice_=None, color=True,
download_if_missing=False)
assert lfw_people.images.shape == (17, 250, 250, 3)
# the ids and class names are the same as previously
assert_array_equal(lfw_people.target,
[0, 0, 1, 6, 5, 6, 3, 6, 0, 3, 6, 1, 2, 4, 5, 1, 2])
assert_array_equal(lfw_people.target_names,
['Abdelatif Smith', 'Abhati Kepler', 'Camara Alvaro',
'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez'])
# test return_X_y option
fetch_func = partial(fetch_lfw_people, data_home=SCIKIT_LEARN_DATA,
resize=None,
slice_=None, color=True,
download_if_missing=False)
check_return_X_y(lfw_people, fetch_func)
def test_load_fake_lfw_people_too_restrictive():
with pytest.raises(ValueError):
fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=100,
download_if_missing=False)
def test_load_empty_lfw_pairs():
with pytest.raises(IOError):
fetch_lfw_pairs(data_home=SCIKIT_LEARN_EMPTY_DATA,
download_if_missing=False)
def test_load_fake_lfw_pairs():
lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA,
download_if_missing=False)
# The data is croped around the center as a rectangular bounding box
# around the face. Colors are converted to gray levels:
assert lfw_pairs_train.pairs.shape == (10, 2, 62, 47)
# the target is whether the person is the same or not
assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
# names of the persons can be found using the target_names array
expected_classes = ['Different persons', 'Same person']
assert_array_equal(lfw_pairs_train.target_names, expected_classes)
# It is possible to ask for the original data without any croping or color
# conversion
lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA, resize=None,
slice_=None, color=True,
download_if_missing=False)
assert lfw_pairs_train.pairs.shape == (10, 2, 250, 250, 3)
# the ids and class names are the same as previously
assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
assert_array_equal(lfw_pairs_train.target_names, expected_classes)

View file

@ -0,0 +1,26 @@
"""Test Olivetti faces fetcher, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""
import numpy as np
from sklearn.utils import Bunch
from sklearn.datasets.tests.test_common import check_return_X_y
from sklearn.utils._testing import assert_array_equal
def test_olivetti_faces(fetch_olivetti_faces_fxt):
data = fetch_olivetti_faces_fxt(shuffle=True, random_state=0)
assert isinstance(data, Bunch)
for expected_keys in ('data', 'images', 'target', 'DESCR'):
assert expected_keys in data.keys()
assert data.data.shape == (400, 4096)
assert data.images.shape == (400, 64, 64)
assert data.target.shape == (400,)
assert_array_equal(np.unique(np.sort(data.target)), np.arange(40))
# test the return_X_y option
check_return_X_y(data, fetch_olivetti_faces_fxt)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,65 @@
"""Test the rcv1 loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""
import scipy.sparse as sp
import numpy as np
from functools import partial
from sklearn.datasets.tests.test_common import check_return_X_y
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_equal
def test_fetch_rcv1(fetch_rcv1_fxt):
data1 = fetch_rcv1_fxt(shuffle=False)
X1, Y1 = data1.data, data1.target
cat_list, s1 = data1.target_names.tolist(), data1.sample_id
# test sparsity
assert sp.issparse(X1)
assert sp.issparse(Y1)
assert 60915113 == X1.data.size
assert 2606875 == Y1.data.size
# test shapes
assert (804414, 47236) == X1.shape
assert (804414, 103) == Y1.shape
assert (804414,) == s1.shape
assert 103 == len(cat_list)
# test ordering of categories
first_categories = ['C11', 'C12', 'C13', 'C14', 'C15', 'C151']
assert_array_equal(first_categories, cat_list[:6])
# test number of sample for some categories
some_categories = ('GMIL', 'E143', 'CCAT')
number_non_zero_in_cat = (5, 1206, 381327)
for num, cat in zip(number_non_zero_in_cat, some_categories):
j = cat_list.index(cat)
assert num == Y1[:, j].data.size
# test shuffling and subset
data2 = fetch_rcv1_fxt(shuffle=True, subset='train', random_state=77)
X2, Y2 = data2.data, data2.target
s2 = data2.sample_id
# test return_X_y option
fetch_func = partial(fetch_rcv1_fxt, shuffle=False, subset='train')
check_return_X_y(data2, fetch_func)
# The first 23149 samples are the training samples
assert_array_equal(np.sort(s1[:23149]), np.sort(s2))
# test some precise values
some_sample_ids = (2286, 3274, 14042)
for sample_id in some_sample_ids:
idx1 = s1.tolist().index(sample_id)
idx2 = s2.tolist().index(sample_id)
feature_values_1 = X1[idx1, :].toarray()
feature_values_2 = X2[idx2, :].toarray()
assert_almost_equal(feature_values_1, feature_values_2)
target_values_1 = Y1[idx1, :].toarray()
target_values_2 = Y2[idx2, :].toarray()
assert_almost_equal(target_values_1, target_values_2)

View file

@ -0,0 +1,559 @@
from collections import defaultdict
from functools import partial
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.datasets import make_classification
from sklearn.datasets import make_multilabel_classification
from sklearn.datasets import make_hastie_10_2
from sklearn.datasets import make_regression
from sklearn.datasets import make_blobs
from sklearn.datasets import make_friedman1
from sklearn.datasets import make_friedman2
from sklearn.datasets import make_friedman3
from sklearn.datasets import make_low_rank_matrix
from sklearn.datasets import make_moons
from sklearn.datasets import make_circles
from sklearn.datasets import make_sparse_coded_signal
from sklearn.datasets import make_sparse_uncorrelated
from sklearn.datasets import make_spd_matrix
from sklearn.datasets import make_swiss_roll
from sklearn.datasets import make_s_curve
from sklearn.datasets import make_biclusters
from sklearn.datasets import make_checkerboard
from sklearn.utils.validation import assert_all_finite
def test_make_classification():
weights = [0.1, 0.25]
X, y = make_classification(n_samples=100, n_features=20, n_informative=5,
n_redundant=1, n_repeated=1, n_classes=3,
n_clusters_per_class=1, hypercube=False,
shift=None, scale=None, weights=weights,
random_state=0)
assert weights == [0.1, 0.25]
assert X.shape == (100, 20), "X shape mismatch"
assert y.shape == (100,), "y shape mismatch"
assert np.unique(y).shape == (3,), "Unexpected number of classes"
assert sum(y == 0) == 10, "Unexpected number of samples in class #0"
assert sum(y == 1) == 25, "Unexpected number of samples in class #1"
assert sum(y == 2) == 65, "Unexpected number of samples in class #2"
# Test for n_features > 30
X, y = make_classification(n_samples=2000, n_features=31, n_informative=31,
n_redundant=0, n_repeated=0, hypercube=True,
scale=0.5, random_state=0)
assert X.shape == (2000, 31), "X shape mismatch"
assert y.shape == (2000,), "y shape mismatch"
assert (np.unique(X.view([('', X.dtype)]*X.shape[1])).view(X.dtype)
.reshape(-1, X.shape[1]).shape[0] == 2000), (
"Unexpected number of unique rows")
def test_make_classification_informative_features():
"""Test the construction of informative features in make_classification
Also tests `n_clusters_per_class`, `n_classes`, `hypercube` and
fully-specified `weights`.
"""
# Create very separate clusters; check that vertices are unique and
# correspond to classes
class_sep = 1e6
make = partial(make_classification, class_sep=class_sep, n_redundant=0,
n_repeated=0, flip_y=0, shift=0, scale=1, shuffle=False)
for n_informative, weights, n_clusters_per_class in [(2, [1], 1),
(2, [1/3] * 3, 1),
(2, [1/4] * 4, 1),
(2, [1/2] * 2, 2),
(2, [3/4, 1/4], 2),
(10, [1/3] * 3, 10),
(np.int(64), [1], 1)
]:
n_classes = len(weights)
n_clusters = n_classes * n_clusters_per_class
n_samples = n_clusters * 50
for hypercube in (False, True):
X, y = make(n_samples=n_samples, n_classes=n_classes,
weights=weights, n_features=n_informative,
n_informative=n_informative,
n_clusters_per_class=n_clusters_per_class,
hypercube=hypercube, random_state=0)
assert X.shape == (n_samples, n_informative)
assert y.shape == (n_samples,)
# Cluster by sign, viewed as strings to allow uniquing
signs = np.sign(X)
signs = signs.view(dtype='|S{0}'.format(signs.strides[0]))
unique_signs, cluster_index = np.unique(signs,
return_inverse=True)
assert len(unique_signs) == n_clusters, (
"Wrong number of clusters, or not in distinct quadrants")
clusters_by_class = defaultdict(set)
for cluster, cls in zip(cluster_index, y):
clusters_by_class[cls].add(cluster)
for clusters in clusters_by_class.values():
assert len(clusters) == n_clusters_per_class, (
"Wrong number of clusters per class")
assert (len(clusters_by_class) == n_classes), (
"Wrong number of classes")
assert_array_almost_equal(np.bincount(y) / len(y) // weights,
[1] * n_classes,
err_msg="Wrong number of samples "
"per class")
# Ensure on vertices of hypercube
for cluster in range(len(unique_signs)):
centroid = X[cluster_index == cluster].mean(axis=0)
if hypercube:
assert_array_almost_equal(np.abs(centroid) / class_sep,
np.ones(n_informative),
decimal=5,
err_msg="Clusters are not "
"centered on hypercube "
"vertices")
else:
with pytest.raises(AssertionError):
assert_array_almost_equal(np.abs(centroid) / class_sep,
np.ones(n_informative),
decimal=5,
err_msg="Clusters should "
"not be centered "
"on hypercube "
"vertices")
with pytest.raises(ValueError):
make(n_features=2, n_informative=2, n_classes=5,
n_clusters_per_class=1)
with pytest.raises(ValueError):
make(n_features=2, n_informative=2, n_classes=3,
n_clusters_per_class=2)
@pytest.mark.parametrize(
'weights, err_type, err_msg',
[
([], ValueError,
"Weights specified but incompatible with number of classes."),
([.25, .75, .1], ValueError,
"Weights specified but incompatible with number of classes."),
(np.array([]), ValueError,
"Weights specified but incompatible with number of classes."),
(np.array([.25, .75, .1]), ValueError,
"Weights specified but incompatible with number of classes."),
(np.random.random(3), ValueError,
"Weights specified but incompatible with number of classes.")
]
)
def test_make_classification_weights_type(weights, err_type, err_msg):
with pytest.raises(err_type, match=err_msg):
make_classification(weights=weights)
@pytest.mark.parametrize("kwargs", [{}, {"n_classes": 3, "n_informative": 3}])
def test_make_classification_weights_array_or_list_ok(kwargs):
X1, y1 = make_classification(weights=[.1, .9],
random_state=0, **kwargs)
X2, y2 = make_classification(weights=np.array([.1, .9]),
random_state=0, **kwargs)
assert_almost_equal(X1, X2)
assert_almost_equal(y1, y2)
def test_make_multilabel_classification_return_sequences():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=100, n_features=20,
n_classes=3, random_state=0,
return_indicator=False,
allow_unlabeled=allow_unlabeled)
assert X.shape == (100, 20), "X shape mismatch"
if not allow_unlabeled:
assert max([max(y) for y in Y]) == 2
assert min([len(y) for y in Y]) == min_length
assert max([len(y) for y in Y]) <= 3
def test_make_multilabel_classification_return_indicator():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled)
assert X.shape == (25, 20), "X shape mismatch"
assert Y.shape == (25, 3), "Y shape mismatch"
assert np.all(np.sum(Y, axis=0) > min_length)
# Also test return_distributions and return_indicator with True
X2, Y2, p_c, p_w_c = make_multilabel_classification(
n_samples=25, n_features=20, n_classes=3, random_state=0,
allow_unlabeled=allow_unlabeled, return_distributions=True)
assert_array_almost_equal(X, X2)
assert_array_equal(Y, Y2)
assert p_c.shape == (3,)
assert_almost_equal(p_c.sum(), 1)
assert p_w_c.shape == (20, 3)
assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
def test_make_multilabel_classification_return_indicator_sparse():
for allow_unlabeled, min_length in zip((True, False), (0, 1)):
X, Y = make_multilabel_classification(n_samples=25, n_features=20,
n_classes=3, random_state=0,
return_indicator='sparse',
allow_unlabeled=allow_unlabeled)
assert X.shape == (25, 20), "X shape mismatch"
assert Y.shape == (25, 3), "Y shape mismatch"
assert sp.issparse(Y)
@pytest.mark.parametrize(
"params, err_msg",
[
({"n_classes": 0}, "'n_classes' should be an integer"),
({"length": 0}, "'length' should be an integer")
]
)
def test_make_multilabel_classification_valid_arguments(params, err_msg):
with pytest.raises(ValueError, match=err_msg):
make_multilabel_classification(**params)
def test_make_hastie_10_2():
X, y = make_hastie_10_2(n_samples=100, random_state=0)
assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100,), "y shape mismatch"
assert np.unique(y).shape == (2,), "Unexpected number of classes"
def test_make_regression():
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
effective_rank=5, coef=True, bias=0.0,
noise=1.0, random_state=0)
assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100,), "y shape mismatch"
assert c.shape == (10,), "coef shape mismatch"
assert sum(c != 0.0) == 3, "Unexpected number of informative features"
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0).
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
# Test with small number of features.
X, y = make_regression(n_samples=100, n_features=1) # n_informative=3
assert X.shape == (100, 1)
def test_make_regression_multitarget():
X, y, c = make_regression(n_samples=100, n_features=10, n_informative=3,
n_targets=3, coef=True, noise=1., random_state=0)
assert X.shape == (100, 10), "X shape mismatch"
assert y.shape == (100, 3), "y shape mismatch"
assert c.shape == (10, 3), "coef shape mismatch"
assert_array_equal(sum(c != 0.0), 3,
"Unexpected number of informative features")
# Test that y ~= np.dot(X, c) + bias + N(0, 1.0)
assert_almost_equal(np.std(y - np.dot(X, c)), 1.0, decimal=1)
def test_make_blobs():
cluster_stds = np.array([0.05, 0.2, 0.4])
cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
X, y = make_blobs(random_state=0, n_samples=50, n_features=2,
centers=cluster_centers, cluster_std=cluster_stds)
assert X.shape == (50, 2), "X shape mismatch"
assert y.shape == (50,), "y shape mismatch"
assert np.unique(y).shape == (3,), "Unexpected number of blobs"
for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)):
assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
def test_make_blobs_n_samples_list():
n_samples = [50, 30, 20]
X, y = make_blobs(n_samples=n_samples, n_features=2, random_state=0)
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
"Incorrect number of samples per blob"
def test_make_blobs_n_samples_list_with_centers():
n_samples = [20, 20, 20]
centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
cluster_stds = np.array([0.05, 0.2, 0.4])
X, y = make_blobs(n_samples=n_samples, centers=centers,
cluster_std=cluster_stds, random_state=0)
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
"Incorrect number of samples per blob"
for i, (ctr, std) in enumerate(zip(centers, cluster_stds)):
assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
@pytest.mark.parametrize(
"n_samples",
[[5, 3, 0],
np.array([5, 3, 0]),
tuple([5, 3, 0])]
)
def test_make_blobs_n_samples_centers_none(n_samples):
centers = None
X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=0)
assert X.shape == (sum(n_samples), 2), "X shape mismatch"
assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \
"Incorrect number of samples per blob"
def test_make_blobs_return_centers():
n_samples = [10, 20]
n_features = 3
X, y, centers = make_blobs(n_samples=n_samples, n_features=n_features,
return_centers=True, random_state=0)
assert centers.shape == (len(n_samples), n_features)
def test_make_blobs_error():
n_samples = [20, 20, 20]
centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
cluster_stds = np.array([0.05, 0.2, 0.4])
wrong_centers_msg = ("Length of `n_samples` not consistent "
"with number of centers. Got n_samples = {} "
"and centers = {}".format(n_samples, centers[:-1]))
assert_raise_message(ValueError, wrong_centers_msg,
make_blobs, n_samples, centers=centers[:-1])
wrong_std_msg = ("Length of `clusters_std` not consistent with "
"number of centers. Got centers = {} "
"and cluster_std = {}".format(centers, cluster_stds[:-1]))
assert_raise_message(ValueError, wrong_std_msg,
make_blobs, n_samples,
centers=centers, cluster_std=cluster_stds[:-1])
wrong_type_msg = ("Parameter `centers` must be array-like. "
"Got {!r} instead".format(3))
assert_raise_message(ValueError, wrong_type_msg,
make_blobs, n_samples, centers=3)
def test_make_friedman1():
X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0,
random_state=0)
assert X.shape == (5, 10), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
assert_array_almost_equal(y,
10 * np.sin(np.pi * X[:, 0] * X[:, 1])
+ 20 * (X[:, 2] - 0.5) ** 2
+ 10 * X[:, 3] + 5 * X[:, 4])
def test_make_friedman2():
X, y = make_friedman2(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 4), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
assert_array_almost_equal(y,
(X[:, 0] ** 2
+ (X[:, 1] * X[:, 2] - 1
/ (X[:, 1] * X[:, 3])) ** 2) ** 0.5)
def test_make_friedman3():
X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 4), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
assert_array_almost_equal(y, np.arctan((X[:, 1] * X[:, 2]
- 1 / (X[:, 1] * X[:, 3]))
/ X[:, 0]))
def test_make_low_rank_matrix():
X = make_low_rank_matrix(n_samples=50, n_features=25, effective_rank=5,
tail_strength=0.01, random_state=0)
assert X.shape == (50, 25), "X shape mismatch"
from numpy.linalg import svd
u, s, v = svd(X)
assert sum(s) - 5 < 0.1, "X rank is not approximately 5"
def test_make_sparse_coded_signal():
Y, D, X = make_sparse_coded_signal(n_samples=5, n_components=8,
n_features=10, n_nonzero_coefs=3,
random_state=0)
assert Y.shape == (10, 5), "Y shape mismatch"
assert D.shape == (10, 8), "D shape mismatch"
assert X.shape == (8, 5), "X shape mismatch"
for col in X.T:
assert len(np.flatnonzero(col)) == 3, 'Non-zero coefs mismatch'
assert_array_almost_equal(np.dot(D, X), Y)
assert_array_almost_equal(np.sqrt((D ** 2).sum(axis=0)),
np.ones(D.shape[1]))
def test_make_sparse_uncorrelated():
X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0)
assert X.shape == (5, 10), "X shape mismatch"
assert y.shape == (5,), "y shape mismatch"
def test_make_spd_matrix():
X = make_spd_matrix(n_dim=5, random_state=0)
assert X.shape == (5, 5), "X shape mismatch"
assert_array_almost_equal(X, X.T)
from numpy.linalg import eig
eigenvalues, _ = eig(X)
assert_array_equal(eigenvalues > 0, np.array([True] * 5),
"X is not positive-definite")
def test_make_swiss_roll():
X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 3), "X shape mismatch"
assert t.shape == (5,), "t shape mismatch"
assert_array_almost_equal(X[:, 0], t * np.cos(t))
assert_array_almost_equal(X[:, 2], t * np.sin(t))
def test_make_s_curve():
X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0)
assert X.shape == (5, 3), "X shape mismatch"
assert t.shape == (5,), "t shape mismatch"
assert_array_almost_equal(X[:, 0], np.sin(t))
assert_array_almost_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
def test_make_biclusters():
X, rows, cols = make_biclusters(
shape=(100, 100), n_clusters=4, shuffle=True, random_state=0)
assert X.shape == (100, 100), "X shape mismatch"
assert rows.shape == (4, 100), "rows shape mismatch"
assert cols.shape == (4, 100,), "columns shape mismatch"
assert_all_finite(X)
assert_all_finite(rows)
assert_all_finite(cols)
X2, _, _ = make_biclusters(shape=(100, 100), n_clusters=4,
shuffle=True, random_state=0)
assert_array_almost_equal(X, X2)
def test_make_checkerboard():
X, rows, cols = make_checkerboard(
shape=(100, 100), n_clusters=(20, 5),
shuffle=True, random_state=0)
assert X.shape == (100, 100), "X shape mismatch"
assert rows.shape == (100, 100), "rows shape mismatch"
assert cols.shape == (100, 100,), "columns shape mismatch"
X, rows, cols = make_checkerboard(
shape=(100, 100), n_clusters=2, shuffle=True, random_state=0)
assert_all_finite(X)
assert_all_finite(rows)
assert_all_finite(cols)
X1, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
shuffle=True, random_state=0)
X2, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2,
shuffle=True, random_state=0)
assert_array_almost_equal(X1, X2)
def test_make_moons():
X, y = make_moons(3, shuffle=False)
for x, label in zip(X, y):
center = [0.0, 0.0] if label == 0 else [1.0, 0.5]
dist_sqr = ((x - center) ** 2).sum()
assert_almost_equal(dist_sqr, 1.0,
err_msg="Point is not on expected unit circle")
def test_make_moons_unbalanced():
X, y = make_moons(n_samples=(7, 5))
assert np.sum(y == 0) == 7 and np.sum(y == 1) == 5, \
'Number of samples in a moon is wrong'
assert X.shape == (12, 2), "X shape mismatch"
assert y.shape == (12,), "y shape mismatch"
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_moons(n_samples=[1, 2, 3])
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_moons(n_samples=(10,))
def test_make_circles():
factor = 0.3
for (n_samples, n_outer, n_inner) in [(7, 3, 4), (8, 4, 4)]:
# Testing odd and even case, because in the past make_circles always
# created an even number of samples.
X, y = make_circles(n_samples, shuffle=False, noise=None,
factor=factor)
assert X.shape == (n_samples, 2), "X shape mismatch"
assert y.shape == (n_samples,), "y shape mismatch"
center = [0.0, 0.0]
for x, label in zip(X, y):
dist_sqr = ((x - center) ** 2).sum()
dist_exp = 1.0 if label == 0 else factor**2
dist_exp = 1.0 if label == 0 else factor ** 2
assert_almost_equal(dist_sqr, dist_exp,
err_msg="Point is not on expected circle")
assert X[y == 0].shape == (n_outer, 2), (
"Samples not correctly distributed across circles.")
assert X[y == 1].shape == (n_inner, 2), (
"Samples not correctly distributed across circles.")
with pytest.raises(ValueError):
make_circles(factor=-0.01)
with pytest.raises(ValueError):
make_circles(factor=1.)
def test_make_circles_unbalanced():
X, y = make_circles(n_samples=(2, 8))
assert np.sum(y == 0) == 2, 'Number of samples in inner circle is wrong'
assert np.sum(y == 1) == 8, 'Number of samples in outer circle is wrong'
assert X.shape == (10, 2), "X shape mismatch"
assert y.shape == (10,), "y shape mismatch"
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_circles(n_samples=[1, 2, 3])
with pytest.raises(ValueError, match=r'`n_samples` can be either an int '
r'or a two-element tuple.'):
make_circles(n_samples=(10,))

View file

@ -0,0 +1,521 @@
from bz2 import BZ2File
import gzip
from io import BytesIO
import numpy as np
import scipy.sparse as sp
import os
import shutil
from tempfile import NamedTemporaryFile
import pytest
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import fails_if_pypy
import sklearn
from sklearn.datasets import (load_svmlight_file, load_svmlight_files,
dump_svmlight_file)
currdir = os.path.dirname(os.path.abspath(__file__))
datafile = os.path.join(currdir, "data", "svmlight_classification.txt")
multifile = os.path.join(currdir, "data", "svmlight_multilabel.txt")
invalidfile = os.path.join(currdir, "data", "svmlight_invalid.txt")
invalidfile2 = os.path.join(currdir, "data", "svmlight_invalid_order.txt")
pytestmark = fails_if_pypy
def test_load_svmlight_file():
X, y = load_svmlight_file(datafile)
# test X's shape
assert X.indptr.shape[0] == 7
assert X.shape[0] == 6
assert X.shape[1] == 21
assert y.shape[0] == 6
# test X's non-zero values
for i, j, val in ((0, 2, 2.5), (0, 10, -5.2), (0, 15, 1.5),
(1, 5, 1.0), (1, 12, -3),
(2, 20, 27)):
assert X[i, j] == val
# tests X's zero values
assert X[0, 3] == 0
assert X[0, 5] == 0
assert X[1, 8] == 0
assert X[1, 16] == 0
assert X[2, 18] == 0
# test can change X's values
X[0, 2] *= 2
assert X[0, 2] == 5
# test y
assert_array_equal(y, [1, 2, 3, 4, 1, 2])
def test_load_svmlight_file_fd():
# test loading from file descriptor
X1, y1 = load_svmlight_file(datafile)
fd = os.open(datafile, os.O_RDONLY)
try:
X2, y2 = load_svmlight_file(fd)
assert_array_almost_equal(X1.data, X2.data)
assert_array_almost_equal(y1, y2)
finally:
os.close(fd)
def test_load_svmlight_file_multilabel():
X, y = load_svmlight_file(multifile, multilabel=True)
assert y == [(0, 1), (2,), (), (1, 2)]
def test_load_svmlight_files():
X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2,
dtype=np.float32)
assert_array_equal(X_train.toarray(), X_test.toarray())
assert_array_almost_equal(y_train, y_test)
assert X_train.dtype == np.float32
assert X_test.dtype == np.float32
X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3,
dtype=np.float64)
assert X1.dtype == X2.dtype
assert X2.dtype == X3.dtype
assert X3.dtype == np.float64
def test_load_svmlight_file_n_features():
X, y = load_svmlight_file(datafile, n_features=22)
# test X'shape
assert X.indptr.shape[0] == 7
assert X.shape[0] == 6
assert X.shape[1] == 22
# test X's non-zero values
for i, j, val in ((0, 2, 2.5), (0, 10, -5.2),
(1, 5, 1.0), (1, 12, -3)):
assert X[i, j] == val
# 21 features in file
with pytest.raises(ValueError):
load_svmlight_file(datafile, n_features=20)
def test_load_compressed():
X, y = load_svmlight_file(datafile)
with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp:
tmp.close() # necessary under windows
with open(datafile, "rb") as f:
with gzip.open(tmp.name, "wb") as fh_out:
shutil.copyfileobj(f, fh_out)
Xgz, ygz = load_svmlight_file(tmp.name)
# because we "close" it manually and write to it,
# we need to remove it manually.
os.remove(tmp.name)
assert_array_almost_equal(X.toarray(), Xgz.toarray())
assert_array_almost_equal(y, ygz)
with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp:
tmp.close() # necessary under windows
with open(datafile, "rb") as f:
with BZ2File(tmp.name, "wb") as fh_out:
shutil.copyfileobj(f, fh_out)
Xbz, ybz = load_svmlight_file(tmp.name)
# because we "close" it manually and write to it,
# we need to remove it manually.
os.remove(tmp.name)
assert_array_almost_equal(X.toarray(), Xbz.toarray())
assert_array_almost_equal(y, ybz)
def test_load_invalid_file():
with pytest.raises(ValueError):
load_svmlight_file(invalidfile)
def test_load_invalid_order_file():
with pytest.raises(ValueError):
load_svmlight_file(invalidfile2)
def test_load_zero_based():
f = BytesIO(b"-1 4:1.\n1 0:1\n")
with pytest.raises(ValueError):
load_svmlight_file(f, zero_based=False)
def test_load_zero_based_auto():
data1 = b"-1 1:1 2:2 3:3\n"
data2 = b"-1 0:0 1:1\n"
f1 = BytesIO(data1)
X, y = load_svmlight_file(f1, zero_based="auto")
assert X.shape == (1, 3)
f1 = BytesIO(data1)
f2 = BytesIO(data2)
X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto")
assert X1.shape == (1, 4)
assert X2.shape == (1, 4)
def test_load_with_qid():
# load svmfile with qid attribute
data = b"""
3 qid:1 1:0.53 2:0.12
2 qid:1 1:0.13 2:0.1
7 qid:2 1:0.87 2:0.12"""
X, y = load_svmlight_file(BytesIO(data), query_id=False)
assert_array_equal(y, [3, 2, 7])
assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]])
res1 = load_svmlight_files([BytesIO(data)], query_id=True)
res2 = load_svmlight_file(BytesIO(data), query_id=True)
for X, y, qid in (res1, res2):
assert_array_equal(y, [3, 2, 7])
assert_array_equal(qid, [1, 1, 2])
assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]])
@pytest.mark.skip("testing the overflow of 32 bit sparse indexing requires a"
" large amount of memory")
def test_load_large_qid():
"""
load large libsvm / svmlight file with qid attribute. Tests 64-bit query ID
"""
data = b"\n".join(("3 qid:{0} 1:0.53 2:0.12\n2 qid:{0} 1:0.13 2:0.1"
.format(i).encode() for i in range(1, 40*1000*1000)))
X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)
assert_array_equal(y[-4:], [3, 2, 3, 2])
assert_array_equal(np.unique(qid), np.arange(1, 40*1000*1000))
def test_load_invalid_file2():
with pytest.raises(ValueError):
load_svmlight_files([datafile, invalidfile, datafile])
def test_not_a_filename():
# in python 3 integers are valid file opening arguments (taken as unix
# file descriptors)
with pytest.raises(TypeError):
load_svmlight_file(.42)
def test_invalid_filename():
with pytest.raises(IOError):
load_svmlight_file("trou pic nic douille")
def test_dump():
X_sparse, y_dense = load_svmlight_file(datafile)
X_dense = X_sparse.toarray()
y_sparse = sp.csr_matrix(y_dense)
# slicing a csr_matrix can unsort its .indices, so test that we sort
# those correctly
X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
y_sliced = y_sparse[np.arange(y_sparse.shape[0])]
for X in (X_sparse, X_dense, X_sliced):
for y in (y_sparse, y_dense, y_sliced):
for zero_based in (True, False):
for dtype in [np.float32, np.float64, np.int32, np.int64]:
f = BytesIO()
# we need to pass a comment to get the version info in;
# LibSVM doesn't grok comments so they're not put in by
# default anymore.
if (sp.issparse(y) and y.shape[0] == 1):
# make sure y's shape is: (n_samples, n_labels)
# when it is sparse
y = y.T
# Note: with dtype=np.int32 we are performing unsafe casts,
# where X.astype(dtype) overflows. The result is
# then platform dependent and X_dense.astype(dtype) may be
# different from X_sparse.astype(dtype).asarray().
X_input = X.astype(dtype)
dump_svmlight_file(X_input, y, f, comment="test",
zero_based=zero_based)
f.seek(0)
comment = f.readline()
comment = str(comment, "utf-8")
assert "scikit-learn %s" % sklearn.__version__ in comment
comment = f.readline()
comment = str(comment, "utf-8")
assert ["one", "zero"][zero_based] + "-based" in comment
X2, y2 = load_svmlight_file(f, dtype=dtype,
zero_based=zero_based)
assert X2.dtype == dtype
assert_array_equal(X2.sorted_indices().indices, X2.indices)
X2_dense = X2.toarray()
if sp.issparse(X_input):
X_input_dense = X_input.toarray()
else:
X_input_dense = X_input
if dtype == np.float32:
# allow a rounding error at the last decimal place
assert_array_almost_equal(
X_input_dense, X2_dense, 4)
assert_array_almost_equal(
y_dense.astype(dtype, copy=False), y2, 4)
else:
# allow a rounding error at the last decimal place
assert_array_almost_equal(
X_input_dense, X2_dense, 15)
assert_array_almost_equal(
y_dense.astype(dtype, copy=False), y2, 15)
def test_dump_multilabel():
X = [[1, 0, 3, 0, 5],
[0, 0, 0, 0, 0],
[0, 5, 0, 1, 0]]
y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]]
y_sparse = sp.csr_matrix(y_dense)
for y in [y_dense, y_sparse]:
f = BytesIO()
dump_svmlight_file(X, y, f, multilabel=True)
f.seek(0)
# make sure it dumps multilabel correctly
assert f.readline() == b"1 0:1 2:3 4:5\n"
assert f.readline() == b"0,2 \n"
assert f.readline() == b"0,1 1:5 3:1\n"
def test_dump_concise():
one = 1
two = 2.1
three = 3.01
exact = 1.000000000000001
# loses the last decimal place
almost = 1.0000000000000001
X = [[one, two, three, exact, almost],
[1e9, 2e18, 3e27, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]
y = [one, two, three, exact, almost]
f = BytesIO()
dump_svmlight_file(X, y, f)
f.seek(0)
# make sure it's using the most concise format possible
assert (f.readline() ==
b"1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n")
assert f.readline() == b"2.1 0:1000000000 1:2e+18 2:3e+27\n"
assert f.readline() == b"3.01 \n"
assert f.readline() == b"1.000000000000001 \n"
assert f.readline() == b"1 \n"
f.seek(0)
# make sure it's correct too :)
X2, y2 = load_svmlight_file(f)
assert_array_almost_equal(X, X2.toarray())
assert_array_almost_equal(y, y2)
def test_dump_comment():
X, y = load_svmlight_file(datafile)
X = X.toarray()
f = BytesIO()
ascii_comment = "This is a comment\nspanning multiple lines."
dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False)
f.seek(0)
X2, y2 = load_svmlight_file(f, zero_based=False)
assert_array_almost_equal(X, X2.toarray())
assert_array_almost_equal(y, y2)
# XXX we have to update this to support Python 3.x
utf8_comment = b"It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc"
f = BytesIO()
with pytest.raises(UnicodeDecodeError):
dump_svmlight_file(X, y, f, comment=utf8_comment)
unicode_comment = utf8_comment.decode("utf-8")
f = BytesIO()
dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False)
f.seek(0)
X2, y2 = load_svmlight_file(f, zero_based=False)
assert_array_almost_equal(X, X2.toarray())
assert_array_almost_equal(y, y2)
f = BytesIO()
with pytest.raises(ValueError):
dump_svmlight_file(X, y, f, comment="I've got a \0.")
def test_dump_invalid():
X, y = load_svmlight_file(datafile)
f = BytesIO()
y2d = [y]
with pytest.raises(ValueError):
dump_svmlight_file(X, y2d, f)
f = BytesIO()
with pytest.raises(ValueError):
dump_svmlight_file(X, y[:-1], f)
def test_dump_query_id():
# test dumping a file with query_id
X, y = load_svmlight_file(datafile)
X = X.toarray()
query_id = np.arange(X.shape[0]) // 2
f = BytesIO()
dump_svmlight_file(X, y, f, query_id=query_id, zero_based=True)
f.seek(0)
X1, y1, query_id1 = load_svmlight_file(f, query_id=True, zero_based=True)
assert_array_almost_equal(X, X1.toarray())
assert_array_almost_equal(y, y1)
assert_array_almost_equal(query_id, query_id1)
def test_load_with_long_qid():
# load svmfile with longint qid attribute
data = b"""
1 qid:0 0:1 1:2 2:3
0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985
0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985
3 qid:9223372036854775807 0:1440446648 1:72048431380967004 2:236784985"""
X, y, qid = load_svmlight_file(BytesIO(data), query_id=True)
true_X = [[1, 2, 3],
[1440446648, 72048431380967004, 236784985],
[1440446648, 72048431380967004, 236784985],
[1440446648, 72048431380967004, 236784985]]
true_y = [1, 0, 0, 3]
trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807]
assert_array_equal(y, true_y)
assert_array_equal(X.toarray(), true_X)
assert_array_equal(qid, trueQID)
f = BytesIO()
dump_svmlight_file(X, y, f, query_id=qid, zero_based=True)
f.seek(0)
X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True)
assert_array_equal(y, true_y)
assert_array_equal(X.toarray(), true_X)
assert_array_equal(qid, trueQID)
f.seek(0)
X, y = load_svmlight_file(f, query_id=False, zero_based=True)
assert_array_equal(y, true_y)
assert_array_equal(X.toarray(), true_X)
def test_load_zeros():
f = BytesIO()
true_X = sp.csr_matrix(np.zeros(shape=(3, 4)))
true_y = np.array([0, 1, 0])
dump_svmlight_file(true_X, true_y, f)
for zero_based in ['auto', True, False]:
f.seek(0)
X, y = load_svmlight_file(f, n_features=4, zero_based=zero_based)
assert_array_almost_equal(y, true_y)
assert_array_almost_equal(X.toarray(), true_X.toarray())
@pytest.mark.parametrize('sparsity', [0, 0.1, .5, 0.99, 1])
@pytest.mark.parametrize('n_samples', [13, 101])
@pytest.mark.parametrize('n_features', [2, 7, 41])
def test_load_with_offsets(sparsity, n_samples, n_features):
rng = np.random.RandomState(0)
X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features))
if sparsity:
X[X < sparsity] = 0.0
X = sp.csr_matrix(X)
y = rng.randint(low=0, high=2, size=n_samples)
f = BytesIO()
dump_svmlight_file(X, y, f)
f.seek(0)
size = len(f.getvalue())
# put some marks that are likely to happen anywhere in a row
mark_0 = 0
mark_1 = size // 3
length_0 = mark_1 - mark_0
mark_2 = 4 * size // 5
length_1 = mark_2 - mark_1
# load the original sparse matrix into 3 independent CSR matrices
X_0, y_0 = load_svmlight_file(f, n_features=n_features,
offset=mark_0, length=length_0)
X_1, y_1 = load_svmlight_file(f, n_features=n_features,
offset=mark_1, length=length_1)
X_2, y_2 = load_svmlight_file(f, n_features=n_features,
offset=mark_2)
y_concat = np.concatenate([y_0, y_1, y_2])
X_concat = sp.vstack([X_0, X_1, X_2])
assert_array_almost_equal(y, y_concat)
assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_load_offset_exhaustive_splits():
rng = np.random.RandomState(0)
X = np.array([
[0, 0, 0, 0, 0, 0],
[1, 2, 3, 4, 0, 6],
[1, 2, 3, 4, 0, 6],
[0, 0, 0, 0, 0, 0],
[1, 0, 3, 0, 0, 0],
[0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0],
])
X = sp.csr_matrix(X)
n_samples, n_features = X.shape
y = rng.randint(low=0, high=2, size=n_samples)
query_id = np.arange(n_samples) // 2
f = BytesIO()
dump_svmlight_file(X, y, f, query_id=query_id)
f.seek(0)
size = len(f.getvalue())
# load the same data in 2 parts with all the possible byte offsets to
# locate the split so has to test for particular boundary cases
for mark in range(size):
f.seek(0)
X_0, y_0, q_0 = load_svmlight_file(f, n_features=n_features,
query_id=True, offset=0,
length=mark)
X_1, y_1, q_1 = load_svmlight_file(f, n_features=n_features,
query_id=True, offset=mark,
length=-1)
q_concat = np.concatenate([q_0, q_1])
y_concat = np.concatenate([y_0, y_1])
X_concat = sp.vstack([X_0, X_1])
assert_array_almost_equal(y, y_concat)
assert_array_equal(query_id, q_concat)
assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_load_with_offsets_error():
with pytest.raises(ValueError, match="n_features is required"):
load_svmlight_file(datafile, offset=3, length=3)