Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,67 @@
import itertools
import numpy as np
import pytest
from numpy.testing import assert_array_almost_equal
from sklearn.neighbors._ball_tree import BallTree
from sklearn.neighbors import DistanceMetric
from sklearn.utils import check_random_state
rng = np.random.RandomState(10)
V_mahalanobis = rng.rand(3, 3)
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
DIMENSION = 3
METRICS = {'euclidean': {},
'manhattan': {},
'minkowski': dict(p=3),
'chebyshev': {},
'seuclidean': dict(V=rng.random_sample(DIMENSION)),
'wminkowski': dict(p=3, w=rng.random_sample(DIMENSION)),
'mahalanobis': dict(V=V_mahalanobis)}
DISCRETE_METRICS = ['hamming',
'canberra',
'braycurtis']
BOOLEAN_METRICS = ['matching', 'jaccard', 'dice', 'kulsinski',
'rogerstanimoto', 'russellrao', 'sokalmichener',
'sokalsneath']
def brute_force_neighbors(X, Y, k, metric, **kwargs):
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
ind = np.argsort(D, axis=1)[:, :k]
dist = D[np.arange(Y.shape[0])[:, None], ind]
return dist, ind
@pytest.mark.parametrize('metric',
itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
def test_ball_tree_query_metrics(metric):
rng = check_random_state(0)
if metric in BOOLEAN_METRICS:
X = rng.random_sample((40, 10)).round(0)
Y = rng.random_sample((10, 10)).round(0)
elif metric in DISCRETE_METRICS:
X = (4 * rng.random_sample((40, 10))).round(0)
Y = (4 * rng.random_sample((10, 10))).round(0)
k = 5
bt = BallTree(X, leaf_size=1, metric=metric)
dist1, ind1 = bt.query(Y, k)
dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
assert_array_almost_equal(dist1, dist2)
def test_query_haversine():
rng = check_random_state(0)
X = 2 * np.pi * rng.random_sample((40, 2))
bt = BallTree(X, leaf_size=1, metric='haversine')
dist1, ind1 = bt.query(X, k=5)
dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine')
assert_array_almost_equal(dist1, dist2)
assert_array_almost_equal(ind1, ind2)

View file

@ -0,0 +1,203 @@
import itertools
import pickle
import numpy as np
from numpy.testing import assert_array_almost_equal
import pytest
from scipy.spatial.distance import cdist
from sklearn.neighbors import DistanceMetric
from sklearn.neighbors import BallTree
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_raises_regex
from sklearn.utils.fixes import sp_version, parse_version
def dist_func(x1, x2, p):
return np.sum((x1 - x2) ** p) ** (1. / p)
rng = check_random_state(0)
d = 4
n1 = 20
n2 = 25
X1 = rng.random_sample((n1, d)).astype('float64', copy=False)
X2 = rng.random_sample((n2, d)).astype('float64', copy=False)
# make boolean arrays: ones and zeros
X1_bool = X1.round(0)
X2_bool = X2.round(0)
V = rng.random_sample((d, d))
VI = np.dot(V, V.T)
BOOL_METRICS = ['matching', 'jaccard', 'dice',
'kulsinski', 'rogerstanimoto', 'russellrao',
'sokalmichener', 'sokalsneath']
METRICS_DEFAULT_PARAMS = {'euclidean': {},
'cityblock': {},
'minkowski': dict(p=(1, 1.5, 2, 3)),
'chebyshev': {},
'seuclidean': dict(V=(rng.random_sample(d),)),
'wminkowski': dict(p=(1, 1.5, 3),
w=(rng.random_sample(d),)),
'mahalanobis': dict(VI=(VI,)),
'hamming': {},
'canberra': {},
'braycurtis': {}}
@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
def test_cdist(metric):
argdict = METRICS_DEFAULT_PARAMS[metric]
keys = argdict.keys()
for vals in itertools.product(*argdict.values()):
kwargs = dict(zip(keys, vals))
D_true = cdist(X1, X2, metric, **kwargs)
check_cdist(metric, kwargs, D_true)
@pytest.mark.parametrize('metric', BOOL_METRICS)
def test_cdist_bool_metric(metric):
D_true = cdist(X1_bool, X2_bool, metric)
check_cdist_bool(metric, D_true)
def check_cdist(metric, kwargs, D_true):
dm = DistanceMetric.get_metric(metric, **kwargs)
D12 = dm.pairwise(X1, X2)
assert_array_almost_equal(D12, D_true)
def check_cdist_bool(metric, D_true):
dm = DistanceMetric.get_metric(metric)
D12 = dm.pairwise(X1_bool, X2_bool)
assert_array_almost_equal(D12, D_true)
@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
def test_pdist(metric):
argdict = METRICS_DEFAULT_PARAMS[metric]
keys = argdict.keys()
for vals in itertools.product(*argdict.values()):
kwargs = dict(zip(keys, vals))
D_true = cdist(X1, X1, metric, **kwargs)
check_pdist(metric, kwargs, D_true)
@pytest.mark.parametrize('metric', BOOL_METRICS)
def test_pdist_bool_metrics(metric):
D_true = cdist(X1_bool, X1_bool, metric)
check_pdist_bool(metric, D_true)
def check_pdist(metric, kwargs, D_true):
dm = DistanceMetric.get_metric(metric, **kwargs)
D12 = dm.pairwise(X1)
assert_array_almost_equal(D12, D_true)
def check_pdist_bool(metric, D_true):
dm = DistanceMetric.get_metric(metric)
D12 = dm.pairwise(X1_bool)
# Based on https://github.com/scipy/scipy/pull/7373
# When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
# was changed to return 0, instead of nan.
if metric == 'jaccard' and sp_version < parse_version('1.2.0'):
D_true[np.isnan(D_true)] = 0
assert_array_almost_equal(D12, D_true)
@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
def test_pickle(metric):
argdict = METRICS_DEFAULT_PARAMS[metric]
keys = argdict.keys()
for vals in itertools.product(*argdict.values()):
kwargs = dict(zip(keys, vals))
check_pickle(metric, kwargs)
@pytest.mark.parametrize('metric', BOOL_METRICS)
def test_pickle_bool_metrics(metric):
dm = DistanceMetric.get_metric(metric)
D1 = dm.pairwise(X1_bool)
dm2 = pickle.loads(pickle.dumps(dm))
D2 = dm2.pairwise(X1_bool)
assert_array_almost_equal(D1, D2)
def check_pickle(metric, kwargs):
dm = DistanceMetric.get_metric(metric, **kwargs)
D1 = dm.pairwise(X1)
dm2 = pickle.loads(pickle.dumps(dm))
D2 = dm2.pairwise(X1)
assert_array_almost_equal(D1, D2)
def test_haversine_metric():
def haversine_slow(x1, x2):
return 2 * np.arcsin(np.sqrt(np.sin(0.5 * (x1[0] - x2[0])) ** 2
+ np.cos(x1[0]) * np.cos(x2[0]) *
np.sin(0.5 * (x1[1] - x2[1])) ** 2))
X = np.random.random((10, 2))
haversine = DistanceMetric.get_metric("haversine")
D1 = haversine.pairwise(X)
D2 = np.zeros_like(D1)
for i, x1 in enumerate(X):
for j, x2 in enumerate(X):
D2[i, j] = haversine_slow(x1, x2)
assert_array_almost_equal(D1, D2)
assert_array_almost_equal(haversine.dist_to_rdist(D1),
np.sin(0.5 * D2) ** 2)
def test_pyfunc_metric():
X = np.random.random((10, 3))
euclidean = DistanceMetric.get_metric("euclidean")
pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2)
# Check if both callable metric and predefined metric initialized
# DistanceMetric object is picklable
euclidean_pkl = pickle.loads(pickle.dumps(euclidean))
pyfunc_pkl = pickle.loads(pickle.dumps(pyfunc))
D1 = euclidean.pairwise(X)
D2 = pyfunc.pairwise(X)
D1_pkl = euclidean_pkl.pairwise(X)
D2_pkl = pyfunc_pkl.pairwise(X)
assert_array_almost_equal(D1, D2)
assert_array_almost_equal(D1_pkl, D2_pkl)
def test_bad_pyfunc_metric():
def wrong_distance(x, y):
return "1"
X = np.ones((5, 2))
assert_raises_regex(TypeError,
"Custom distance function must accept two vectors",
BallTree, X, metric=wrong_distance)
def test_input_data_size():
# Regression test for #6288
# Previously, a metric requiring a particular input dimension would fail
def custom_metric(x, y):
assert x.shape[0] == 3
return np.sum((x - y) ** 2)
rng = check_random_state(0)
X = rng.rand(10, 3)
pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric)
eucl = DistanceMetric.get_metric("euclidean")
assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)

View file

@ -0,0 +1,79 @@
import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
from sklearn.neighbors._base import _is_sorted_by_data
def test_transformer_result():
# Test the number of neighbors returned
n_neighbors = 5
n_samples_fit = 20
n_queries = 18
n_features = 10
rng = np.random.RandomState(42)
X = rng.randn(n_samples_fit, n_features)
X2 = rng.randn(n_queries, n_features)
radius = np.percentile(euclidean_distances(X), 10)
# with n_neighbors
for mode in ['distance', 'connectivity']:
add_one = mode == 'distance'
nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
Xt = nnt.fit_transform(X)
assert Xt.shape == (n_samples_fit, n_samples_fit)
assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
assert Xt.format == 'csr'
assert _is_sorted_by_data(Xt)
X2t = nnt.transform(X2)
assert X2t.shape == (n_queries, n_samples_fit)
assert X2t.data.shape == (n_queries * (n_neighbors + add_one), )
assert X2t.format == 'csr'
assert _is_sorted_by_data(X2t)
# with radius
for mode in ['distance', 'connectivity']:
add_one = mode == 'distance'
nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
Xt = nnt.fit_transform(X)
assert Xt.shape == (n_samples_fit, n_samples_fit)
assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
assert Xt.format == 'csr'
assert _is_sorted_by_data(Xt)
X2t = nnt.transform(X2)
assert X2t.shape == (n_queries, n_samples_fit)
assert not X2t.data.shape == (n_queries * (n_neighbors + add_one), )
assert X2t.format == 'csr'
assert _is_sorted_by_data(X2t)
def _has_explicit_diagonal(X):
"""Return True if the diagonal is explicitly stored"""
X = X.tocoo()
explicit = X.row[X.row == X.col]
return len(explicit) == X.shape[0]
def test_explicit_diagonal():
# Test that the diagonal is explicitly stored in the sparse graph
n_neighbors = 5
n_samples_fit, n_samples_transform, n_features = 20, 18, 10
rng = np.random.RandomState(42)
X = rng.randn(n_samples_fit, n_features)
X2 = rng.randn(n_samples_transform, n_features)
nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
Xt = nnt.fit_transform(X)
assert _has_explicit_diagonal(Xt)
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
Xt = nnt.transform(X)
assert _has_explicit_diagonal(Xt)
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
# Using transform on new data should not always have zero diagonal
X2t = nnt.transform(X2)
assert not _has_explicit_diagonal(X2t)

View file

@ -0,0 +1,6 @@
DIMENSION = 3
METRICS = {'euclidean': {},
'manhattan': {},
'chebyshev': {},
'minkowski': dict(p=3)}

View file

@ -0,0 +1,250 @@
import numpy as np
import pytest
from sklearn.utils._testing import assert_allclose, assert_raises
from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors
from sklearn.neighbors._ball_tree import kernel_norm
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_blobs
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError
import joblib
# XXX Duplicated in test_neighbors_tree, test_kde
def compute_kernel_slow(Y, X, kernel, h):
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
if kernel == 'gaussian':
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
elif kernel == 'tophat':
return norm * (d < h).sum(-1)
elif kernel == 'epanechnikov':
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
elif kernel == 'exponential':
return norm * (np.exp(-d / h)).sum(-1)
elif kernel == 'linear':
return norm * ((1 - d / h) * (d < h)).sum(-1)
elif kernel == 'cosine':
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
else:
raise ValueError('kernel not recognized')
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
atol=atol, rtol=rtol)
log_dens = kde.fit(X).score_samples(Y)
assert_allclose(np.exp(log_dens), dens_true,
atol=atol, rtol=max(1E-7, rtol))
assert_allclose(np.exp(kde.score(Y)),
np.prod(dens_true),
atol=atol, rtol=max(1E-7, rtol))
@pytest.mark.parametrize(
'kernel',
['gaussian', 'tophat', 'epanechnikov',
'exponential', 'linear', 'cosine'])
@pytest.mark.parametrize('bandwidth', [0.01, 0.1, 1])
def test_kernel_density(kernel, bandwidth):
n_samples, n_features = (100, 3)
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
Y = rng.randn(n_samples, n_features)
dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
for rtol in [0, 1E-5]:
for atol in [1E-6, 1E-2]:
for breadth_first in (True, False):
check_results(kernel, bandwidth, atol, rtol,
X, Y, dens_true)
def test_kernel_density_sampling(n_samples=100, n_features=3):
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
bandwidth = 0.2
for kernel in ['gaussian', 'tophat']:
# draw a tophat sample
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
samp = kde.sample(100)
assert X.shape == samp.shape
# check that samples are in the right range
nbrs = NearestNeighbors(n_neighbors=1).fit(X)
dist, ind = nbrs.kneighbors(X, return_distance=True)
if kernel == 'tophat':
assert np.all(dist < bandwidth)
elif kernel == 'gaussian':
# 5 standard deviations is safe for 100 samples, but there's a
# very small chance this test could fail.
assert np.all(dist < 5 * bandwidth)
# check unsupported kernels
for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
assert_raises(NotImplementedError, kde.sample, 100)
# non-regression test: used to return a scalar
X = rng.randn(4, 1)
kde = KernelDensity(kernel="gaussian").fit(X)
assert kde.sample().shape == (1, 1)
@pytest.mark.parametrize('algorithm', ['auto', 'ball_tree', 'kd_tree'])
@pytest.mark.parametrize('metric',
['euclidean', 'minkowski', 'manhattan',
'chebyshev', 'haversine'])
def test_kde_algorithm_metric_choice(algorithm, metric):
# Smoke test for various metrics and algorithms
rng = np.random.RandomState(0)
X = rng.randn(10, 2) # 2 features required for haversine dist.
Y = rng.randn(10, 2)
if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:
assert_raises(ValueError, KernelDensity,
algorithm=algorithm, metric=metric)
else:
kde = KernelDensity(algorithm=algorithm, metric=metric)
kde.fit(X)
y_dens = kde.score_samples(Y)
assert y_dens.shape == Y.shape[:1]
def test_kde_score(n_samples=100, n_features=3):
pass
# FIXME
# rng = np.random.RandomState(0)
# X = rng.random_sample((n_samples, n_features))
# Y = rng.random_sample((n_samples, n_features))
def test_kde_badargs():
assert_raises(ValueError, KernelDensity,
algorithm='blah')
assert_raises(ValueError, KernelDensity,
bandwidth=0)
assert_raises(ValueError, KernelDensity,
kernel='blah')
assert_raises(ValueError, KernelDensity,
metric='blah')
assert_raises(ValueError, KernelDensity,
algorithm='kd_tree', metric='blah')
kde = KernelDensity()
assert_raises(ValueError, kde.fit, np.random.random((200, 10)),
sample_weight=np.random.random((200, 10)))
assert_raises(ValueError, kde.fit, np.random.random((200, 10)),
sample_weight=-np.random.random(200))
def test_kde_pipeline_gridsearch():
# test that kde plays nice in pipelines and grid-searches
X, _ = make_blobs(cluster_std=.1, random_state=1,
centers=[[0, 1], [1, 0], [0, 0]])
pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
KernelDensity(kernel="gaussian"))
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
search = GridSearchCV(pipe1, param_grid=params)
search.fit(X)
assert search.best_params_['kerneldensity__bandwidth'] == .1
def test_kde_sample_weights():
n_samples = 400
size_test = 20
weights_neutral = np.full(n_samples, 3.)
for d in [1, 2, 10]:
rng = np.random.RandomState(0)
X = rng.rand(n_samples, d)
weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
X_repetitions = np.repeat(X, weights, axis=0)
n_samples_test = size_test // d
test_points = rng.rand(n_samples_test, d)
for algorithm in ['auto', 'ball_tree', 'kd_tree']:
for metric in ['euclidean', 'minkowski', 'manhattan',
'chebyshev']:
if algorithm != 'kd_tree' or metric in KDTree.valid_metrics:
kde = KernelDensity(algorithm=algorithm, metric=metric)
# Test that adding a constant sample weight has no effect
kde.fit(X, sample_weight=weights_neutral)
scores_const_weight = kde.score_samples(test_points)
sample_const_weight = kde.sample(random_state=1234)
kde.fit(X)
scores_no_weight = kde.score_samples(test_points)
sample_no_weight = kde.sample(random_state=1234)
assert_allclose(scores_const_weight, scores_no_weight)
assert_allclose(sample_const_weight, sample_no_weight)
# Test equivalence between sampling and (integer) weights
kde.fit(X, sample_weight=weights)
scores_weight = kde.score_samples(test_points)
sample_weight = kde.sample(random_state=1234)
kde.fit(X_repetitions)
scores_ref_sampling = kde.score_samples(test_points)
sample_ref_sampling = kde.sample(random_state=1234)
assert_allclose(scores_weight, scores_ref_sampling)
assert_allclose(sample_weight, sample_ref_sampling)
# Test that sample weights has a non-trivial effect
diff = np.max(np.abs(scores_no_weight - scores_weight))
assert diff > 0.001
# Test invariance with respect to arbitrary scaling
scale_factor = rng.rand()
kde.fit(X, sample_weight=(scale_factor * weights))
scores_scaled_weight = kde.score_samples(test_points)
assert_allclose(scores_scaled_weight, scores_weight)
def test_sample_weight_invalid():
# Check sample weighting raises errors.
kde = KernelDensity()
data = np.reshape([1., 2., 3.], (-1, 1))
sample_weight = [0.1, -0.2, 0.3]
expected_err = "sample_weight must have positive values"
with pytest.raises(ValueError, match=expected_err):
kde.fit(data, sample_weight=sample_weight)
@pytest.mark.parametrize('sample_weight', [None, [0.1, 0.2, 0.3]])
def test_pickling(tmpdir, sample_weight):
# Make sure that predictions are the same before and after pickling. Used
# to be a bug because sample_weights wasn't pickled and the resulting tree
# would miss some info.
kde = KernelDensity()
data = np.reshape([1., 2., 3.], (-1, 1))
kde.fit(data, sample_weight=sample_weight)
X = np.reshape([1.1, 2.1], (-1, 1))
scores = kde.score_samples(X)
file_path = str(tmpdir.join('dump.pkl'))
joblib.dump(kde, file_path)
kde = joblib.load(file_path)
scores_pickled = kde.score_samples(X)
assert_allclose(scores, scores_pickled)
@pytest.mark.parametrize('method', ['score_samples', 'sample'])
def test_check_is_fitted(method):
# Check that predict raises an exception in an unfitted estimator.
# Unfitted estimators should raise a NotFittedError.
rng = np.random.RandomState(0)
X = rng.randn(10, 2)
kde = KernelDensity()
with pytest.raises(NotFittedError):
getattr(kde, method)(X)

View file

@ -0,0 +1,232 @@
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# License: BSD 3 clause
from math import sqrt
import numpy as np
from sklearn import neighbors
import pytest
from numpy.testing import assert_array_equal
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import assert_raises
from sklearn.utils._testing import assert_raises_regex
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.estimator_checks import check_outlier_corruption
from sklearn.datasets import load_iris
# load the iris dataset
# and randomly permute it
rng = check_random_state(0)
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
def test_lof():
# Toy sample (the last two samples are outliers):
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]]
# Test LocalOutlierFactor:
clf = neighbors.LocalOutlierFactor(n_neighbors=5)
score = clf.fit(X).negative_outlier_factor_
assert_array_equal(clf._fit_X, X)
# Assert largest outlier score is smaller than smallest inlier score:
assert np.min(score[:-2]) > np.max(score[-2:])
# Assert predict() works:
clf = neighbors.LocalOutlierFactor(contamination=0.25,
n_neighbors=5).fit(X)
assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
assert_array_equal(clf.fit_predict(X), 6 * [1] + 2 * [-1])
def test_lof_performance():
# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = X[:100]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
# fit the model for novelty detection
clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
# predict scores (the lower, the more normal)
y_pred = -clf.decision_function(X_test)
# check that roc_auc is good
assert roc_auc_score(y_test, y_pred) > .99
def test_lof_values():
# toy samples:
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
contamination=0.1,
novelty=True).fit(X_train)
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2,
novelty=True).fit(X_train)
s_0 = 2. * sqrt(2.) / (1. + sqrt(2.))
s_1 = (1. + sqrt(2)) * (1. / (4. * sqrt(2.)) + 1. / (2. + 2. * sqrt(2)))
# check predict()
assert_array_almost_equal(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
assert_array_almost_equal(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
# check predict(one sample not in train)
assert_array_almost_equal(-clf1.score_samples([[2., 2.]]), [s_0])
assert_array_almost_equal(-clf2.score_samples([[2., 2.]]), [s_0])
# check predict(one sample already in train)
assert_array_almost_equal(-clf1.score_samples([[1., 1.]]), [s_1])
assert_array_almost_equal(-clf2.score_samples([[1., 1.]]), [s_1])
def test_lof_precomputed(random_state=42):
"""Tests LOF with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
# As a feature matrix (n_samples by n_features)
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
lof_X.fit(X)
pred_X_X = lof_X._predict()
pred_X_Y = lof_X.predict(Y)
# As a dense distance matrix (n_samples by n_samples)
lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
metric='precomputed', novelty=True)
lof_D.fit(DXX)
pred_D_X = lof_D._predict()
pred_D_Y = lof_D.predict(DYX)
assert_array_almost_equal(pred_X_X, pred_D_X)
assert_array_almost_equal(pred_X_Y, pred_D_Y)
def test_n_neighbors_attribute():
X = iris.data
clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
assert clf.n_neighbors_ == X.shape[0] - 1
clf = neighbors.LocalOutlierFactor(n_neighbors=500)
assert_warns_message(UserWarning,
"n_neighbors will be set to (n_samples - 1)",
clf.fit, X)
assert clf.n_neighbors_ == X.shape[0] - 1
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
contamination=0.1,
novelty=True).fit(X_train)
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2,
novelty=True).fit(X_train)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf1.decision_function([[2., 2.]]) + clf1.offset_)
assert_array_equal(clf2.score_samples([[2., 2.]]),
clf2.decision_function([[2., 2.]]) + clf2.offset_)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf2.score_samples([[2., 2.]]))
def test_contamination():
X = [[1, 1], [1, 0]]
clf = neighbors.LocalOutlierFactor(contamination=0.6)
assert_raises(ValueError, clf.fit, X)
def test_novelty_errors():
X = iris.data
# check errors for novelty=False
clf = neighbors.LocalOutlierFactor()
clf.fit(X)
# predict, decision_function and score_samples raise ValueError
for method in ['predict', 'decision_function', 'score_samples']:
msg = ('{} is not available when novelty=False'.format(method))
assert_raises_regex(AttributeError, msg, getattr, clf, method)
# check errors for novelty=True
clf = neighbors.LocalOutlierFactor(novelty=True)
msg = 'fit_predict is not available when novelty=True'
assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict')
def test_novelty_training_scores():
# check that the scores of the training samples are still accessible
# when novelty=True through the negative_outlier_factor_ attribute
X = iris.data
# fit with novelty=False
clf_1 = neighbors.LocalOutlierFactor()
clf_1.fit(X)
scores_1 = clf_1.negative_outlier_factor_
# fit with novelty=True
clf_2 = neighbors.LocalOutlierFactor(novelty=True)
clf_2.fit(X)
scores_2 = clf_2.negative_outlier_factor_
assert_array_almost_equal(scores_1, scores_2)
def test_hasattr_prediction():
# check availability of prediction methods depending on novelty value.
X = [[1, 1], [1, 2], [2, 1]]
# when novelty=True
clf = neighbors.LocalOutlierFactor(novelty=True)
clf.fit(X)
assert hasattr(clf, 'predict')
assert hasattr(clf, 'decision_function')
assert hasattr(clf, 'score_samples')
assert not hasattr(clf, 'fit_predict')
# when novelty=False
clf = neighbors.LocalOutlierFactor(novelty=False)
clf.fit(X)
assert hasattr(clf, 'fit_predict')
assert not hasattr(clf, 'predict')
assert not hasattr(clf, 'decision_function')
assert not hasattr(clf, 'score_samples')
def test_novelty_true_common_tests():
# the common tests are run for the default LOF (novelty=False).
# here we run these common tests for LOF when novelty=True
check_estimator(neighbors.LocalOutlierFactor(novelty=True))
@pytest.mark.parametrize('expected_outliers', [30, 53])
def test_predicted_outlier_number(expected_outliers):
# the number of predicted outliers should be equal to the number of
# expected outliers unless there are ties in the abnormality scores.
X = iris.data
n_samples = X.shape[0]
contamination = float(expected_outliers)/n_samples
clf = neighbors.LocalOutlierFactor(contamination=contamination)
y_pred = clf.fit_predict(X)
num_outliers = np.sum(y_pred != 1)
if num_outliers != expected_outliers:
y_dec = clf.negative_outlier_factor_
check_outlier_corruption(num_outliers, expected_outliers, y_dec)

View file

@ -0,0 +1,534 @@
# coding: utf-8
"""
Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca)
"""
# Authors: William de Vazelhes <wdevazelhes@gmail.com>
# John Chiotellis <ioannis.chiotellis@in.tum.de>
# License: BSD 3 clause
import pytest
import re
import numpy as np
from numpy.testing import assert_array_equal, assert_array_almost_equal
from scipy.optimize import check_grad
from sklearn import clone
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import check_random_state
from sklearn.utils._testing import (assert_raises,
assert_raise_message, assert_warns_message)
from sklearn.datasets import load_iris, make_classification, make_blobs
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.metrics import pairwise_distances
rng = check_random_state(0)
# load and shuffle iris dataset
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris_data = iris.data[perm]
iris_target = iris.target[perm]
EPS = np.finfo(float).eps
def test_simple_example():
"""Test on a simple example.
Puts four points in the input space where the opposite labels points are
next to each other. After transform the samples from the same class
should be next to each other.
"""
X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
y = np.array([1, 0, 1, 0])
nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
random_state=42)
nca.fit(X, y)
X_t = nca.transform(X)
assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
np.array([2, 3, 0, 1]))
def test_toy_example_collapse_points():
"""Test on a toy example of three points that should collapse
We build a simple example: two points from the same class and a point from
a different class in the middle of them. On this simple example, the new
(transformed) points should all collapse into one single point. Indeed, the
objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
two samples from the same class. This is maximized for d=0 (because d>=0),
with an objective equal to 1 (loss=-1.).
"""
rng = np.random.RandomState(42)
input_dim = 5
two_points = rng.randn(2, input_dim)
X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
y = [0, 0, 1]
class LossStorer:
def __init__(self, X, y):
self.loss = np.inf # initialize the loss to very high
# Initialize a fake NCA and variables needed to compute the loss:
self.fake_nca = NeighborhoodComponentsAnalysis()
self.fake_nca.n_iter_ = np.inf
self.X, y, _ = self.fake_nca._validate_params(X, y)
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
def callback(self, transformation, n_iter):
"""Stores the last value of the loss function"""
self.loss, _ = self.fake_nca._loss_grad_lbfgs(transformation,
self.X,
self.same_class_mask,
-1.0)
loss_storer = LossStorer(X, y)
nca = NeighborhoodComponentsAnalysis(random_state=42,
callback=loss_storer.callback)
X_t = nca.fit_transform(X, y)
print(X_t)
# test that points are collapsed into one point
assert_array_almost_equal(X_t - X_t[0], 0.)
assert abs(loss_storer.loss + 1) < 1e-10
def test_finite_differences():
"""Test gradient of loss function
Assert that the gradient is almost equal to its finite differences
approximation.
"""
# Initialize the transformation `M`, as well as `X` and `y` and `NCA`
rng = np.random.RandomState(42)
X, y = make_classification()
M = rng.randn(rng.randint(1, X.shape[1] + 1),
X.shape[1])
nca = NeighborhoodComponentsAnalysis()
nca.n_iter_ = 0
mask = y[:, np.newaxis] == y[np.newaxis, :]
def fun(M):
return nca._loss_grad_lbfgs(M, X, mask)[0]
def grad(M):
return nca._loss_grad_lbfgs(M, X, mask)[1]
# compute relative error
rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M))
np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
def test_params_validation():
# Test that invalid parameters raise value error
X = np.arange(12).reshape(4, 3)
y = [1, 1, 2, 2]
NCA = NeighborhoodComponentsAnalysis
rng = np.random.RandomState(42)
# TypeError
assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
assert_raises(TypeError, NCA(verbose='true').fit, X, y)
assert_raises(TypeError, NCA(tol='1').fit, X, y)
assert_raises(TypeError, NCA(n_components='invalid').fit, X, y)
assert_raises(TypeError, NCA(warm_start=1).fit, X, y)
# ValueError
assert_raise_message(ValueError,
"`init` must be 'auto', 'pca', 'lda', 'identity', "
"'random' or a numpy array of shape "
"(n_components, n_features).",
NCA(init=1).fit, X, y)
assert_raise_message(ValueError,
'`max_iter`= -1, must be >= 1.',
NCA(max_iter=-1).fit, X, y)
init = rng.rand(5, 3)
assert_raise_message(ValueError,
'The output dimensionality ({}) of the given linear '
'transformation `init` cannot be greater than its '
'input dimensionality ({}).'
.format(init.shape[0], init.shape[1]),
NCA(init=init).fit, X, y)
n_components = 10
assert_raise_message(ValueError,
'The preferred dimensionality of the '
'projected space `n_components` ({}) cannot '
'be greater than the given data '
'dimensionality ({})!'
.format(n_components, X.shape[1]),
NCA(n_components=n_components).fit, X, y)
def test_transformation_dimensions():
X = np.arange(12).reshape(4, 3)
y = [1, 1, 2, 2]
# Fail if transformation input dimension does not match inputs dimensions
transformation = np.array([[1, 2], [3, 4]])
assert_raises(ValueError,
NeighborhoodComponentsAnalysis(init=transformation).fit,
X, y)
# Fail if transformation output dimension is larger than
# transformation input dimension
transformation = np.array([[1, 2], [3, 4], [5, 6]])
# len(transformation) > len(transformation[0])
assert_raises(ValueError,
NeighborhoodComponentsAnalysis(init=transformation).fit,
X, y)
# Pass otherwise
transformation = np.arange(9).reshape(3, 3)
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
def test_n_components():
rng = np.random.RandomState(42)
X = np.arange(12).reshape(4, 3)
y = [1, 1, 2, 2]
init = rng.rand(X.shape[1] - 1, 3)
# n_components = X.shape[1] != transformation.shape[0]
n_components = X.shape[1]
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
assert_raise_message(ValueError,
'The preferred dimensionality of the '
'projected space `n_components` ({}) does not match '
'the output dimensionality of the given '
'linear transformation `init` ({})!'
.format(n_components, init.shape[0]),
nca.fit, X, y)
# n_components > X.shape[1]
n_components = X.shape[1] + 2
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
assert_raise_message(ValueError,
'The preferred dimensionality of the '
'projected space `n_components` ({}) cannot '
'be greater than the given data '
'dimensionality ({})!'
.format(n_components, X.shape[1]),
nca.fit, X, y)
# n_components < X.shape[1]
nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
nca.fit(X, y)
def test_init_transformation():
rng = np.random.RandomState(42)
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
# Start learning from scratch
nca = NeighborhoodComponentsAnalysis(init='identity')
nca.fit(X, y)
# Initialize with random
nca_random = NeighborhoodComponentsAnalysis(init='random')
nca_random.fit(X, y)
# Initialize with auto
nca_auto = NeighborhoodComponentsAnalysis(init='auto')
nca_auto.fit(X, y)
# Initialize with PCA
nca_pca = NeighborhoodComponentsAnalysis(init='pca')
nca_pca.fit(X, y)
# Initialize with LDA
nca_lda = NeighborhoodComponentsAnalysis(init='lda')
nca_lda.fit(X, y)
init = rng.rand(X.shape[1], X.shape[1])
nca = NeighborhoodComponentsAnalysis(init=init)
nca.fit(X, y)
# init.shape[1] must match X.shape[1]
init = rng.rand(X.shape[1], X.shape[1] + 1)
nca = NeighborhoodComponentsAnalysis(init=init)
assert_raise_message(ValueError,
'The input dimensionality ({}) of the given '
'linear transformation `init` must match the '
'dimensionality of the given inputs `X` ({}).'
.format(init.shape[1], X.shape[1]),
nca.fit, X, y)
# init.shape[0] must be <= init.shape[1]
init = rng.rand(X.shape[1] + 1, X.shape[1])
nca = NeighborhoodComponentsAnalysis(init=init)
assert_raise_message(ValueError,
'The output dimensionality ({}) of the given '
'linear transformation `init` cannot be '
'greater than its input dimensionality ({}).'
.format(init.shape[0], init.shape[1]),
nca.fit, X, y)
# init.shape[0] must match n_components
init = rng.rand(X.shape[1], X.shape[1])
n_components = X.shape[1] - 2
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
assert_raise_message(ValueError,
'The preferred dimensionality of the '
'projected space `n_components` ({}) does not match '
'the output dimensionality of the given '
'linear transformation `init` ({})!'
.format(n_components, init.shape[0]),
nca.fit, X, y)
@pytest.mark.parametrize('n_samples', [3, 5, 7, 11])
@pytest.mark.parametrize('n_features', [3, 5, 7, 11])
@pytest.mark.parametrize('n_classes', [5, 7, 11])
@pytest.mark.parametrize('n_components', [3, 5, 7, 11])
def test_auto_init(n_samples, n_features, n_classes, n_components):
# Test that auto choose the init as expected with every configuration
# of order of n_samples, n_features, n_classes and n_components.
rng = np.random.RandomState(42)
nca_base = NeighborhoodComponentsAnalysis(init='auto',
n_components=n_components,
max_iter=1,
random_state=rng)
if n_classes >= n_samples:
pass
# n_classes > n_samples is impossible, and n_classes == n_samples
# throws an error from lda but is an absurd case
else:
X = rng.randn(n_samples, n_features)
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
if n_components > n_features:
# this would return a ValueError, which is already tested in
# test_params_validation
pass
else:
nca = clone(nca_base)
nca.fit(X, y)
if n_components <= min(n_classes - 1, n_features):
nca_other = clone(nca_base).set_params(init='lda')
elif n_components < min(n_features, n_samples):
nca_other = clone(nca_base).set_params(init='pca')
else:
nca_other = clone(nca_base).set_params(init='identity')
nca_other.fit(X, y)
assert_array_almost_equal(nca.components_, nca_other.components_)
def test_warm_start_validation():
X, y = make_classification(n_samples=30, n_features=5, n_classes=4,
n_redundant=0, n_informative=5, random_state=0)
nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
nca.fit(X, y)
X_less_features, y = make_classification(n_samples=30, n_features=4,
n_classes=4, n_redundant=0,
n_informative=4, random_state=0)
assert_raise_message(ValueError,
'The new inputs dimensionality ({}) does not '
'match the input dimensionality of the '
'previously learned transformation ({}).'
.format(X_less_features.shape[1],
nca.components_.shape[1]),
nca.fit, X_less_features, y)
def test_warm_start_effectiveness():
# A 1-iteration second fit on same data should give almost same result
# with warm starting, and quite different result without warm starting.
nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
nca_warm.fit(iris_data, iris_target)
transformation_warm = nca_warm.components_
nca_warm.max_iter = 1
nca_warm.fit(iris_data, iris_target)
transformation_warm_plus_one = nca_warm.components_
nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
nca_cold.fit(iris_data, iris_target)
transformation_cold = nca_cold.components_
nca_cold.max_iter = 1
nca_cold.fit(iris_data, iris_target)
transformation_cold_plus_one = nca_cold.components_
diff_warm = np.sum(np.abs(transformation_warm_plus_one -
transformation_warm))
diff_cold = np.sum(np.abs(transformation_cold_plus_one -
transformation_cold))
assert diff_warm < 3.0, ("Transformer changed significantly after one "
"iteration even though it was warm-started.")
assert diff_cold > diff_warm, ("Cold-started transformer changed less "
"significantly than warm-started "
"transformer after one iteration.")
@pytest.mark.parametrize('init_name', ['pca', 'lda', 'identity', 'random',
'precomputed'])
def test_verbose(init_name, capsys):
# assert there is proper output when verbose = 1, for every initialization
# except auto because auto will call one of the others
rng = np.random.RandomState(42)
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
regexp_init = r'... done in \ *\d+\.\d{2}s'
msgs = {'pca': "Finding principal components" + regexp_init,
'lda': "Finding most discriminative components" + regexp_init}
if init_name == 'precomputed':
init = rng.randn(X.shape[1], X.shape[1])
else:
init = init_name
nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
nca.fit(X, y)
out, _ = capsys.readouterr()
# check output
lines = re.split('\n+', out)
# if pca or lda init, an additional line is printed, so we test
# it and remove it to test the rest equally among initializations
if init_name in ['pca', 'lda']:
assert re.match(msgs[init_name], lines[0])
lines = lines[1:]
assert lines[0] == '[NeighborhoodComponentsAnalysis]'
header = '{:>10} {:>20} {:>10}'.format('Iteration', 'Objective Value',
'Time(s)')
assert lines[1] == '[NeighborhoodComponentsAnalysis] {}'.format(header)
assert lines[2] == ('[NeighborhoodComponentsAnalysis] {}'
.format('-' * len(header)))
for line in lines[3:-2]:
# The following regex will match for instance:
# '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01'
assert re.match(r'\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e'
r'[+|-]\d+\ *\d+\.\d{2}', line)
assert re.match(r'\[NeighborhoodComponentsAnalysis\] Training took\ *'
r'\d+\.\d{2}s\.', lines[-2])
assert lines[-1] == ''
def test_no_verbose(capsys):
# assert by default there is no output (verbose=0)
nca = NeighborhoodComponentsAnalysis()
nca.fit(iris_data, iris_target)
out, _ = capsys.readouterr()
# check output
assert(out == '')
def test_singleton_class():
X = iris_data
y = iris_target
# one singleton class
singleton_class = 1
ind_singleton, = np.where(y == singleton_class)
y[ind_singleton] = 2
y[ind_singleton[0]] = singleton_class
nca = NeighborhoodComponentsAnalysis(max_iter=30)
nca.fit(X, y)
# One non-singleton class
ind_1, = np.where(y == 1)
ind_2, = np.where(y == 2)
y[ind_1] = 0
y[ind_1[0]] = 1
y[ind_2] = 0
y[ind_2[0]] = 2
nca = NeighborhoodComponentsAnalysis(max_iter=30)
nca.fit(X, y)
# Only singleton classes
ind_0, = np.where(y == 0)
ind_1, = np.where(y == 1)
ind_2, = np.where(y == 2)
X = X[[ind_0[0], ind_1[0], ind_2[0]]]
y = y[[ind_0[0], ind_1[0], ind_2[0]]]
nca = NeighborhoodComponentsAnalysis(init='identity', max_iter=30)
nca.fit(X, y)
assert_array_equal(X, nca.transform(X))
def test_one_class():
X = iris_data[iris_target == 0]
y = iris_target[iris_target == 0]
nca = NeighborhoodComponentsAnalysis(max_iter=30,
n_components=X.shape[1],
init='identity')
nca.fit(X, y)
assert_array_equal(X, nca.transform(X))
def test_callback(capsys):
X = iris_data
y = iris_target
nca = NeighborhoodComponentsAnalysis(callback='my_cb')
assert_raises(ValueError, nca.fit, X, y)
max_iter = 10
def my_cb(transformation, n_iter):
assert transformation.shape == (iris_data.shape[1]**2,)
rem_iter = max_iter - n_iter
print('{} iterations remaining...'.format(rem_iter))
# assert that my_cb is called
nca = NeighborhoodComponentsAnalysis(max_iter=max_iter,
callback=my_cb, verbose=1)
nca.fit(iris_data, iris_target)
out, _ = capsys.readouterr()
# check output
assert('{} iterations remaining...'.format(max_iter - 1) in out)
def test_expected_transformation_shape():
"""Test that the transformation has the expected shape."""
X = iris_data
y = iris_target
class TransformationStorer:
def __init__(self, X, y):
# Initialize a fake NCA and variables needed to call the loss
# function:
self.fake_nca = NeighborhoodComponentsAnalysis()
self.fake_nca.n_iter_ = np.inf
self.X, y, _ = self.fake_nca._validate_params(X, y)
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
def callback(self, transformation, n_iter):
"""Stores the last value of the transformation taken as input by
the optimizer"""
self.transformation = transformation
transformation_storer = TransformationStorer(X, y)
cb = transformation_storer.callback
nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
nca.fit(X, y)
assert transformation_storer.transformation.size == X.shape[1]**2
def test_convergence_warning():
nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
cls_name = nca.__class__.__name__
assert_warns_message(ConvergenceWarning,
'[{}] NCA did not converge'.format(cls_name),
nca.fit, iris_data, iris_target)
@pytest.mark.parametrize('param, value', [('n_components', np.int32(3)),
('max_iter', np.int32(100)),
('tol', np.float32(0.0001))])
def test_parameters_valid_types(param, value):
# check that no error is raised when parameters have numpy integer or
# floating types.
nca = NeighborhoodComponentsAnalysis(**{param: value})
X = iris_data
y = iris_target
nca.fit(X, y)

View file

@ -0,0 +1,148 @@
"""
Testing for the nearest centroid module.
"""
import numpy as np
from scipy import sparse as sp
from numpy.testing import assert_array_equal
from sklearn.neighbors import NearestCentroid
from sklearn import datasets
from sklearn.utils._testing import assert_raises
# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
X_csr = sp.csr_matrix(X) # Sparse matrix
y = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
T_csr = sp.csr_matrix(T)
true_result = [-1, 1, 1]
# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
rng = np.random.RandomState(1)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]
def test_classification_toy():
# Check classification on a toy dataset, including sparse versions.
clf = NearestCentroid()
clf.fit(X, y)
assert_array_equal(clf.predict(T), true_result)
# Same test, but with a sparse matrix to fit and test.
clf = NearestCentroid()
clf.fit(X_csr, y)
assert_array_equal(clf.predict(T_csr), true_result)
# Fit with sparse, test with non-sparse
clf = NearestCentroid()
clf.fit(X_csr, y)
assert_array_equal(clf.predict(T), true_result)
# Fit with non-sparse, test with sparse
clf = NearestCentroid()
clf.fit(X, y)
assert_array_equal(clf.predict(T_csr), true_result)
# Fit and predict with non-CSR sparse matrices
clf = NearestCentroid()
clf.fit(X_csr.tocoo(), y)
assert_array_equal(clf.predict(T_csr.tolil()), true_result)
def test_precomputed():
clf = NearestCentroid(metric='precomputed')
with assert_raises(ValueError):
clf.fit(X, y)
def test_iris():
# Check consistency on dataset iris.
for metric in ('euclidean', 'cosine'):
clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
score = np.mean(clf.predict(iris.data) == iris.target)
assert score > 0.9, "Failed with score = " + str(score)
def test_iris_shrinkage():
# Check consistency on dataset iris, when using shrinkage.
for metric in ('euclidean', 'cosine'):
for shrink_threshold in [None, 0.1, 0.5]:
clf = NearestCentroid(metric=metric,
shrink_threshold=shrink_threshold)
clf = clf.fit(iris.data, iris.target)
score = np.mean(clf.predict(iris.data) == iris.target)
assert score > 0.8, "Failed with score = " + str(score)
def test_pickle():
import pickle
# classification
obj = NearestCentroid()
obj.fit(iris.data, iris.target)
score = obj.score(iris.data, iris.target)
s = pickle.dumps(obj)
obj2 = pickle.loads(s)
assert type(obj2) == obj.__class__
score2 = obj2.score(iris.data, iris.target)
assert_array_equal(score, score2,
"Failed to generate same score"
" after pickling (classification).")
def test_shrinkage_correct():
# Ensure that the shrinking is correct.
# The expected result is calculated by R (pamr),
# which is implemented by the author of the original paper.
# (One need to modify the code to output the new centroid in pamr.predict)
X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
y = np.array([1, 1, 2, 2, 2])
clf = NearestCentroid(shrink_threshold=0.1)
clf.fit(X, y)
expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
def test_shrinkage_threshold_decoded_y():
clf = NearestCentroid(shrink_threshold=0.01)
y_ind = np.asarray(y)
y_ind[y_ind == -1] = 0
clf.fit(X, y_ind)
centroid_encoded = clf.centroids_
clf.fit(X, y)
assert_array_equal(centroid_encoded, clf.centroids_)
def test_predict_translated_data():
# Test that NearestCentroid gives same results on translated data
rng = np.random.RandomState(0)
X = rng.rand(50, 50)
y = rng.randint(0, 3, 50)
noise = rng.rand(50)
clf = NearestCentroid(shrink_threshold=0.1)
clf.fit(X, y)
y_init = clf.predict(X)
clf = NearestCentroid(shrink_threshold=0.1)
X_noise = X + noise
clf.fit(X_noise, y)
y_translate = clf.predict(X_noise)
assert_array_equal(y_init, y_translate)
def test_manhattan_metric():
# Test the manhattan metric.
clf = NearestCentroid(metric='manhattan')
clf.fit(X, y)
dense_centroid = clf.centroids_
clf.fit(X_csr, y)
assert_array_equal(clf.centroids_, dense_centroid)
assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,221 @@
"""
This is testing the equivalence between some estimators with internal nearest
neighbors computations, and the corresponding pipeline versions with
KNeighborsTransformer or RadiusNeighborsTransformer to precompute the
neighbors.
"""
import numpy as np
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.cluster.tests.common import generate_clustered_data
from sklearn.datasets import make_blobs
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from sklearn.neighbors import KNeighborsTransformer
from sklearn.neighbors import RadiusNeighborsTransformer
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.manifold import SpectralEmbedding
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
def test_spectral_clustering():
# Test chaining KNeighborsTransformer and SpectralClustering
n_neighbors = 5
X, _ = make_blobs(random_state=0)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed',
random_state=42))
est_compact = SpectralClustering(
n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
labels_compact = est_compact.fit_predict(X)
labels_chain = est_chain.fit_predict(X)
assert_array_almost_equal(labels_chain, labels_compact)
def test_spectral_embedding():
# Test chaining KNeighborsTransformer and SpectralEmbedding
n_neighbors = 5
n_samples = 1000
centers = np.array([
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
])
S, true_labels = make_blobs(n_samples=n_samples, centers=centers,
cluster_std=1., random_state=42)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
SpectralEmbedding(n_neighbors=n_neighbors, affinity='precomputed',
random_state=42))
est_compact = SpectralEmbedding(
n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
St_compact = est_compact.fit_transform(S)
St_chain = est_chain.fit_transform(S)
assert_array_almost_equal(St_chain, St_compact)
def test_dbscan():
# Test chaining RadiusNeighborsTransformer and DBSCAN
radius = 0.3
n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters)
# compare the chained version and the compact version
est_chain = make_pipeline(
RadiusNeighborsTransformer(radius=radius, mode='distance'),
DBSCAN(metric='precomputed', eps=radius))
est_compact = DBSCAN(eps=radius)
labels_chain = est_chain.fit_predict(X)
labels_compact = est_compact.fit_predict(X)
assert_array_almost_equal(labels_chain, labels_compact)
def test_isomap():
# Test chaining KNeighborsTransformer and Isomap with
# neighbors_algorithm='precomputed'
algorithm = 'auto'
n_neighbors = 10
X, _ = make_blobs(random_state=0)
X2, _ = make_blobs(random_state=1)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, algorithm=algorithm,
mode='distance'),
Isomap(n_neighbors=n_neighbors, metric='precomputed'))
est_compact = Isomap(n_neighbors=n_neighbors,
neighbors_algorithm=algorithm)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_array_almost_equal(Xt_chain, Xt_compact)
Xt_chain = est_chain.transform(X2)
Xt_compact = est_compact.transform(X2)
assert_array_almost_equal(Xt_chain, Xt_compact)
def test_tsne():
# Test chaining KNeighborsTransformer and TSNE
n_iter = 250
perplexity = 5
n_neighbors = int(3. * perplexity + 1)
rng = np.random.RandomState(0)
X = rng.randn(20, 2)
for metric in ['minkowski', 'sqeuclidean']:
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
metric=metric),
TSNE(metric='precomputed', perplexity=perplexity,
method="barnes_hut", random_state=42, n_iter=n_iter))
est_compact = TSNE(metric=metric, perplexity=perplexity, n_iter=n_iter,
method="barnes_hut", random_state=42)
Xt_chain = est_chain.fit_transform(X)
Xt_compact = est_compact.fit_transform(X)
assert_array_almost_equal(Xt_chain, Xt_compact)
def test_lof_novelty_false():
# Test chaining KNeighborsTransformer and LocalOutlierFactor
n_neighbors = 4
rng = np.random.RandomState(0)
X = rng.randn(40, 2)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'),
LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors,
novelty=False, contamination="auto"))
est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=False,
contamination="auto")
pred_chain = est_chain.fit_predict(X)
pred_compact = est_compact.fit_predict(X)
assert_array_almost_equal(pred_chain, pred_compact)
def test_lof_novelty_true():
# Test chaining KNeighborsTransformer and LocalOutlierFactor
n_neighbors = 4
rng = np.random.RandomState(0)
X1 = rng.randn(40, 2)
X2 = rng.randn(40, 2)
# compare the chained version and the compact version
est_chain = make_pipeline(
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'),
LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors,
novelty=True, contamination="auto"))
est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True,
contamination="auto")
pred_chain = est_chain.fit(X1).predict(X2)
pred_compact = est_compact.fit(X1).predict(X2)
assert_array_almost_equal(pred_chain, pred_compact)
def test_kneighbors_regressor():
# Test chaining KNeighborsTransformer and classifiers/regressors
rng = np.random.RandomState(0)
X = 2 * rng.rand(40, 5) - 1
X2 = 2 * rng.rand(40, 5) - 1
y = rng.rand(40, 1)
n_neighbors = 12
radius = 1.5
# We precompute more neighbors than necessary, to have equivalence between
# k-neighbors estimator after radius-neighbors transformer, and vice-versa.
factor = 2
k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance')
k_trans_factor = KNeighborsTransformer(n_neighbors=int(
n_neighbors * factor), mode='distance')
r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance')
r_trans_factor = RadiusNeighborsTransformer(radius=int(
radius * factor), mode='distance')
k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
r_reg = RadiusNeighborsRegressor(radius=radius)
test_list = [
(k_trans, k_reg),
(k_trans_factor, r_reg),
(r_trans, r_reg),
(r_trans_factor, k_reg),
]
for trans, reg in test_list:
# compare the chained version and the compact version
reg_compact = clone(reg)
reg_precomp = clone(reg)
reg_precomp.set_params(metric='precomputed')
reg_chain = make_pipeline(clone(trans), reg_precomp)
y_pred_chain = reg_chain.fit(X, y).predict(X2)
y_pred_compact = reg_compact.fit(X, y).predict(X2)
assert_array_almost_equal(y_pred_chain, y_pred_compact)

View file

@ -0,0 +1,279 @@
# License: BSD 3 clause
import pickle
import itertools
import numpy as np
import pytest
from sklearn.neighbors import DistanceMetric
from sklearn.neighbors._ball_tree import (
BallTree, kernel_norm, DTYPE, ITYPE,
NeighborsHeap as NeighborsHeapBT,
simultaneous_sort as simultaneous_sort_bt,
nodeheap_sort as nodeheap_sort_bt)
from sklearn.neighbors._kd_tree import (
KDTree, NeighborsHeap as NeighborsHeapKDT,
simultaneous_sort as simultaneous_sort_kdt,
nodeheap_sort as nodeheap_sort_kdt)
from sklearn.utils import check_random_state
from numpy.testing import assert_array_almost_equal, assert_allclose
rng = np.random.RandomState(42)
V_mahalanobis = rng.rand(3, 3)
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
DIMENSION = 3
METRICS = {'euclidean': {},
'manhattan': {},
'minkowski': dict(p=3),
'chebyshev': {},
'seuclidean': dict(V=rng.random_sample(DIMENSION)),
'wminkowski': dict(p=3, w=rng.random_sample(DIMENSION)),
'mahalanobis': dict(V=V_mahalanobis)}
KD_TREE_METRICS = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
BALL_TREE_METRICS = list(METRICS)
def dist_func(x1, x2, p):
return np.sum((x1 - x2) ** p) ** (1. / p)
def compute_kernel_slow(Y, X, kernel, h):
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
norm = kernel_norm(h, X.shape[1], kernel)
if kernel == 'gaussian':
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
elif kernel == 'tophat':
return norm * (d < h).sum(-1)
elif kernel == 'epanechnikov':
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
elif kernel == 'exponential':
return norm * (np.exp(-d / h)).sum(-1)
elif kernel == 'linear':
return norm * ((1 - d / h) * (d < h)).sum(-1)
elif kernel == 'cosine':
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
else:
raise ValueError('kernel not recognized')
def brute_force_neighbors(X, Y, k, metric, **kwargs):
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
ind = np.argsort(D, axis=1)[:, :k]
dist = D[np.arange(Y.shape[0])[:, None], ind]
return dist, ind
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
@pytest.mark.parametrize("kernel", ['gaussian', 'tophat', 'epanechnikov',
'exponential', 'linear', 'cosine'])
@pytest.mark.parametrize("h", [0.01, 0.1, 1])
@pytest.mark.parametrize("rtol", [0, 1E-5])
@pytest.mark.parametrize("atol", [1E-6, 1E-2])
@pytest.mark.parametrize("breadth_first", [True, False])
def test_kernel_density(Cls, kernel, h, rtol, atol, breadth_first,
n_samples=100, n_features=3):
rng = check_random_state(1)
X = rng.random_sample((n_samples, n_features))
Y = rng.random_sample((n_samples, n_features))
dens_true = compute_kernel_slow(Y, X, kernel, h)
tree = Cls(X, leaf_size=10)
dens = tree.kernel_density(Y, h, atol=atol, rtol=rtol,
kernel=kernel,
breadth_first=breadth_first)
assert_allclose(dens, dens_true,
atol=atol, rtol=max(rtol, 1e-7))
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
def test_neighbor_tree_query_radius(Cls, n_samples=100, n_features=10):
rng = check_random_state(0)
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
query_pt = np.zeros(n_features, dtype=float)
eps = 1E-15 # roundoff error can cause test to fail
tree = Cls(X, leaf_size=5)
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
for r in np.linspace(rad[0], rad[-1], 100):
ind = tree.query_radius([query_pt], r + eps)[0]
i = np.where(rad <= r + eps)[0]
ind.sort()
i.sort()
assert_array_almost_equal(i, ind)
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
def test_neighbor_tree_query_radius_distance(Cls, n_samples=100,
n_features=10):
rng = check_random_state(0)
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
query_pt = np.zeros(n_features, dtype=float)
eps = 1E-15 # roundoff error can cause test to fail
tree = Cls(X, leaf_size=5)
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
for r in np.linspace(rad[0], rad[-1], 100):
ind, dist = tree.query_radius([query_pt], r + eps,
return_distance=True)
ind = ind[0]
dist = dist[0]
d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))
assert_array_almost_equal(d, dist)
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
@pytest.mark.parametrize('dualtree', (True, False))
def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
rng = check_random_state(0)
X = rng.random_sample((n_samples, n_features))
Y = rng.random_sample((n_samples, n_features))
r = np.linspace(0, 1, 10)
tree = Cls(X, leaf_size=10)
D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
counts_true = [(D <= ri).sum() for ri in r]
counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree)
assert_array_almost_equal(counts, counts_true)
@pytest.mark.parametrize('NeighborsHeap', [NeighborsHeapBT, NeighborsHeapKDT])
def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
heap = NeighborsHeap(n_pts, n_nbrs)
rng = check_random_state(0)
for row in range(n_pts):
d_in = rng.random_sample(2 * n_nbrs).astype(DTYPE, copy=False)
i_in = np.arange(2 * n_nbrs, dtype=ITYPE)
for d, i in zip(d_in, i_in):
heap.push(row, d, i)
ind = np.argsort(d_in)
d_in = d_in[ind]
i_in = i_in[ind]
d_heap, i_heap = heap.get_arrays(sort=True)
assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
@pytest.mark.parametrize('nodeheap_sort', [nodeheap_sort_bt,
nodeheap_sort_kdt])
def test_node_heap(nodeheap_sort, n_nodes=50):
rng = check_random_state(0)
vals = rng.random_sample(n_nodes).astype(DTYPE, copy=False)
i1 = np.argsort(vals)
vals2, i2 = nodeheap_sort(vals)
assert_array_almost_equal(i1, i2)
assert_array_almost_equal(vals[i1], vals2)
@pytest.mark.parametrize('simultaneous_sort', [simultaneous_sort_bt,
simultaneous_sort_kdt])
def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
rng = check_random_state(0)
dist = rng.random_sample((n_rows, n_pts)).astype(DTYPE, copy=False)
ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(ITYPE, copy=False)
dist2 = dist.copy()
ind2 = ind.copy()
# simultaneous sort rows using function
simultaneous_sort(dist, ind)
# simultaneous sort rows using numpy
i = np.argsort(dist2, axis=1)
row_ind = np.arange(n_rows)[:, None]
dist2 = dist2[row_ind, i]
ind2 = ind2[row_ind, i]
assert_array_almost_equal(dist, dist2)
assert_array_almost_equal(ind, ind2)
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
def test_gaussian_kde(Cls, n_samples=1000):
# Compare gaussian KDE results to scipy.stats.gaussian_kde
from scipy.stats import gaussian_kde
rng = check_random_state(0)
x_in = rng.normal(0, 1, n_samples)
x_out = np.linspace(-5, 5, 30)
for h in [0.01, 0.1, 1]:
tree = Cls(x_in[:, None])
gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
dens_tree = tree.kernel_density(x_out[:, None], h) / n_samples
dens_gkde = gkde.evaluate(x_out)
assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)
@pytest.mark.parametrize(
'Cls, metric',
itertools.chain(
[(KDTree, metric) for metric in KD_TREE_METRICS],
[(BallTree, metric) for metric in BALL_TREE_METRICS]))
@pytest.mark.parametrize('k', (1, 3, 5))
@pytest.mark.parametrize('dualtree', (True, False))
@pytest.mark.parametrize('breadth_first', (True, False))
def test_nn_tree_query(Cls, metric, k, dualtree, breadth_first):
rng = check_random_state(0)
X = rng.random_sample((40, DIMENSION))
Y = rng.random_sample((10, DIMENSION))
kwargs = METRICS[metric]
kdt = Cls(X, leaf_size=1, metric=metric, **kwargs)
dist1, ind1 = kdt.query(Y, k, dualtree=dualtree,
breadth_first=breadth_first)
dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)
# don't check indices here: if there are any duplicate distances,
# the indices may not match. Distances should not have this problem.
assert_array_almost_equal(dist1, dist2)
@pytest.mark.parametrize(
"Cls, metric",
[(KDTree, 'euclidean'), (BallTree, 'euclidean'),
(BallTree, dist_func)])
@pytest.mark.parametrize('protocol', (0, 1, 2))
def test_pickle(Cls, metric, protocol):
rng = check_random_state(0)
X = rng.random_sample((10, 3))
if hasattr(metric, '__call__'):
kwargs = {'p': 2}
else:
kwargs = {}
tree1 = Cls(X, leaf_size=1, metric=metric, **kwargs)
ind1, dist1 = tree1.query(X)
s = pickle.dumps(tree1, protocol=protocol)
tree2 = pickle.loads(s)
ind2, dist2 = tree2.query(X)
assert_array_almost_equal(ind1, ind2)
assert_array_almost_equal(dist1, dist2)
assert isinstance(tree2, Cls)

View file

@ -0,0 +1,104 @@
import pickle
import numpy as np
import pytest
from sklearn.neighbors._quad_tree import _QuadTree
from sklearn.utils import check_random_state
def test_quadtree_boundary_computation():
# Introduce a point into a quad tree with boundaries not easy to compute.
Xs = []
# check a random case
Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32))
# check the case where only 0 are inserted
Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32))
# check the case where only negative are inserted
Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32))
# check the case where only small numbers are inserted
Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32))
for X in Xs:
tree = _QuadTree(n_dimensions=2, verbose=0)
tree.build_tree(X)
tree._check_coherence()
def test_quadtree_similar_point():
# Introduce a point into a quad tree where a similar point already exists.
# Test will hang if it doesn't complete.
Xs = []
# check the case where points are actually different
Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32))
# check the case where points are the same on X axis
Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32))
# check the case where points are arbitrarily close on X axis
Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32))
# check the case where points are the same on Y axis
Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32))
# check the case where points are arbitrarily close on Y axis
Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32))
# check the case where points are arbitrarily close on both axes
Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]],
dtype=np.float32))
# check the case where points are arbitrarily close on both axes
# close to machine epsilon - x axis
Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]],
dtype=np.float32))
# check the case where points are arbitrarily close on both axes
# close to machine epsilon - y axis
Xs.append(np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]],
dtype=np.float32))
for X in Xs:
tree = _QuadTree(n_dimensions=2, verbose=0)
tree.build_tree(X)
tree._check_coherence()
@pytest.mark.parametrize('n_dimensions', (2, 3))
@pytest.mark.parametrize('protocol', (0, 1, 2))
def test_quad_tree_pickle(n_dimensions, protocol):
rng = check_random_state(0)
X = rng.random_sample((10, n_dimensions))
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
tree.build_tree(X)
s = pickle.dumps(tree, protocol=protocol)
bt2 = pickle.loads(s)
for x in X:
cell_x_tree = tree.get_cell(x)
cell_x_bt2 = bt2.get_cell(x)
assert cell_x_tree == cell_x_bt2
@pytest.mark.parametrize('n_dimensions', (2, 3))
def test_qt_insert_duplicate(n_dimensions):
rng = check_random_state(0)
X = rng.random_sample((10, n_dimensions))
Xd = np.r_[X, X[:5]]
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
tree.build_tree(Xd)
cumulative_size = tree.cumulative_size
leafs = tree.leafs
# Assert that the first 5 are indeed duplicated and that the next
# ones are single point leaf
for i, x in enumerate(X):
cell_id = tree.get_cell(x)
assert leafs[cell_id]
assert cumulative_size[cell_id] == 1 + (i < 5)
def test_summarize():
_QuadTree.test_summarize()