# # Author: Damian Eads # Date: April 17, 2008 # # Copyright (C) 2008 Damian Eads # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # 3. The name of the author may not be used to endorse or promote # products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os.path from functools import wraps, partial import numpy as np import warnings from numpy.linalg import norm from numpy.testing import (verbose, assert_, assert_array_equal, assert_equal, assert_almost_equal, assert_allclose, suppress_warnings) import pytest from pytest import raises as assert_raises from scipy.spatial.distance import (squareform, pdist, cdist, num_obs_y, num_obs_dm, is_valid_dm, is_valid_y, _validate_vector, _METRICS_NAMES) # these were missing: chebyshev cityblock kulsinski # jensenshannon, matching and seuclidean are referenced by string name. from scipy.spatial.distance import (braycurtis, canberra, chebyshev, cityblock, correlation, cosine, dice, euclidean, hamming, jaccard, jensenshannon, kulsinski, mahalanobis, matching, minkowski, rogerstanimoto, russellrao, seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule) from scipy.spatial.distance import wminkowski as old_wminkowski _filenames = [ "cdist-X1.txt", "cdist-X2.txt", "iris.txt", "pdist-boolean-inp.txt", "pdist-chebyshev-ml-iris.txt", "pdist-chebyshev-ml.txt", "pdist-cityblock-ml-iris.txt", "pdist-cityblock-ml.txt", "pdist-correlation-ml-iris.txt", "pdist-correlation-ml.txt", "pdist-cosine-ml-iris.txt", "pdist-cosine-ml.txt", "pdist-double-inp.txt", "pdist-euclidean-ml-iris.txt", "pdist-euclidean-ml.txt", "pdist-hamming-ml.txt", "pdist-jaccard-ml.txt", "pdist-jensenshannon-ml-iris.txt", "pdist-jensenshannon-ml.txt", "pdist-minkowski-3.2-ml-iris.txt", "pdist-minkowski-3.2-ml.txt", "pdist-minkowski-5.8-ml-iris.txt", "pdist-seuclidean-ml-iris.txt", "pdist-seuclidean-ml.txt", "pdist-spearman-ml.txt", "random-bool-data.txt", "random-double-data.txt", "random-int-data.txt", "random-uint-data.txt", ] _tdist = np.array([[0, 662, 877, 255, 412, 996], [662, 0, 295, 468, 268, 400], [877, 295, 0, 754, 564, 138], [255, 468, 754, 0, 219, 869], [412, 268, 564, 219, 0, 669], [996, 400, 138, 869, 669, 0]], dtype='double') _ytdist = squareform(_tdist) # A hashmap of expected output arrays for the tests. These arrays # come from a list of text files, which are read prior to testing. # Each test loads inputs and outputs from this dictionary. eo = {} def load_testing_files(): for fn in _filenames: name = fn.replace(".txt", "").replace("-ml", "") fqfn = os.path.join(os.path.dirname(__file__), 'data', fn) fp = open(fqfn) eo[name] = np.loadtxt(fp) fp.close() eo['pdist-boolean-inp'] = np.bool_(eo['pdist-boolean-inp']) eo['random-bool-data'] = np.bool_(eo['random-bool-data']) eo['random-float32-data'] = np.float32(eo['random-double-data']) eo['random-int-data'] = np.int_(eo['random-int-data']) eo['random-uint-data'] = np.uint(eo['random-uint-data']) load_testing_files() def _chk_asarrays(arrays, axis=None): arrays = [np.asanyarray(a) for a in arrays] if axis is None: # np < 1.10 ravel removes subclass from arrays arrays = [np.ravel(a) if a.ndim != 1 else a for a in arrays] axis = 0 arrays = tuple(np.atleast_1d(a) for a in arrays) if axis < 0: if not all(a.ndim == arrays[0].ndim for a in arrays): raise ValueError("array ndim must be the same for neg axis") axis = range(arrays[0].ndim)[axis] return arrays + (axis,) def _chk_weights(arrays, weights=None, axis=None, force_weights=False, simplify_weights=True, pos_only=False, neg_check=False, nan_screen=False, mask_screen=False, ddof=None): chked = _chk_asarrays(arrays, axis=axis) arrays, axis = chked[:-1], chked[-1] simplify_weights = simplify_weights and not force_weights if not force_weights and mask_screen: force_weights = any(np.ma.getmask(a) is not np.ma.nomask for a in arrays) if nan_screen: has_nans = [np.isnan(np.sum(a)) for a in arrays] if any(has_nans): mask_screen = True force_weights = True arrays = tuple(np.ma.masked_invalid(a) if has_nan else a for a, has_nan in zip(arrays, has_nans)) if weights is not None: weights = np.asanyarray(weights) elif force_weights: weights = np.ones(arrays[0].shape[axis]) else: return arrays + (weights, axis) if ddof: weights = _freq_weights(weights) if mask_screen: weights = _weight_masked(arrays, weights, axis) if not all(weights.shape == (a.shape[axis],) for a in arrays): raise ValueError("weights shape must match arrays along axis") if neg_check and (weights < 0).any(): raise ValueError("weights cannot be negative") if pos_only: pos_weights = np.nonzero(weights > 0)[0] if pos_weights.size < weights.size: arrays = tuple(np.take(a, pos_weights, axis=axis) for a in arrays) weights = weights[pos_weights] if simplify_weights and (weights == 1).all(): weights = None return arrays + (weights, axis) def _freq_weights(weights): if weights is None: return weights int_weights = weights.astype(int) if (weights != int_weights).any(): raise ValueError("frequency (integer count-type) weights required %s" % weights) return int_weights def _weight_masked(arrays, weights, axis): if axis is None: axis = 0 weights = np.asanyarray(weights) for a in arrays: axis_mask = np.ma.getmask(a) if axis_mask is np.ma.nomask: continue if a.ndim > 1: not_axes = tuple(i for i in range(a.ndim) if i != axis) axis_mask = axis_mask.any(axis=not_axes) weights *= 1 - axis_mask.astype(int) return weights def within_tol(a, b, tol): return np.abs(a - b).max() < tol def _assert_within_tol(a, b, atol=0, rtol=0, verbose_=False): if verbose_: print(np.abs(a - b).max()) assert_allclose(a, b, rtol=rtol, atol=atol) def _rand_split(arrays, weights, axis, split_per, seed=None): # inverse operation for stats.collapse_weights weights = np.array(weights, dtype=np.float64) # modified inplace; need a copy seeded_rand = np.random.RandomState(seed) def mytake(a, ix, axis): record = np.asanyarray(np.take(a, ix, axis=axis)) return record.reshape([a.shape[i] if i != axis else 1 for i in range(a.ndim)]) n_obs = arrays[0].shape[axis] assert all(a.shape[axis] == n_obs for a in arrays), "data must be aligned on sample axis" for i in range(int(split_per) * n_obs): split_ix = seeded_rand.randint(n_obs + i) prev_w = weights[split_ix] q = seeded_rand.rand() weights[split_ix] = q * prev_w weights = np.append(weights, (1. - q) * prev_w) arrays = [np.append(a, mytake(a, split_ix, axis=axis), axis=axis) for a in arrays] return arrays, weights def _rough_check(a, b, compare_assert=partial(assert_allclose, atol=1e-5), key=lambda x: x, w=None): check_a = key(a) check_b = key(b) try: if np.array(check_a != check_b).any(): # try strict equality for string types compare_assert(check_a, check_b) except AttributeError: # masked array compare_assert(check_a, check_b) except (TypeError, ValueError): # nested data structure for a_i, b_i in zip(check_a, check_b): _rough_check(a_i, b_i, compare_assert=compare_assert) # diff from test_stats: # n_args=2, weight_arg='w', default_axis=None # ma_safe = False, nan_safe = False def _weight_checked(fn, n_args=2, default_axis=None, key=lambda x: x, weight_arg='w', squeeze=True, silent=False, ones_test=True, const_test=True, dup_test=True, split_test=True, dud_test=True, ma_safe=False, ma_very_safe=False, nan_safe=False, split_per=1.0, seed=0, compare_assert=partial(assert_allclose, atol=1e-5)): """runs fn on its arguments 2 or 3 ways, checks that the results are the same, then returns the same thing it would have returned before""" @wraps(fn) def wrapped(*args, **kwargs): result = fn(*args, **kwargs) arrays = args[:n_args] rest = args[n_args:] weights = kwargs.get(weight_arg, None) axis = kwargs.get('axis', default_axis) chked = _chk_weights(arrays, weights=weights, axis=axis, force_weights=True, mask_screen=True) arrays, weights, axis = chked[:-2], chked[-2], chked[-1] if squeeze: arrays = [np.atleast_1d(a.squeeze()) for a in arrays] try: # WEIGHTS CHECK 1: EQUAL WEIGHTED OBESERVATIONS args = tuple(arrays) + rest if ones_test: kwargs[weight_arg] = weights _rough_check(result, fn(*args, **kwargs), key=key) if const_test: kwargs[weight_arg] = weights * 101.0 _rough_check(result, fn(*args, **kwargs), key=key) kwargs[weight_arg] = weights * 0.101 try: _rough_check(result, fn(*args, **kwargs), key=key) except Exception as e: raise type(e)((e, arrays, weights)) # WEIGHTS CHECK 2: ADDL 0-WEIGHTED OBS if dud_test: # add randomly resampled rows, weighted at 0 dud_arrays, dud_weights = _rand_split(arrays, weights, axis, split_per=split_per, seed=seed) dud_weights[:weights.size] = weights # not exactly 1 because of masked arrays dud_weights[weights.size:] = 0 dud_args = tuple(dud_arrays) + rest kwargs[weight_arg] = dud_weights _rough_check(result, fn(*dud_args, **kwargs), key=key) # increase the value of those 0-weighted rows for a in dud_arrays: indexer = [slice(None)] * a.ndim indexer[axis] = slice(weights.size, None) indexer = tuple(indexer) a[indexer] = a[indexer] * 101 dud_args = tuple(dud_arrays) + rest _rough_check(result, fn(*dud_args, **kwargs), key=key) # set those 0-weighted rows to NaNs for a in dud_arrays: indexer = [slice(None)] * a.ndim indexer[axis] = slice(weights.size, None) indexer = tuple(indexer) a[indexer] = a[indexer] * np.nan if kwargs.get("nan_policy", None) == "omit" and nan_safe: dud_args = tuple(dud_arrays) + rest _rough_check(result, fn(*dud_args, **kwargs), key=key) # mask out those nan values if ma_safe: dud_arrays = [np.ma.masked_invalid(a) for a in dud_arrays] dud_args = tuple(dud_arrays) + rest _rough_check(result, fn(*dud_args, **kwargs), key=key) if ma_very_safe: kwargs[weight_arg] = None _rough_check(result, fn(*dud_args, **kwargs), key=key) del dud_arrays, dud_args, dud_weights # WEIGHTS CHECK 3: DUPLICATE DATA (DUMB SPLITTING) if dup_test: dup_arrays = [np.append(a, a, axis=axis) for a in arrays] dup_weights = np.append(weights, weights) / 2.0 dup_args = tuple(dup_arrays) + rest kwargs[weight_arg] = dup_weights _rough_check(result, fn(*dup_args, **kwargs), key=key) del dup_args, dup_arrays, dup_weights # WEIGHT CHECK 3: RANDOM SPLITTING if split_test and split_per > 0: split_arrays, split_weights = _rand_split(arrays, weights, axis, split_per=split_per, seed=seed) split_args = tuple(split_arrays) + rest kwargs[weight_arg] = split_weights _rough_check(result, fn(*split_args, **kwargs), key=key) except NotImplementedError as e: # when some combination of arguments makes weighting impossible, # this is the desired response if not silent: warnings.warn("%s NotImplemented weights: %s" % (fn.__name__, e)) return result return wrapped wcdist = _weight_checked(cdist, default_axis=1, squeeze=False) wcdist_no_const = _weight_checked(cdist, default_axis=1, squeeze=False, const_test=False) wpdist = _weight_checked(pdist, default_axis=1, squeeze=False, n_args=1) wpdist_no_const = _weight_checked(pdist, default_axis=1, squeeze=False, const_test=False, n_args=1) wrogerstanimoto = _weight_checked(rogerstanimoto) wmatching = whamming = _weight_checked(hamming, dud_test=False) wyule = _weight_checked(yule) wdice = _weight_checked(dice) wcityblock = _weight_checked(cityblock) wchebyshev = _weight_checked(chebyshev) wcosine = _weight_checked(cosine) wcorrelation = _weight_checked(correlation) wkulsinski = _weight_checked(kulsinski) wminkowski = _weight_checked(minkowski, const_test=False) wjaccard = _weight_checked(jaccard) weuclidean = _weight_checked(euclidean, const_test=False) wsqeuclidean = _weight_checked(sqeuclidean, const_test=False) wbraycurtis = _weight_checked(braycurtis) wcanberra = _weight_checked(canberra, const_test=False) wsokalsneath = _weight_checked(sokalsneath) wsokalmichener = _weight_checked(sokalmichener) wrussellrao = _weight_checked(russellrao) class TestCdist(object): def setup_method(self): self.rnd_eo_names = ['random-float32-data', 'random-int-data', 'random-uint-data', 'random-double-data', 'random-bool-data'] self.valid_upcasts = {'bool': [np.uint, np.int_, np.float32, np.double], 'uint': [np.int_, np.float32, np.double], 'int': [np.float32, np.double], 'float32': [np.double]} def test_cdist_extra_args(self): # Tests that args and kwargs are correctly handled def _my_metric(x, y, arg, kwarg=1, kwarg2=2): return arg + kwarg + kwarg2 X1 = [[1., 2., 3.], [1.2, 2.3, 3.4], [2.2, 2.3, 4.4]] X2 = [[7., 5., 8.], [7.5, 5.8, 8.4], [5.5, 5.8, 4.4]] kwargs = {'N0tV4l1D_p4raM': 3.14, "w":np.arange(3)} args = [3.14] * 200 with suppress_warnings() as w: w.filter(DeprecationWarning) for metric in _METRICS_NAMES: assert_raises(TypeError, cdist, X1, X2, metric=metric, **kwargs) assert_raises(TypeError, cdist, X1, X2, metric=eval(metric), **kwargs) assert_raises(TypeError, cdist, X1, X2, metric="test_" + metric, **kwargs) assert_raises(TypeError, cdist, X1, X2, metric=metric, *args) assert_raises(TypeError, cdist, X1, X2, metric=eval(metric), *args) assert_raises(TypeError, cdist, X1, X2, metric="test_" + metric, *args) assert_raises(TypeError, cdist, X1, X2, _my_metric) assert_raises(TypeError, cdist, X1, X2, _my_metric, *args) assert_raises(TypeError, cdist, X1, X2, _my_metric, **kwargs) assert_raises(TypeError, cdist, X1, X2, _my_metric, kwarg=2.2, kwarg2=3.3) assert_raises(TypeError, cdist, X1, X2, _my_metric, 1, 2, kwarg=2.2) assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1, 2.2, 3.3) assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1, 2.2) assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1) assert_raises(TypeError, cdist, X1, X2, _my_metric, 1.1, kwarg=2.2, kwarg2=3.3) # this should work assert_allclose(cdist(X1, X2, metric=_my_metric, arg=1.1, kwarg2=3.3), 5.4) def test_cdist_euclidean_random_unicode(self): eps = 1e-07 X1 = eo['cdist-X1'] X2 = eo['cdist-X2'] Y1 = wcdist_no_const(X1, X2, 'euclidean') Y2 = wcdist_no_const(X1, X2, 'test_euclidean') _assert_within_tol(Y1, Y2, eps, verbose > 2) def test_cdist_minkowski_random_p3d8(self): eps = 1e-07 X1 = eo['cdist-X1'] X2 = eo['cdist-X2'] Y1 = wcdist_no_const(X1, X2, 'minkowski', p=3.8) Y2 = wcdist_no_const(X1, X2, 'test_minkowski', p=3.8) _assert_within_tol(Y1, Y2, eps, verbose > 2) def test_cdist_minkowski_random_p4d6(self): eps = 1e-07 X1 = eo['cdist-X1'] X2 = eo['cdist-X2'] Y1 = wcdist_no_const(X1, X2, 'minkowski', p=4.6) Y2 = wcdist_no_const(X1, X2, 'test_minkowski', p=4.6) _assert_within_tol(Y1, Y2, eps, verbose > 2) def test_cdist_minkowski_random_p1d23(self): eps = 1e-07 X1 = eo['cdist-X1'] X2 = eo['cdist-X2'] Y1 = wcdist_no_const(X1, X2, 'minkowski', p=1.23) Y2 = wcdist_no_const(X1, X2, 'test_minkowski', p=1.23) _assert_within_tol(Y1, Y2, eps, verbose > 2) def test_cdist_cosine_random(self): eps = 1e-07 X1 = eo['cdist-X1'] X2 = eo['cdist-X2'] Y1 = wcdist(X1, X2, 'cosine') # Naive implementation def norms(X): return np.linalg.norm(X, axis=1).reshape(-1, 1) Y2 = 1 - np.dot((X1 / norms(X1)), (X2 / norms(X2)).T) _assert_within_tol(Y1, Y2, eps, verbose > 2) def test_cdist_mahalanobis(self): # 1-dimensional observations x1 = np.array([[2], [3]]) x2 = np.array([[2], [5]]) dist = cdist(x1, x2, metric='mahalanobis') assert_allclose(dist, [[0.0, np.sqrt(4.5)], [np.sqrt(0.5), np.sqrt(2)]]) # 2-dimensional observations x1 = np.array([[0, 0], [-1, 0]]) x2 = np.array([[0, 2], [1, 0], [0, -2]]) dist = cdist(x1, x2, metric='mahalanobis') rt2 = np.sqrt(2) assert_allclose(dist, [[rt2, rt2, rt2], [2, 2 * rt2, 2]]) # Too few observations assert_raises(ValueError, cdist, [[0, 1]], [[2, 3]], metric='mahalanobis') def test_cdist_custom_notdouble(self): class myclass(object): pass def _my_metric(x, y): if not isinstance(x[0], myclass) or not isinstance(y[0], myclass): raise ValueError("Type has been changed") return 1.123 data = np.array([[myclass()]], dtype=object) cdist_y = cdist(data, data, metric=_my_metric) right_y = 1.123 assert_equal(cdist_y, right_y, verbose=verbose > 2) def _check_calling_conventions(self, X1, X2, metric, eps=1e-07, **kwargs): # helper function for test_cdist_calling_conventions try: y1 = cdist(X1, X2, metric=metric, **kwargs) y2 = cdist(X1, X2, metric=eval(metric), **kwargs) y3 = cdist(X1, X2, metric="test_" + metric, **kwargs) except Exception as e: e_cls = e.__class__ if verbose > 2: print(e_cls.__name__) print(e) assert_raises(e_cls, cdist, X1, X2, metric=metric, **kwargs) assert_raises(e_cls, cdist, X1, X2, metric=eval(metric), **kwargs) assert_raises(e_cls, cdist, X1, X2, metric="test_" + metric, **kwargs) else: _assert_within_tol(y1, y2, rtol=eps, verbose_=verbose > 2) _assert_within_tol(y1, y3, rtol=eps, verbose_=verbose > 2) def test_cdist_calling_conventions(self): # Ensures that specifying the metric with a str or scipy function # gives the same behaviour (i.e. same result or same exception). # NOTE: The correctness should be checked within each metric tests. for eo_name in self.rnd_eo_names: # subsampling input data to speed-up tests # NOTE: num samples needs to be > than dimensions for mahalanobis X1 = eo[eo_name][::5, ::-2] X2 = eo[eo_name][1::5, ::2] for metric in _METRICS_NAMES: if verbose > 2: print("testing: ", metric, " with: ", eo_name) if metric == 'wminkowski': continue if metric in {'dice', 'yule', 'kulsinski', 'matching', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath'} and 'bool' not in eo_name: # python version permits non-bools e.g. for fuzzy logic continue self._check_calling_conventions(X1, X2, metric) # Testing built-in metrics with extra args if metric == "seuclidean": X12 = np.vstack([X1, X2]).astype(np.double) V = np.var(X12, axis=0, ddof=1) self._check_calling_conventions(X1, X2, metric, V=V) elif metric == "mahalanobis": X12 = np.vstack([X1, X2]).astype(np.double) V = np.atleast_2d(np.cov(X12.T)) VI = np.array(np.linalg.inv(V).T) self._check_calling_conventions(X1, X2, metric, VI=VI) def test_cdist_dtype_equivalence(self): # Tests that the result is not affected by type up-casting eps = 1e-07 tests = [(eo['random-bool-data'], self.valid_upcasts['bool']), (eo['random-uint-data'], self.valid_upcasts['uint']), (eo['random-int-data'], self.valid_upcasts['int']), (eo['random-float32-data'], self.valid_upcasts['float32'])] for metric in _METRICS_NAMES: for test in tests: X1 = test[0][::5, ::-2] X2 = test[0][1::5, ::2] try: y1 = cdist(X1, X2, metric=metric) except Exception as e: e_cls = e.__class__ if verbose > 2: print(e_cls.__name__) print(e) for new_type in test[1]: X1new = new_type(X1) X2new = new_type(X2) assert_raises(e_cls, cdist, X1new, X2new, metric=metric) else: for new_type in test[1]: y2 = cdist(new_type(X1), new_type(X2), metric=metric) _assert_within_tol(y1, y2, eps, verbose > 2) def test_cdist_out(self): # Test that out parameter works properly eps = 1e-07 X1 = eo['cdist-X1'] X2 = eo['cdist-X2'] out_r, out_c = X1.shape[0], X2.shape[0] for metric in _METRICS_NAMES: kwargs = dict() if metric in ['minkowski', 'wminkowski']: kwargs['p'] = 1.23 if metric == 'wminkowski': kwargs['w'] = 1.0 / X1.std(axis=0) out1 = np.empty((out_r, out_c), dtype=np.double) Y1 = cdist(X1, X2, metric, **kwargs) Y2 = cdist(X1, X2, metric, out=out1, **kwargs) # test that output is numerically equivalent _assert_within_tol(Y1, Y2, eps, verbose > 2) # test that Y_test1 and out1 are the same object assert_(Y2 is out1) # test for incorrect shape out2 = np.empty((out_r-1, out_c+1), dtype=np.double) assert_raises(ValueError, cdist, X1, X2, metric, out=out2, **kwargs) # test for C-contiguous order out3 = np.empty((2 * out_r, 2 * out_c), dtype=np.double)[::2, ::2] out4 = np.empty((out_r, out_c), dtype=np.double, order='F') assert_raises(ValueError, cdist, X1, X2, metric, out=out3, **kwargs) assert_raises(ValueError, cdist, X1, X2, metric, out=out4, **kwargs) # test for incorrect dtype out5 = np.empty((out_r, out_c), dtype=np.int64) assert_raises(ValueError, cdist, X1, X2, metric, out=out5, **kwargs) def test_striding(self): # test that striding is handled correct with calls to # _copy_array_if_base_present eps = 1e-07 X1 = eo['cdist-X1'][::2, ::2] X2 = eo['cdist-X2'][::2, ::2] X1_copy = X1.copy() X2_copy = X2.copy() # confirm equivalence assert_equal(X1, X1_copy) assert_equal(X2, X2_copy) # confirm contiguity assert_(not X1.flags.c_contiguous) assert_(not X2.flags.c_contiguous) assert_(X1_copy.flags.c_contiguous) assert_(X2_copy.flags.c_contiguous) for metric in _METRICS_NAMES: kwargs = dict() if metric in ['minkowski', 'wminkowski']: kwargs['p'] = 1.23 if metric == 'wminkowski': kwargs['w'] = 1.0 / X1.std(axis=0) Y1 = cdist(X1, X2, metric, **kwargs) Y2 = cdist(X1_copy, X2_copy, metric, **kwargs) # test that output is numerically equivalent _assert_within_tol(Y1, Y2, eps, verbose > 2) class TestPdist(object): def setup_method(self): self.rnd_eo_names = ['random-float32-data', 'random-int-data', 'random-uint-data', 'random-double-data', 'random-bool-data'] self.valid_upcasts = {'bool': [np.uint, np.int_, np.float32, np.double], 'uint': [np.int_, np.float32, np.double], 'int': [np.float32, np.double], 'float32': [np.double]} def test_pdist_extra_args(self): # Tests that args and kwargs are correctly handled def _my_metric(x, y, arg, kwarg=1, kwarg2=2): return arg + kwarg + kwarg2 X1 = [[1., 2.], [1.2, 2.3], [2.2, 2.3]] kwargs = {'N0tV4l1D_p4raM': 3.14, "w":np.arange(2)} args = [3.14] * 200 with suppress_warnings() as w: w.filter(DeprecationWarning) for metric in _METRICS_NAMES: assert_raises(TypeError, pdist, X1, metric=metric, **kwargs) assert_raises(TypeError, pdist, X1, metric=eval(metric), **kwargs) assert_raises(TypeError, pdist, X1, metric="test_" + metric, **kwargs) assert_raises(TypeError, pdist, X1, metric=metric, *args) assert_raises(TypeError, pdist, X1, metric=eval(metric), *args) assert_raises(TypeError, pdist, X1, metric="test_" + metric, *args) assert_raises(TypeError, pdist, X1, _my_metric) assert_raises(TypeError, pdist, X1, _my_metric, *args) assert_raises(TypeError, pdist, X1, _my_metric, **kwargs) assert_raises(TypeError, pdist, X1, _my_metric, kwarg=2.2, kwarg2=3.3) assert_raises(TypeError, pdist, X1, _my_metric, 1, 2, kwarg=2.2) assert_raises(TypeError, pdist, X1, _my_metric, 1.1, 2.2, 3.3) assert_raises(TypeError, pdist, X1, _my_metric, 1.1, 2.2) assert_raises(TypeError, pdist, X1, _my_metric, 1.1) assert_raises(TypeError, pdist, X1, _my_metric, 1.1, kwarg=2.2, kwarg2=3.3) # these should work assert_allclose(pdist(X1, metric=_my_metric, arg=1.1, kwarg2=3.3), 5.4) def test_pdist_euclidean_random(self): eps = 1e-07 X = eo['pdist-double-inp'] Y_right = eo['pdist-euclidean'] Y_test1 = wpdist_no_const(X, 'euclidean') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_euclidean_random_u(self): eps = 1e-07 X = eo['pdist-double-inp'] Y_right = eo['pdist-euclidean'] Y_test1 = wpdist_no_const(X, 'euclidean') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_euclidean_random_float32(self): eps = 1e-07 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-euclidean'] Y_test1 = wpdist_no_const(X, 'euclidean') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_euclidean_random_nonC(self): eps = 1e-07 X = eo['pdist-double-inp'] Y_right = eo['pdist-euclidean'] Y_test2 = wpdist_no_const(X, 'test_euclidean') _assert_within_tol(Y_test2, Y_right, eps) @pytest.mark.slow def test_pdist_euclidean_iris_double(self): eps = 1e-07 X = eo['iris'] Y_right = eo['pdist-euclidean-iris'] Y_test1 = wpdist_no_const(X, 'euclidean') _assert_within_tol(Y_test1, Y_right, eps) @pytest.mark.slow def test_pdist_euclidean_iris_float32(self): eps = 1e-06 X = np.float32(eo['iris']) Y_right = eo['pdist-euclidean-iris'] Y_test1 = wpdist_no_const(X, 'euclidean') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) @pytest.mark.slow def test_pdist_euclidean_iris_nonC(self): # Test pdist(X, 'test_euclidean') [the non-C implementation] on the # Iris data set. eps = 1e-07 X = eo['iris'] Y_right = eo['pdist-euclidean-iris'] Y_test2 = wpdist_no_const(X, 'test_euclidean') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_seuclidean_random(self): eps = 1e-05 X = eo['pdist-double-inp'] Y_right = eo['pdist-seuclidean'] Y_test1 = pdist(X, 'seuclidean') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_seuclidean_random_float32(self): eps = 1e-05 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-seuclidean'] Y_test1 = pdist(X, 'seuclidean') _assert_within_tol(Y_test1, Y_right, eps) # Check no error is raise when V has float32 dtype (#11171). V = np.var(X, axis=0, ddof=1) Y_test2 = pdist(X, 'seuclidean', V=V) _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_seuclidean_random_nonC(self): # Test pdist(X, 'test_sqeuclidean') [the non-C implementation] eps = 1e-05 X = eo['pdist-double-inp'] Y_right = eo['pdist-seuclidean'] Y_test2 = pdist(X, 'test_seuclidean') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_seuclidean_iris(self): eps = 1e-05 X = eo['iris'] Y_right = eo['pdist-seuclidean-iris'] Y_test1 = pdist(X, 'seuclidean') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_seuclidean_iris_float32(self): # Tests pdist(X, 'seuclidean') on the Iris data set (float32). eps = 1e-05 X = np.float32(eo['iris']) Y_right = eo['pdist-seuclidean-iris'] Y_test1 = pdist(X, 'seuclidean') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_seuclidean_iris_nonC(self): # Test pdist(X, 'test_seuclidean') [the non-C implementation] on the # Iris data set. eps = 1e-05 X = eo['iris'] Y_right = eo['pdist-seuclidean-iris'] Y_test2 = pdist(X, 'test_seuclidean') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_cosine_random(self): eps = 1e-08 X = eo['pdist-double-inp'] Y_right = eo['pdist-cosine'] Y_test1 = wpdist(X, 'cosine') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_cosine_random_float32(self): eps = 1e-08 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-cosine'] Y_test1 = wpdist(X, 'cosine') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_cosine_random_nonC(self): # Test pdist(X, 'test_cosine') [the non-C implementation] eps = 1e-08 X = eo['pdist-double-inp'] Y_right = eo['pdist-cosine'] Y_test2 = wpdist(X, 'test_cosine') _assert_within_tol(Y_test2, Y_right, eps) @pytest.mark.slow def test_pdist_cosine_iris(self): eps = 1e-08 X = eo['iris'] Y_right = eo['pdist-cosine-iris'] Y_test1 = wpdist(X, 'cosine') _assert_within_tol(Y_test1, Y_right, eps) @pytest.mark.slow def test_pdist_cosine_iris_float32(self): eps = 1e-07 X = np.float32(eo['iris']) Y_right = eo['pdist-cosine-iris'] Y_test1 = wpdist(X, 'cosine') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) @pytest.mark.slow def test_pdist_cosine_iris_nonC(self): eps = 1e-08 X = eo['iris'] Y_right = eo['pdist-cosine-iris'] Y_test2 = wpdist(X, 'test_cosine') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_cosine_bounds(self): # Test adapted from @joernhees's example at gh-5208: case where # cosine distance used to be negative. XXX: very sensitive to the # specific norm computation. x = np.abs(np.random.RandomState(1337).rand(91)) X = np.vstack([x, x]) assert_(wpdist(X, 'cosine')[0] >= 0, msg='cosine distance should be non-negative') def test_pdist_cityblock_random(self): eps = 1e-06 X = eo['pdist-double-inp'] Y_right = eo['pdist-cityblock'] Y_test1 = wpdist_no_const(X, 'cityblock') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_cityblock_random_float32(self): eps = 1e-06 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-cityblock'] Y_test1 = wpdist_no_const(X, 'cityblock') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_cityblock_random_nonC(self): eps = 1e-06 X = eo['pdist-double-inp'] Y_right = eo['pdist-cityblock'] Y_test2 = wpdist_no_const(X, 'test_cityblock') _assert_within_tol(Y_test2, Y_right, eps) @pytest.mark.slow def test_pdist_cityblock_iris(self): eps = 1e-14 X = eo['iris'] Y_right = eo['pdist-cityblock-iris'] Y_test1 = wpdist_no_const(X, 'cityblock') _assert_within_tol(Y_test1, Y_right, eps) @pytest.mark.slow def test_pdist_cityblock_iris_float32(self): eps = 1e-06 X = np.float32(eo['iris']) Y_right = eo['pdist-cityblock-iris'] Y_test1 = wpdist_no_const(X, 'cityblock') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) @pytest.mark.slow def test_pdist_cityblock_iris_nonC(self): # Test pdist(X, 'test_cityblock') [the non-C implementation] on the # Iris data set. eps = 1e-14 X = eo['iris'] Y_right = eo['pdist-cityblock-iris'] Y_test2 = wpdist_no_const(X, 'test_cityblock') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_correlation_random(self): eps = 1e-07 X = eo['pdist-double-inp'] Y_right = eo['pdist-correlation'] Y_test1 = wpdist(X, 'correlation') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_correlation_random_float32(self): eps = 1e-07 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-correlation'] Y_test1 = wpdist(X, 'correlation') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_correlation_random_nonC(self): eps = 1e-07 X = eo['pdist-double-inp'] Y_right = eo['pdist-correlation'] Y_test2 = wpdist(X, 'test_correlation') _assert_within_tol(Y_test2, Y_right, eps) @pytest.mark.slow def test_pdist_correlation_iris(self): eps = 1e-08 X = eo['iris'] Y_right = eo['pdist-correlation-iris'] Y_test1 = wpdist(X, 'correlation') _assert_within_tol(Y_test1, Y_right, eps) @pytest.mark.slow def test_pdist_correlation_iris_float32(self): eps = 1e-07 X = eo['iris'] Y_right = np.float32(eo['pdist-correlation-iris']) Y_test1 = wpdist(X, 'correlation') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) @pytest.mark.slow def test_pdist_correlation_iris_nonC(self): eps = 1e-08 X = eo['iris'] Y_right = eo['pdist-correlation-iris'] Y_test2 = wpdist(X, 'test_correlation') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_minkowski_random(self): eps = 1e-05 X = eo['pdist-double-inp'] Y_right = eo['pdist-minkowski-3.2'] Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2) _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_minkowski_random_float32(self): eps = 1e-05 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-minkowski-3.2'] Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2) _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_minkowski_random_nonC(self): eps = 1e-05 X = eo['pdist-double-inp'] Y_right = eo['pdist-minkowski-3.2'] Y_test2 = wpdist_no_const(X, 'test_minkowski', p=3.2) _assert_within_tol(Y_test2, Y_right, eps) @pytest.mark.slow def test_pdist_minkowski_3_2_iris(self): eps = 1e-07 X = eo['iris'] Y_right = eo['pdist-minkowski-3.2-iris'] Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2) _assert_within_tol(Y_test1, Y_right, eps) @pytest.mark.slow def test_pdist_minkowski_3_2_iris_float32(self): eps = 1e-06 X = np.float32(eo['iris']) Y_right = eo['pdist-minkowski-3.2-iris'] Y_test1 = wpdist_no_const(X, 'minkowski', p=3.2) _assert_within_tol(Y_test1, Y_right, eps) @pytest.mark.slow def test_pdist_minkowski_3_2_iris_nonC(self): eps = 1e-07 X = eo['iris'] Y_right = eo['pdist-minkowski-3.2-iris'] Y_test2 = wpdist_no_const(X, 'test_minkowski', p=3.2) _assert_within_tol(Y_test2, Y_right, eps) @pytest.mark.slow def test_pdist_minkowski_5_8_iris(self): eps = 1e-07 X = eo['iris'] Y_right = eo['pdist-minkowski-5.8-iris'] Y_test1 = wpdist_no_const(X, 'minkowski', p=5.8) _assert_within_tol(Y_test1, Y_right, eps) @pytest.mark.slow def test_pdist_minkowski_5_8_iris_float32(self): eps = 1e-06 X = np.float32(eo['iris']) Y_right = eo['pdist-minkowski-5.8-iris'] Y_test1 = wpdist_no_const(X, 'minkowski', p=5.8) _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) @pytest.mark.slow def test_pdist_minkowski_5_8_iris_nonC(self): eps = 1e-07 X = eo['iris'] Y_right = eo['pdist-minkowski-5.8-iris'] Y_test2 = wpdist_no_const(X, 'test_minkowski', p=5.8) _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_mahalanobis(self): # 1-dimensional observations x = np.array([2.0, 2.0, 3.0, 5.0]).reshape(-1, 1) dist = pdist(x, metric='mahalanobis') assert_allclose(dist, [0.0, np.sqrt(0.5), np.sqrt(4.5), np.sqrt(0.5), np.sqrt(4.5), np.sqrt(2.0)]) # 2-dimensional observations x = np.array([[0, 0], [-1, 0], [0, 2], [1, 0], [0, -2]]) dist = pdist(x, metric='mahalanobis') rt2 = np.sqrt(2) assert_allclose(dist, [rt2, rt2, rt2, rt2, 2, 2 * rt2, 2, 2, 2 * rt2, 2]) # Too few observations assert_raises(ValueError, wpdist, [[0, 1], [2, 3]], metric='mahalanobis') def test_pdist_hamming_random(self): eps = 1e-07 X = eo['pdist-boolean-inp'] Y_right = eo['pdist-hamming'] Y_test1 = wpdist(X, 'hamming') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_hamming_random_float32(self): eps = 1e-07 X = np.float32(eo['pdist-boolean-inp']) Y_right = eo['pdist-hamming'] Y_test1 = wpdist(X, 'hamming') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_hamming_random_nonC(self): eps = 1e-07 X = eo['pdist-boolean-inp'] Y_right = eo['pdist-hamming'] Y_test2 = wpdist(X, 'test_hamming') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_dhamming_random(self): eps = 1e-07 X = np.float64(eo['pdist-boolean-inp']) Y_right = eo['pdist-hamming'] Y_test1 = wpdist(X, 'hamming') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_dhamming_random_float32(self): eps = 1e-07 X = np.float32(eo['pdist-boolean-inp']) Y_right = eo['pdist-hamming'] Y_test1 = wpdist(X, 'hamming') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_dhamming_random_nonC(self): eps = 1e-07 X = np.float64(eo['pdist-boolean-inp']) Y_right = eo['pdist-hamming'] Y_test2 = wpdist(X, 'test_hamming') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_jaccard_random(self): eps = 1e-08 X = eo['pdist-boolean-inp'] Y_right = eo['pdist-jaccard'] Y_test1 = wpdist(X, 'jaccard') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_jaccard_random_float32(self): eps = 1e-08 X = np.float32(eo['pdist-boolean-inp']) Y_right = eo['pdist-jaccard'] Y_test1 = wpdist(X, 'jaccard') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_jaccard_random_nonC(self): eps = 1e-08 X = eo['pdist-boolean-inp'] Y_right = eo['pdist-jaccard'] Y_test2 = wpdist(X, 'test_jaccard') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_djaccard_random(self): eps = 1e-08 X = np.float64(eo['pdist-boolean-inp']) Y_right = eo['pdist-jaccard'] Y_test1 = wpdist(X, 'jaccard') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_djaccard_random_float32(self): eps = 1e-08 X = np.float32(eo['pdist-boolean-inp']) Y_right = eo['pdist-jaccard'] Y_test1 = wpdist(X, 'jaccard') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_djaccard_allzeros(self): eps = 1e-08 Y = pdist(np.zeros((5, 3)), 'jaccard') _assert_within_tol(np.zeros(10), Y, eps) def test_pdist_djaccard_random_nonC(self): eps = 1e-08 X = np.float64(eo['pdist-boolean-inp']) Y_right = eo['pdist-jaccard'] Y_test2 = wpdist(X, 'test_jaccard') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_jensenshannon_random(self): eps = 1e-08 X = eo['pdist-double-inp'] Y_right = eo['pdist-jensenshannon'] Y_test1 = pdist(X, 'jensenshannon') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_jensenshannon_random_float32(self): eps = 1e-07 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-jensenshannon'] Y_test1 = pdist(X, 'jensenshannon') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) def test_pdist_jensenshannon_random_nonC(self): eps = 1e-08 X = eo['pdist-double-inp'] Y_right = eo['pdist-jensenshannon'] Y_test2 = pdist(X, 'test_jensenshannon') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_jensenshannon_iris(self): eps = 1e-12 X = eo['iris'] Y_right = eo['pdist-jensenshannon-iris'] Y_test1 = pdist(X, 'jensenshannon') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_jensenshannon_iris_float32(self): eps = 1e-06 X = np.float32(eo['iris']) Y_right = eo['pdist-jensenshannon-iris'] Y_test1 = pdist(X, 'jensenshannon') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) def test_pdist_jensenshannon_iris_nonC(self): eps = 5e-12 X = eo['iris'] Y_right = eo['pdist-jensenshannon-iris'] Y_test2 = pdist(X, 'test_jensenshannon') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_djaccard_allzeros_nonC(self): eps = 1e-08 Y = pdist(np.zeros((5, 3)), 'test_jaccard') _assert_within_tol(np.zeros(10), Y, eps) def test_pdist_chebyshev_random(self): eps = 1e-08 X = eo['pdist-double-inp'] Y_right = eo['pdist-chebyshev'] Y_test1 = pdist(X, 'chebyshev') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_chebyshev_random_float32(self): eps = 1e-07 X = np.float32(eo['pdist-double-inp']) Y_right = eo['pdist-chebyshev'] Y_test1 = pdist(X, 'chebyshev') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) def test_pdist_chebyshev_random_nonC(self): eps = 1e-08 X = eo['pdist-double-inp'] Y_right = eo['pdist-chebyshev'] Y_test2 = pdist(X, 'test_chebyshev') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_chebyshev_iris(self): eps = 1e-15 X = eo['iris'] Y_right = eo['pdist-chebyshev-iris'] Y_test1 = pdist(X, 'chebyshev') _assert_within_tol(Y_test1, Y_right, eps) def test_pdist_chebyshev_iris_float32(self): eps = 1e-06 X = np.float32(eo['iris']) Y_right = eo['pdist-chebyshev-iris'] Y_test1 = pdist(X, 'chebyshev') _assert_within_tol(Y_test1, Y_right, eps, verbose > 2) def test_pdist_chebyshev_iris_nonC(self): eps = 1e-15 X = eo['iris'] Y_right = eo['pdist-chebyshev-iris'] Y_test2 = pdist(X, 'test_chebyshev') _assert_within_tol(Y_test2, Y_right, eps) def test_pdist_matching_mtica1(self): # Test matching(*,*) with mtica example #1 (nums). m = wmatching(np.array([1, 0, 1, 1, 0]), np.array([1, 1, 0, 1, 1])) m2 = wmatching(np.array([1, 0, 1, 1, 0], dtype=bool), np.array([1, 1, 0, 1, 1], dtype=bool)) assert_allclose(m, 0.6, rtol=0, atol=1e-10) assert_allclose(m2, 0.6, rtol=0, atol=1e-10) def test_pdist_matching_mtica2(self): # Test matching(*,*) with mtica example #2. m = wmatching(np.array([1, 0, 1]), np.array([1, 1, 0])) m2 = wmatching(np.array([1, 0, 1], dtype=bool), np.array([1, 1, 0], dtype=bool)) assert_allclose(m, 2 / 3, rtol=0, atol=1e-10) assert_allclose(m2, 2 / 3, rtol=0, atol=1e-10) def test_pdist_jaccard_mtica1(self): m = wjaccard(np.array([1, 0, 1, 1, 0]), np.array([1, 1, 0, 1, 1])) m2 = wjaccard(np.array([1, 0, 1, 1, 0], dtype=bool), np.array([1, 1, 0, 1, 1], dtype=bool)) assert_allclose(m, 0.6, rtol=0, atol=1e-10) assert_allclose(m2, 0.6, rtol=0, atol=1e-10) def test_pdist_jaccard_mtica2(self): m = wjaccard(np.array([1, 0, 1]), np.array([1, 1, 0])) m2 = wjaccard(np.array([1, 0, 1], dtype=bool), np.array([1, 1, 0], dtype=bool)) assert_allclose(m, 2 / 3, rtol=0, atol=1e-10) assert_allclose(m2, 2 / 3, rtol=0, atol=1e-10) def test_pdist_yule_mtica1(self): m = wyule(np.array([1, 0, 1, 1, 0]), np.array([1, 1, 0, 1, 1])) m2 = wyule(np.array([1, 0, 1, 1, 0], dtype=bool), np.array([1, 1, 0, 1, 1], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 2, rtol=0, atol=1e-10) assert_allclose(m2, 2, rtol=0, atol=1e-10) def test_pdist_yule_mtica2(self): m = wyule(np.array([1, 0, 1]), np.array([1, 1, 0])) m2 = wyule(np.array([1, 0, 1], dtype=bool), np.array([1, 1, 0], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 2, rtol=0, atol=1e-10) assert_allclose(m2, 2, rtol=0, atol=1e-10) def test_pdist_dice_mtica1(self): m = wdice(np.array([1, 0, 1, 1, 0]), np.array([1, 1, 0, 1, 1])) m2 = wdice(np.array([1, 0, 1, 1, 0], dtype=bool), np.array([1, 1, 0, 1, 1], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 3 / 7, rtol=0, atol=1e-10) assert_allclose(m2, 3 / 7, rtol=0, atol=1e-10) def test_pdist_dice_mtica2(self): m = wdice(np.array([1, 0, 1]), np.array([1, 1, 0])) m2 = wdice(np.array([1, 0, 1], dtype=bool), np.array([1, 1, 0], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 0.5, rtol=0, atol=1e-10) assert_allclose(m2, 0.5, rtol=0, atol=1e-10) def test_pdist_sokalsneath_mtica1(self): m = sokalsneath(np.array([1, 0, 1, 1, 0]), np.array([1, 1, 0, 1, 1])) m2 = sokalsneath(np.array([1, 0, 1, 1, 0], dtype=bool), np.array([1, 1, 0, 1, 1], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 3 / 4, rtol=0, atol=1e-10) assert_allclose(m2, 3 / 4, rtol=0, atol=1e-10) def test_pdist_sokalsneath_mtica2(self): m = wsokalsneath(np.array([1, 0, 1]), np.array([1, 1, 0])) m2 = wsokalsneath(np.array([1, 0, 1], dtype=bool), np.array([1, 1, 0], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 4 / 5, rtol=0, atol=1e-10) assert_allclose(m2, 4 / 5, rtol=0, atol=1e-10) def test_pdist_rogerstanimoto_mtica1(self): m = wrogerstanimoto(np.array([1, 0, 1, 1, 0]), np.array([1, 1, 0, 1, 1])) m2 = wrogerstanimoto(np.array([1, 0, 1, 1, 0], dtype=bool), np.array([1, 1, 0, 1, 1], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 3 / 4, rtol=0, atol=1e-10) assert_allclose(m2, 3 / 4, rtol=0, atol=1e-10) def test_pdist_rogerstanimoto_mtica2(self): m = wrogerstanimoto(np.array([1, 0, 1]), np.array([1, 1, 0])) m2 = wrogerstanimoto(np.array([1, 0, 1], dtype=bool), np.array([1, 1, 0], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 4 / 5, rtol=0, atol=1e-10) assert_allclose(m2, 4 / 5, rtol=0, atol=1e-10) def test_pdist_russellrao_mtica1(self): m = wrussellrao(np.array([1, 0, 1, 1, 0]), np.array([1, 1, 0, 1, 1])) m2 = wrussellrao(np.array([1, 0, 1, 1, 0], dtype=bool), np.array([1, 1, 0, 1, 1], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 3 / 5, rtol=0, atol=1e-10) assert_allclose(m2, 3 / 5, rtol=0, atol=1e-10) def test_pdist_russellrao_mtica2(self): m = wrussellrao(np.array([1, 0, 1]), np.array([1, 1, 0])) m2 = wrussellrao(np.array([1, 0, 1], dtype=bool), np.array([1, 1, 0], dtype=bool)) if verbose > 2: print(m) assert_allclose(m, 2 / 3, rtol=0, atol=1e-10) assert_allclose(m2, 2 / 3, rtol=0, atol=1e-10) @pytest.mark.slow def test_pdist_canberra_match(self): D = eo['iris'] if verbose > 2: print(D.shape, D.dtype) eps = 1e-10 y1 = wpdist_no_const(D, "canberra") y2 = wpdist_no_const(D, "test_canberra") _assert_within_tol(y1, y2, eps, verbose > 2) def test_pdist_canberra_ticket_711(self): # Test pdist(X, 'canberra') to see if Canberra gives the right result # as reported on gh-1238. eps = 1e-8 pdist_y = wpdist_no_const(([3.3], [3.4]), "canberra") right_y = 0.01492537 _assert_within_tol(pdist_y, right_y, eps, verbose > 2) def test_pdist_custom_notdouble(self): # tests that when using a custom metric the data type is not altered class myclass(object): pass def _my_metric(x, y): if not isinstance(x[0], myclass) or not isinstance(y[0], myclass): raise ValueError("Type has been changed") return 1.123 data = np.array([[myclass()], [myclass()]], dtype=object) pdist_y = pdist(data, metric=_my_metric) right_y = 1.123 assert_equal(pdist_y, right_y, verbose=verbose > 2) def _check_calling_conventions(self, X, metric, eps=1e-07, **kwargs): # helper function for test_pdist_calling_conventions try: y1 = pdist(X, metric=metric, **kwargs) y2 = pdist(X, metric=eval(metric), **kwargs) y3 = pdist(X, metric="test_" + metric, **kwargs) except Exception as e: e_cls = e.__class__ if verbose > 2: print(e_cls.__name__) print(e) assert_raises(e_cls, pdist, X, metric=metric, **kwargs) assert_raises(e_cls, pdist, X, metric=eval(metric), **kwargs) assert_raises(e_cls, pdist, X, metric="test_" + metric, **kwargs) else: _assert_within_tol(y1, y2, rtol=eps, verbose_=verbose > 2) _assert_within_tol(y1, y3, rtol=eps, verbose_=verbose > 2) def test_pdist_calling_conventions(self): # Ensures that specifying the metric with a str or scipy function # gives the same behaviour (i.e. same result or same exception). # NOTE: The correctness should be checked within each metric tests. # NOTE: Extra args should be checked with a dedicated test for eo_name in self.rnd_eo_names: # subsampling input data to speed-up tests # NOTE: num samples needs to be > than dimensions for mahalanobis X = eo[eo_name][::5, ::2] for metric in _METRICS_NAMES: if metric == 'wminkowski': continue if verbose > 2: print("testing: ", metric, " with: ", eo_name) if metric in {'dice', 'yule', 'kulsinski', 'matching', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath'} and 'bool' not in eo_name: # python version permits non-bools e.g. for fuzzy logic continue self._check_calling_conventions(X, metric) # Testing built-in metrics with extra args if metric == "seuclidean": V = np.var(X.astype(np.double), axis=0, ddof=1) self._check_calling_conventions(X, metric, V=V) elif metric == "mahalanobis": V = np.atleast_2d(np.cov(X.astype(np.double).T)) VI = np.array(np.linalg.inv(V).T) self._check_calling_conventions(X, metric, VI=VI) def test_pdist_dtype_equivalence(self): # Tests that the result is not affected by type up-casting eps = 1e-07 tests = [(eo['random-bool-data'], self.valid_upcasts['bool']), (eo['random-uint-data'], self.valid_upcasts['uint']), (eo['random-int-data'], self.valid_upcasts['int']), (eo['random-float32-data'], self.valid_upcasts['float32'])] for metric in _METRICS_NAMES: for test in tests: X1 = test[0][::5, ::2] try: y1 = pdist(X1, metric=metric) except Exception as e: e_cls = e.__class__ if verbose > 2: print(e_cls.__name__) print(e) for new_type in test[1]: X2 = new_type(X1) assert_raises(e_cls, pdist, X2, metric=metric) else: for new_type in test[1]: y2 = pdist(new_type(X1), metric=metric) _assert_within_tol(y1, y2, eps, verbose > 2) def test_pdist_out(self): # Test that out parameter works properly eps = 1e-07 X = eo['random-float32-data'][::5, ::2] out_size = int((X.shape[0] * (X.shape[0] - 1)) / 2) for metric in _METRICS_NAMES: kwargs = dict() if metric in ['minkowski', 'wminkowski']: kwargs['p'] = 1.23 if metric == 'wminkowski': kwargs['w'] = 1.0 / X.std(axis=0) out1 = np.empty(out_size, dtype=np.double) Y_right = pdist(X, metric, **kwargs) Y_test1 = pdist(X, metric, out=out1, **kwargs) # test that output is numerically equivalent _assert_within_tol(Y_test1, Y_right, eps) # test that Y_test1 and out1 are the same object assert_(Y_test1 is out1) # test for incorrect shape out2 = np.empty(out_size + 3, dtype=np.double) assert_raises(ValueError, pdist, X, metric, out=out2, **kwargs) # test for (C-)contiguous output out3 = np.empty(2 * out_size, dtype=np.double)[::2] assert_raises(ValueError, pdist, X, metric, out=out3, **kwargs) # test for incorrect dtype out5 = np.empty(out_size, dtype=np.int64) assert_raises(ValueError, pdist, X, metric, out=out5, **kwargs) def test_striding(self): # test that striding is handled correct with calls to # _copy_array_if_base_present eps = 1e-07 X = eo['random-float32-data'][::5, ::2] X_copy = X.copy() # confirm contiguity assert_(not X.flags.c_contiguous) assert_(X_copy.flags.c_contiguous) for metric in _METRICS_NAMES: kwargs = dict() if metric in ['minkowski', 'wminkowski']: kwargs['p'] = 1.23 if metric == 'wminkowski': kwargs['w'] = 1.0 / X.std(axis=0) Y1 = pdist(X, metric, **kwargs) Y2 = pdist(X_copy, metric, **kwargs) # test that output is numerically equivalent _assert_within_tol(Y1, Y2, eps, verbose > 2) class TestSomeDistanceFunctions(object): def setup_method(self): # 1D arrays x = np.array([1.0, 2.0, 3.0]) y = np.array([1.0, 1.0, 5.0]) # 3x1 arrays x31 = x[:, np.newaxis] y31 = y[:, np.newaxis] # 1x3 arrays x13 = x31.T y13 = y31.T self.cases = [(x, y), (x31, y31), (x13, y13)] def test_minkowski(self): with suppress_warnings() as w: w.filter(message="`wminkowski` is deprecated") for x, y in self.cases: dist1 = wminkowski(x, y, p=1) assert_almost_equal(dist1, 3.0) dist1p5 = wminkowski(x, y, p=1.5) assert_almost_equal(dist1p5, (1.0 + 2.0**1.5)**(2. / 3)) wminkowski(x, y, p=2) # Check that casting input to minimum scalar type doesn't affect result (issue #10262). # This could be extended to more test inputs with np.min_scalar_type(np.max(input_matrix)). a = np.array([352, 916]) b = np.array([350, 660]) assert_equal(minkowski(a, b), minkowski(a.astype('uint16'), b.astype('uint16'))) def test_old_wminkowski(self): with suppress_warnings() as wrn: wrn.filter(message="`wminkowski` is deprecated") w = np.array([1.0, 2.0, 0.5]) for x, y in self.cases: dist1 = old_wminkowski(x, y, p=1, w=w) assert_almost_equal(dist1, 3.0) dist1p5 = old_wminkowski(x, y, p=1.5, w=w) assert_almost_equal(dist1p5, (2.0**1.5+1.0)**(2./3)) dist2 = old_wminkowski(x, y, p=2, w=w) assert_almost_equal(dist2, np.sqrt(5)) # test weights Issue #7893 arr = np.arange(4) w = np.full_like(arr, 4) assert_almost_equal(old_wminkowski(arr, arr + 1, p=2, w=w), 8.0) assert_almost_equal(wminkowski(arr, arr + 1, p=2, w=w), 4.0) def test_euclidean(self): for x, y in self.cases: dist = weuclidean(x, y) assert_almost_equal(dist, np.sqrt(5)) def test_sqeuclidean(self): for x, y in self.cases: dist = wsqeuclidean(x, y) assert_almost_equal(dist, 5.0) def test_cosine(self): for x, y in self.cases: dist = wcosine(x, y) assert_almost_equal(dist, 1.0 - 18.0 / (np.sqrt(14) * np.sqrt(27))) def test_correlation(self): xm = np.array([-1.0, 0, 1.0]) ym = np.array([-4.0 / 3, -4.0 / 3, 5.0 - 7.0 / 3]) for x, y in self.cases: dist = wcorrelation(x, y) assert_almost_equal(dist, 1.0 - np.dot(xm, ym) / (norm(xm) * norm(ym))) def test_correlation_positive(self): # Regression test for gh-12320 (negative return value due to rounding x = np.array([0., 0., 0., 0., 0., 0., -2., 0., 0., 0., -2., -2., -2., 0., -2., 0., -2., 0., 0., -1., -2., 0., 1., 0., 0., -2., 0., 0., -2., 0., -2., -2., -2., -2., -2., -2., 0.]) y = np.array([1., 1., 1., 1., 1., 1., -1., 1., 1., 1., -1., -1., -1., 1., -1., 1., -1., 1., 1., 0., -1., 1., 2., 1., 1., -1., 1., 1., -1., 1., -1., -1., -1., -1., -1., -1., 1.]) dist = correlation(x, y) assert 0 <= dist <= 10 * np.finfo(np.float64).eps def test_mahalanobis(self): x = np.array([1.0, 2.0, 3.0]) y = np.array([1.0, 1.0, 5.0]) vi = np.array([[2.0, 1.0, 0.0], [1.0, 2.0, 1.0], [0.0, 1.0, 2.0]]) for x, y in self.cases: dist = mahalanobis(x, y, vi) assert_almost_equal(dist, np.sqrt(6.0)) class TestSquareForm(object): checked_dtypes = [np.float64, np.float32, np.int32, np.int8, bool] def test_squareform_matrix(self): for dtype in self.checked_dtypes: self.check_squareform_matrix(dtype) def test_squareform_vector(self): for dtype in self.checked_dtypes: self.check_squareform_vector(dtype) def check_squareform_matrix(self, dtype): A = np.zeros((0, 0), dtype=dtype) rA = squareform(A) assert_equal(rA.shape, (0,)) assert_equal(rA.dtype, dtype) A = np.zeros((1, 1), dtype=dtype) rA = squareform(A) assert_equal(rA.shape, (0,)) assert_equal(rA.dtype, dtype) A = np.array([[0, 4.2], [4.2, 0]], dtype=dtype) rA = squareform(A) assert_equal(rA.shape, (1,)) assert_equal(rA.dtype, dtype) assert_array_equal(rA, np.array([4.2], dtype=dtype)) def check_squareform_vector(self, dtype): v = np.zeros((0,), dtype=dtype) rv = squareform(v) assert_equal(rv.shape, (1, 1)) assert_equal(rv.dtype, dtype) assert_array_equal(rv, [[0]]) v = np.array([8.3], dtype=dtype) rv = squareform(v) assert_equal(rv.shape, (2, 2)) assert_equal(rv.dtype, dtype) assert_array_equal(rv, np.array([[0, 8.3], [8.3, 0]], dtype=dtype)) def test_squareform_multi_matrix(self): for n in range(2, 5): self.check_squareform_multi_matrix(n) def check_squareform_multi_matrix(self, n): X = np.random.rand(n, 4) Y = wpdist_no_const(X) assert_equal(len(Y.shape), 1) A = squareform(Y) Yr = squareform(A) s = A.shape k = 0 if verbose >= 3: print(A.shape, Y.shape, Yr.shape) assert_equal(len(s), 2) assert_equal(len(Yr.shape), 1) assert_equal(s[0], s[1]) for i in range(0, s[0]): for j in range(i + 1, s[1]): if i != j: assert_equal(A[i, j], Y[k]) k += 1 else: assert_equal(A[i, j], 0) class TestNumObsY(object): def test_num_obs_y_multi_matrix(self): for n in range(2, 10): X = np.random.rand(n, 4) Y = wpdist_no_const(X) assert_equal(num_obs_y(Y), n) def test_num_obs_y_1(self): # Tests num_obs_y(y) on a condensed distance matrix over 1 # observations. Expecting exception. assert_raises(ValueError, self.check_y, 1) def test_num_obs_y_2(self): # Tests num_obs_y(y) on a condensed distance matrix over 2 # observations. assert_(self.check_y(2)) def test_num_obs_y_3(self): assert_(self.check_y(3)) def test_num_obs_y_4(self): assert_(self.check_y(4)) def test_num_obs_y_5_10(self): for i in range(5, 16): self.minit(i) def test_num_obs_y_2_100(self): # Tests num_obs_y(y) on 100 improper condensed distance matrices. # Expecting exception. a = set([]) for n in range(2, 16): a.add(n * (n - 1) / 2) for i in range(5, 105): if i not in a: assert_raises(ValueError, self.bad_y, i) def minit(self, n): assert_(self.check_y(n)) def bad_y(self, n): y = np.random.rand(n) return num_obs_y(y) def check_y(self, n): return num_obs_y(self.make_y(n)) == n def make_y(self, n): return np.random.rand((n * (n - 1)) // 2) class TestNumObsDM(object): def test_num_obs_dm_multi_matrix(self): for n in range(1, 10): X = np.random.rand(n, 4) Y = wpdist_no_const(X) A = squareform(Y) if verbose >= 3: print(A.shape, Y.shape) assert_equal(num_obs_dm(A), n) def test_num_obs_dm_0(self): # Tests num_obs_dm(D) on a 0x0 distance matrix. Expecting exception. assert_(self.check_D(0)) def test_num_obs_dm_1(self): # Tests num_obs_dm(D) on a 1x1 distance matrix. assert_(self.check_D(1)) def test_num_obs_dm_2(self): assert_(self.check_D(2)) def test_num_obs_dm_3(self): assert_(self.check_D(2)) def test_num_obs_dm_4(self): assert_(self.check_D(4)) def check_D(self, n): return num_obs_dm(self.make_D(n)) == n def make_D(self, n): return np.random.rand(n, n) def is_valid_dm_throw(D): return is_valid_dm(D, throw=True) class TestIsValidDM(object): def test_is_valid_dm_improper_shape_1D_E(self): D = np.zeros((5,), dtype=np.double) assert_raises(ValueError, is_valid_dm_throw, (D)) def test_is_valid_dm_improper_shape_1D_F(self): D = np.zeros((5,), dtype=np.double) assert_equal(is_valid_dm(D), False) def test_is_valid_dm_improper_shape_3D_E(self): D = np.zeros((3, 3, 3), dtype=np.double) assert_raises(ValueError, is_valid_dm_throw, (D)) def test_is_valid_dm_improper_shape_3D_F(self): D = np.zeros((3, 3, 3), dtype=np.double) assert_equal(is_valid_dm(D), False) def test_is_valid_dm_nonzero_diagonal_E(self): y = np.random.rand(10) D = squareform(y) for i in range(0, 5): D[i, i] = 2.0 assert_raises(ValueError, is_valid_dm_throw, (D)) def test_is_valid_dm_nonzero_diagonal_F(self): y = np.random.rand(10) D = squareform(y) for i in range(0, 5): D[i, i] = 2.0 assert_equal(is_valid_dm(D), False) def test_is_valid_dm_asymmetric_E(self): y = np.random.rand(10) D = squareform(y) D[1, 3] = D[3, 1] + 1 assert_raises(ValueError, is_valid_dm_throw, (D)) def test_is_valid_dm_asymmetric_F(self): y = np.random.rand(10) D = squareform(y) D[1, 3] = D[3, 1] + 1 assert_equal(is_valid_dm(D), False) def test_is_valid_dm_correct_1_by_1(self): D = np.zeros((1, 1), dtype=np.double) assert_equal(is_valid_dm(D), True) def test_is_valid_dm_correct_2_by_2(self): y = np.random.rand(1) D = squareform(y) assert_equal(is_valid_dm(D), True) def test_is_valid_dm_correct_3_by_3(self): y = np.random.rand(3) D = squareform(y) assert_equal(is_valid_dm(D), True) def test_is_valid_dm_correct_4_by_4(self): y = np.random.rand(6) D = squareform(y) assert_equal(is_valid_dm(D), True) def test_is_valid_dm_correct_5_by_5(self): y = np.random.rand(10) D = squareform(y) assert_equal(is_valid_dm(D), True) def is_valid_y_throw(y): return is_valid_y(y, throw=True) class TestIsValidY(object): # If test case name ends on "_E" then an exception is expected for the # given input, if it ends in "_F" then False is expected for the is_valid_y # check. Otherwise the input is expected to be valid. def test_is_valid_y_improper_shape_2D_E(self): y = np.zeros((3, 3,), dtype=np.double) assert_raises(ValueError, is_valid_y_throw, (y)) def test_is_valid_y_improper_shape_2D_F(self): y = np.zeros((3, 3,), dtype=np.double) assert_equal(is_valid_y(y), False) def test_is_valid_y_improper_shape_3D_E(self): y = np.zeros((3, 3, 3), dtype=np.double) assert_raises(ValueError, is_valid_y_throw, (y)) def test_is_valid_y_improper_shape_3D_F(self): y = np.zeros((3, 3, 3), dtype=np.double) assert_equal(is_valid_y(y), False) def test_is_valid_y_correct_2_by_2(self): y = self.correct_n_by_n(2) assert_equal(is_valid_y(y), True) def test_is_valid_y_correct_3_by_3(self): y = self.correct_n_by_n(3) assert_equal(is_valid_y(y), True) def test_is_valid_y_correct_4_by_4(self): y = self.correct_n_by_n(4) assert_equal(is_valid_y(y), True) def test_is_valid_y_correct_5_by_5(self): y = self.correct_n_by_n(5) assert_equal(is_valid_y(y), True) def test_is_valid_y_2_100(self): a = set([]) for n in range(2, 16): a.add(n * (n - 1) / 2) for i in range(5, 105): if i not in a: assert_raises(ValueError, self.bad_y, i) def bad_y(self, n): y = np.random.rand(n) return is_valid_y(y, throw=True) def correct_n_by_n(self, n): y = np.random.rand((n * (n - 1)) // 2) return y def test_bad_p(): # Raise ValueError if p < 1. p = 0.5 with suppress_warnings() as w: w.filter(message="`wminkowski` is deprecated") assert_raises(ValueError, wminkowski, [1, 2], [3, 4], p) assert_raises(ValueError, wminkowski, [1, 2], [3, 4], p, [1, 1]) def test_sokalsneath_all_false(): # Regression test for ticket #876 assert_raises(ValueError, sokalsneath, [False, False, False], [False, False, False]) def test_canberra(): # Regression test for ticket #1430. assert_equal(wcanberra([1, 2, 3], [2, 4, 6]), 1) assert_equal(wcanberra([1, 1, 0, 0], [1, 0, 1, 0]), 2) def test_braycurtis(): # Regression test for ticket #1430. assert_almost_equal(wbraycurtis([1, 2, 3], [2, 4, 6]), 1. / 3, decimal=15) assert_almost_equal(wbraycurtis([1, 1, 0, 0], [1, 0, 1, 0]), 0.5, decimal=15) def test_euclideans(): # Regression test for ticket #1328. x1 = np.array([1, 1, 1]) x2 = np.array([0, 0, 0]) # Basic test of the calculation. assert_almost_equal(wsqeuclidean(x1, x2), 3.0, decimal=14) assert_almost_equal(weuclidean(x1, x2), np.sqrt(3), decimal=14) # Check flattening for (1, N) or (N, 1) inputs assert_almost_equal(weuclidean(x1[np.newaxis, :], x2[np.newaxis, :]), np.sqrt(3), decimal=14) assert_almost_equal(wsqeuclidean(x1[np.newaxis, :], x2[np.newaxis, :]), 3.0, decimal=14) assert_almost_equal(wsqeuclidean(x1[:, np.newaxis], x2[:, np.newaxis]), 3.0, decimal=14) # Distance metrics only defined for vectors (= 1-D) x = np.arange(4).reshape(2, 2) assert_raises(ValueError, weuclidean, x, x) assert_raises(ValueError, wsqeuclidean, x, x) # Another check, with random data. rs = np.random.RandomState(1234567890) x = rs.rand(10) y = rs.rand(10) d1 = weuclidean(x, y) d2 = wsqeuclidean(x, y) assert_almost_equal(d1**2, d2, decimal=14) def test_hamming_unequal_length(): # Regression test for gh-4290. x = [0, 0, 1] y = [1, 0, 1, 0] # Used to give an AttributeError from ndarray.mean called on bool assert_raises(ValueError, whamming, x, y) def test_hamming_string_array(): # https://github.com/scikit-learn/scikit-learn/issues/4014 a = np.array(['eggs', 'spam', 'spam', 'eggs', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'eggs', 'eggs', 'spam', 'eggs', 'eggs', 'eggs', 'eggs', 'eggs', 'spam'], dtype='|S4') b = np.array(['eggs', 'spam', 'spam', 'eggs', 'eggs', 'spam', 'spam', 'spam', 'spam', 'eggs', 'spam', 'eggs', 'spam', 'eggs', 'spam', 'spam', 'eggs', 'spam', 'spam', 'eggs'], dtype='|S4') desired = 0.45 assert_allclose(whamming(a, b), desired) def test_minkowski_w(): # Regression test for gh-8142. arr_in = np.array([[83.33333333, 100., 83.33333333, 100., 36., 60., 90., 150., 24., 48.], [83.33333333, 100., 83.33333333, 100., 36., 60., 90., 150., 24., 48.]]) p0 = pdist(arr_in, metric='minkowski', p=1, w=None) c0 = cdist(arr_in, arr_in, metric='minkowski', p=1, w=None) p1 = pdist(arr_in, metric='minkowski', p=1) c1 = cdist(arr_in, arr_in, metric='minkowski', p=1) assert_allclose(p0, p1, rtol=1e-15) assert_allclose(c0, c1, rtol=1e-15) def test_sqeuclidean_dtypes(): # Assert that sqeuclidean returns the right types of values. # Integer types should be converted to floating for stability. # Floating point types should be the same as the input. x = [1, 2, 3] y = [4, 5, 6] for dtype in [np.int8, np.int16, np.int32, np.int64]: d = wsqeuclidean(np.asarray(x, dtype=dtype), np.asarray(y, dtype=dtype)) assert_(np.issubdtype(d.dtype, np.floating)) for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]: d1 = wsqeuclidean([0], np.asarray([-1], dtype=dtype)) d2 = wsqeuclidean(np.asarray([-1], dtype=dtype), [0]) assert_equal(d1, d2) assert_equal(d1, np.float64(np.iinfo(dtype).max)**2) dtypes = [np.float32, np.float64, np.complex64, np.complex128] for dtype in ['float16', 'float128']: # These aren't present in older numpy versions; float128 may also not # be present on all platforms. if hasattr(np, dtype): dtypes.append(getattr(np, dtype)) for dtype in dtypes: d = wsqeuclidean(np.asarray(x, dtype=dtype), np.asarray(y, dtype=dtype)) assert_equal(d.dtype, dtype) def test_sokalmichener(): # Test that sokalmichener has the same result for bool and int inputs. p = [True, True, False] q = [True, False, True] x = [int(b) for b in p] y = [int(b) for b in q] dist1 = sokalmichener(p, q) dist2 = sokalmichener(x, y) # These should be exactly the same. assert_equal(dist1, dist2) def test_modifies_input(): # test whether cdist or pdist modifies input arrays X1 = np.asarray([[1., 2., 3.], [1.2, 2.3, 3.4], [2.2, 2.3, 4.4], [22.2, 23.3, 44.4]]) X1_copy = X1.copy() with suppress_warnings() as w: w.filter(message="`wminkowski` is deprecated") for metric in _METRICS_NAMES: kwargs = {"w": 1.0 / X1.std(axis=0)} if metric == "wminkowski" else {} cdist(X1, X1, metric, **kwargs) pdist(X1, metric, **kwargs) assert_array_equal(X1, X1_copy) def test_Xdist_deprecated_args(): # testing both cdist and pdist deprecated warnings X1 = np.asarray([[1., 2., 3.], [1.2, 2.3, 3.4], [2.2, 2.3, 4.4], [22.2, 23.3, 44.4]]) weights = np.arange(3) warn_msg_kwargs = "Got unexpected kwarg" warn_msg_args = "[0-9]* metric parameters have been passed as positional" for metric in _METRICS_NAMES: kwargs = {"w": weights} if metric == "wminkowski" else dict() with suppress_warnings() as w: log = w.record(message=warn_msg_args) w.filter(message=warn_msg_kwargs) w.filter(message="`wminkowski` is deprecated") cdist(X1, X1, metric, 2., **kwargs) pdist(X1, metric, 2., **kwargs) assert_(len(log) == 2) for arg in ["p", "V", "VI"]: kwargs = {arg:"foo"} if metric == "wminkowski": if "p" in kwargs or "w" in kwargs: continue kwargs["w"] = weights if((arg == "V" and metric == "seuclidean") or (arg == "VI" and metric == "mahalanobis") or (arg == "p" and metric == "minkowski")): continue with suppress_warnings() as w: log = w.record(message=warn_msg_kwargs) w.filter(message="`wminkowski` is deprecated") cdist(X1, X1, metric, **kwargs) pdist(X1, metric, **kwargs) assert_(len(log) == 2) def test_Xdist_non_negative_weights(): X = eo['random-float32-data'][::5, ::2] w = np.ones(X.shape[1]) w[::5] = -w[::5] for metric in _METRICS_NAMES: if metric in ['seuclidean', 'mahalanobis', 'jensenshannon']: continue for m in [metric, eval(metric), "test_" + metric]: assert_raises(ValueError, pdist, X, m, w=w) assert_raises(ValueError, cdist, X, X, m, w=w) def test__validate_vector(): x = [1, 2, 3] y = _validate_vector(x) assert_array_equal(y, x) y = _validate_vector(x, dtype=np.float64) assert_array_equal(y, x) assert_equal(y.dtype, np.float64) x = [1] y = _validate_vector(x) assert_equal(y.ndim, 1) assert_equal(y, x) x = 1 y = _validate_vector(x) assert_equal(y.ndim, 1) assert_equal(y, [x]) x = np.arange(5).reshape(1, -1, 1) y = _validate_vector(x) assert_equal(y.ndim, 1) assert_array_equal(y, x[0, :, 0]) x = [[1, 2], [3, 4]] assert_raises(ValueError, _validate_vector, x)