Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
BIN
venv/Lib/site-packages/sklearn/.libs/vcomp140.dll
Normal file
BIN
venv/Lib/site-packages/sklearn/.libs/vcomp140.dll
Normal file
Binary file not shown.
46
venv/Lib/site-packages/sklearn/__check_build/__init__.py
Normal file
46
venv/Lib/site-packages/sklearn/__check_build/__init__.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
""" Module to give helpful messages to the user that did not
|
||||
compile scikit-learn properly.
|
||||
"""
|
||||
import os
|
||||
|
||||
INPLACE_MSG = """
|
||||
It appears that you are importing a local scikit-learn source tree. For
|
||||
this, you need to have an inplace install. Maybe you are in the source
|
||||
directory and you need to try from another location."""
|
||||
|
||||
STANDARD_MSG = """
|
||||
If you have used an installer, please check that it is suited for your
|
||||
Python version, your operating system and your platform."""
|
||||
|
||||
|
||||
def raise_build_error(e):
|
||||
# Raise a comprehensible error and list the contents of the
|
||||
# directory to help debugging on the mailing list.
|
||||
local_dir = os.path.split(__file__)[0]
|
||||
msg = STANDARD_MSG
|
||||
if local_dir == "sklearn/__check_build":
|
||||
# Picking up the local install: this will work only if the
|
||||
# install is an 'inplace build'
|
||||
msg = INPLACE_MSG
|
||||
dir_content = list()
|
||||
for i, filename in enumerate(os.listdir(local_dir)):
|
||||
if ((i + 1) % 3):
|
||||
dir_content.append(filename.ljust(26))
|
||||
else:
|
||||
dir_content.append(filename + '\n')
|
||||
raise ImportError("""%s
|
||||
___________________________________________________________________________
|
||||
Contents of %s:
|
||||
%s
|
||||
___________________________________________________________________________
|
||||
It seems that scikit-learn has not been built correctly.
|
||||
|
||||
If you have installed scikit-learn from source, please do not forget
|
||||
to build the package before using it: run `python setup.py install` or
|
||||
`make` in the source directory.
|
||||
%s""" % (e, local_dir, ''.join(dir_content).strip(), msg))
|
||||
|
||||
try:
|
||||
from ._check_build import check_build # noqa
|
||||
except ImportError as e:
|
||||
raise_build_error(e)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
18
venv/Lib/site-packages/sklearn/__check_build/setup.py
Normal file
18
venv/Lib/site-packages/sklearn/__check_build/setup.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy
|
||||
|
||||
|
||||
def configuration(parent_package='', top_path=None):
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
config = Configuration('__check_build', parent_package, top_path)
|
||||
config.add_extension('_check_build',
|
||||
sources=['_check_build.pyx'],
|
||||
include_dirs=[numpy.get_include()])
|
||||
|
||||
return config
|
||||
|
||||
if __name__ == '__main__':
|
||||
from numpy.distutils.core import setup
|
||||
setup(**configuration(top_path='').todict())
|
111
venv/Lib/site-packages/sklearn/__init__.py
Normal file
111
venv/Lib/site-packages/sklearn/__init__.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
"""
|
||||
Machine learning module for Python
|
||||
==================================
|
||||
|
||||
sklearn is a Python module integrating classical machine
|
||||
learning algorithms in the tightly-knit world of scientific Python
|
||||
packages (numpy, scipy, matplotlib).
|
||||
|
||||
It aims to provide simple and efficient solutions to learning problems
|
||||
that are accessible to everybody and reusable in various contexts:
|
||||
machine-learning as a versatile tool for science and engineering.
|
||||
|
||||
See http://scikit-learn.org for complete documentation.
|
||||
"""
|
||||
import sys
|
||||
import logging
|
||||
import os
|
||||
|
||||
from ._config import get_config, set_config, config_context
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# PEP0440 compatible formatted version, see:
|
||||
# https://www.python.org/dev/peps/pep-0440/
|
||||
#
|
||||
# Generic release markers:
|
||||
# X.Y
|
||||
# X.Y.Z # For bugfix releases
|
||||
#
|
||||
# Admissible pre-release markers:
|
||||
# X.YaN # Alpha release
|
||||
# X.YbN # Beta release
|
||||
# X.YrcN # Release Candidate
|
||||
# X.Y # Final release
|
||||
#
|
||||
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
|
||||
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
|
||||
#
|
||||
__version__ = '0.23.2'
|
||||
|
||||
|
||||
# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
|
||||
# simultaneously. This can happen for instance when calling BLAS inside a
|
||||
# prange. Setting the following environment variable allows multiple OpenMP
|
||||
# libraries to be loaded. It should not degrade performances since we manually
|
||||
# take care of potential over-subcription performance issues, in sections of
|
||||
# the code where nested OpenMP loops can happen, by dynamically reconfiguring
|
||||
# the inner OpenMP runtime to temporarily disable it while under the scope of
|
||||
# the outer OpenMP parallel section.
|
||||
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
|
||||
|
||||
# Workaround issue discovered in intel-openmp 2019.5:
|
||||
# https://github.com/ContinuumIO/anaconda-issues/issues/11294
|
||||
os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
|
||||
|
||||
try:
|
||||
# This variable is injected in the __builtins__ by the build
|
||||
# process. It is used to enable importing subpackages of sklearn when
|
||||
# the binaries are not built
|
||||
# mypy error: Cannot determine type of '__SKLEARN_SETUP__'
|
||||
__SKLEARN_SETUP__ # type: ignore
|
||||
except NameError:
|
||||
__SKLEARN_SETUP__ = False
|
||||
|
||||
if __SKLEARN_SETUP__:
|
||||
sys.stderr.write('Partial import of sklearn during the build process.\n')
|
||||
# We are not importing the rest of scikit-learn during the build
|
||||
# process, as it may not be compiled yet
|
||||
else:
|
||||
# `_distributor_init` allows distributors to run custom init code.
|
||||
# For instance, for the Windows wheel, this is used to pre-load the
|
||||
# vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
|
||||
# sub-folder.
|
||||
# It is necessary to do this prior to importing show_versions as the
|
||||
# later is linked to the OpenMP runtime to make it possible to introspect
|
||||
# it and importing it first would fail if the OpenMP dll cannot be found.
|
||||
from . import _distributor_init # noqa: F401
|
||||
from . import __check_build # noqa: F401
|
||||
from .base import clone
|
||||
from .utils._show_versions import show_versions
|
||||
|
||||
__all__ = ['calibration', 'cluster', 'covariance', 'cross_decomposition',
|
||||
'datasets', 'decomposition', 'dummy', 'ensemble', 'exceptions',
|
||||
'experimental', 'externals', 'feature_extraction',
|
||||
'feature_selection', 'gaussian_process', 'inspection',
|
||||
'isotonic', 'kernel_approximation', 'kernel_ridge',
|
||||
'linear_model', 'manifold', 'metrics', 'mixture',
|
||||
'model_selection', 'multiclass', 'multioutput',
|
||||
'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
|
||||
'preprocessing', 'random_projection', 'semi_supervised',
|
||||
'svm', 'tree', 'discriminant_analysis', 'impute', 'compose',
|
||||
# Non-modules:
|
||||
'clone', 'get_config', 'set_config', 'config_context',
|
||||
'show_versions']
|
||||
|
||||
|
||||
def setup_module(module):
|
||||
"""Fixture for the tests to assure globally controllable seeding of RNGs"""
|
||||
import os
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
# Check if a random seed exists in the environment, if not create one.
|
||||
_random_seed = os.environ.get('SKLEARN_SEED', None)
|
||||
if _random_seed is None:
|
||||
_random_seed = np.random.uniform() * np.iinfo(np.int32).max
|
||||
_random_seed = int(_random_seed)
|
||||
print("I: Seeding RNGs with %r" % _random_seed)
|
||||
np.random.seed(_random_seed)
|
||||
random.seed(_random_seed)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
venv/Lib/site-packages/sklearn/__pycache__/base.cpython-36.pyc
Normal file
BIN
venv/Lib/site-packages/sklearn/__pycache__/base.cpython-36.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
venv/Lib/site-packages/sklearn/__pycache__/dummy.cpython-36.pyc
Normal file
BIN
venv/Lib/site-packages/sklearn/__pycache__/dummy.cpython-36.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
venv/Lib/site-packages/sklearn/__pycache__/setup.cpython-36.pyc
Normal file
BIN
venv/Lib/site-packages/sklearn/__pycache__/setup.cpython-36.pyc
Normal file
Binary file not shown.
101
venv/Lib/site-packages/sklearn/_build_utils/__init__.py
Normal file
101
venv/Lib/site-packages/sklearn/_build_utils/__init__.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
"""
|
||||
Utilities useful during the build.
|
||||
"""
|
||||
# author: Andy Mueller, Gael Varoquaux
|
||||
# license: BSD
|
||||
|
||||
|
||||
import os
|
||||
import sklearn
|
||||
import contextlib
|
||||
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
from .pre_build_helpers import basic_check_build
|
||||
from .openmp_helpers import check_openmp_support
|
||||
|
||||
|
||||
DEFAULT_ROOT = 'sklearn'
|
||||
|
||||
# The following places need to be in sync with regard to Cython version:
|
||||
# - .circleci config file
|
||||
# - sklearn/_build_utils/__init__.py
|
||||
# - advanced installation guide
|
||||
CYTHON_MIN_VERSION = '0.28.5'
|
||||
|
||||
|
||||
def _check_cython_version():
|
||||
message = ('Please install Cython with a version >= {0} in order '
|
||||
'to build a scikit-learn from source.').format(
|
||||
CYTHON_MIN_VERSION)
|
||||
try:
|
||||
import Cython
|
||||
except ModuleNotFoundError:
|
||||
# Re-raise with more informative error message instead:
|
||||
raise ModuleNotFoundError(message)
|
||||
|
||||
if LooseVersion(Cython.__version__) < CYTHON_MIN_VERSION:
|
||||
message += (' The current version of Cython is {} installed in {}.'
|
||||
.format(Cython.__version__, Cython.__path__))
|
||||
raise ValueError(message)
|
||||
|
||||
|
||||
def cythonize_extensions(top_path, config):
|
||||
"""Check that a recent Cython is available and cythonize extensions"""
|
||||
_check_cython_version()
|
||||
from Cython.Build import cythonize
|
||||
|
||||
# Fast fail before cythonization if compiler fails compiling basic test
|
||||
# code even without OpenMP
|
||||
basic_check_build()
|
||||
|
||||
# check simple compilation with OpenMP. If it fails scikit-learn will be
|
||||
# built without OpenMP and the test test_openmp_supported in the test suite
|
||||
# will fail.
|
||||
# `check_openmp_support` compiles a small test program to see if the
|
||||
# compilers are properly configured to build with OpenMP. This is expensive
|
||||
# and we only want to call this function once.
|
||||
# The result of this check is cached as a private attribute on the sklearn
|
||||
# module (only at build-time) to be used twice:
|
||||
# - First to set the value of SKLEARN_OPENMP_PARALLELISM_ENABLED, the
|
||||
# cython build-time variable passed to the cythonize() call.
|
||||
# - Then in the build_ext subclass defined in the top-level setup.py file
|
||||
# to actually build the compiled extensions with OpenMP flags if needed.
|
||||
sklearn._OPENMP_SUPPORTED = check_openmp_support()
|
||||
|
||||
n_jobs = 1
|
||||
with contextlib.suppress(ImportError):
|
||||
import joblib
|
||||
if LooseVersion(joblib.__version__) > LooseVersion("0.13.0"):
|
||||
# earlier joblib versions don't account for CPU affinity
|
||||
# constraints, and may over-estimate the number of available
|
||||
# CPU particularly in CI (cf loky#114)
|
||||
n_jobs = joblib.cpu_count()
|
||||
|
||||
config.ext_modules = cythonize(
|
||||
config.ext_modules,
|
||||
nthreads=n_jobs,
|
||||
compile_time_env={
|
||||
'SKLEARN_OPENMP_PARALLELISM_ENABLED': sklearn._OPENMP_SUPPORTED},
|
||||
compiler_directives={'language_level': 3})
|
||||
|
||||
|
||||
def gen_from_templates(templates, top_path):
|
||||
"""Generate cython files from a list of templates"""
|
||||
# Lazy import because cython is not a runtime dependency.
|
||||
from Cython import Tempita
|
||||
|
||||
for template in templates:
|
||||
outfile = template.replace('.tp', '')
|
||||
|
||||
# if the template is not updated, no need to output the cython file
|
||||
if not (os.path.exists(outfile) and
|
||||
os.stat(template).st_mtime < os.stat(outfile).st_mtime):
|
||||
|
||||
with open(template, "r") as f:
|
||||
tmpl = f.read()
|
||||
|
||||
tmpl_ = Tempita.sub(tmpl)
|
||||
|
||||
with open(outfile, "w") as f:
|
||||
f.write(tmpl_)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,324 @@
|
|||
"""Generates submodule to allow deprecation of submodules and keeping git
|
||||
blame."""
|
||||
from pathlib import Path
|
||||
from contextlib import suppress
|
||||
|
||||
# TODO: Remove the whole file in 0.24
|
||||
|
||||
# This is a set of 4-tuples consisting of
|
||||
# (new_module_name, deprecated_path, correct_import_path, importee)
|
||||
# importee is used by test_import_deprecations to check for DeprecationWarnings
|
||||
_DEPRECATED_MODULES = [
|
||||
('_mocking', 'sklearn.utils.mocking', 'sklearn.utils',
|
||||
'MockDataFrame'),
|
||||
|
||||
('_bagging', 'sklearn.ensemble.bagging', 'sklearn.ensemble',
|
||||
'BaggingClassifier'),
|
||||
('_base', 'sklearn.ensemble.base', 'sklearn.ensemble',
|
||||
'BaseEnsemble'),
|
||||
('_forest', 'sklearn.ensemble.forest', 'sklearn.ensemble',
|
||||
'RandomForestClassifier'),
|
||||
('_gb', 'sklearn.ensemble.gradient_boosting', 'sklearn.ensemble',
|
||||
'GradientBoostingClassifier'),
|
||||
('_iforest', 'sklearn.ensemble.iforest', 'sklearn.ensemble',
|
||||
'IsolationForest'),
|
||||
('_voting', 'sklearn.ensemble.voting', 'sklearn.ensemble',
|
||||
'VotingClassifier'),
|
||||
('_weight_boosting', 'sklearn.ensemble.weight_boosting',
|
||||
'sklearn.ensemble', 'AdaBoostClassifier'),
|
||||
('_classes', 'sklearn.tree.tree', 'sklearn.tree',
|
||||
'DecisionTreeClassifier'),
|
||||
('_export', 'sklearn.tree.export', 'sklearn.tree', 'export_graphviz'),
|
||||
|
||||
('_rbm', 'sklearn.neural_network.rbm', 'sklearn.neural_network',
|
||||
'BernoulliRBM'),
|
||||
('_multilayer_perceptron', 'sklearn.neural_network.multilayer_perceptron',
|
||||
'sklearn.neural_network', 'MLPClassifier'),
|
||||
|
||||
('_weight_vector', 'sklearn.utils.weight_vector', 'sklearn.utils',
|
||||
'WeightVector'),
|
||||
('_seq_dataset', 'sklearn.utils.seq_dataset', 'sklearn.utils',
|
||||
'ArrayDataset32'),
|
||||
('_fast_dict', 'sklearn.utils.fast_dict', 'sklearn.utils', 'IntFloatDict'),
|
||||
|
||||
('_affinity_propagation', 'sklearn.cluster.affinity_propagation_',
|
||||
'sklearn.cluster', 'AffinityPropagation'),
|
||||
('_bicluster', 'sklearn.cluster.bicluster', 'sklearn.cluster',
|
||||
'SpectralBiclustering'),
|
||||
('_birch', 'sklearn.cluster.birch', 'sklearn.cluster', 'Birch'),
|
||||
('_dbscan', 'sklearn.cluster.dbscan_', 'sklearn.cluster', 'DBSCAN'),
|
||||
('_agglomerative', 'sklearn.cluster.hierarchical', 'sklearn.cluster',
|
||||
'FeatureAgglomeration'),
|
||||
('_kmeans', 'sklearn.cluster.k_means_', 'sklearn.cluster', 'KMeans'),
|
||||
('_mean_shift', 'sklearn.cluster.mean_shift_', 'sklearn.cluster',
|
||||
'MeanShift'),
|
||||
('_optics', 'sklearn.cluster.optics_', 'sklearn.cluster', 'OPTICS'),
|
||||
('_spectral', 'sklearn.cluster.spectral', 'sklearn.cluster',
|
||||
'SpectralClustering'),
|
||||
|
||||
('_base', 'sklearn.mixture.base', 'sklearn.mixture', 'BaseMixture'),
|
||||
('_gaussian_mixture', 'sklearn.mixture.gaussian_mixture',
|
||||
'sklearn.mixture', 'GaussianMixture'),
|
||||
('_bayesian_mixture', 'sklearn.mixture.bayesian_mixture',
|
||||
'sklearn.mixture', 'BayesianGaussianMixture'),
|
||||
|
||||
('_empirical_covariance', 'sklearn.covariance.empirical_covariance_',
|
||||
'sklearn.covariance', 'EmpiricalCovariance'),
|
||||
('_shrunk_covariance', 'sklearn.covariance.shrunk_covariance_',
|
||||
'sklearn.covariance', 'ShrunkCovariance'),
|
||||
('_robust_covariance', 'sklearn.covariance.robust_covariance',
|
||||
'sklearn.covariance', 'MinCovDet'),
|
||||
('_graph_lasso', 'sklearn.covariance.graph_lasso_',
|
||||
'sklearn.covariance', 'GraphicalLasso'),
|
||||
('_elliptic_envelope', 'sklearn.covariance.elliptic_envelope',
|
||||
'sklearn.covariance', 'EllipticEnvelope'),
|
||||
|
||||
('_cca', 'sklearn.cross_decomposition.cca_',
|
||||
'sklearn.cross_decomposition', 'CCA'),
|
||||
('_pls', 'sklearn.cross_decomposition.pls_',
|
||||
'sklearn.cross_decomposition', 'PLSSVD'),
|
||||
|
||||
('_base', 'sklearn.svm.base', 'sklearn.svm', 'BaseLibSVM'),
|
||||
('_bounds', 'sklearn.svm.bounds', 'sklearn.svm', 'l1_min_c'),
|
||||
('_classes', 'sklearn.svm.classes', 'sklearn.svm', 'SVR'),
|
||||
('_libsvm', 'sklearn.svm.libsvm', 'sklearn.svm', 'fit'),
|
||||
('_libsvm_sparse', 'sklearn.svm.libsvm_sparse', 'sklearn.svm',
|
||||
'set_verbosity_wrap'),
|
||||
('_liblinear', 'sklearn.svm.liblinear', 'sklearn.svm', 'train_wrap'),
|
||||
|
||||
('_base', 'sklearn.decomposition.base', 'sklearn.decomposition',
|
||||
'BaseEstimator'),
|
||||
('_dict_learning', 'sklearn.decomposition.dict_learning',
|
||||
'sklearn.decomposition', 'MiniBatchDictionaryLearning'),
|
||||
('_cdnmf_fast', 'sklearn.decomposition.cdnmf_fast',
|
||||
'sklearn.decomposition', '__dict__'),
|
||||
('_factor_analysis', 'sklearn.decomposition.factor_analysis',
|
||||
'sklearn.decomposition', 'FactorAnalysis'),
|
||||
('_fastica', 'sklearn.decomposition.fastica_', 'sklearn.decomposition',
|
||||
'FastICA'),
|
||||
('_incremental_pca', 'sklearn.decomposition.incremental_pca',
|
||||
'sklearn.decomposition', 'IncrementalPCA'),
|
||||
('_kernel_pca', 'sklearn.decomposition.kernel_pca',
|
||||
'sklearn.decomposition', 'KernelPCA'),
|
||||
('_nmf', 'sklearn.decomposition.nmf', 'sklearn.decomposition', 'NMF'),
|
||||
('_lda', 'sklearn.decomposition.online_lda',
|
||||
'sklearn.decomposition', 'LatentDirichletAllocation'),
|
||||
('_online_lda_fast', 'sklearn.decomposition.online_lda_fast',
|
||||
'sklearn.decomposition', 'mean_change'),
|
||||
('_pca', 'sklearn.decomposition.pca', 'sklearn.decomposition', 'PCA'),
|
||||
('_sparse_pca', 'sklearn.decomposition.sparse_pca',
|
||||
'sklearn.decomposition', 'SparsePCA'),
|
||||
('_truncated_svd', 'sklearn.decomposition.truncated_svd',
|
||||
'sklearn.decomposition', 'TruncatedSVD'),
|
||||
|
||||
('_gpr', 'sklearn.gaussian_process.gpr', 'sklearn.gaussian_process',
|
||||
'GaussianProcessRegressor'),
|
||||
('_gpc', 'sklearn.gaussian_process.gpc', 'sklearn.gaussian_process',
|
||||
'GaussianProcessClassifier'),
|
||||
|
||||
('_base', 'sklearn.datasets.base', 'sklearn.datasets', 'get_data_home'),
|
||||
('_california_housing', 'sklearn.datasets.california_housing',
|
||||
'sklearn.datasets', 'fetch_california_housing'),
|
||||
('_covtype', 'sklearn.datasets.covtype', 'sklearn.datasets',
|
||||
'fetch_covtype'),
|
||||
('_kddcup99', 'sklearn.datasets.kddcup99', 'sklearn.datasets',
|
||||
'fetch_kddcup99'),
|
||||
('_lfw', 'sklearn.datasets.lfw', 'sklearn.datasets',
|
||||
'fetch_lfw_people'),
|
||||
('_olivetti_faces', 'sklearn.datasets.olivetti_faces', 'sklearn.datasets',
|
||||
'fetch_olivetti_faces'),
|
||||
('_openml', 'sklearn.datasets.openml', 'sklearn.datasets', 'fetch_openml'),
|
||||
('_rcv1', 'sklearn.datasets.rcv1', 'sklearn.datasets', 'fetch_rcv1'),
|
||||
('_samples_generator', 'sklearn.datasets.samples_generator',
|
||||
'sklearn.datasets', 'make_classification'),
|
||||
('_species_distributions', 'sklearn.datasets.species_distributions',
|
||||
'sklearn.datasets', 'fetch_species_distributions'),
|
||||
('_svmlight_format_io', 'sklearn.datasets.svmlight_format',
|
||||
'sklearn.datasets', 'load_svmlight_file'),
|
||||
('_twenty_newsgroups', 'sklearn.datasets.twenty_newsgroups',
|
||||
'sklearn.datasets', 'strip_newsgroup_header'),
|
||||
|
||||
('_dict_vectorizer', 'sklearn.feature_extraction.dict_vectorizer',
|
||||
'sklearn.feature_extraction', 'DictVectorizer'),
|
||||
('_hash', 'sklearn.feature_extraction.hashing',
|
||||
'sklearn.feature_extraction', 'FeatureHasher'),
|
||||
('_stop_words', 'sklearn.feature_extraction.stop_words',
|
||||
'sklearn.feature_extraction.text', 'ENGLISH_STOP_WORDS'),
|
||||
|
||||
('_base', 'sklearn.linear_model.base', 'sklearn.linear_model',
|
||||
'LinearRegression'),
|
||||
('_cd_fast', 'sklearn.linear_model.cd_fast', 'sklearn.linear_model',
|
||||
'sparse_enet_coordinate_descent'),
|
||||
('_bayes', 'sklearn.linear_model.bayes', 'sklearn.linear_model',
|
||||
'BayesianRidge'),
|
||||
('_coordinate_descent', 'sklearn.linear_model.coordinate_descent',
|
||||
'sklearn.linear_model', 'Lasso'),
|
||||
('_huber', 'sklearn.linear_model.huber', 'sklearn.linear_model',
|
||||
'HuberRegressor'),
|
||||
('_least_angle', 'sklearn.linear_model.least_angle',
|
||||
'sklearn.linear_model', 'LassoLarsCV'),
|
||||
('_logistic', 'sklearn.linear_model.logistic', 'sklearn.linear_model',
|
||||
'LogisticRegression'),
|
||||
('_omp', 'sklearn.linear_model.omp', 'sklearn.linear_model',
|
||||
'OrthogonalMatchingPursuit'),
|
||||
('_passive_aggressive', 'sklearn.linear_model.passive_aggressive',
|
||||
'sklearn.linear_model', 'PassiveAggressiveClassifier'),
|
||||
('_perceptron', 'sklearn.linear_model.perceptron', 'sklearn.linear_model',
|
||||
'Perceptron'),
|
||||
('_ransac', 'sklearn.linear_model.ransac', 'sklearn.linear_model',
|
||||
'RANSACRegressor'),
|
||||
('_ridge', 'sklearn.linear_model.ridge', 'sklearn.linear_model',
|
||||
'Ridge'),
|
||||
('_sag', 'sklearn.linear_model.sag', 'sklearn.linear_model',
|
||||
'get_auto_step_size'),
|
||||
('_sag_fast', 'sklearn.linear_model.sag_fast', 'sklearn.linear_model',
|
||||
'MultinomialLogLoss64'),
|
||||
('_sgd_fast', 'sklearn.linear_model.sgd_fast', 'sklearn.linear_model',
|
||||
'Hinge'),
|
||||
('_stochastic_gradient', 'sklearn.linear_model.stochastic_gradient',
|
||||
'sklearn.linear_model', 'SGDClassifier'),
|
||||
('_theil_sen', 'sklearn.linear_model.theil_sen', 'sklearn.linear_model',
|
||||
'TheilSenRegressor'),
|
||||
|
||||
('_bicluster', 'sklearn.metrics.cluster.bicluster',
|
||||
'sklearn.metrics.cluster', 'consensus_score'),
|
||||
('_supervised', 'sklearn.metrics.cluster.supervised',
|
||||
'sklearn.metrics.cluster', 'entropy'),
|
||||
('_unsupervised', 'sklearn.metrics.cluster.unsupervised',
|
||||
'sklearn.metrics.cluster', 'silhouette_score'),
|
||||
('_expected_mutual_info_fast',
|
||||
'sklearn.metrics.cluster.expected_mutual_info_fast',
|
||||
'sklearn.metrics.cluster', 'expected_mutual_information'),
|
||||
|
||||
('_base', 'sklearn.metrics.base', 'sklearn.metrics', 'combinations'),
|
||||
('_classification', 'sklearn.metrics.classification', 'sklearn.metrics',
|
||||
'accuracy_score'),
|
||||
('_regression', 'sklearn.metrics.regression', 'sklearn.metrics',
|
||||
'max_error'),
|
||||
('_ranking', 'sklearn.metrics.ranking', 'sklearn.metrics', 'roc_curve'),
|
||||
('_pairwise_fast', 'sklearn.metrics.pairwise_fast', 'sklearn.metrics',
|
||||
'np'),
|
||||
('_scorer', 'sklearn.metrics.scorer', 'sklearn.metrics', 'get_scorer'),
|
||||
|
||||
('_partial_dependence', 'sklearn.inspection.partial_dependence',
|
||||
'sklearn.inspection', 'partial_dependence'),
|
||||
|
||||
('_ball_tree', 'sklearn.neighbors.ball_tree', 'sklearn.neighbors',
|
||||
'BallTree'),
|
||||
('_base', 'sklearn.neighbors.base', 'sklearn.neighbors',
|
||||
'VALID_METRICS'),
|
||||
('_classification', 'sklearn.neighbors.classification',
|
||||
'sklearn.neighbors', 'KNeighborsClassifier'),
|
||||
('_dist_metrics', 'sklearn.neighbors.dist_metrics', 'sklearn.neighbors',
|
||||
'DistanceMetric'),
|
||||
('_graph', 'sklearn.neighbors.graph', 'sklearn.neighbors',
|
||||
'KNeighborsTransformer'),
|
||||
('_kd_tree', 'sklearn.neighbors.kd_tree', 'sklearn.neighbors',
|
||||
'KDTree'),
|
||||
('_kde', 'sklearn.neighbors.kde', 'sklearn.neighbors',
|
||||
'KernelDensity'),
|
||||
('_lof', 'sklearn.neighbors.lof', 'sklearn.neighbors',
|
||||
'LocalOutlierFactor'),
|
||||
('_nca', 'sklearn.neighbors.nca', 'sklearn.neighbors',
|
||||
'NeighborhoodComponentsAnalysis'),
|
||||
('_nearest_centroid', 'sklearn.neighbors.nearest_centroid',
|
||||
'sklearn.neighbors', 'NearestCentroid'),
|
||||
('_quad_tree', 'sklearn.neighbors.quad_tree', 'sklearn.neighbors',
|
||||
'CELL_DTYPE'),
|
||||
('_regression', 'sklearn.neighbors.regression', 'sklearn.neighbors',
|
||||
'KNeighborsRegressor'),
|
||||
('_typedefs', 'sklearn.neighbors.typedefs', 'sklearn.neighbors',
|
||||
'DTYPE'),
|
||||
('_unsupervised', 'sklearn.neighbors.unsupervised', 'sklearn.neighbors',
|
||||
'NearestNeighbors'),
|
||||
|
||||
('_isomap', 'sklearn.manifold.isomap', 'sklearn.manifold', 'Isomap'),
|
||||
('_locally_linear', 'sklearn.manifold.locally_linear', 'sklearn.manifold',
|
||||
'LocallyLinearEmbedding'),
|
||||
('_mds', 'sklearn.manifold.mds', 'sklearn.manifold', 'MDS'),
|
||||
('_spectral_embedding', 'sklearn.manifold.spectral_embedding_',
|
||||
'sklearn.manifold', 'SpectralEmbedding'),
|
||||
('_t_sne', 'sklearn.manifold.t_sne', 'sklearn.manifold', 'TSNE'),
|
||||
|
||||
('_label_propagation', 'sklearn.semi_supervised.label_propagation',
|
||||
'sklearn.semi_supervised', 'LabelPropagation'),
|
||||
|
||||
('_data', 'sklearn.preprocessing.data', 'sklearn.preprocessing',
|
||||
'Binarizer'),
|
||||
('_label', 'sklearn.preprocessing.label', 'sklearn.preprocessing',
|
||||
'LabelEncoder'),
|
||||
|
||||
('_base', 'sklearn.feature_selection.base', 'sklearn.feature_selection',
|
||||
'SelectorMixin'),
|
||||
('_from_model', 'sklearn.feature_selection.from_model',
|
||||
'sklearn.feature_selection', 'SelectFromModel'),
|
||||
('_mutual_info', 'sklearn.feature_selection.mutual_info',
|
||||
'sklearn.feature_selection', 'mutual_info_regression'),
|
||||
('_rfe', 'sklearn.feature_selection.rfe',
|
||||
'sklearn.feature_selection.rfe', 'RFE'),
|
||||
('_univariate_selection',
|
||||
'sklearn.feature_selection.univariate_selection',
|
||||
'sklearn.feature_selection', 'chi2'),
|
||||
('_variance_threshold',
|
||||
'sklearn.feature_selection.variance_threshold',
|
||||
'sklearn.feature_selection', 'VarianceThreshold'),
|
||||
|
||||
('_testing', 'sklearn.utils.testing', 'sklearn.utils',
|
||||
'all_estimators'),
|
||||
]
|
||||
|
||||
|
||||
_FILE_CONTENT_TEMPLATE = """
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import {new_module_name} # type: ignore
|
||||
from {relative_dots}externals._pep562 import Pep562
|
||||
from {relative_dots}utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = '{deprecated_path}'
|
||||
correct_import_path = '{correct_import_path}'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr({new_module_name}, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
||||
"""
|
||||
|
||||
|
||||
def _get_deprecated_path(deprecated_path):
|
||||
deprecated_parts = deprecated_path.split(".")
|
||||
deprecated_parts[-1] = deprecated_parts[-1] + ".py"
|
||||
return Path(*deprecated_parts)
|
||||
|
||||
|
||||
def _create_deprecated_modules_files():
|
||||
"""Add submodules that will be deprecated. A file is created based
|
||||
on the deprecated submodule's name. When this submodule is imported a
|
||||
deprecation warning will be raised.
|
||||
"""
|
||||
for (new_module_name, deprecated_path,
|
||||
correct_import_path, _) in _DEPRECATED_MODULES:
|
||||
relative_dots = deprecated_path.count(".") * "."
|
||||
deprecated_content = _FILE_CONTENT_TEMPLATE.format(
|
||||
new_module_name=new_module_name,
|
||||
relative_dots=relative_dots,
|
||||
deprecated_path=deprecated_path,
|
||||
correct_import_path=correct_import_path)
|
||||
|
||||
with _get_deprecated_path(deprecated_path).open('w') as f:
|
||||
f.write(deprecated_content)
|
||||
|
||||
|
||||
def _clean_deprecated_modules_files():
|
||||
"""Removes submodules created by _create_deprecated_modules_files."""
|
||||
for _, deprecated_path, _, _ in _DEPRECATED_MODULES:
|
||||
with suppress(FileNotFoundError):
|
||||
_get_deprecated_path(deprecated_path).unlink()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_clean_deprecated_modules_files()
|
114
venv/Lib/site-packages/sklearn/_build_utils/openmp_helpers.py
Normal file
114
venv/Lib/site-packages/sklearn/_build_utils/openmp_helpers.py
Normal file
|
@ -0,0 +1,114 @@
|
|||
"""Helpers for OpenMP support during the build."""
|
||||
|
||||
# This code is adapted for a large part from the astropy openmp helpers, which
|
||||
# can be found at: https://github.com/astropy/astropy-helpers/blob/master/astropy_helpers/openmp_helpers.py # noqa
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
import warnings
|
||||
import subprocess
|
||||
|
||||
from distutils.errors import CompileError, LinkError
|
||||
|
||||
from .pre_build_helpers import compile_test_program
|
||||
|
||||
|
||||
def get_openmp_flag(compiler):
|
||||
if hasattr(compiler, 'compiler'):
|
||||
compiler = compiler.compiler[0]
|
||||
else:
|
||||
compiler = compiler.__class__.__name__
|
||||
|
||||
if sys.platform == "win32" and ('icc' in compiler or 'icl' in compiler):
|
||||
return ['/Qopenmp']
|
||||
elif sys.platform == "win32":
|
||||
return ['/openmp']
|
||||
elif sys.platform == "darwin" and ('icc' in compiler or 'icl' in compiler):
|
||||
return ['-openmp']
|
||||
elif sys.platform == "darwin" and 'openmp' in os.getenv('CPPFLAGS', ''):
|
||||
# -fopenmp can't be passed as compile flag when using Apple-clang.
|
||||
# OpenMP support has to be enabled during preprocessing.
|
||||
#
|
||||
# For example, our macOS wheel build jobs use the following environment
|
||||
# variables to build with Apple-clang and the brew installed "libomp":
|
||||
#
|
||||
# export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
|
||||
# export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
|
||||
# export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
|
||||
# export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib
|
||||
# -L/usr/local/opt/libomp/lib -lomp"
|
||||
return []
|
||||
# Default flag for GCC and clang:
|
||||
return ['-fopenmp']
|
||||
|
||||
|
||||
def check_openmp_support():
|
||||
"""Check whether OpenMP test code can be compiled and run"""
|
||||
code = textwrap.dedent(
|
||||
"""\
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
int main(void) {
|
||||
#pragma omp parallel
|
||||
printf("nthreads=%d\\n", omp_get_num_threads());
|
||||
return 0;
|
||||
}
|
||||
""")
|
||||
|
||||
extra_preargs = os.getenv('LDFLAGS', None)
|
||||
if extra_preargs is not None:
|
||||
extra_preargs = extra_preargs.strip().split(" ")
|
||||
extra_preargs = [
|
||||
flag for flag in extra_preargs
|
||||
if flag.startswith(('-L', '-Wl,-rpath', '-l'))]
|
||||
|
||||
extra_postargs = get_openmp_flag
|
||||
|
||||
try:
|
||||
output = compile_test_program(code,
|
||||
extra_preargs=extra_preargs,
|
||||
extra_postargs=extra_postargs)
|
||||
|
||||
if 'nthreads=' in output[0]:
|
||||
nthreads = int(output[0].strip().split('=')[1])
|
||||
openmp_supported = len(output) == nthreads
|
||||
else:
|
||||
openmp_supported = False
|
||||
|
||||
except (CompileError, LinkError, subprocess.CalledProcessError):
|
||||
openmp_supported = False
|
||||
|
||||
if not openmp_supported:
|
||||
if os.getenv("SKLEARN_FAIL_NO_OPENMP"):
|
||||
raise CompileError("Failed to build with OpenMP")
|
||||
else:
|
||||
message = textwrap.dedent(
|
||||
"""
|
||||
|
||||
***********
|
||||
* WARNING *
|
||||
***********
|
||||
|
||||
It seems that scikit-learn cannot be built with OpenMP.
|
||||
|
||||
- Make sure you have followed the installation instructions:
|
||||
|
||||
https://scikit-learn.org/dev/developers/advanced_installation.html
|
||||
|
||||
- If your compiler supports OpenMP but you still see this
|
||||
message, please submit a bug report at:
|
||||
|
||||
https://github.com/scikit-learn/scikit-learn/issues
|
||||
|
||||
- The build will continue with OpenMP-based parallelism
|
||||
disabled. Note however that some estimators will run in
|
||||
sequential mode instead of leveraging thread-based
|
||||
parallelism.
|
||||
|
||||
***
|
||||
""")
|
||||
warnings.warn(message)
|
||||
|
||||
return openmp_supported
|
|
@ -0,0 +1,70 @@
|
|||
"""Helpers to check build environment before actual build of scikit-learn"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import tempfile
|
||||
import textwrap
|
||||
import subprocess
|
||||
|
||||
from distutils.sysconfig import customize_compiler
|
||||
from numpy.distutils.ccompiler import new_compiler
|
||||
|
||||
|
||||
def compile_test_program(code, extra_preargs=[], extra_postargs=[]):
|
||||
"""Check that some C code can be compiled and run"""
|
||||
ccompiler = new_compiler()
|
||||
customize_compiler(ccompiler)
|
||||
|
||||
# extra_(pre/post)args can be a callable to make it possible to get its
|
||||
# value from the compiler
|
||||
if callable(extra_preargs):
|
||||
extra_preargs = extra_preargs(ccompiler)
|
||||
if callable(extra_postargs):
|
||||
extra_postargs = extra_postargs(ccompiler)
|
||||
|
||||
start_dir = os.path.abspath('.')
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
try:
|
||||
os.chdir(tmp_dir)
|
||||
|
||||
# Write test program
|
||||
with open('test_program.c', 'w') as f:
|
||||
f.write(code)
|
||||
|
||||
os.mkdir('objects')
|
||||
|
||||
# Compile, test program
|
||||
ccompiler.compile(['test_program.c'], output_dir='objects',
|
||||
extra_postargs=extra_postargs)
|
||||
|
||||
# Link test program
|
||||
objects = glob.glob(
|
||||
os.path.join('objects', '*' + ccompiler.obj_extension))
|
||||
ccompiler.link_executable(objects, 'test_program',
|
||||
extra_preargs=extra_preargs,
|
||||
extra_postargs=extra_postargs)
|
||||
|
||||
# Run test program
|
||||
# will raise a CalledProcessError if return code was non-zero
|
||||
output = subprocess.check_output('./test_program')
|
||||
output = output.decode(sys.stdout.encoding or 'utf-8').splitlines()
|
||||
except Exception:
|
||||
raise
|
||||
finally:
|
||||
os.chdir(start_dir)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def basic_check_build():
|
||||
"""Check basic compilation and linking of C code"""
|
||||
code = textwrap.dedent(
|
||||
"""\
|
||||
#include <stdio.h>
|
||||
int main(void) {
|
||||
return 0;
|
||||
}
|
||||
""")
|
||||
compile_test_program(code)
|
150
venv/Lib/site-packages/sklearn/_config.py
Normal file
150
venv/Lib/site-packages/sklearn/_config.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
"""Global configuration state and functions for management
|
||||
"""
|
||||
import os
|
||||
from contextlib import contextmanager as contextmanager
|
||||
|
||||
_global_config = {
|
||||
'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)),
|
||||
'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
|
||||
'print_changed_only': True,
|
||||
'display': 'text',
|
||||
}
|
||||
|
||||
|
||||
def get_config():
|
||||
"""Retrieve current values for configuration set by :func:`set_config`
|
||||
|
||||
Returns
|
||||
-------
|
||||
config : dict
|
||||
Keys are parameter names that can be passed to :func:`set_config`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
config_context: Context manager for global scikit-learn configuration
|
||||
set_config: Set global scikit-learn configuration
|
||||
"""
|
||||
return _global_config.copy()
|
||||
|
||||
|
||||
def set_config(assume_finite=None, working_memory=None,
|
||||
print_changed_only=None, display=None):
|
||||
"""Set global scikit-learn configuration
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
Parameters
|
||||
----------
|
||||
assume_finite : bool, optional
|
||||
If True, validation for finiteness will be skipped,
|
||||
saving time, but leading to potential crashes. If
|
||||
False, validation for finiteness will be performed,
|
||||
avoiding error. Global default: False.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
working_memory : int, optional
|
||||
If set, scikit-learn will attempt to limit the size of temporary arrays
|
||||
to this number of MiB (per job when parallelised), often saving both
|
||||
computation time and memory on expensive operations that can be
|
||||
performed in chunks. Global default: 1024.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
print_changed_only : bool, optional
|
||||
If True, only the parameters that were set to non-default
|
||||
values will be printed when printing an estimator. For example,
|
||||
``print(SVC())`` while True will only print 'SVC()' while the default
|
||||
behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
|
||||
all the non-changed parameters.
|
||||
|
||||
.. versionadded:: 0.21
|
||||
|
||||
display : {'text', 'diagram'}, optional
|
||||
If 'diagram', estimators will be displayed as a diagram in a Jupyter
|
||||
lab or notebook context. If 'text', estimators will be displayed as
|
||||
text. Default is 'text'.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
See Also
|
||||
--------
|
||||
config_context: Context manager for global scikit-learn configuration
|
||||
get_config: Retrieve current values of the global configuration
|
||||
"""
|
||||
if assume_finite is not None:
|
||||
_global_config['assume_finite'] = assume_finite
|
||||
if working_memory is not None:
|
||||
_global_config['working_memory'] = working_memory
|
||||
if print_changed_only is not None:
|
||||
_global_config['print_changed_only'] = print_changed_only
|
||||
if display is not None:
|
||||
_global_config['display'] = display
|
||||
|
||||
|
||||
@contextmanager
|
||||
def config_context(**new_config):
|
||||
"""Context manager for global scikit-learn configuration
|
||||
|
||||
Parameters
|
||||
----------
|
||||
assume_finite : bool, optional
|
||||
If True, validation for finiteness will be skipped,
|
||||
saving time, but leading to potential crashes. If
|
||||
False, validation for finiteness will be performed,
|
||||
avoiding error. Global default: False.
|
||||
|
||||
working_memory : int, optional
|
||||
If set, scikit-learn will attempt to limit the size of temporary arrays
|
||||
to this number of MiB (per job when parallelised), often saving both
|
||||
computation time and memory on expensive operations that can be
|
||||
performed in chunks. Global default: 1024.
|
||||
|
||||
print_changed_only : bool, optional
|
||||
If True, only the parameters that were set to non-default
|
||||
values will be printed when printing an estimator. For example,
|
||||
``print(SVC())`` while True will only print 'SVC()', but would print
|
||||
'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
|
||||
when False. Default is True.
|
||||
|
||||
.. versionchanged:: 0.23
|
||||
Default changed from False to True.
|
||||
|
||||
display : {'text', 'diagram'}, optional
|
||||
If 'diagram', estimators will be displayed as a diagram in a Jupyter
|
||||
lab or notebook context. If 'text', estimators will be displayed as
|
||||
text. Default is 'text'.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
Notes
|
||||
-----
|
||||
All settings, not just those presently modified, will be returned to
|
||||
their previous values when the context manager is exited. This is not
|
||||
thread-safe.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import sklearn
|
||||
>>> from sklearn.utils.validation import assert_all_finite
|
||||
>>> with sklearn.config_context(assume_finite=True):
|
||||
... assert_all_finite([float('nan')])
|
||||
>>> with sklearn.config_context(assume_finite=True):
|
||||
... with sklearn.config_context(assume_finite=False):
|
||||
... assert_all_finite([float('nan')])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Input contains NaN, ...
|
||||
|
||||
See Also
|
||||
--------
|
||||
set_config: Set global scikit-learn configuration
|
||||
get_config: Retrieve current values of the global configuration
|
||||
"""
|
||||
old_config = get_config().copy()
|
||||
set_config(**new_config)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
set_config(**old_config)
|
19
venv/Lib/site-packages/sklearn/_distributor_init.py
Normal file
19
venv/Lib/site-packages/sklearn/_distributor_init.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
|
||||
'''
|
||||
Helper to preload the OpenMP dll to prevent "dll not found"
|
||||
errors.
|
||||
Once a DLL is preloaded, its namespace is made available to any
|
||||
subsequent DLL. This file originated in the scikit-learn-wheels
|
||||
github repo, and is created as part of the scripts that build the
|
||||
wheel.
|
||||
'''
|
||||
import os
|
||||
import os.path as op
|
||||
from ctypes import WinDLL
|
||||
|
||||
|
||||
if os.name == 'nt':
|
||||
# Pre-load the DLL stored in sklearn/.libs by convention.
|
||||
dll_path = op.join(op.dirname(__file__), '.libs', 'vcomp140.dll')
|
||||
WinDLL(op.abspath(dll_path))
|
||||
|
BIN
venv/Lib/site-packages/sklearn/_isotonic.cp36-win32.pyd
Normal file
BIN
venv/Lib/site-packages/sklearn/_isotonic.cp36-win32.pyd
Normal file
Binary file not shown.
0
venv/Lib/site-packages/sklearn/_loss/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/_loss/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
355
venv/Lib/site-packages/sklearn/_loss/glm_distribution.py
Normal file
355
venv/Lib/site-packages/sklearn/_loss/glm_distribution.py
Normal file
|
@ -0,0 +1,355 @@
|
|||
"""
|
||||
Distribution functions used in GLM
|
||||
"""
|
||||
|
||||
# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections import namedtuple
|
||||
import numbers
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import xlogy
|
||||
|
||||
|
||||
DistributionBoundary = namedtuple("DistributionBoundary",
|
||||
("value", "inclusive"))
|
||||
|
||||
|
||||
class ExponentialDispersionModel(metaclass=ABCMeta):
|
||||
r"""Base class for reproductive Exponential Dispersion Models (EDM).
|
||||
|
||||
The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
|
||||
|
||||
.. math:: p(y| \theta, \phi) = c(y, \phi)
|
||||
\exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
|
||||
= \tilde{c}(y, \phi)
|
||||
\exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
|
||||
|
||||
with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
|
||||
variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
|
||||
unit variance :math:`v(y_\textrm{pred})` and
|
||||
unit deviance :math:`d(y,y_\textrm{pred})`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
deviance
|
||||
deviance_derivative
|
||||
in_y_range
|
||||
unit_deviance
|
||||
unit_deviance_derivative
|
||||
unit_variance
|
||||
|
||||
References
|
||||
----------
|
||||
https://en.wikipedia.org/wiki/Exponential_dispersion_model.
|
||||
"""
|
||||
|
||||
def in_y_range(self, y):
|
||||
"""Returns ``True`` if y is in the valid range of Y~EDM.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
"""
|
||||
# Note that currently supported distributions have +inf upper bound
|
||||
|
||||
if not isinstance(self._lower_bound, DistributionBoundary):
|
||||
raise TypeError('_lower_bound attribute must be of type '
|
||||
'DistributionBoundary')
|
||||
|
||||
if self._lower_bound.inclusive:
|
||||
return np.greater_equal(y, self._lower_bound.value)
|
||||
else:
|
||||
return np.greater(y, self._lower_bound.value)
|
||||
|
||||
@abstractmethod
|
||||
def unit_variance(self, y_pred):
|
||||
r"""Compute the unit variance function.
|
||||
|
||||
The unit variance :math:`v(y_\textrm{pred})` determines the variance as
|
||||
a function of the mean :math:`y_\textrm{pred}` by
|
||||
:math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`.
|
||||
It can also be derived from the unit deviance
|
||||
:math:`d(y,y_\textrm{pred})` as
|
||||
|
||||
.. math:: v(y_\textrm{pred}) = \frac{2}{
|
||||
\frac{\partial^2 d(y,y_\textrm{pred})}{
|
||||
\partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}}
|
||||
|
||||
See also :func:`variance`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def unit_deviance(self, y, y_pred, check_input=False):
|
||||
r"""Compute the unit deviance.
|
||||
|
||||
The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
|
||||
log-likelihood as
|
||||
:math:`d(y,y_\textrm{pred}) = -2\phi\cdot
|
||||
\left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
check_input : bool, default=False
|
||||
If True raise an exception on invalid y or y_pred values, otherwise
|
||||
they will be propagated as NaN.
|
||||
Returns
|
||||
-------
|
||||
deviance: array of shape (n_samples,)
|
||||
Computed deviance
|
||||
"""
|
||||
|
||||
def unit_deviance_derivative(self, y, y_pred):
|
||||
r"""Compute the derivative of the unit deviance w.r.t. y_pred.
|
||||
|
||||
The derivative of the unit deviance is given by
|
||||
:math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred})
|
||||
= -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}`
|
||||
with unit variance :math:`v(y_\textrm{pred})`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
"""
|
||||
return -2 * (y - y_pred) / self.unit_variance(y_pred)
|
||||
|
||||
def deviance(self, y, y_pred, weights=1):
|
||||
r"""Compute the deviance.
|
||||
|
||||
The deviance is a weighted sum of the per sample unit deviances,
|
||||
:math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
|
||||
with weights :math:`s_i` and unit deviance
|
||||
:math:`d(y,y_\textrm{pred})`.
|
||||
In terms of the log-likelihood it is :math:`D = -2\phi\cdot
|
||||
\left(loglike(y,y_\textrm{pred},\frac{phi}{s})
|
||||
- loglike(y,y,\frac{phi}{s})\right)`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
weights : {int, array of shape (n_samples,)}, default=1
|
||||
Weights or exposure to which variance is inverse proportional.
|
||||
"""
|
||||
return np.sum(weights * self.unit_deviance(y, y_pred))
|
||||
|
||||
def deviance_derivative(self, y, y_pred, weights=1):
|
||||
r"""Compute the derivative of the deviance w.r.t. y_pred.
|
||||
|
||||
It gives :math:`\frac{\partial}{\partial y_\textrm{pred}}
|
||||
D(y, \y_\textrm{pred}; weights)`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array, shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array, shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
weights : {int, array of shape (n_samples,)}, default=1
|
||||
Weights or exposure to which variance is inverse proportional.
|
||||
"""
|
||||
return weights * self.unit_deviance_derivative(y, y_pred)
|
||||
|
||||
|
||||
class TweedieDistribution(ExponentialDispersionModel):
|
||||
r"""A class for the Tweedie distribution.
|
||||
|
||||
A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
|
||||
is uniquely defined by it's mean-variance relationship
|
||||
:math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
|
||||
|
||||
Special cases are:
|
||||
|
||||
===== ================
|
||||
Power Distribution
|
||||
===== ================
|
||||
0 Normal
|
||||
1 Poisson
|
||||
(1,2) Compound Poisson
|
||||
2 Gamma
|
||||
3 Inverse Gaussian
|
||||
|
||||
Parameters
|
||||
----------
|
||||
power : float, default=0
|
||||
The variance power of the `unit_variance`
|
||||
:math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
|
||||
For ``0<power<1``, no distribution exists.
|
||||
"""
|
||||
def __init__(self, power=0):
|
||||
self.power = power
|
||||
|
||||
@property
|
||||
def power(self):
|
||||
return self._power
|
||||
|
||||
@power.setter
|
||||
def power(self, power):
|
||||
# We use a property with a setter, to update lower and
|
||||
# upper bound when the power parameter is updated e.g. in grid
|
||||
# search.
|
||||
if not isinstance(power, numbers.Real):
|
||||
raise TypeError('power must be a real number, input was {0}'
|
||||
.format(power))
|
||||
|
||||
if power <= 0:
|
||||
# Extreme Stable or Normal distribution
|
||||
self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
|
||||
elif 0 < power < 1:
|
||||
raise ValueError('Tweedie distribution is only defined for '
|
||||
'power<=0 and power>=1.')
|
||||
elif 1 <= power < 2:
|
||||
# Poisson or Compound Poisson distribution
|
||||
self._lower_bound = DistributionBoundary(0, inclusive=True)
|
||||
elif power >= 2:
|
||||
# Gamma, Positive Stable, Inverse Gaussian distributions
|
||||
self._lower_bound = DistributionBoundary(0, inclusive=False)
|
||||
else: # pragma: no cover
|
||||
# this branch should be unreachable.
|
||||
raise ValueError
|
||||
|
||||
self._power = power
|
||||
|
||||
def unit_variance(self, y_pred):
|
||||
"""Compute the unit variance of a Tweedie distribution
|
||||
v(y_\textrm{pred})=y_\textrm{pred}**power.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
"""
|
||||
return np.power(y_pred, self.power)
|
||||
|
||||
def unit_deviance(self, y, y_pred, check_input=False):
|
||||
r"""Compute the unit deviance.
|
||||
|
||||
The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
|
||||
log-likelihood as
|
||||
:math:`d(y,y_\textrm{pred}) = -2\phi\cdot
|
||||
\left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
check_input : bool, default=False
|
||||
If True raise an exception on invalid y or y_pred values, otherwise
|
||||
they will be propagated as NaN.
|
||||
Returns
|
||||
-------
|
||||
deviance: array of shape (n_samples,)
|
||||
Computed deviance
|
||||
"""
|
||||
p = self.power
|
||||
|
||||
if check_input:
|
||||
message = ("Mean Tweedie deviance error with power={} can only be "
|
||||
"used on ".format(p))
|
||||
if p < 0:
|
||||
# 'Extreme stable', y any realy number, y_pred > 0
|
||||
if (y_pred <= 0).any():
|
||||
raise ValueError(message + "strictly positive y_pred.")
|
||||
elif p == 0:
|
||||
# Normal, y and y_pred can be any real number
|
||||
pass
|
||||
elif 0 < p < 1:
|
||||
raise ValueError("Tweedie deviance is only defined for "
|
||||
"power<=0 and power>=1.")
|
||||
elif 1 <= p < 2:
|
||||
# Poisson and Compount poisson distribution, y >= 0, y_pred > 0
|
||||
if (y < 0).any() or (y_pred <= 0).any():
|
||||
raise ValueError(message + "non-negative y and strictly "
|
||||
"positive y_pred.")
|
||||
elif p >= 2:
|
||||
# Gamma and Extreme stable distribution, y and y_pred > 0
|
||||
if (y <= 0).any() or (y_pred <= 0).any():
|
||||
raise ValueError(message
|
||||
+ "strictly positive y and y_pred.")
|
||||
else: # pragma: nocover
|
||||
# Unreachable statement
|
||||
raise ValueError
|
||||
|
||||
if p < 0:
|
||||
# 'Extreme stable', y any realy number, y_pred > 0
|
||||
dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p))
|
||||
- y * np.power(y_pred, 1-p) / (1-p)
|
||||
+ np.power(y_pred, 2-p) / (2-p))
|
||||
|
||||
elif p == 0:
|
||||
# Normal distribution, y and y_pred any real number
|
||||
dev = (y - y_pred)**2
|
||||
elif p < 1:
|
||||
raise ValueError("Tweedie deviance is only defined for power<=0 "
|
||||
"and power>=1.")
|
||||
elif p == 1:
|
||||
# Poisson distribution
|
||||
dev = 2 * (xlogy(y, y/y_pred) - y + y_pred)
|
||||
elif p == 2:
|
||||
# Gamma distribution
|
||||
dev = 2 * (np.log(y_pred/y) + y/y_pred - 1)
|
||||
else:
|
||||
dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p))
|
||||
- y * np.power(y_pred, 1-p) / (1-p)
|
||||
+ np.power(y_pred, 2-p) / (2-p))
|
||||
return dev
|
||||
|
||||
|
||||
class NormalDistribution(TweedieDistribution):
|
||||
"""Class for the Normal (aka Gaussian) distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=0)
|
||||
|
||||
|
||||
class PoissonDistribution(TweedieDistribution):
|
||||
"""Class for the scaled Poisson distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=1)
|
||||
|
||||
|
||||
class GammaDistribution(TweedieDistribution):
|
||||
"""Class for the Gamma distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=2)
|
||||
|
||||
|
||||
class InverseGaussianDistribution(TweedieDistribution):
|
||||
"""Class for the scaled InverseGaussianDistribution distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=3)
|
||||
|
||||
|
||||
EDM_DISTRIBUTIONS = {
|
||||
'normal': NormalDistribution,
|
||||
'poisson': PoissonDistribution,
|
||||
'gamma': GammaDistribution,
|
||||
'inverse-gaussian': InverseGaussianDistribution,
|
||||
}
|
0
venv/Lib/site-packages/sklearn/_loss/tests/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/_loss/tests/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,112 @@
|
|||
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
import numpy as np
|
||||
from numpy.testing import (
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
)
|
||||
from scipy.optimize import check_grad
|
||||
import pytest
|
||||
|
||||
from sklearn._loss.glm_distribution import (
|
||||
TweedieDistribution,
|
||||
NormalDistribution, PoissonDistribution,
|
||||
GammaDistribution, InverseGaussianDistribution,
|
||||
DistributionBoundary
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'family, expected',
|
||||
[(NormalDistribution(), [True, True, True]),
|
||||
(PoissonDistribution(), [False, True, True]),
|
||||
(TweedieDistribution(power=1.5), [False, True, True]),
|
||||
(GammaDistribution(), [False, False, True]),
|
||||
(InverseGaussianDistribution(), [False, False, True]),
|
||||
(TweedieDistribution(power=4.5), [False, False, True])])
|
||||
def test_family_bounds(family, expected):
|
||||
"""Test the valid range of distributions at -1, 0, 1."""
|
||||
result = family.in_y_range([-1, 0, 1])
|
||||
assert_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_distribution_bound():
|
||||
dist = TweedieDistribution()
|
||||
dist._lower_bound = 0
|
||||
with pytest.raises(TypeError,
|
||||
match="must be of type DistributionBoundary"):
|
||||
dist.in_y_range([-1, 0, 1])
|
||||
|
||||
|
||||
def test_tweedie_distribution_power():
|
||||
msg = "distribution is only defined for power<=0 and power>=1"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
TweedieDistribution(power=0.5)
|
||||
|
||||
with pytest.raises(TypeError, match="must be a real number"):
|
||||
TweedieDistribution(power=1j)
|
||||
|
||||
with pytest.raises(TypeError, match="must be a real number"):
|
||||
dist = TweedieDistribution()
|
||||
dist.power = 1j
|
||||
|
||||
dist = TweedieDistribution()
|
||||
assert isinstance(dist._lower_bound, DistributionBoundary)
|
||||
|
||||
assert dist._lower_bound.inclusive is False
|
||||
dist.power = 1
|
||||
assert dist._lower_bound.value == 0.0
|
||||
assert dist._lower_bound.inclusive is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'family, chk_values',
|
||||
[(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
|
||||
(PoissonDistribution(), [0.1, 1.5]),
|
||||
(GammaDistribution(), [0.1, 1.5]),
|
||||
(InverseGaussianDistribution(), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=-2.5), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=-1), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=1.5), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=2.5), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=-4), [0.1, 1.5])])
|
||||
def test_deviance_zero(family, chk_values):
|
||||
"""Test deviance(y,y) = 0 for different families."""
|
||||
for x in chk_values:
|
||||
assert_allclose(family.deviance(x, x), 0, atol=1e-9)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'family',
|
||||
[NormalDistribution(),
|
||||
PoissonDistribution(),
|
||||
GammaDistribution(),
|
||||
InverseGaussianDistribution(),
|
||||
TweedieDistribution(power=-2.5),
|
||||
TweedieDistribution(power=-1),
|
||||
TweedieDistribution(power=1.5),
|
||||
TweedieDistribution(power=2.5),
|
||||
TweedieDistribution(power=-4)],
|
||||
ids=lambda x: x.__class__.__name__
|
||||
)
|
||||
def test_deviance_derivative(family):
|
||||
"""Test deviance derivative for different families."""
|
||||
rng = np.random.RandomState(0)
|
||||
y_true = rng.rand(10)
|
||||
# make data positive
|
||||
y_true += np.abs(y_true.min()) + 1e-2
|
||||
|
||||
y_pred = y_true + np.fmax(rng.rand(10), 0.)
|
||||
|
||||
dev = family.deviance(y_true, y_pred)
|
||||
assert isinstance(dev, float)
|
||||
dev_derivative = family.deviance_derivative(y_true, y_pred)
|
||||
assert dev_derivative.shape == y_pred.shape
|
||||
|
||||
err = check_grad(
|
||||
lambda y_pred: family.deviance(y_true, y_pred),
|
||||
lambda y_pred: family.deviance_derivative(y_true, y_pred),
|
||||
y_pred,
|
||||
) / np.linalg.norm(dev_derivative)
|
||||
assert abs(err) < 1e-6
|
806
venv/Lib/site-packages/sklearn/base.py
Normal file
806
venv/Lib/site-packages/sklearn/base.py
Normal file
|
@ -0,0 +1,806 @@
|
|||
"""
|
||||
Base classes for all estimators.
|
||||
|
||||
Used for VotingClassifier
|
||||
"""
|
||||
|
||||
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import copy
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
import platform
|
||||
import inspect
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
|
||||
from . import __version__
|
||||
from ._config import get_config
|
||||
from .utils import _IS_32BIT
|
||||
from .utils.validation import check_X_y
|
||||
from .utils.validation import check_array
|
||||
from .utils._estimator_html_repr import estimator_html_repr
|
||||
from .utils.validation import _deprecate_positional_args
|
||||
|
||||
_DEFAULT_TAGS = {
|
||||
'non_deterministic': False,
|
||||
'requires_positive_X': False,
|
||||
'requires_positive_y': False,
|
||||
'X_types': ['2darray'],
|
||||
'poor_score': False,
|
||||
'no_validation': False,
|
||||
'multioutput': False,
|
||||
"allow_nan": False,
|
||||
'stateless': False,
|
||||
'multilabel': False,
|
||||
'_skip_test': False,
|
||||
'_xfail_checks': False,
|
||||
'multioutput_only': False,
|
||||
'binary_only': False,
|
||||
'requires_fit': True,
|
||||
'requires_y': False,
|
||||
}
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def clone(estimator, *, safe=True):
|
||||
"""Constructs a new estimator with the same parameters.
|
||||
|
||||
Clone does a deep copy of the model in an estimator
|
||||
without actually copying attached data. It yields a new estimator
|
||||
with the same parameters that has not been fit on any data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : {list, tuple, set} of estimator objects or estimator object
|
||||
The estimator or group of estimators to be cloned.
|
||||
|
||||
safe : bool, default=True
|
||||
If safe is false, clone will fall back to a deep copy on objects
|
||||
that are not estimators.
|
||||
|
||||
"""
|
||||
estimator_type = type(estimator)
|
||||
# XXX: not handling dictionaries
|
||||
if estimator_type in (list, tuple, set, frozenset):
|
||||
return estimator_type([clone(e, safe=safe) for e in estimator])
|
||||
elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
|
||||
if not safe:
|
||||
return copy.deepcopy(estimator)
|
||||
else:
|
||||
if isinstance(estimator, type):
|
||||
raise TypeError("Cannot clone object. " +
|
||||
"You should provide an instance of " +
|
||||
"scikit-learn estimator instead of a class.")
|
||||
else:
|
||||
raise TypeError("Cannot clone object '%s' (type %s): "
|
||||
"it does not seem to be a scikit-learn "
|
||||
"estimator as it does not implement a "
|
||||
"'get_params' method."
|
||||
% (repr(estimator), type(estimator)))
|
||||
|
||||
klass = estimator.__class__
|
||||
new_object_params = estimator.get_params(deep=False)
|
||||
for name, param in new_object_params.items():
|
||||
new_object_params[name] = clone(param, safe=False)
|
||||
new_object = klass(**new_object_params)
|
||||
params_set = new_object.get_params(deep=False)
|
||||
|
||||
# quick sanity check of the parameters of the clone
|
||||
for name in new_object_params:
|
||||
param1 = new_object_params[name]
|
||||
param2 = params_set[name]
|
||||
if param1 is not param2:
|
||||
raise RuntimeError('Cannot clone object %s, as the constructor '
|
||||
'either does not set or modifies parameter %s' %
|
||||
(estimator, name))
|
||||
return new_object
|
||||
|
||||
|
||||
def _pprint(params, offset=0, printer=repr):
|
||||
"""Pretty print the dictionary 'params'
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : dict
|
||||
The dictionary to pretty print
|
||||
|
||||
offset : int, default=0
|
||||
The offset in characters to add at the begin of each line.
|
||||
|
||||
printer : callable, default=repr
|
||||
The function to convert entries to strings, typically
|
||||
the builtin str or repr
|
||||
|
||||
"""
|
||||
# Do a multi-line justified repr:
|
||||
options = np.get_printoptions()
|
||||
np.set_printoptions(precision=5, threshold=64, edgeitems=2)
|
||||
params_list = list()
|
||||
this_line_length = offset
|
||||
line_sep = ',\n' + (1 + offset // 2) * ' '
|
||||
for i, (k, v) in enumerate(sorted(params.items())):
|
||||
if type(v) is float:
|
||||
# use str for representing floating point numbers
|
||||
# this way we get consistent representation across
|
||||
# architectures and versions.
|
||||
this_repr = '%s=%s' % (k, str(v))
|
||||
else:
|
||||
# use repr of the rest
|
||||
this_repr = '%s=%s' % (k, printer(v))
|
||||
if len(this_repr) > 500:
|
||||
this_repr = this_repr[:300] + '...' + this_repr[-100:]
|
||||
if i > 0:
|
||||
if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr):
|
||||
params_list.append(line_sep)
|
||||
this_line_length = len(line_sep)
|
||||
else:
|
||||
params_list.append(', ')
|
||||
this_line_length += 2
|
||||
params_list.append(this_repr)
|
||||
this_line_length += len(this_repr)
|
||||
|
||||
np.set_printoptions(**options)
|
||||
lines = ''.join(params_list)
|
||||
# Strip trailing space to avoid nightmare in doctests
|
||||
lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n'))
|
||||
return lines
|
||||
|
||||
|
||||
class BaseEstimator:
|
||||
"""Base class for all estimators in scikit-learn
|
||||
|
||||
Notes
|
||||
-----
|
||||
All estimators should specify all the parameters that can be set
|
||||
at the class level in their ``__init__`` as explicit keyword
|
||||
arguments (no ``*args`` or ``**kwargs``).
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def _get_param_names(cls):
|
||||
"""Get parameter names for the estimator"""
|
||||
# fetch the constructor or the original constructor before
|
||||
# deprecation wrapping if any
|
||||
init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
|
||||
if init is object.__init__:
|
||||
# No explicit constructor to introspect
|
||||
return []
|
||||
|
||||
# introspect the constructor arguments to find the model parameters
|
||||
# to represent
|
||||
init_signature = inspect.signature(init)
|
||||
# Consider the constructor parameters excluding 'self'
|
||||
parameters = [p for p in init_signature.parameters.values()
|
||||
if p.name != 'self' and p.kind != p.VAR_KEYWORD]
|
||||
for p in parameters:
|
||||
if p.kind == p.VAR_POSITIONAL:
|
||||
raise RuntimeError("scikit-learn estimators should always "
|
||||
"specify their parameters in the signature"
|
||||
" of their __init__ (no varargs)."
|
||||
" %s with constructor %s doesn't "
|
||||
" follow this convention."
|
||||
% (cls, init_signature))
|
||||
# Extract and sort argument names excluding 'self'
|
||||
return sorted([p.name for p in parameters])
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""
|
||||
Get parameters for this estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
deep : bool, default=True
|
||||
If True, will return the parameters for this estimator and
|
||||
contained subobjects that are estimators.
|
||||
|
||||
Returns
|
||||
-------
|
||||
params : mapping of string to any
|
||||
Parameter names mapped to their values.
|
||||
"""
|
||||
out = dict()
|
||||
for key in self._get_param_names():
|
||||
try:
|
||||
value = getattr(self, key)
|
||||
except AttributeError:
|
||||
warnings.warn('From version 0.24, get_params will raise an '
|
||||
'AttributeError if a parameter cannot be '
|
||||
'retrieved as an instance attribute. Previously '
|
||||
'it would return None.',
|
||||
FutureWarning)
|
||||
value = None
|
||||
if deep and hasattr(value, 'get_params'):
|
||||
deep_items = value.get_params().items()
|
||||
out.update((key + '__' + k, val) for k, val in deep_items)
|
||||
out[key] = value
|
||||
return out
|
||||
|
||||
def set_params(self, **params):
|
||||
"""
|
||||
Set the parameters of this estimator.
|
||||
|
||||
The method works on simple estimators as well as on nested objects
|
||||
(such as pipelines). The latter have parameters of the form
|
||||
``<component>__<parameter>`` so that it's possible to update each
|
||||
component of a nested object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
**params : dict
|
||||
Estimator parameters.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Estimator instance.
|
||||
"""
|
||||
if not params:
|
||||
# Simple optimization to gain speed (inspect is slow)
|
||||
return self
|
||||
valid_params = self.get_params(deep=True)
|
||||
|
||||
nested_params = defaultdict(dict) # grouped by prefix
|
||||
for key, value in params.items():
|
||||
key, delim, sub_key = key.partition('__')
|
||||
if key not in valid_params:
|
||||
raise ValueError('Invalid parameter %s for estimator %s. '
|
||||
'Check the list of available parameters '
|
||||
'with `estimator.get_params().keys()`.' %
|
||||
(key, self))
|
||||
|
||||
if delim:
|
||||
nested_params[key][sub_key] = value
|
||||
else:
|
||||
setattr(self, key, value)
|
||||
valid_params[key] = value
|
||||
|
||||
for key, sub_params in nested_params.items():
|
||||
valid_params[key].set_params(**sub_params)
|
||||
|
||||
return self
|
||||
|
||||
def __repr__(self, N_CHAR_MAX=700):
|
||||
# N_CHAR_MAX is the (approximate) maximum number of non-blank
|
||||
# characters to render. We pass it as an optional parameter to ease
|
||||
# the tests.
|
||||
|
||||
from .utils._pprint import _EstimatorPrettyPrinter
|
||||
|
||||
N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences
|
||||
|
||||
# use ellipsis for sequences with a lot of elements
|
||||
pp = _EstimatorPrettyPrinter(
|
||||
compact=True, indent=1, indent_at_name=True,
|
||||
n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
|
||||
|
||||
repr_ = pp.pformat(self)
|
||||
|
||||
# Use bruteforce ellipsis when there are a lot of non-blank characters
|
||||
n_nonblank = len(''.join(repr_.split()))
|
||||
if n_nonblank > N_CHAR_MAX:
|
||||
lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends
|
||||
regex = r'^(\s*\S){%d}' % lim
|
||||
# The regex '^(\s*\S){%d}' % n
|
||||
# matches from the start of the string until the nth non-blank
|
||||
# character:
|
||||
# - ^ matches the start of string
|
||||
# - (pattern){n} matches n repetitions of pattern
|
||||
# - \s*\S matches a non-blank char following zero or more blanks
|
||||
left_lim = re.match(regex, repr_).end()
|
||||
right_lim = re.match(regex, repr_[::-1]).end()
|
||||
|
||||
if '\n' in repr_[left_lim:-right_lim]:
|
||||
# The left side and right side aren't on the same line.
|
||||
# To avoid weird cuts, e.g.:
|
||||
# categoric...ore',
|
||||
# we need to start the right side with an appropriate newline
|
||||
# character so that it renders properly as:
|
||||
# categoric...
|
||||
# handle_unknown='ignore',
|
||||
# so we add [^\n]*\n which matches until the next \n
|
||||
regex += r'[^\n]*\n'
|
||||
right_lim = re.match(regex, repr_[::-1]).end()
|
||||
|
||||
ellipsis = '...'
|
||||
if left_lim + len(ellipsis) < len(repr_) - right_lim:
|
||||
# Only add ellipsis if it results in a shorter repr
|
||||
repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:]
|
||||
|
||||
return repr_
|
||||
|
||||
def __getstate__(self):
|
||||
try:
|
||||
state = super().__getstate__()
|
||||
except AttributeError:
|
||||
state = self.__dict__.copy()
|
||||
|
||||
if type(self).__module__.startswith('sklearn.'):
|
||||
return dict(state.items(), _sklearn_version=__version__)
|
||||
else:
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
if type(self).__module__.startswith('sklearn.'):
|
||||
pickle_version = state.pop("_sklearn_version", "pre-0.18")
|
||||
if pickle_version != __version__:
|
||||
warnings.warn(
|
||||
"Trying to unpickle estimator {0} from version {1} when "
|
||||
"using version {2}. This might lead to breaking code or "
|
||||
"invalid results. Use at your own risk.".format(
|
||||
self.__class__.__name__, pickle_version, __version__),
|
||||
UserWarning)
|
||||
try:
|
||||
super().__setstate__(state)
|
||||
except AttributeError:
|
||||
self.__dict__.update(state)
|
||||
|
||||
def _more_tags(self):
|
||||
return _DEFAULT_TAGS
|
||||
|
||||
def _get_tags(self):
|
||||
collected_tags = {}
|
||||
for base_class in reversed(inspect.getmro(self.__class__)):
|
||||
if hasattr(base_class, '_more_tags'):
|
||||
# need the if because mixins might not have _more_tags
|
||||
# but might do redundant work in estimators
|
||||
# (i.e. calling more tags on BaseEstimator multiple times)
|
||||
more_tags = base_class._more_tags(self)
|
||||
collected_tags.update(more_tags)
|
||||
return collected_tags
|
||||
|
||||
def _check_n_features(self, X, reset):
|
||||
"""Set the `n_features_in_` attribute, or check against it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
reset : bool
|
||||
If True, the `n_features_in_` attribute is set to `X.shape[1]`.
|
||||
Else, the attribute must already exist and the function checks
|
||||
that it is equal to `X.shape[1]`.
|
||||
"""
|
||||
n_features = X.shape[1]
|
||||
|
||||
if reset:
|
||||
self.n_features_in_ = n_features
|
||||
else:
|
||||
if not hasattr(self, 'n_features_in_'):
|
||||
raise RuntimeError(
|
||||
"The reset parameter is False but there is no "
|
||||
"n_features_in_ attribute. Is this estimator fitted?"
|
||||
)
|
||||
if n_features != self.n_features_in_:
|
||||
raise ValueError(
|
||||
'X has {} features, but this {} is expecting {} features '
|
||||
'as input.'.format(n_features, self.__class__.__name__,
|
||||
self.n_features_in_)
|
||||
)
|
||||
|
||||
def _validate_data(self, X, y=None, reset=True,
|
||||
validate_separately=False, **check_params):
|
||||
"""Validate input data and set or check the `n_features_in_` attribute.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, dataframe} of shape \
|
||||
(n_samples, n_features)
|
||||
The input samples.
|
||||
y : array-like of shape (n_samples,), default=None
|
||||
The targets. If None, `check_array` is called on `X` and
|
||||
`check_X_y` is called otherwise.
|
||||
reset : bool, default=True
|
||||
Whether to reset the `n_features_in_` attribute.
|
||||
If False, the input will be checked for consistency with data
|
||||
provided when reset was last True.
|
||||
validate_separately : False or tuple of dicts, default=False
|
||||
Only used if y is not None.
|
||||
If False, call validate_X_y(). Else, it must be a tuple of kwargs
|
||||
to be used for calling check_array() on X and y respectively.
|
||||
**check_params : kwargs
|
||||
Parameters passed to :func:`sklearn.utils.check_array` or
|
||||
:func:`sklearn.utils.check_X_y`. Ignored if validate_separately
|
||||
is not False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : {ndarray, sparse matrix} or tuple of these
|
||||
The validated input. A tuple is returned if `y` is not None.
|
||||
"""
|
||||
|
||||
if y is None:
|
||||
if self._get_tags()['requires_y']:
|
||||
raise ValueError(
|
||||
f"This {self.__class__.__name__} estimator "
|
||||
f"requires y to be passed, but the target y is None."
|
||||
)
|
||||
X = check_array(X, **check_params)
|
||||
out = X
|
||||
else:
|
||||
if validate_separately:
|
||||
# We need this because some estimators validate X and y
|
||||
# separately, and in general, separately calling check_array()
|
||||
# on X and y isn't equivalent to just calling check_X_y()
|
||||
# :(
|
||||
check_X_params, check_y_params = validate_separately
|
||||
X = check_array(X, **check_X_params)
|
||||
y = check_array(y, **check_y_params)
|
||||
else:
|
||||
X, y = check_X_y(X, y, **check_params)
|
||||
out = X, y
|
||||
|
||||
if check_params.get('ensure_2d', True):
|
||||
self._check_n_features(X, reset=reset)
|
||||
|
||||
return out
|
||||
|
||||
@property
|
||||
def _repr_html_(self):
|
||||
"""HTML representation of estimator.
|
||||
|
||||
This is redundant with the logic of `_repr_mimebundle_`. The latter
|
||||
should be favorted in the long term, `_repr_html_` is only
|
||||
implemented for consumers who do not interpret `_repr_mimbundle_`.
|
||||
"""
|
||||
if get_config()["display"] != 'diagram':
|
||||
raise AttributeError("_repr_html_ is only defined when the "
|
||||
"'display' configuration option is set to "
|
||||
"'diagram'")
|
||||
return self._repr_html_inner
|
||||
|
||||
def _repr_html_inner(self):
|
||||
"""This function is returned by the @property `_repr_html_` to make
|
||||
`hasattr(estimator, "_repr_html_") return `True` or `False` depending
|
||||
on `get_config()["display"]`.
|
||||
"""
|
||||
return estimator_html_repr(self)
|
||||
|
||||
def _repr_mimebundle_(self, **kwargs):
|
||||
"""Mime bundle used by jupyter kernels to display estimator"""
|
||||
output = {"text/plain": repr(self)}
|
||||
if get_config()["display"] == 'diagram':
|
||||
output["text/html"] = estimator_html_repr(self)
|
||||
return output
|
||||
|
||||
|
||||
class ClassifierMixin:
|
||||
"""Mixin class for all classifiers in scikit-learn."""
|
||||
|
||||
_estimator_type = "classifier"
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""
|
||||
Return the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||||
True labels for X.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of self.predict(X) wrt. y.
|
||||
"""
|
||||
from .metrics import accuracy_score
|
||||
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
||||
|
||||
def _more_tags(self):
|
||||
return {'requires_y': True}
|
||||
|
||||
|
||||
class RegressorMixin:
|
||||
"""Mixin class for all regression estimators in scikit-learn."""
|
||||
_estimator_type = "regressor"
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Return the coefficient of determination R^2 of the prediction.
|
||||
|
||||
The coefficient R^2 is defined as (1 - u/v), where u is the residual
|
||||
sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
|
||||
sum of squares ((y_true - y_true.mean()) ** 2).sum().
|
||||
The best possible score is 1.0 and it can be negative (because the
|
||||
model can be arbitrarily worse). A constant model that always
|
||||
predicts the expected value of y, disregarding the input features,
|
||||
would get a R^2 score of 0.0.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Test samples. For some estimators this may be a
|
||||
precomputed kernel matrix or a list of generic objects instead,
|
||||
shape = (n_samples, n_samples_fitted),
|
||||
where n_samples_fitted is the number of
|
||||
samples used in the fitting for the estimator.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||||
True values for X.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
R^2 of self.predict(X) wrt. y.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The R2 score used when calling ``score`` on a regressor uses
|
||||
``multioutput='uniform_average'`` from version 0.23 to keep consistent
|
||||
with default value of :func:`~sklearn.metrics.r2_score`.
|
||||
This influences the ``score`` method of all the multioutput
|
||||
regressors (except for
|
||||
:class:`~sklearn.multioutput.MultiOutputRegressor`).
|
||||
"""
|
||||
|
||||
from .metrics import r2_score
|
||||
y_pred = self.predict(X)
|
||||
return r2_score(y, y_pred, sample_weight=sample_weight)
|
||||
|
||||
def _more_tags(self):
|
||||
return {'requires_y': True}
|
||||
|
||||
|
||||
class ClusterMixin:
|
||||
"""Mixin class for all cluster estimators in scikit-learn."""
|
||||
_estimator_type = "clusterer"
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""
|
||||
Perform clustering on X and returns cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
# non-optimized default implementation; override when a better
|
||||
# method is possible for a given clustering algorithm
|
||||
self.fit(X)
|
||||
return self.labels_
|
||||
|
||||
|
||||
class BiclusterMixin:
|
||||
"""Mixin class for all bicluster estimators in scikit-learn"""
|
||||
|
||||
@property
|
||||
def biclusters_(self):
|
||||
"""Convenient way to get row and column indicators together.
|
||||
|
||||
Returns the ``rows_`` and ``columns_`` members.
|
||||
"""
|
||||
return self.rows_, self.columns_
|
||||
|
||||
def get_indices(self, i):
|
||||
"""Row and column indices of the i'th bicluster.
|
||||
|
||||
Only works if ``rows_`` and ``columns_`` attributes exist.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
i : int
|
||||
The index of the cluster.
|
||||
|
||||
Returns
|
||||
-------
|
||||
row_ind : ndarray, dtype=np.intp
|
||||
Indices of rows in the dataset that belong to the bicluster.
|
||||
col_ind : ndarray, dtype=np.intp
|
||||
Indices of columns in the dataset that belong to the bicluster.
|
||||
|
||||
"""
|
||||
rows = self.rows_[i]
|
||||
columns = self.columns_[i]
|
||||
return np.nonzero(rows)[0], np.nonzero(columns)[0]
|
||||
|
||||
def get_shape(self, i):
|
||||
"""Shape of the i'th bicluster.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
i : int
|
||||
The index of the cluster.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shape : tuple (int, int)
|
||||
Number of rows and columns (resp.) in the bicluster.
|
||||
"""
|
||||
indices = self.get_indices(i)
|
||||
return tuple(len(i) for i in indices)
|
||||
|
||||
def get_submatrix(self, i, data):
|
||||
"""Return the submatrix corresponding to bicluster `i`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
i : int
|
||||
The index of the cluster.
|
||||
data : array-like
|
||||
The data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
submatrix : ndarray
|
||||
The submatrix corresponding to bicluster i.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Works with sparse matrices. Only works if ``rows_`` and
|
||||
``columns_`` attributes exist.
|
||||
"""
|
||||
from .utils.validation import check_array
|
||||
data = check_array(data, accept_sparse='csr')
|
||||
row_ind, col_ind = self.get_indices(i)
|
||||
return data[row_ind[:, np.newaxis], col_ind]
|
||||
|
||||
|
||||
class TransformerMixin:
|
||||
"""Mixin class for all transformers in scikit-learn."""
|
||||
|
||||
def fit_transform(self, X, y=None, **fit_params):
|
||||
"""
|
||||
Fit to data, then transform it.
|
||||
|
||||
Fits transformer to X and y with optional parameters fit_params
|
||||
and returns a transformed version of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, dataframe} of shape \
|
||||
(n_samples, n_features)
|
||||
|
||||
y : ndarray of shape (n_samples,), default=None
|
||||
Target values.
|
||||
|
||||
**fit_params : dict
|
||||
Additional fit parameters.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray array of shape (n_samples, n_features_new)
|
||||
Transformed array.
|
||||
"""
|
||||
# non-optimized default implementation; override when a better
|
||||
# method is possible for a given clustering algorithm
|
||||
if y is None:
|
||||
# fit method of arity 1 (unsupervised transformation)
|
||||
return self.fit(X, **fit_params).transform(X)
|
||||
else:
|
||||
# fit method of arity 2 (supervised transformation)
|
||||
return self.fit(X, y, **fit_params).transform(X)
|
||||
|
||||
|
||||
class DensityMixin:
|
||||
"""Mixin class for all density estimators in scikit-learn."""
|
||||
_estimator_type = "DensityEstimator"
|
||||
|
||||
def score(self, X, y=None):
|
||||
"""Return the score of the model on the data X
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class OutlierMixin:
|
||||
"""Mixin class for all outlier detection estimators in scikit-learn."""
|
||||
_estimator_type = "outlier_detector"
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Perform fit on X and returns labels for X.
|
||||
|
||||
Returns -1 for outliers and 1 for inliers.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, dataframe} of shape \
|
||||
(n_samples, n_features)
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples,)
|
||||
1 for inliers, -1 for outliers.
|
||||
"""
|
||||
# override for transductive outlier detectors like LocalOulierFactor
|
||||
return self.fit(X).predict(X)
|
||||
|
||||
|
||||
class MetaEstimatorMixin:
|
||||
_required_parameters = ["estimator"]
|
||||
"""Mixin class for all meta estimators in scikit-learn."""
|
||||
|
||||
|
||||
class MultiOutputMixin:
|
||||
"""Mixin to mark estimators that support multioutput."""
|
||||
def _more_tags(self):
|
||||
return {'multioutput': True}
|
||||
|
||||
|
||||
class _UnstableArchMixin:
|
||||
"""Mark estimators that are non-determinstic on 32bit or PowerPC"""
|
||||
def _more_tags(self):
|
||||
return {'non_deterministic': (
|
||||
_IS_32BIT or platform.machine().startswith(('ppc', 'powerpc')))}
|
||||
|
||||
|
||||
def is_classifier(estimator):
|
||||
"""Return True if the given estimator is (probably) a classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : object
|
||||
Estimator object to test.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : bool
|
||||
True if estimator is a classifier and False otherwise.
|
||||
"""
|
||||
return getattr(estimator, "_estimator_type", None) == "classifier"
|
||||
|
||||
|
||||
def is_regressor(estimator):
|
||||
"""Return True if the given estimator is (probably) a regressor.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : object
|
||||
Estimator object to test.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : bool
|
||||
True if estimator is a regressor and False otherwise.
|
||||
"""
|
||||
return getattr(estimator, "_estimator_type", None) == "regressor"
|
||||
|
||||
|
||||
def is_outlier_detector(estimator):
|
||||
"""Return True if the given estimator is (probably) an outlier detector.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : object
|
||||
Estimator object to test.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : bool
|
||||
True if estimator is an outlier detector and False otherwise.
|
||||
"""
|
||||
return getattr(estimator, "_estimator_type", None) == "outlier_detector"
|
600
venv/Lib/site-packages/sklearn/calibration.py
Normal file
600
venv/Lib/site-packages/sklearn/calibration.py
Normal file
|
@ -0,0 +1,600 @@
|
|||
"""Calibration of predicted probabilities."""
|
||||
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# Balazs Kegl <balazs.kegl@gmail.com>
|
||||
# Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
||||
# Mathieu Blondel <mathieu@mblondel.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from inspect import signature
|
||||
|
||||
from math import log
|
||||
import numpy as np
|
||||
|
||||
from scipy.special import expit
|
||||
from scipy.special import xlogy
|
||||
from scipy.optimize import fmin_bfgs
|
||||
from .preprocessing import LabelEncoder
|
||||
|
||||
from .base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone,
|
||||
MetaEstimatorMixin)
|
||||
from .preprocessing import label_binarize, LabelBinarizer
|
||||
from .utils import check_array, indexable, column_or_1d
|
||||
from .utils.validation import check_is_fitted, check_consistent_length
|
||||
from .utils.validation import _check_sample_weight
|
||||
from .isotonic import IsotonicRegression
|
||||
from .svm import LinearSVC
|
||||
from .model_selection import check_cv
|
||||
from .utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
class CalibratedClassifierCV(BaseEstimator, ClassifierMixin,
|
||||
MetaEstimatorMixin):
|
||||
"""Probability calibration with isotonic regression or logistic regression.
|
||||
|
||||
The calibration is based on the :term:`decision_function` method of the
|
||||
`base_estimator` if it exists, else on :term:`predict_proba`.
|
||||
|
||||
Read more in the :ref:`User Guide <calibration>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
base_estimator : instance BaseEstimator
|
||||
The classifier whose output need to be calibrated to provide more
|
||||
accurate `predict_proba` outputs.
|
||||
|
||||
method : 'sigmoid' or 'isotonic'
|
||||
The method to use for calibration. Can be 'sigmoid' which
|
||||
corresponds to Platt's method (i.e. a logistic regression model) or
|
||||
'isotonic' which is a non-parametric approach. It is not advised to
|
||||
use isotonic calibration with too few calibration samples
|
||||
``(<<1000)`` since it tends to overfit.
|
||||
|
||||
cv : integer, cross-validation generator, iterable or "prefit", optional
|
||||
Determines the cross-validation splitting strategy.
|
||||
Possible inputs for cv are:
|
||||
|
||||
- None, to use the default 5-fold cross-validation,
|
||||
- integer, to specify the number of folds.
|
||||
- :term:`CV splitter`,
|
||||
- An iterable yielding (train, test) splits as arrays of indices.
|
||||
|
||||
For integer/None inputs, if ``y`` is binary or multiclass,
|
||||
:class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is
|
||||
neither binary nor multiclass, :class:`sklearn.model_selection.KFold`
|
||||
is used.
|
||||
|
||||
Refer :ref:`User Guide <cross_validation>` for the various
|
||||
cross-validation strategies that can be used here.
|
||||
|
||||
If "prefit" is passed, it is assumed that `base_estimator` has been
|
||||
fitted already and all data is used for calibration.
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
``cv`` default value if None changed from 3-fold to 5-fold.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : array, shape (n_classes)
|
||||
The class labels.
|
||||
|
||||
calibrated_classifiers_ : list (len() equal to cv or 1 if cv == "prefit")
|
||||
The list of calibrated classifiers, one for each cross-validation fold,
|
||||
which has been fitted on all but the validation fold and calibrated
|
||||
on the validation fold.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Obtaining calibrated probability estimates from decision trees
|
||||
and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001
|
||||
|
||||
.. [2] Transforming Classifier Scores into Accurate Multiclass
|
||||
Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
|
||||
|
||||
.. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
|
||||
Regularized Likelihood Methods, J. Platt, (1999)
|
||||
|
||||
.. [4] Predicting Good Probabilities with Supervised Learning,
|
||||
A. Niculescu-Mizil & R. Caruana, ICML 2005
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, base_estimator=None, *, method='sigmoid', cv=None):
|
||||
self.base_estimator = base_estimator
|
||||
self.method = method
|
||||
self.cv = cv
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the calibrated model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like, shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns an instance of self.
|
||||
"""
|
||||
X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr', 'coo'],
|
||||
force_all_finite=False, allow_nd=True)
|
||||
X, y = indexable(X, y)
|
||||
le = LabelBinarizer().fit(y)
|
||||
self.classes_ = le.classes_
|
||||
|
||||
# Check that each cross-validation fold can have at least one
|
||||
# example per class
|
||||
n_folds = self.cv if isinstance(self.cv, int) \
|
||||
else self.cv.n_folds if hasattr(self.cv, "n_folds") else None
|
||||
if n_folds and \
|
||||
np.any([np.sum(y == class_) < n_folds for class_ in
|
||||
self.classes_]):
|
||||
raise ValueError("Requesting %d-fold cross-validation but provided"
|
||||
" less than %d examples for at least one class."
|
||||
% (n_folds, n_folds))
|
||||
|
||||
self.calibrated_classifiers_ = []
|
||||
if self.base_estimator is None:
|
||||
# we want all classifiers that don't expose a random_state
|
||||
# to be deterministic (and we don't want to expose this one).
|
||||
base_estimator = LinearSVC(random_state=0)
|
||||
else:
|
||||
base_estimator = self.base_estimator
|
||||
|
||||
if self.cv == "prefit":
|
||||
calibrated_classifier = _CalibratedClassifier(
|
||||
base_estimator, method=self.method)
|
||||
calibrated_classifier.fit(X, y, sample_weight)
|
||||
self.calibrated_classifiers_.append(calibrated_classifier)
|
||||
else:
|
||||
cv = check_cv(self.cv, y, classifier=True)
|
||||
fit_parameters = signature(base_estimator.fit).parameters
|
||||
base_estimator_supports_sw = "sample_weight" in fit_parameters
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
|
||||
if not base_estimator_supports_sw:
|
||||
estimator_name = type(base_estimator).__name__
|
||||
warnings.warn("Since %s does not support sample_weights, "
|
||||
"sample weights will only be used for the "
|
||||
"calibration itself." % estimator_name)
|
||||
|
||||
for train, test in cv.split(X, y):
|
||||
this_estimator = clone(base_estimator)
|
||||
|
||||
if sample_weight is not None and base_estimator_supports_sw:
|
||||
this_estimator.fit(X[train], y[train],
|
||||
sample_weight=sample_weight[train])
|
||||
else:
|
||||
this_estimator.fit(X[train], y[train])
|
||||
|
||||
calibrated_classifier = _CalibratedClassifier(
|
||||
this_estimator, method=self.method, classes=self.classes_)
|
||||
sw = None if sample_weight is None else sample_weight[test]
|
||||
calibrated_classifier.fit(X[test], y[test], sample_weight=sw)
|
||||
self.calibrated_classifiers_.append(calibrated_classifier)
|
||||
|
||||
return self
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Posterior probabilities of classification
|
||||
|
||||
This function returns posterior probabilities of classification
|
||||
according to each class on an array of test vectors X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : array, shape (n_samples, n_classes)
|
||||
The predicted probas.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse=['csc', 'csr', 'coo'],
|
||||
force_all_finite=False)
|
||||
# Compute the arithmetic mean of the predictions of the calibrated
|
||||
# classifiers
|
||||
mean_proba = np.zeros((X.shape[0], len(self.classes_)))
|
||||
for calibrated_classifier in self.calibrated_classifiers_:
|
||||
proba = calibrated_classifier.predict_proba(X)
|
||||
mean_proba += proba
|
||||
|
||||
mean_proba /= len(self.calibrated_classifiers_)
|
||||
|
||||
return mean_proba
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the target of new samples. The predicted class is the
|
||||
class that has the highest probability, and can thus be different
|
||||
from the prediction of the uncalibrated classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : array, shape (n_samples,)
|
||||
The predicted class.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
|
||||
|
||||
|
||||
class _CalibratedClassifier:
|
||||
"""Probability calibration with isotonic regression or sigmoid.
|
||||
|
||||
It assumes that base_estimator has already been fit, and trains the
|
||||
calibration on the input set of the fit function. Note that this class
|
||||
should not be used as an estimator directly. Use CalibratedClassifierCV
|
||||
with cv="prefit" instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
base_estimator : instance BaseEstimator
|
||||
The classifier whose output decision function needs to be calibrated
|
||||
to offer more accurate predict_proba outputs. No default value since
|
||||
it has to be an already fitted estimator.
|
||||
|
||||
method : 'sigmoid' | 'isotonic'
|
||||
The method to use for calibration. Can be 'sigmoid' which
|
||||
corresponds to Platt's method or 'isotonic' which is a
|
||||
non-parametric approach based on isotonic regression.
|
||||
|
||||
classes : array-like, shape (n_classes,), optional
|
||||
Contains unique classes used to fit the base estimator.
|
||||
if None, then classes is extracted from the given target values
|
||||
in fit().
|
||||
|
||||
See also
|
||||
--------
|
||||
CalibratedClassifierCV
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Obtaining calibrated probability estimates from decision trees
|
||||
and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001
|
||||
|
||||
.. [2] Transforming Classifier Scores into Accurate Multiclass
|
||||
Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
|
||||
|
||||
.. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
|
||||
Regularized Likelihood Methods, J. Platt, (1999)
|
||||
|
||||
.. [4] Predicting Good Probabilities with Supervised Learning,
|
||||
A. Niculescu-Mizil & R. Caruana, ICML 2005
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, base_estimator, *, method='sigmoid', classes=None):
|
||||
self.base_estimator = base_estimator
|
||||
self.method = method
|
||||
self.classes = classes
|
||||
|
||||
def _preproc(self, X):
|
||||
n_classes = len(self.classes_)
|
||||
if hasattr(self.base_estimator, "decision_function"):
|
||||
df = self.base_estimator.decision_function(X)
|
||||
if df.ndim == 1:
|
||||
df = df[:, np.newaxis]
|
||||
elif hasattr(self.base_estimator, "predict_proba"):
|
||||
df = self.base_estimator.predict_proba(X)
|
||||
if n_classes == 2:
|
||||
df = df[:, 1:]
|
||||
else:
|
||||
raise RuntimeError('classifier has no decision_function or '
|
||||
'predict_proba method.')
|
||||
|
||||
idx_pos_class = self.label_encoder_.\
|
||||
transform(self.base_estimator.classes_)
|
||||
|
||||
return df, idx_pos_class
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Calibrate the fitted model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like, shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns an instance of self.
|
||||
"""
|
||||
|
||||
self.label_encoder_ = LabelEncoder()
|
||||
if self.classes is None:
|
||||
self.label_encoder_.fit(y)
|
||||
else:
|
||||
self.label_encoder_.fit(self.classes)
|
||||
|
||||
self.classes_ = self.label_encoder_.classes_
|
||||
Y = label_binarize(y, classes=self.classes_)
|
||||
|
||||
df, idx_pos_class = self._preproc(X)
|
||||
self.calibrators_ = []
|
||||
|
||||
for k, this_df in zip(idx_pos_class, df.T):
|
||||
if self.method == 'isotonic':
|
||||
calibrator = IsotonicRegression(out_of_bounds='clip')
|
||||
elif self.method == 'sigmoid':
|
||||
calibrator = _SigmoidCalibration()
|
||||
else:
|
||||
raise ValueError('method should be "sigmoid" or '
|
||||
'"isotonic". Got %s.' % self.method)
|
||||
calibrator.fit(this_df, Y[:, k], sample_weight)
|
||||
self.calibrators_.append(calibrator)
|
||||
|
||||
return self
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Posterior probabilities of classification
|
||||
|
||||
This function returns posterior probabilities of classification
|
||||
according to each class on an array of test vectors X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : array, shape (n_samples, n_classes)
|
||||
The predicted probas. Can be exact zeros.
|
||||
"""
|
||||
n_classes = len(self.classes_)
|
||||
proba = np.zeros((X.shape[0], n_classes))
|
||||
|
||||
df, idx_pos_class = self._preproc(X)
|
||||
|
||||
for k, this_df, calibrator in \
|
||||
zip(idx_pos_class, df.T, self.calibrators_):
|
||||
if n_classes == 2:
|
||||
k += 1
|
||||
proba[:, k] = calibrator.predict(this_df)
|
||||
|
||||
# Normalize the probabilities
|
||||
if n_classes == 2:
|
||||
proba[:, 0] = 1. - proba[:, 1]
|
||||
else:
|
||||
proba /= np.sum(proba, axis=1)[:, np.newaxis]
|
||||
|
||||
# XXX : for some reason all probas can be 0
|
||||
proba[np.isnan(proba)] = 1. / n_classes
|
||||
|
||||
# Deal with cases where the predicted probability minimally exceeds 1.0
|
||||
proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0
|
||||
|
||||
return proba
|
||||
|
||||
|
||||
def _sigmoid_calibration(df, y, sample_weight=None):
|
||||
"""Probability Calibration with sigmoid method (Platt 2000)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : ndarray, shape (n_samples,)
|
||||
The decision function or predict proba for the samples.
|
||||
|
||||
y : ndarray, shape (n_samples,)
|
||||
The targets.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
a : float
|
||||
The slope.
|
||||
|
||||
b : float
|
||||
The intercept.
|
||||
|
||||
References
|
||||
----------
|
||||
Platt, "Probabilistic Outputs for Support Vector Machines"
|
||||
"""
|
||||
df = column_or_1d(df)
|
||||
y = column_or_1d(y)
|
||||
|
||||
F = df # F follows Platt's notations
|
||||
|
||||
# Bayesian priors (see Platt end of section 2.2)
|
||||
prior0 = float(np.sum(y <= 0))
|
||||
prior1 = y.shape[0] - prior0
|
||||
T = np.zeros(y.shape)
|
||||
T[y > 0] = (prior1 + 1.) / (prior1 + 2.)
|
||||
T[y <= 0] = 1. / (prior0 + 2.)
|
||||
T1 = 1. - T
|
||||
|
||||
def objective(AB):
|
||||
# From Platt (beginning of Section 2.2)
|
||||
P = expit(-(AB[0] * F + AB[1]))
|
||||
loss = -(xlogy(T, P) + xlogy(T1, 1. - P))
|
||||
if sample_weight is not None:
|
||||
return (sample_weight * loss).sum()
|
||||
else:
|
||||
return loss.sum()
|
||||
|
||||
def grad(AB):
|
||||
# gradient of the objective function
|
||||
P = expit(-(AB[0] * F + AB[1]))
|
||||
TEP_minus_T1P = T - P
|
||||
if sample_weight is not None:
|
||||
TEP_minus_T1P *= sample_weight
|
||||
dA = np.dot(TEP_minus_T1P, F)
|
||||
dB = np.sum(TEP_minus_T1P)
|
||||
return np.array([dA, dB])
|
||||
|
||||
AB0 = np.array([0., log((prior0 + 1.) / (prior1 + 1.))])
|
||||
AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False)
|
||||
return AB_[0], AB_[1]
|
||||
|
||||
|
||||
class _SigmoidCalibration(RegressorMixin, BaseEstimator):
|
||||
"""Sigmoid regression model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
a_ : float
|
||||
The slope.
|
||||
|
||||
b_ : float
|
||||
The intercept.
|
||||
"""
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the model using X, y as training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples,)
|
||||
Training data.
|
||||
|
||||
y : array-like, shape (n_samples,)
|
||||
Training target.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights. If None, then samples are equally weighted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns an instance of self.
|
||||
"""
|
||||
X = column_or_1d(X)
|
||||
y = column_or_1d(y)
|
||||
X, y = indexable(X, y)
|
||||
|
||||
self.a_, self.b_ = _sigmoid_calibration(X, y, sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, T):
|
||||
"""Predict new data by linear interpolation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
T : array-like, shape (n_samples,)
|
||||
Data to predict from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
T_ : array, shape (n_samples,)
|
||||
The predicted data.
|
||||
"""
|
||||
T = column_or_1d(T)
|
||||
return expit(-(self.a_ * T + self.b_))
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5,
|
||||
strategy='uniform'):
|
||||
"""Compute true and predicted probabilities for a calibration curve.
|
||||
|
||||
The method assumes the inputs come from a binary classifier, and
|
||||
discretize the [0, 1] interval into bins.
|
||||
|
||||
Calibration curves may also be referred to as reliability diagrams.
|
||||
|
||||
Read more in the :ref:`User Guide <calibration>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_true : array-like of shape (n_samples,)
|
||||
True targets.
|
||||
|
||||
y_prob : array-like of shape (n_samples,)
|
||||
Probabilities of the positive class.
|
||||
|
||||
normalize : bool, default=False
|
||||
Whether y_prob needs to be normalized into the [0, 1] interval, i.e.
|
||||
is not a proper probability. If True, the smallest value in y_prob
|
||||
is linearly mapped onto 0 and the largest one onto 1.
|
||||
|
||||
n_bins : int, default=5
|
||||
Number of bins to discretize the [0, 1] interval. A bigger number
|
||||
requires more data. Bins with no samples (i.e. without
|
||||
corresponding values in `y_prob`) will not be returned, thus the
|
||||
returned arrays may have less than `n_bins` values.
|
||||
|
||||
strategy : {'uniform', 'quantile'}, default='uniform'
|
||||
Strategy used to define the widths of the bins.
|
||||
|
||||
uniform
|
||||
The bins have identical widths.
|
||||
quantile
|
||||
The bins have the same number of samples and depend on `y_prob`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
prob_true : ndarray of shape (n_bins,) or smaller
|
||||
The proportion of samples whose class is the positive class, in each
|
||||
bin (fraction of positives).
|
||||
|
||||
prob_pred : ndarray of shape (n_bins,) or smaller
|
||||
The mean predicted probability in each bin.
|
||||
|
||||
References
|
||||
----------
|
||||
Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
|
||||
Probabilities With Supervised Learning, in Proceedings of the 22nd
|
||||
International Conference on Machine Learning (ICML).
|
||||
See section 4 (Qualitative Analysis of Predictions).
|
||||
"""
|
||||
y_true = column_or_1d(y_true)
|
||||
y_prob = column_or_1d(y_prob)
|
||||
check_consistent_length(y_true, y_prob)
|
||||
|
||||
if normalize: # Normalize predicted values into interval [0, 1]
|
||||
y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())
|
||||
elif y_prob.min() < 0 or y_prob.max() > 1:
|
||||
raise ValueError("y_prob has values outside [0, 1] and normalize is "
|
||||
"set to False.")
|
||||
|
||||
labels = np.unique(y_true)
|
||||
if len(labels) > 2:
|
||||
raise ValueError("Only binary classification is supported. "
|
||||
"Provided labels %s." % labels)
|
||||
y_true = label_binarize(y_true, classes=labels)[:, 0]
|
||||
|
||||
if strategy == 'quantile': # Determine bin edges by distribution of data
|
||||
quantiles = np.linspace(0, 1, n_bins + 1)
|
||||
bins = np.percentile(y_prob, quantiles * 100)
|
||||
bins[-1] = bins[-1] + 1e-8
|
||||
elif strategy == 'uniform':
|
||||
bins = np.linspace(0., 1. + 1e-8, n_bins + 1)
|
||||
else:
|
||||
raise ValueError("Invalid entry to 'strategy' input. Strategy "
|
||||
"must be either 'quantile' or 'uniform'.")
|
||||
|
||||
binids = np.digitize(y_prob, bins) - 1
|
||||
|
||||
bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins))
|
||||
bin_true = np.bincount(binids, weights=y_true, minlength=len(bins))
|
||||
bin_total = np.bincount(binids, minlength=len(bins))
|
||||
|
||||
nonzero = bin_total != 0
|
||||
prob_true = bin_true[nonzero] / bin_total[nonzero]
|
||||
prob_pred = bin_sums[nonzero] / bin_total[nonzero]
|
||||
|
||||
return prob_true, prob_pred
|
42
venv/Lib/site-packages/sklearn/cluster/__init__.py
Normal file
42
venv/Lib/site-packages/sklearn/cluster/__init__.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
The :mod:`sklearn.cluster` module gathers popular unsupervised clustering
|
||||
algorithms.
|
||||
"""
|
||||
|
||||
from ._spectral import spectral_clustering, SpectralClustering
|
||||
from ._mean_shift import (mean_shift, MeanShift,
|
||||
estimate_bandwidth, get_bin_seeds)
|
||||
from ._affinity_propagation import affinity_propagation, AffinityPropagation
|
||||
from ._agglomerative import (ward_tree, AgglomerativeClustering,
|
||||
linkage_tree, FeatureAgglomeration)
|
||||
from ._kmeans import k_means, KMeans, MiniBatchKMeans
|
||||
from ._dbscan import dbscan, DBSCAN
|
||||
from ._optics import (OPTICS, cluster_optics_dbscan, compute_optics_graph,
|
||||
cluster_optics_xi)
|
||||
from ._bicluster import SpectralBiclustering, SpectralCoclustering
|
||||
from ._birch import Birch
|
||||
|
||||
__all__ = ['AffinityPropagation',
|
||||
'AgglomerativeClustering',
|
||||
'Birch',
|
||||
'DBSCAN',
|
||||
'OPTICS',
|
||||
'cluster_optics_dbscan',
|
||||
'cluster_optics_xi',
|
||||
'compute_optics_graph',
|
||||
'KMeans',
|
||||
'FeatureAgglomeration',
|
||||
'MeanShift',
|
||||
'MiniBatchKMeans',
|
||||
'SpectralClustering',
|
||||
'affinity_propagation',
|
||||
'dbscan',
|
||||
'estimate_bandwidth',
|
||||
'get_bin_seeds',
|
||||
'k_means',
|
||||
'linkage_tree',
|
||||
'mean_shift',
|
||||
'spectral_clustering',
|
||||
'ward_tree',
|
||||
'SpectralBiclustering',
|
||||
'SpectralCoclustering']
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
474
venv/Lib/site-packages/sklearn/cluster/_affinity_propagation.py
Normal file
474
venv/Lib/site-packages/sklearn/cluster/_affinity_propagation.py
Normal file
|
@ -0,0 +1,474 @@
|
|||
"""Affinity Propagation clustering algorithm."""
|
||||
|
||||
# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
|
||||
# Gael Varoquaux gael.varoquaux@normalesup.org
|
||||
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..base import BaseEstimator, ClusterMixin
|
||||
from ..utils import as_float_array, check_array, check_random_state
|
||||
from ..utils.validation import check_is_fitted, _deprecate_positional_args
|
||||
from ..metrics import euclidean_distances
|
||||
from ..metrics import pairwise_distances_argmin
|
||||
|
||||
|
||||
def _equal_similarities_and_preferences(S, preference):
|
||||
def all_equal_preferences():
|
||||
return np.all(preference == preference.flat[0])
|
||||
|
||||
def all_equal_similarities():
|
||||
# Create mask to ignore diagonal of S
|
||||
mask = np.ones(S.shape, dtype=bool)
|
||||
np.fill_diagonal(mask, 0)
|
||||
|
||||
return np.all(S[mask].flat == S[mask].flat[0])
|
||||
|
||||
return all_equal_preferences() and all_equal_similarities()
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def affinity_propagation(S, *, preference=None, convergence_iter=15,
|
||||
max_iter=200, damping=0.5, copy=True, verbose=False,
|
||||
return_n_iter=False, random_state='warn'):
|
||||
"""Perform Affinity Propagation Clustering of data
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
S : array-like, shape (n_samples, n_samples)
|
||||
Matrix of similarities between points
|
||||
|
||||
preference : array-like, shape (n_samples,) or float, optional
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number of
|
||||
exemplars, i.e. of clusters, is influenced by the input preferences
|
||||
value. If the preferences are not passed as arguments, they will be
|
||||
set to the median of the input similarities (resulting in a moderate
|
||||
number of clusters). For a smaller amount of clusters, this can be set
|
||||
to the minimum value of the similarities.
|
||||
|
||||
convergence_iter : int, optional, default: 15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
max_iter : int, optional, default: 200
|
||||
Maximum number of iterations
|
||||
|
||||
damping : float, optional, default: 0.5
|
||||
Damping factor between 0.5 and 1.
|
||||
|
||||
copy : boolean, optional, default: True
|
||||
If copy is False, the affinity matrix is modified inplace by the
|
||||
algorithm, for memory efficiency
|
||||
|
||||
verbose : boolean, optional, default: False
|
||||
The verbosity level
|
||||
|
||||
return_n_iter : bool, default False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
random_state : int or np.random.RandomStateInstance, default: 0
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
cluster_centers_indices : array, shape (n_clusters,)
|
||||
index of clusters centers
|
||||
|
||||
labels : array, shape (n_samples,)
|
||||
cluster labels for each point
|
||||
|
||||
n_iter : int
|
||||
number of iterations run. Returned only if `return_n_iter` is
|
||||
set to True.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||||
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||||
|
||||
When the algorithm does not converge, it returns an empty array as
|
||||
``cluster_center_indices`` and ``-1`` as label for each training sample.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, a single cluster center
|
||||
and label ``0`` for every sample will be returned. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
"""
|
||||
S = as_float_array(S, copy=copy)
|
||||
n_samples = S.shape[0]
|
||||
|
||||
if S.shape[0] != S.shape[1]:
|
||||
raise ValueError("S must be a square array (shape=%s)" % repr(S.shape))
|
||||
|
||||
if preference is None:
|
||||
preference = np.median(S)
|
||||
if damping < 0.5 or damping >= 1:
|
||||
raise ValueError('damping must be >= 0.5 and < 1')
|
||||
|
||||
preference = np.array(preference)
|
||||
|
||||
if (n_samples == 1 or
|
||||
_equal_similarities_and_preferences(S, preference)):
|
||||
# It makes no sense to run the algorithm in this case, so return 1 or
|
||||
# n_samples clusters, depending on preferences
|
||||
warnings.warn("All samples have mutually equal similarities. "
|
||||
"Returning arbitrary cluster center(s).")
|
||||
if preference.flat[0] >= S.flat[n_samples - 1]:
|
||||
return ((np.arange(n_samples), np.arange(n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.arange(n_samples), np.arange(n_samples)))
|
||||
else:
|
||||
return ((np.array([0]), np.array([0] * n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.array([0]), np.array([0] * n_samples)))
|
||||
|
||||
if random_state == 'warn':
|
||||
warnings.warn(("'random_state' has been introduced in 0.23. "
|
||||
"It will be set to None starting from 0.25 which "
|
||||
"means that results will differ at every function "
|
||||
"call. Set 'random_state' to None to silence this "
|
||||
"warning, or to 0 to keep the behavior of versions "
|
||||
"<0.23."),
|
||||
FutureWarning)
|
||||
random_state = 0
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
# Place preference on the diagonal of S
|
||||
S.flat[::(n_samples + 1)] = preference
|
||||
|
||||
A = np.zeros((n_samples, n_samples))
|
||||
R = np.zeros((n_samples, n_samples)) # Initialize messages
|
||||
# Intermediate results
|
||||
tmp = np.zeros((n_samples, n_samples))
|
||||
|
||||
# Remove degeneracies
|
||||
S += ((np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100) *
|
||||
random_state.randn(n_samples, n_samples))
|
||||
|
||||
# Execute parallel affinity propagation updates
|
||||
e = np.zeros((n_samples, convergence_iter))
|
||||
|
||||
ind = np.arange(n_samples)
|
||||
|
||||
for it in range(max_iter):
|
||||
# tmp = A + S; compute responsibilities
|
||||
np.add(A, S, tmp)
|
||||
I = np.argmax(tmp, axis=1)
|
||||
Y = tmp[ind, I] # np.max(A + S, axis=1)
|
||||
tmp[ind, I] = -np.inf
|
||||
Y2 = np.max(tmp, axis=1)
|
||||
|
||||
# tmp = Rnew
|
||||
np.subtract(S, Y[:, None], tmp)
|
||||
tmp[ind, I] = S[ind, I] - Y2
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
R *= damping
|
||||
R += tmp
|
||||
|
||||
# tmp = Rp; compute availabilities
|
||||
np.maximum(R, 0, tmp)
|
||||
tmp.flat[::n_samples + 1] = R.flat[::n_samples + 1]
|
||||
|
||||
# tmp = -Anew
|
||||
tmp -= np.sum(tmp, axis=0)
|
||||
dA = np.diag(tmp).copy()
|
||||
tmp.clip(0, np.inf, tmp)
|
||||
tmp.flat[::n_samples + 1] = dA
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
A *= damping
|
||||
A -= tmp
|
||||
|
||||
# Check for convergence
|
||||
E = (np.diag(A) + np.diag(R)) > 0
|
||||
e[:, it % convergence_iter] = E
|
||||
K = np.sum(E, axis=0)
|
||||
|
||||
if it >= convergence_iter:
|
||||
se = np.sum(e, axis=1)
|
||||
unconverged = (np.sum((se == convergence_iter) + (se == 0))
|
||||
!= n_samples)
|
||||
if (not unconverged and (K > 0)) or (it == max_iter):
|
||||
never_converged = False
|
||||
if verbose:
|
||||
print("Converged after %d iterations." % it)
|
||||
break
|
||||
else:
|
||||
never_converged = True
|
||||
if verbose:
|
||||
print("Did not converge")
|
||||
|
||||
I = np.flatnonzero(E)
|
||||
K = I.size # Identify exemplars
|
||||
|
||||
if K > 0 and not never_converged:
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K) # Identify clusters
|
||||
# Refine the final set of exemplars and clusters and return results
|
||||
for k in range(K):
|
||||
ii = np.where(c == k)[0]
|
||||
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
|
||||
I[k] = ii[j]
|
||||
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K)
|
||||
labels = I[c]
|
||||
# Reduce labels to a sorted, gapless, list
|
||||
cluster_centers_indices = np.unique(labels)
|
||||
labels = np.searchsorted(cluster_centers_indices, labels)
|
||||
else:
|
||||
warnings.warn("Affinity propagation did not converge, this model "
|
||||
"will not have any cluster centers.", ConvergenceWarning)
|
||||
labels = np.array([-1] * n_samples)
|
||||
cluster_centers_indices = []
|
||||
|
||||
if return_n_iter:
|
||||
return cluster_centers_indices, labels, it + 1
|
||||
else:
|
||||
return cluster_centers_indices, labels
|
||||
|
||||
|
||||
###############################################################################
|
||||
|
||||
class AffinityPropagation(ClusterMixin, BaseEstimator):
|
||||
"""Perform Affinity Propagation Clustering of data.
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
damping : float, default=0.5
|
||||
Damping factor (between 0.5 and 1) is the extent to
|
||||
which the current value is maintained relative to
|
||||
incoming values (weighted 1 - damping). This in order
|
||||
to avoid numerical oscillations when updating these
|
||||
values (messages).
|
||||
|
||||
max_iter : int, default=200
|
||||
Maximum number of iterations.
|
||||
|
||||
convergence_iter : int, default=15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
copy : bool, default=True
|
||||
Make a copy of input data.
|
||||
|
||||
preference : array-like of shape (n_samples,) or float, default=None
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number
|
||||
of exemplars, ie of clusters, is influenced by the input
|
||||
preferences value. If the preferences are not passed as arguments,
|
||||
they will be set to the median of the input similarities.
|
||||
|
||||
affinity : {'euclidean', 'precomputed'}, default='euclidean'
|
||||
Which affinity to use. At the moment 'precomputed' and
|
||||
``euclidean`` are supported. 'euclidean' uses the
|
||||
negative squared euclidean distance between points.
|
||||
|
||||
verbose : bool, default=False
|
||||
Whether to be verbose.
|
||||
|
||||
random_state : int or np.random.RandomStateInstance, default: 0
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_indices_ : ndarray of shape (n_clusters,)
|
||||
Indices of cluster centers
|
||||
|
||||
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
||||
Cluster centers (if affinity != ``precomputed``).
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Stores the affinity matrix used in ``fit``.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations taken to converge.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||||
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||||
|
||||
The algorithmic complexity of affinity propagation is quadratic
|
||||
in the number of points.
|
||||
|
||||
When ``fit`` does not converge, ``cluster_centers_`` becomes an empty
|
||||
array and all training samples will be labelled as ``-1``. In addition,
|
||||
``predict`` will then label every sample as ``-1``.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, ``fit`` will result in
|
||||
a single cluster center and label ``0`` for every sample. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import AffinityPropagation
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||||
... [4, 2], [4, 4], [4, 0]])
|
||||
>>> clustering = AffinityPropagation(random_state=5).fit(X)
|
||||
>>> clustering
|
||||
AffinityPropagation(random_state=5)
|
||||
>>> clustering.labels_
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
>>> clustering.predict([[0, 0], [4, 4]])
|
||||
array([0, 1])
|
||||
>>> clustering.cluster_centers_
|
||||
array([[1, 2],
|
||||
[4, 2]])
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
|
||||
copy=True, preference=None, affinity='euclidean',
|
||||
verbose=False, random_state='warn'):
|
||||
|
||||
self.damping = damping
|
||||
self.max_iter = max_iter
|
||||
self.convergence_iter = convergence_iter
|
||||
self.copy = copy
|
||||
self.verbose = verbose
|
||||
self.preference = preference
|
||||
self.affinity = affinity
|
||||
self.random_state = random_state
|
||||
|
||||
@property
|
||||
def _pairwise(self):
|
||||
return self.affinity == "precomputed"
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the clustering from features, or affinity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features), or \
|
||||
array-like, shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
accept_sparse = False
|
||||
else:
|
||||
accept_sparse = 'csr'
|
||||
X = self._validate_data(X, accept_sparse=accept_sparse)
|
||||
if self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
elif self.affinity == "euclidean":
|
||||
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
|
||||
else:
|
||||
raise ValueError("Affinity must be 'precomputed' or "
|
||||
"'euclidean'. Got %s instead"
|
||||
% str(self.affinity))
|
||||
|
||||
self.cluster_centers_indices_, self.labels_, self.n_iter_ = \
|
||||
affinity_propagation(
|
||||
self.affinity_matrix_, preference=self.preference,
|
||||
max_iter=self.max_iter,
|
||||
convergence_iter=self.convergence_iter, damping=self.damping,
|
||||
copy=self.copy, verbose=self.verbose, return_n_iter=True,
|
||||
random_state=self.random_state)
|
||||
|
||||
if self.affinity != "precomputed":
|
||||
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the closest cluster each sample in X belongs to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features)
|
||||
New data to predict. If a sparse matrix is provided, it will be
|
||||
converted into a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray, shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X)
|
||||
if not hasattr(self, "cluster_centers_"):
|
||||
raise ValueError("Predict method is not supported when "
|
||||
"affinity='precomputed'.")
|
||||
|
||||
if self.cluster_centers_.shape[0] > 0:
|
||||
return pairwise_distances_argmin(X, self.cluster_centers_)
|
||||
else:
|
||||
warnings.warn("This model does not have any cluster centers "
|
||||
"because affinity propagation did not converge. "
|
||||
"Labeling every sample as '-1'.", ConvergenceWarning)
|
||||
return np.array([-1] * X.shape[0])
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Fit the clustering from features or affinity matrix, and return
|
||||
cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features), or \
|
||||
array-like, shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray, shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
return super().fit_predict(X, y)
|
1079
venv/Lib/site-packages/sklearn/cluster/_agglomerative.py
Normal file
1079
venv/Lib/site-packages/sklearn/cluster/_agglomerative.py
Normal file
File diff suppressed because it is too large
Load diff
546
venv/Lib/site-packages/sklearn/cluster/_bicluster.py
Normal file
546
venv/Lib/site-packages/sklearn/cluster/_bicluster.py
Normal file
|
@ -0,0 +1,546 @@
|
|||
"""Spectral biclustering algorithms."""
|
||||
# Authors : Kemal Eren
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy.linalg import norm
|
||||
from scipy.sparse import dia_matrix, issparse
|
||||
from scipy.sparse.linalg import eigsh, svds
|
||||
|
||||
from . import KMeans, MiniBatchKMeans
|
||||
from ..base import BaseEstimator, BiclusterMixin
|
||||
from ..utils import check_random_state
|
||||
|
||||
from ..utils.extmath import (make_nonnegative, randomized_svd,
|
||||
safe_sparse_dot)
|
||||
|
||||
from ..utils.validation import assert_all_finite, _deprecate_positional_args
|
||||
|
||||
|
||||
__all__ = ['SpectralCoclustering',
|
||||
'SpectralBiclustering']
|
||||
|
||||
|
||||
def _scale_normalize(X):
|
||||
"""Normalize ``X`` by scaling rows and columns independently.
|
||||
|
||||
Returns the normalized matrix and the row and column scaling
|
||||
factors.
|
||||
|
||||
"""
|
||||
X = make_nonnegative(X)
|
||||
row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
|
||||
col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
|
||||
row_diag = np.where(np.isnan(row_diag), 0, row_diag)
|
||||
col_diag = np.where(np.isnan(col_diag), 0, col_diag)
|
||||
if issparse(X):
|
||||
n_rows, n_cols = X.shape
|
||||
r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
|
||||
c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
|
||||
an = r * X * c
|
||||
else:
|
||||
an = row_diag[:, np.newaxis] * X * col_diag
|
||||
return an, row_diag, col_diag
|
||||
|
||||
|
||||
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
|
||||
"""Normalize rows and columns of ``X`` simultaneously so that all
|
||||
rows sum to one constant and all columns sum to a different
|
||||
constant.
|
||||
|
||||
"""
|
||||
# According to paper, this can also be done more efficiently with
|
||||
# deviation reduction and balancing algorithms.
|
||||
X = make_nonnegative(X)
|
||||
X_scaled = X
|
||||
for _ in range(max_iter):
|
||||
X_new, _, _ = _scale_normalize(X_scaled)
|
||||
if issparse(X):
|
||||
dist = norm(X_scaled.data - X.data)
|
||||
else:
|
||||
dist = norm(X_scaled - X_new)
|
||||
X_scaled = X_new
|
||||
if dist is not None and dist < tol:
|
||||
break
|
||||
return X_scaled
|
||||
|
||||
|
||||
def _log_normalize(X):
|
||||
"""Normalize ``X`` according to Kluger's log-interactions scheme."""
|
||||
X = make_nonnegative(X, min_value=1)
|
||||
if issparse(X):
|
||||
raise ValueError("Cannot compute log of a sparse matrix,"
|
||||
" because log(x) diverges to -infinity as x"
|
||||
" goes to 0.")
|
||||
L = np.log(X)
|
||||
row_avg = L.mean(axis=1)[:, np.newaxis]
|
||||
col_avg = L.mean(axis=0)
|
||||
avg = L.mean()
|
||||
return L - row_avg - col_avg + avg
|
||||
|
||||
|
||||
class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
|
||||
"""Base class for spectral biclustering."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, n_clusters=3, svd_method="randomized",
|
||||
n_svd_vecs=None, mini_batch=False, init="k-means++",
|
||||
n_init=10, n_jobs='deprecated', random_state=None):
|
||||
self.n_clusters = n_clusters
|
||||
self.svd_method = svd_method
|
||||
self.n_svd_vecs = n_svd_vecs
|
||||
self.mini_batch = mini_batch
|
||||
self.init = init
|
||||
self.n_init = n_init
|
||||
self.n_jobs = n_jobs
|
||||
self.random_state = random_state
|
||||
|
||||
def _check_parameters(self):
|
||||
legal_svd_methods = ('randomized', 'arpack')
|
||||
if self.svd_method not in legal_svd_methods:
|
||||
raise ValueError("Unknown SVD method: '{0}'. svd_method must be"
|
||||
" one of {1}.".format(self.svd_method,
|
||||
legal_svd_methods))
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Creates a biclustering for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
|
||||
y : Ignored
|
||||
|
||||
"""
|
||||
if self.n_jobs != 'deprecated':
|
||||
warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
|
||||
" removed in 0.25.", FutureWarning)
|
||||
|
||||
X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)
|
||||
self._check_parameters()
|
||||
self._fit(X)
|
||||
return self
|
||||
|
||||
def _svd(self, array, n_components, n_discard):
|
||||
"""Returns first `n_components` left and right singular
|
||||
vectors u and v, discarding the first `n_discard`.
|
||||
|
||||
"""
|
||||
if self.svd_method == 'randomized':
|
||||
kwargs = {}
|
||||
if self.n_svd_vecs is not None:
|
||||
kwargs['n_oversamples'] = self.n_svd_vecs
|
||||
u, _, vt = randomized_svd(array, n_components,
|
||||
random_state=self.random_state,
|
||||
**kwargs)
|
||||
|
||||
elif self.svd_method == 'arpack':
|
||||
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
|
||||
if np.any(np.isnan(vt)):
|
||||
# some eigenvalues of A * A.T are negative, causing
|
||||
# sqrt() to be np.nan. This causes some vectors in vt
|
||||
# to be np.nan.
|
||||
A = safe_sparse_dot(array.T, array)
|
||||
random_state = check_random_state(self.random_state)
|
||||
# initialize with [-1,1] as in ARPACK
|
||||
v0 = random_state.uniform(-1, 1, A.shape[0])
|
||||
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
|
||||
vt = v.T
|
||||
if np.any(np.isnan(u)):
|
||||
A = safe_sparse_dot(array, array.T)
|
||||
random_state = check_random_state(self.random_state)
|
||||
# initialize with [-1,1] as in ARPACK
|
||||
v0 = random_state.uniform(-1, 1, A.shape[0])
|
||||
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
|
||||
|
||||
assert_all_finite(u)
|
||||
assert_all_finite(vt)
|
||||
u = u[:, n_discard:]
|
||||
vt = vt[n_discard:]
|
||||
return u, vt.T
|
||||
|
||||
def _k_means(self, data, n_clusters):
|
||||
if self.mini_batch:
|
||||
model = MiniBatchKMeans(n_clusters,
|
||||
init=self.init,
|
||||
n_init=self.n_init,
|
||||
random_state=self.random_state)
|
||||
else:
|
||||
model = KMeans(n_clusters, init=self.init,
|
||||
n_init=self.n_init, n_jobs=self.n_jobs,
|
||||
random_state=self.random_state)
|
||||
model.fit(data)
|
||||
centroid = model.cluster_centers_
|
||||
labels = model.labels_
|
||||
return centroid, labels
|
||||
|
||||
|
||||
class SpectralCoclustering(BaseSpectral):
|
||||
"""Spectral Co-Clustering algorithm (Dhillon, 2001).
|
||||
|
||||
Clusters rows and columns of an array `X` to solve the relaxed
|
||||
normalized cut of the bipartite graph created from `X` as follows:
|
||||
the edge between row vertex `i` and column vertex `j` has weight
|
||||
`X[i, j]`.
|
||||
|
||||
The resulting bicluster structure is block-diagonal, since each
|
||||
row and each column belongs to exactly one bicluster.
|
||||
|
||||
Supports sparse matrices, as long as they are nonnegative.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_coclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, default=3
|
||||
The number of biclusters to find.
|
||||
|
||||
svd_method : {'randomized', 'arpack'}, default='randomized'
|
||||
Selects the algorithm for finding singular vectors. May be
|
||||
'randomized' or 'arpack'. If 'randomized', use
|
||||
:func:`sklearn.utils.extmath.randomized_svd`, which may be faster
|
||||
for large matrices. If 'arpack', use
|
||||
:func:`scipy.sparse.linalg.svds`, which is more accurate, but
|
||||
possibly slower in some cases.
|
||||
|
||||
n_svd_vecs : int, default=None
|
||||
Number of vectors to use in calculating the SVD. Corresponds
|
||||
to `ncv` when `svd_method=arpack` and `n_oversamples` when
|
||||
`svd_method` is 'randomized`.
|
||||
|
||||
mini_batch : bool, default=False
|
||||
Whether to use mini-batch k-means, which is faster but may get
|
||||
different results.
|
||||
|
||||
init : {'k-means++', 'random', or ndarray of shape \
|
||||
(n_clusters, n_features), default='k-means++'
|
||||
Method for initialization of k-means algorithm; defaults to
|
||||
'k-means++'.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of random initializations that are tried with the
|
||||
k-means algorithm.
|
||||
|
||||
If mini-batch k-means is used, the best initialization is
|
||||
chosen and the algorithm runs once. Otherwise, the algorithm
|
||||
is run for each initialization and the best solution chosen.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. This works by breaking
|
||||
down the pairwise matrix into n_jobs even slices and computing them in
|
||||
parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
.. deprecated:: 0.23
|
||||
``n_jobs`` was deprecated in version 0.23 and will be removed in
|
||||
0.25.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used for randomizing the singular value decomposition and the k-means
|
||||
initialization. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
rows_ : array-like of shape (n_row_clusters, n_rows)
|
||||
Results of the clustering. `rows[i, r]` is True if
|
||||
cluster `i` contains row `r`. Available only after calling ``fit``.
|
||||
|
||||
columns_ : array-like of shape (n_column_clusters, n_columns)
|
||||
Results of the clustering, like `rows`.
|
||||
|
||||
row_labels_ : array-like of shape (n_rows,)
|
||||
The bicluster label of each row.
|
||||
|
||||
column_labels_ : array-like of shape (n_cols,)
|
||||
The bicluster label of each column.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralCoclustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
|
||||
>>> clustering.row_labels_ #doctest: +SKIP
|
||||
array([0, 1, 1, 0, 0, 0], dtype=int32)
|
||||
>>> clustering.column_labels_ #doctest: +SKIP
|
||||
array([0, 0], dtype=int32)
|
||||
>>> clustering
|
||||
SpectralCoclustering(n_clusters=2, random_state=0)
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
* Dhillon, Inderjit S, 2001. `Co-clustering documents and words using
|
||||
bipartite spectral graph partitioning
|
||||
<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.140.3011>`__.
|
||||
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_clusters=3, *, svd_method='randomized',
|
||||
n_svd_vecs=None, mini_batch=False, init='k-means++',
|
||||
n_init=10, n_jobs='deprecated', random_state=None):
|
||||
super().__init__(n_clusters,
|
||||
svd_method,
|
||||
n_svd_vecs,
|
||||
mini_batch,
|
||||
init,
|
||||
n_init,
|
||||
n_jobs,
|
||||
random_state)
|
||||
|
||||
def _fit(self, X):
|
||||
normalized_data, row_diag, col_diag = _scale_normalize(X)
|
||||
n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
|
||||
u, v = self._svd(normalized_data, n_sv, n_discard=1)
|
||||
z = np.vstack((row_diag[:, np.newaxis] * u,
|
||||
col_diag[:, np.newaxis] * v))
|
||||
|
||||
_, labels = self._k_means(z, self.n_clusters)
|
||||
|
||||
n_rows = X.shape[0]
|
||||
self.row_labels_ = labels[:n_rows]
|
||||
self.column_labels_ = labels[n_rows:]
|
||||
|
||||
self.rows_ = np.vstack([self.row_labels_ == c
|
||||
for c in range(self.n_clusters)])
|
||||
self.columns_ = np.vstack([self.column_labels_ == c
|
||||
for c in range(self.n_clusters)])
|
||||
|
||||
|
||||
class SpectralBiclustering(BaseSpectral):
|
||||
"""Spectral biclustering (Kluger, 2003).
|
||||
|
||||
Partitions rows and columns under the assumption that the data has
|
||||
an underlying checkerboard structure. For instance, if there are
|
||||
two row partitions and three column partitions, each row will
|
||||
belong to three biclusters, and each column will belong to two
|
||||
biclusters. The outer product of the corresponding row and column
|
||||
label vectors gives this checkerboard structure.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_biclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
|
||||
The number of row and column clusters in the checkerboard
|
||||
structure.
|
||||
|
||||
method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
|
||||
Method of normalizing and converting singular vectors into
|
||||
biclusters. May be one of 'scale', 'bistochastic', or 'log'.
|
||||
The authors recommend using 'log'. If the data is sparse,
|
||||
however, log normalization will not work, which is why the
|
||||
default is 'bistochastic'.
|
||||
|
||||
.. warning::
|
||||
if `method='log'`, the data must be sparse.
|
||||
|
||||
n_components : int, default=6
|
||||
Number of singular vectors to check.
|
||||
|
||||
n_best : int, default=3
|
||||
Number of best singular vectors to which to project the data
|
||||
for clustering.
|
||||
|
||||
svd_method : {'randomized', 'arpack'}, default='randomized'
|
||||
Selects the algorithm for finding singular vectors. May be
|
||||
'randomized' or 'arpack'. If 'randomized', uses
|
||||
:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
|
||||
for large matrices. If 'arpack', uses
|
||||
`scipy.sparse.linalg.svds`, which is more accurate, but
|
||||
possibly slower in some cases.
|
||||
|
||||
n_svd_vecs : int, default=None
|
||||
Number of vectors to use in calculating the SVD. Corresponds
|
||||
to `ncv` when `svd_method=arpack` and `n_oversamples` when
|
||||
`svd_method` is 'randomized`.
|
||||
|
||||
mini_batch : bool, default=False
|
||||
Whether to use mini-batch k-means, which is faster but may get
|
||||
different results.
|
||||
|
||||
init : {'k-means++', 'random'} or ndarray of (n_clusters, n_features), \
|
||||
default='k-means++'
|
||||
Method for initialization of k-means algorithm; defaults to
|
||||
'k-means++'.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of random initializations that are tried with the
|
||||
k-means algorithm.
|
||||
|
||||
If mini-batch k-means is used, the best initialization is
|
||||
chosen and the algorithm runs once. Otherwise, the algorithm
|
||||
is run for each initialization and the best solution chosen.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. This works by breaking
|
||||
down the pairwise matrix into n_jobs even slices and computing them in
|
||||
parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
.. deprecated:: 0.23
|
||||
``n_jobs`` was deprecated in version 0.23 and will be removed in
|
||||
0.25.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used for randomizing the singular value decomposition and the k-means
|
||||
initialization. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
rows_ : array-like of shape (n_row_clusters, n_rows)
|
||||
Results of the clustering. `rows[i, r]` is True if
|
||||
cluster `i` contains row `r`. Available only after calling ``fit``.
|
||||
|
||||
columns_ : array-like of shape (n_column_clusters, n_columns)
|
||||
Results of the clustering, like `rows`.
|
||||
|
||||
row_labels_ : array-like of shape (n_rows,)
|
||||
Row partition labels.
|
||||
|
||||
column_labels_ : array-like of shape (n_cols,)
|
||||
Column partition labels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralBiclustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
|
||||
>>> clustering.row_labels_
|
||||
array([1, 1, 1, 0, 0, 0], dtype=int32)
|
||||
>>> clustering.column_labels_
|
||||
array([0, 1], dtype=int32)
|
||||
>>> clustering
|
||||
SpectralBiclustering(n_clusters=2, random_state=0)
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
* Kluger, Yuval, et. al., 2003. `Spectral biclustering of microarray
|
||||
data: coclustering genes and conditions
|
||||
<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.135.1608>`__.
|
||||
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_clusters=3, *, method='bistochastic',
|
||||
n_components=6, n_best=3, svd_method='randomized',
|
||||
n_svd_vecs=None, mini_batch=False, init='k-means++',
|
||||
n_init=10, n_jobs='deprecated', random_state=None):
|
||||
super().__init__(n_clusters,
|
||||
svd_method,
|
||||
n_svd_vecs,
|
||||
mini_batch,
|
||||
init,
|
||||
n_init,
|
||||
n_jobs,
|
||||
random_state)
|
||||
self.method = method
|
||||
self.n_components = n_components
|
||||
self.n_best = n_best
|
||||
|
||||
def _check_parameters(self):
|
||||
super()._check_parameters()
|
||||
legal_methods = ('bistochastic', 'scale', 'log')
|
||||
if self.method not in legal_methods:
|
||||
raise ValueError("Unknown method: '{0}'. method must be"
|
||||
" one of {1}.".format(self.method, legal_methods))
|
||||
try:
|
||||
int(self.n_clusters)
|
||||
except TypeError:
|
||||
try:
|
||||
r, c = self.n_clusters
|
||||
int(r)
|
||||
int(c)
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError("Incorrect parameter n_clusters has value:"
|
||||
" {}. It should either be a single integer"
|
||||
" or an iterable with two integers:"
|
||||
" (n_row_clusters, n_column_clusters)")
|
||||
if self.n_components < 1:
|
||||
raise ValueError("Parameter n_components must be greater than 0,"
|
||||
" but its value is {}".format(self.n_components))
|
||||
if self.n_best < 1:
|
||||
raise ValueError("Parameter n_best must be greater than 0,"
|
||||
" but its value is {}".format(self.n_best))
|
||||
if self.n_best > self.n_components:
|
||||
raise ValueError("n_best cannot be larger than"
|
||||
" n_components, but {} > {}"
|
||||
"".format(self.n_best, self.n_components))
|
||||
|
||||
def _fit(self, X):
|
||||
n_sv = self.n_components
|
||||
if self.method == 'bistochastic':
|
||||
normalized_data = _bistochastic_normalize(X)
|
||||
n_sv += 1
|
||||
elif self.method == 'scale':
|
||||
normalized_data, _, _ = _scale_normalize(X)
|
||||
n_sv += 1
|
||||
elif self.method == 'log':
|
||||
normalized_data = _log_normalize(X)
|
||||
n_discard = 0 if self.method == 'log' else 1
|
||||
u, v = self._svd(normalized_data, n_sv, n_discard)
|
||||
ut = u.T
|
||||
vt = v.T
|
||||
|
||||
try:
|
||||
n_row_clusters, n_col_clusters = self.n_clusters
|
||||
except TypeError:
|
||||
n_row_clusters = n_col_clusters = self.n_clusters
|
||||
|
||||
best_ut = self._fit_best_piecewise(ut, self.n_best,
|
||||
n_row_clusters)
|
||||
|
||||
best_vt = self._fit_best_piecewise(vt, self.n_best,
|
||||
n_col_clusters)
|
||||
|
||||
self.row_labels_ = self._project_and_cluster(X, best_vt.T,
|
||||
n_row_clusters)
|
||||
|
||||
self.column_labels_ = self._project_and_cluster(X.T, best_ut.T,
|
||||
n_col_clusters)
|
||||
|
||||
self.rows_ = np.vstack([self.row_labels_ == label
|
||||
for label in range(n_row_clusters)
|
||||
for _ in range(n_col_clusters)])
|
||||
self.columns_ = np.vstack([self.column_labels_ == label
|
||||
for _ in range(n_row_clusters)
|
||||
for label in range(n_col_clusters)])
|
||||
|
||||
def _fit_best_piecewise(self, vectors, n_best, n_clusters):
|
||||
"""Find the ``n_best`` vectors that are best approximated by piecewise
|
||||
constant vectors.
|
||||
|
||||
The piecewise vectors are found by k-means; the best is chosen
|
||||
according to Euclidean distance.
|
||||
|
||||
"""
|
||||
def make_piecewise(v):
|
||||
centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
|
||||
return centroid[labels].ravel()
|
||||
piecewise_vectors = np.apply_along_axis(make_piecewise,
|
||||
axis=1, arr=vectors)
|
||||
dists = np.apply_along_axis(norm, axis=1,
|
||||
arr=(vectors - piecewise_vectors))
|
||||
result = vectors[np.argsort(dists)[:n_best]]
|
||||
return result
|
||||
|
||||
def _project_and_cluster(self, data, vectors, n_clusters):
|
||||
"""Project ``data`` to ``vectors`` and cluster the result."""
|
||||
projected = safe_sparse_dot(data, vectors)
|
||||
_, labels = self._k_means(projected, n_clusters)
|
||||
return labels
|
658
venv/Lib/site-packages/sklearn/cluster/_birch.py
Normal file
658
venv/Lib/site-packages/sklearn/cluster/_birch.py
Normal file
|
@ -0,0 +1,658 @@
|
|||
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# Joel Nothman <joel.nothman@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
import numbers
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from math import sqrt
|
||||
|
||||
from ..metrics import pairwise_distances_argmin
|
||||
from ..metrics.pairwise import euclidean_distances
|
||||
from ..base import TransformerMixin, ClusterMixin, BaseEstimator
|
||||
from ..utils import check_array
|
||||
from ..utils.extmath import row_norms
|
||||
from ..utils.validation import check_is_fitted, _deprecate_positional_args
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from . import AgglomerativeClustering
|
||||
|
||||
|
||||
def _iterate_sparse_X(X):
|
||||
"""This little hack returns a densified row when iterating over a sparse
|
||||
matrix, instead of constructing a sparse matrix for every row that is
|
||||
expensive.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
X_indices = X.indices
|
||||
X_data = X.data
|
||||
X_indptr = X.indptr
|
||||
|
||||
for i in range(n_samples):
|
||||
row = np.zeros(X.shape[1])
|
||||
startptr, endptr = X_indptr[i], X_indptr[i + 1]
|
||||
nonzero_indices = X_indices[startptr:endptr]
|
||||
row[nonzero_indices] = X_data[startptr:endptr]
|
||||
yield row
|
||||
|
||||
|
||||
def _split_node(node, threshold, branching_factor):
|
||||
"""The node has to be split if there is no place for a new subcluster
|
||||
in the node.
|
||||
1. Two empty nodes and two empty subclusters are initialized.
|
||||
2. The pair of distant subclusters are found.
|
||||
3. The properties of the empty subclusters and nodes are updated
|
||||
according to the nearest distance between the subclusters to the
|
||||
pair of distant subclusters.
|
||||
4. The two nodes are set as children to the two subclusters.
|
||||
"""
|
||||
new_subcluster1 = _CFSubcluster()
|
||||
new_subcluster2 = _CFSubcluster()
|
||||
new_node1 = _CFNode(
|
||||
threshold=threshold, branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features)
|
||||
new_node2 = _CFNode(
|
||||
threshold=threshold, branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features)
|
||||
new_subcluster1.child_ = new_node1
|
||||
new_subcluster2.child_ = new_node2
|
||||
|
||||
if node.is_leaf:
|
||||
if node.prev_leaf_ is not None:
|
||||
node.prev_leaf_.next_leaf_ = new_node1
|
||||
new_node1.prev_leaf_ = node.prev_leaf_
|
||||
new_node1.next_leaf_ = new_node2
|
||||
new_node2.prev_leaf_ = new_node1
|
||||
new_node2.next_leaf_ = node.next_leaf_
|
||||
if node.next_leaf_ is not None:
|
||||
node.next_leaf_.prev_leaf_ = new_node2
|
||||
|
||||
dist = euclidean_distances(
|
||||
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True)
|
||||
n_clusters = dist.shape[0]
|
||||
|
||||
farthest_idx = np.unravel_index(
|
||||
dist.argmax(), (n_clusters, n_clusters))
|
||||
node1_dist, node2_dist = dist[(farthest_idx,)]
|
||||
|
||||
node1_closer = node1_dist < node2_dist
|
||||
for idx, subcluster in enumerate(node.subclusters_):
|
||||
if node1_closer[idx]:
|
||||
new_node1.append_subcluster(subcluster)
|
||||
new_subcluster1.update(subcluster)
|
||||
else:
|
||||
new_node2.append_subcluster(subcluster)
|
||||
new_subcluster2.update(subcluster)
|
||||
return new_subcluster1, new_subcluster2
|
||||
|
||||
|
||||
class _CFNode:
|
||||
"""Each node in a CFTree is called a CFNode.
|
||||
|
||||
The CFNode can have a maximum of branching_factor
|
||||
number of CFSubclusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float
|
||||
Threshold needed for a new subcluster to enter a CFSubcluster.
|
||||
|
||||
branching_factor : int
|
||||
Maximum number of CF subclusters in each node.
|
||||
|
||||
is_leaf : bool
|
||||
We need to know if the CFNode is a leaf or not, in order to
|
||||
retrieve the final subclusters.
|
||||
|
||||
n_features : int
|
||||
The number of features.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
subclusters_ : list
|
||||
List of subclusters for a particular CFNode.
|
||||
|
||||
prev_leaf_ : _CFNode
|
||||
Useful only if is_leaf is True.
|
||||
|
||||
next_leaf_ : _CFNode
|
||||
next_leaf. Useful only if is_leaf is True.
|
||||
the final subclusters.
|
||||
|
||||
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Manipulate ``init_centroids_`` throughout rather than centroids_ since
|
||||
the centroids are just a view of the ``init_centroids_`` .
|
||||
|
||||
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
|
||||
|
||||
centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
View of ``init_centroids_``.
|
||||
|
||||
squared_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
View of ``init_sq_norm_``.
|
||||
|
||||
"""
|
||||
def __init__(self, *, threshold, branching_factor, is_leaf, n_features):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.is_leaf = is_leaf
|
||||
self.n_features = n_features
|
||||
|
||||
# The list of subclusters, centroids and squared norms
|
||||
# to manipulate throughout.
|
||||
self.subclusters_ = []
|
||||
self.init_centroids_ = np.zeros((branching_factor + 1, n_features))
|
||||
self.init_sq_norm_ = np.zeros((branching_factor + 1))
|
||||
self.squared_norm_ = []
|
||||
self.prev_leaf_ = None
|
||||
self.next_leaf_ = None
|
||||
|
||||
def append_subcluster(self, subcluster):
|
||||
n_samples = len(self.subclusters_)
|
||||
self.subclusters_.append(subcluster)
|
||||
self.init_centroids_[n_samples] = subcluster.centroid_
|
||||
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
|
||||
|
||||
# Keep centroids and squared norm as views. In this way
|
||||
# if we change init_centroids and init_sq_norm_, it is
|
||||
# sufficient,
|
||||
self.centroids_ = self.init_centroids_[:n_samples + 1, :]
|
||||
self.squared_norm_ = self.init_sq_norm_[:n_samples + 1]
|
||||
|
||||
def update_split_subclusters(self, subcluster,
|
||||
new_subcluster1, new_subcluster2):
|
||||
"""Remove a subcluster from a node and update it with the
|
||||
split subclusters.
|
||||
"""
|
||||
ind = self.subclusters_.index(subcluster)
|
||||
self.subclusters_[ind] = new_subcluster1
|
||||
self.init_centroids_[ind] = new_subcluster1.centroid_
|
||||
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
|
||||
self.append_subcluster(new_subcluster2)
|
||||
|
||||
def insert_cf_subcluster(self, subcluster):
|
||||
"""Insert a new subcluster into the node."""
|
||||
if not self.subclusters_:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
# We need to find the closest subcluster among all the
|
||||
# subclusters so that we can insert our new subcluster.
|
||||
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
|
||||
dist_matrix *= -2.
|
||||
dist_matrix += self.squared_norm_
|
||||
closest_index = np.argmin(dist_matrix)
|
||||
closest_subcluster = self.subclusters_[closest_index]
|
||||
|
||||
# If the subcluster has a child, we need a recursive strategy.
|
||||
if closest_subcluster.child_ is not None:
|
||||
split_child = closest_subcluster.child_.insert_cf_subcluster(
|
||||
subcluster)
|
||||
|
||||
if not split_child:
|
||||
# If it is determined that the child need not be split, we
|
||||
# can just update the closest_subcluster
|
||||
closest_subcluster.update(subcluster)
|
||||
self.init_centroids_[closest_index] = \
|
||||
self.subclusters_[closest_index].centroid_
|
||||
self.init_sq_norm_[closest_index] = \
|
||||
self.subclusters_[closest_index].sq_norm_
|
||||
return False
|
||||
|
||||
# things not too good. we need to redistribute the subclusters in
|
||||
# our child node, and add a new subcluster in the parent
|
||||
# subcluster to accommodate the new child.
|
||||
else:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
closest_subcluster.child_, threshold, branching_factor)
|
||||
self.update_split_subclusters(
|
||||
closest_subcluster, new_subcluster1, new_subcluster2)
|
||||
|
||||
if len(self.subclusters_) > self.branching_factor:
|
||||
return True
|
||||
return False
|
||||
|
||||
# good to go!
|
||||
else:
|
||||
merged = closest_subcluster.merge_subcluster(
|
||||
subcluster, self.threshold)
|
||||
if merged:
|
||||
self.init_centroids_[closest_index] = \
|
||||
closest_subcluster.centroid_
|
||||
self.init_sq_norm_[closest_index] = \
|
||||
closest_subcluster.sq_norm_
|
||||
return False
|
||||
|
||||
# not close to any other subclusters, and we still
|
||||
# have space, so add.
|
||||
elif len(self.subclusters_) < self.branching_factor:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
# We do not have enough space nor is it closer to an
|
||||
# other subcluster. We need to split.
|
||||
else:
|
||||
self.append_subcluster(subcluster)
|
||||
return True
|
||||
|
||||
|
||||
class _CFSubcluster:
|
||||
"""Each subcluster in a CFNode is called a CFSubcluster.
|
||||
|
||||
A CFSubcluster can have a CFNode has its child.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
linear_sum : ndarray of shape (n_features,), default=None
|
||||
Sample. This is kept optional to allow initialization of empty
|
||||
subclusters.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_samples_ : int
|
||||
Number of samples that belong to each subcluster.
|
||||
|
||||
linear_sum_ : ndarray
|
||||
Linear sum of all the samples in a subcluster. Prevents holding
|
||||
all sample data in memory.
|
||||
|
||||
squared_sum_ : float
|
||||
Sum of the squared l2 norms of all samples belonging to a subcluster.
|
||||
|
||||
centroid_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Centroid of the subcluster. Prevent recomputing of centroids when
|
||||
``CFNode.centroids_`` is called.
|
||||
|
||||
child_ : _CFNode
|
||||
Child Node of the subcluster. Once a given _CFNode is set as the child
|
||||
of the _CFNode, it is set to ``self.child_``.
|
||||
|
||||
sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
Squared norm of the subcluster. Used to prevent recomputing when
|
||||
pairwise minimum distances are computed.
|
||||
"""
|
||||
def __init__(self, *, linear_sum=None):
|
||||
if linear_sum is None:
|
||||
self.n_samples_ = 0
|
||||
self.squared_sum_ = 0.0
|
||||
self.centroid_ = self.linear_sum_ = 0
|
||||
else:
|
||||
self.n_samples_ = 1
|
||||
self.centroid_ = self.linear_sum_ = linear_sum
|
||||
self.squared_sum_ = self.sq_norm_ = np.dot(
|
||||
self.linear_sum_, self.linear_sum_)
|
||||
self.child_ = None
|
||||
|
||||
def update(self, subcluster):
|
||||
self.n_samples_ += subcluster.n_samples_
|
||||
self.linear_sum_ += subcluster.linear_sum_
|
||||
self.squared_sum_ += subcluster.squared_sum_
|
||||
self.centroid_ = self.linear_sum_ / self.n_samples_
|
||||
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
|
||||
|
||||
def merge_subcluster(self, nominee_cluster, threshold):
|
||||
"""Check if a cluster is worthy enough to be merged. If
|
||||
yes then merge.
|
||||
"""
|
||||
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
|
||||
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
|
||||
new_n = self.n_samples_ + nominee_cluster.n_samples_
|
||||
new_centroid = (1 / new_n) * new_ls
|
||||
new_norm = np.dot(new_centroid, new_centroid)
|
||||
dot_product = (-2 * new_n) * new_norm
|
||||
sq_radius = (new_ss + dot_product) / new_n + new_norm
|
||||
if sq_radius <= threshold ** 2:
|
||||
(self.n_samples_, self.linear_sum_, self.squared_sum_,
|
||||
self.centroid_, self.sq_norm_) = \
|
||||
new_n, new_ls, new_ss, new_centroid, new_norm
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def radius(self):
|
||||
"""Return radius of the subcluster"""
|
||||
dot_product = -2 * np.dot(self.linear_sum_, self.centroid_)
|
||||
return sqrt(
|
||||
((self.squared_sum_ + dot_product) / self.n_samples_) +
|
||||
self.sq_norm_)
|
||||
|
||||
|
||||
class Birch(ClusterMixin, TransformerMixin, BaseEstimator):
|
||||
"""Implements the Birch clustering algorithm.
|
||||
|
||||
It is a memory-efficient, online-learning algorithm provided as an
|
||||
alternative to :class:`MiniBatchKMeans`. It constructs a tree
|
||||
data structure with the cluster centroids being read off the leaf.
|
||||
These can be either the final cluster centroids or can be provided as input
|
||||
to another clustering algorithm such as :class:`AgglomerativeClustering`.
|
||||
|
||||
Read more in the :ref:`User Guide <birch>`.
|
||||
|
||||
.. versionadded:: 0.16
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float, default=0.5
|
||||
The radius of the subcluster obtained by merging a new sample and the
|
||||
closest subcluster should be lesser than the threshold. Otherwise a new
|
||||
subcluster is started. Setting this value to be very low promotes
|
||||
splitting and vice-versa.
|
||||
|
||||
branching_factor : int, default=50
|
||||
Maximum number of CF subclusters in each node. If a new samples enters
|
||||
such that the number of subclusters exceed the branching_factor then
|
||||
that node is split into two nodes with the subclusters redistributed
|
||||
in each. The parent subcluster of that node is removed and two new
|
||||
subclusters are added as parents of the 2 split nodes.
|
||||
|
||||
n_clusters : int, instance of sklearn.cluster model, default=3
|
||||
Number of clusters after the final clustering step, which treats the
|
||||
subclusters from the leaves as new samples.
|
||||
|
||||
- `None` : the final clustering step is not performed and the
|
||||
subclusters are returned as they are.
|
||||
|
||||
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
|
||||
is fit treating the subclusters as new samples and the initial data
|
||||
is mapped to the label of the closest subcluster.
|
||||
|
||||
- `int` : the model fit is :class:`AgglomerativeClustering` with
|
||||
`n_clusters` set to be equal to the int.
|
||||
|
||||
compute_labels : bool, default=True
|
||||
Whether or not to compute labels for each fit.
|
||||
|
||||
copy : bool, default=True
|
||||
Whether or not to make a copy of the given data. If set to False,
|
||||
the initial data will be overwritten.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
root_ : _CFNode
|
||||
Root of the CFTree.
|
||||
|
||||
dummy_leaf_ : _CFNode
|
||||
Start pointer to all the leaves.
|
||||
|
||||
subcluster_centers_ : ndarray
|
||||
Centroids of all subclusters read directly from the leaves.
|
||||
|
||||
subcluster_labels_ : ndarray
|
||||
Labels assigned to the centroids of the subclusters after
|
||||
they are clustered globally.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Array of labels assigned to the input data.
|
||||
if partial_fit is used instead of fit, they are assigned to the
|
||||
last batch of data.
|
||||
|
||||
See Also
|
||||
--------
|
||||
|
||||
MiniBatchKMeans
|
||||
Alternative implementation that does incremental updates
|
||||
of the centers' positions using mini-batches.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The tree data structure consists of nodes with each node consisting of
|
||||
a number of subclusters. The maximum number of subclusters in a node
|
||||
is determined by the branching factor. Each subcluster maintains a
|
||||
linear sum, squared sum and the number of samples in that subcluster.
|
||||
In addition, each subcluster can also have a node as its child, if the
|
||||
subcluster is not a member of a leaf node.
|
||||
|
||||
For a new point entering the root, it is merged with the subcluster closest
|
||||
to it and the linear sum, squared sum and the number of samples of that
|
||||
subcluster are updated. This is done recursively till the properties of
|
||||
the leaf node are updated.
|
||||
|
||||
References
|
||||
----------
|
||||
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
|
||||
BIRCH: An efficient data clustering method for large databases.
|
||||
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
|
||||
|
||||
* Roberto Perdisci
|
||||
JBirch - Java implementation of BIRCH clustering algorithm
|
||||
https://code.google.com/archive/p/jbirch
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import Birch
|
||||
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
|
||||
>>> brc = Birch(n_clusters=None)
|
||||
>>> brc.fit(X)
|
||||
Birch(n_clusters=None)
|
||||
>>> brc.predict(X)
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3,
|
||||
compute_labels=True, copy=True):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.n_clusters = n_clusters
|
||||
self.compute_labels = compute_labels
|
||||
self.copy = copy
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Build a CF Tree for the input data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
self.fit_, self.partial_fit_ = True, False
|
||||
return self._fit(X)
|
||||
|
||||
def _fit(self, X):
|
||||
X = self._validate_data(X, accept_sparse='csr', copy=self.copy)
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
|
||||
if branching_factor <= 1:
|
||||
raise ValueError("Branching_factor should be greater than one.")
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# If partial_fit is called for the first time or fit is called, we
|
||||
# start a new tree.
|
||||
partial_fit = getattr(self, 'partial_fit_')
|
||||
has_root = getattr(self, 'root_', None)
|
||||
if getattr(self, 'fit_') or (partial_fit and not has_root):
|
||||
# The first root is the leaf. Manipulate this object throughout.
|
||||
self.root_ = _CFNode(threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True,
|
||||
n_features=n_features)
|
||||
|
||||
# To enable getting back subclusters.
|
||||
self.dummy_leaf_ = _CFNode(threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True, n_features=n_features)
|
||||
self.dummy_leaf_.next_leaf_ = self.root_
|
||||
self.root_.prev_leaf_ = self.dummy_leaf_
|
||||
|
||||
# Cannot vectorize. Enough to convince to use cython.
|
||||
if not sparse.issparse(X):
|
||||
iter_func = iter
|
||||
else:
|
||||
iter_func = _iterate_sparse_X
|
||||
|
||||
for sample in iter_func(X):
|
||||
subcluster = _CFSubcluster(linear_sum=sample)
|
||||
split = self.root_.insert_cf_subcluster(subcluster)
|
||||
|
||||
if split:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
self.root_, threshold, branching_factor)
|
||||
del self.root_
|
||||
self.root_ = _CFNode(threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=False,
|
||||
n_features=n_features)
|
||||
self.root_.append_subcluster(new_subcluster1)
|
||||
self.root_.append_subcluster(new_subcluster2)
|
||||
|
||||
centroids = np.concatenate([
|
||||
leaf.centroids_ for leaf in self._get_leaves()])
|
||||
self.subcluster_centers_ = centroids
|
||||
|
||||
self._global_clustering(X)
|
||||
return self
|
||||
|
||||
def _get_leaves(self):
|
||||
"""
|
||||
Retrieve the leaves of the CF Node.
|
||||
|
||||
Returns
|
||||
-------
|
||||
leaves : list of shape (n_leaves,)
|
||||
List of the leaf nodes.
|
||||
"""
|
||||
leaf_ptr = self.dummy_leaf_.next_leaf_
|
||||
leaves = []
|
||||
while leaf_ptr is not None:
|
||||
leaves.append(leaf_ptr)
|
||||
leaf_ptr = leaf_ptr.next_leaf_
|
||||
return leaves
|
||||
|
||||
def partial_fit(self, X=None, y=None):
|
||||
"""
|
||||
Online learning. Prevents rebuilding of CFTree from scratch.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
|
||||
default=None
|
||||
Input data. If X is not provided, only the global clustering
|
||||
step is done.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
self.partial_fit_, self.fit_ = True, False
|
||||
if X is None:
|
||||
# Perform just the final global clustering step.
|
||||
self._global_clustering()
|
||||
return self
|
||||
else:
|
||||
self._check_fit(X)
|
||||
return self._fit(X)
|
||||
|
||||
def _check_fit(self, X):
|
||||
check_is_fitted(self)
|
||||
|
||||
if (hasattr(self, 'subcluster_centers_') and
|
||||
X.shape[1] != self.subcluster_centers_.shape[1]):
|
||||
raise ValueError(
|
||||
"Training data and predicted data do "
|
||||
"not have same number of features.")
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict data using the ``centroids_`` of subclusters.
|
||||
|
||||
Avoid computation of the row norms of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape(n_samples,)
|
||||
Labelled data.
|
||||
"""
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
self._check_fit(X)
|
||||
kwargs = {'Y_norm_squared': self._subcluster_norms}
|
||||
return self.subcluster_labels_[
|
||||
pairwise_distances_argmin(X,
|
||||
self.subcluster_centers_,
|
||||
metric_kwargs=kwargs)
|
||||
]
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform X into subcluster centroids dimension.
|
||||
|
||||
Each dimension represents the distance from the sample point to each
|
||||
cluster centroid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
|
||||
Transformed data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return euclidean_distances(X, self.subcluster_centers_)
|
||||
|
||||
def _global_clustering(self, X=None):
|
||||
"""
|
||||
Global clustering for the subclusters obtained after fitting
|
||||
"""
|
||||
clusterer = self.n_clusters
|
||||
centroids = self.subcluster_centers_
|
||||
compute_labels = (X is not None) and self.compute_labels
|
||||
|
||||
# Preprocessing for the global clustering.
|
||||
not_enough_centroids = False
|
||||
if isinstance(clusterer, numbers.Integral):
|
||||
clusterer = AgglomerativeClustering(
|
||||
n_clusters=self.n_clusters)
|
||||
# There is no need to perform the global clustering step.
|
||||
if len(centroids) < self.n_clusters:
|
||||
not_enough_centroids = True
|
||||
elif (clusterer is not None and not
|
||||
hasattr(clusterer, 'fit_predict')):
|
||||
raise ValueError("n_clusters should be an instance of "
|
||||
"ClusterMixin or an int")
|
||||
|
||||
# To use in predict to avoid recalculation.
|
||||
self._subcluster_norms = row_norms(
|
||||
self.subcluster_centers_, squared=True)
|
||||
|
||||
if clusterer is None or not_enough_centroids:
|
||||
self.subcluster_labels_ = np.arange(len(centroids))
|
||||
if not_enough_centroids:
|
||||
warnings.warn(
|
||||
"Number of subclusters found (%d) by Birch is less "
|
||||
"than (%d). Decrease the threshold."
|
||||
% (len(centroids), self.n_clusters), ConvergenceWarning)
|
||||
else:
|
||||
# The global clustering step that clusters the subclusters of
|
||||
# the leaves. It assumes the centroids of the subclusters as
|
||||
# samples and finds the final centroids.
|
||||
self.subcluster_labels_ = clusterer.fit_predict(
|
||||
self.subcluster_centers_)
|
||||
|
||||
if compute_labels:
|
||||
self.labels_ = self.predict(X)
|
392
venv/Lib/site-packages/sklearn/cluster/_dbscan.py
Normal file
392
venv/Lib/site-packages/sklearn/cluster/_dbscan.py
Normal file
|
@ -0,0 +1,392 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
|
||||
"""
|
||||
|
||||
# Author: Robert Layton <robertlayton@gmail.com>
|
||||
# Joel Nothman <joel.nothman@gmail.com>
|
||||
# Lars Buitinck
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
from scipy import sparse
|
||||
|
||||
from ..base import BaseEstimator, ClusterMixin
|
||||
from ..utils.validation import _check_sample_weight, _deprecate_positional_args
|
||||
from ..neighbors import NearestNeighbors
|
||||
|
||||
from ._dbscan_inner import dbscan_inner
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def dbscan(X, eps=0.5, *, min_samples=5, metric='minkowski',
|
||||
metric_params=None, algorithm='auto', leaf_size=30, p=2,
|
||||
sample_weight=None, n_jobs=None):
|
||||
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <dbscan>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
A feature array, or array of distances between samples if
|
||||
``metric='precomputed'``.
|
||||
|
||||
eps : float, default=0.5
|
||||
The maximum distance between two samples for one to be considered
|
||||
as in the neighborhood of the other. This is not a maximum bound
|
||||
on the distances of points within a cluster. This is the most
|
||||
important DBSCAN parameter to choose appropriately for your data set
|
||||
and distance function.
|
||||
|
||||
min_samples : int, default=5
|
||||
The number of samples (or total weight) in a neighborhood for a point
|
||||
to be considered as a core point. This includes the point itself.
|
||||
|
||||
metric : string, or callable
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit.
|
||||
X may be a :term:`sparse graph <sparse graph>`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
The algorithm to be used by the NearestNeighbors module
|
||||
to compute pointwise distances and find nearest neighbors.
|
||||
See NearestNeighbors module documentation for details.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||||
of the construction and query, as well as the memory required
|
||||
to store the tree. The optimal value depends
|
||||
on the nature of the problem.
|
||||
|
||||
p : float, default=2
|
||||
The power of the Minkowski metric to be used to calculate distance
|
||||
between points.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with negative
|
||||
weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search. ``None`` means
|
||||
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
|
||||
using all processors. See :term:`Glossary <n_jobs>` for more details.
|
||||
If precomputed distance are used, parallel execution is not available
|
||||
and thus n_jobs will have no effect.
|
||||
|
||||
Returns
|
||||
-------
|
||||
core_samples : ndarray of shape (n_core_samples,)
|
||||
Indices of core samples.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels for each point. Noisy samples are given the label -1.
|
||||
|
||||
See also
|
||||
--------
|
||||
DBSCAN
|
||||
An estimator interface for this clustering algorithm.
|
||||
OPTICS
|
||||
A similar estimator interface clustering at multiple values of eps. Our
|
||||
implementation is optimized for memory usage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_dbscan.py
|
||||
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
|
||||
|
||||
This implementation bulk-computes all neighborhood queries, which increases
|
||||
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||||
while original DBSCAN had memory complexity O(n). It may attract a higher
|
||||
memory complexity when querying these nearest neighborhoods, depending
|
||||
on the ``algorithm``.
|
||||
|
||||
One way to avoid the query complexity is to pre-compute sparse
|
||||
neighborhoods in chunks using
|
||||
:func:`NearestNeighbors.radius_neighbors_graph
|
||||
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
|
||||
``mode='distance'``, then using ``metric='precomputed'`` here.
|
||||
|
||||
Another way to reduce memory and computation time is to remove
|
||||
(near-)duplicate points and use ``sample_weight`` instead.
|
||||
|
||||
:func:`cluster.optics <sklearn.cluster.optics>` provides a similar
|
||||
clustering with lower memory usage.
|
||||
|
||||
References
|
||||
----------
|
||||
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
|
||||
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
|
||||
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||||
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||||
|
||||
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
|
||||
DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
|
||||
ACM Transactions on Database Systems (TODS), 42(3), 19.
|
||||
"""
|
||||
|
||||
est = DBSCAN(eps=eps, min_samples=min_samples, metric=metric,
|
||||
metric_params=metric_params, algorithm=algorithm,
|
||||
leaf_size=leaf_size, p=p, n_jobs=n_jobs)
|
||||
est.fit(X, sample_weight=sample_weight)
|
||||
return est.core_sample_indices_, est.labels_
|
||||
|
||||
|
||||
class DBSCAN(ClusterMixin, BaseEstimator):
|
||||
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||||
|
||||
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
|
||||
Finds core samples of high density and expands clusters from them.
|
||||
Good for data which contains clusters of similar density.
|
||||
|
||||
Read more in the :ref:`User Guide <dbscan>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eps : float, default=0.5
|
||||
The maximum distance between two samples for one to be considered
|
||||
as in the neighborhood of the other. This is not a maximum bound
|
||||
on the distances of points within a cluster. This is the most
|
||||
important DBSCAN parameter to choose appropriately for your data set
|
||||
and distance function.
|
||||
|
||||
min_samples : int, default=5
|
||||
The number of samples (or total weight) in a neighborhood for a point
|
||||
to be considered as a core point. This includes the point itself.
|
||||
|
||||
metric : string, or callable, default='euclidean'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square. X may be a :term:`Glossary <sparse graph>`, in which
|
||||
case only "nonzero" elements may be considered neighbors for DBSCAN.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
metric *precomputed* to accept precomputed sparse matrix.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
The algorithm to be used by the NearestNeighbors module
|
||||
to compute pointwise distances and find nearest neighbors.
|
||||
See NearestNeighbors module documentation for details.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||||
of the construction and query, as well as the memory required
|
||||
to store the tree. The optimal value depends
|
||||
on the nature of the problem.
|
||||
|
||||
p : float, default=None
|
||||
The power of the Minkowski metric to be used to calculate distance
|
||||
between points.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
core_sample_indices_ : ndarray of shape (n_core_samples,)
|
||||
Indices of core samples.
|
||||
|
||||
components_ : ndarray of shape (n_core_samples, n_features)
|
||||
Copy of each core sample found by training.
|
||||
|
||||
labels_ : ndarray of shape (n_samples)
|
||||
Cluster labels for each point in the dataset given to fit().
|
||||
Noisy samples are given the label -1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import DBSCAN
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [2, 2], [2, 3],
|
||||
... [8, 7], [8, 8], [25, 80]])
|
||||
>>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([ 0, 0, 0, 1, 1, -1])
|
||||
>>> clustering
|
||||
DBSCAN(eps=3, min_samples=2)
|
||||
|
||||
See also
|
||||
--------
|
||||
OPTICS
|
||||
A similar clustering at multiple values of eps. Our implementation
|
||||
is optimized for memory usage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_dbscan.py
|
||||
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
|
||||
|
||||
This implementation bulk-computes all neighborhood queries, which increases
|
||||
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||||
while original DBSCAN had memory complexity O(n). It may attract a higher
|
||||
memory complexity when querying these nearest neighborhoods, depending
|
||||
on the ``algorithm``.
|
||||
|
||||
One way to avoid the query complexity is to pre-compute sparse
|
||||
neighborhoods in chunks using
|
||||
:func:`NearestNeighbors.radius_neighbors_graph
|
||||
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
|
||||
``mode='distance'``, then using ``metric='precomputed'`` here.
|
||||
|
||||
Another way to reduce memory and computation time is to remove
|
||||
(near-)duplicate points and use ``sample_weight`` instead.
|
||||
|
||||
:class:`cluster.OPTICS` provides a similar clustering with lower memory
|
||||
usage.
|
||||
|
||||
References
|
||||
----------
|
||||
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
|
||||
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
|
||||
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||||
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||||
|
||||
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
|
||||
DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.
|
||||
ACM Transactions on Database Systems (TODS), 42(3), 19.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, eps=0.5, *, min_samples=5, metric='euclidean',
|
||||
metric_params=None, algorithm='auto', leaf_size=30, p=None,
|
||||
n_jobs=None):
|
||||
self.eps = eps
|
||||
self.min_samples = min_samples
|
||||
self.metric = metric
|
||||
self.metric_params = metric_params
|
||||
self.algorithm = algorithm
|
||||
self.leaf_size = leaf_size
|
||||
self.p = p
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""Perform DBSCAN clustering from features, or distance matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, or distances between instances if
|
||||
``metric='precomputed'``. If a sparse matrix is provided, it will
|
||||
be converted into a sparse ``csr_matrix``.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with a
|
||||
negative weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
|
||||
"""
|
||||
X = self._validate_data(X, accept_sparse='csr')
|
||||
|
||||
if not self.eps > 0.0:
|
||||
raise ValueError("eps must be positive.")
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
|
||||
# Calculate neighborhood for all samples. This leaves the original
|
||||
# point in, which needs to be considered later (i.e. point i is in the
|
||||
# neighborhood of point i. While True, its useless information)
|
||||
if self.metric == 'precomputed' and sparse.issparse(X):
|
||||
# set the diagonal to explicit values, as a point is its own
|
||||
# neighbor
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
|
||||
X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place
|
||||
|
||||
neighbors_model = NearestNeighbors(
|
||||
radius=self.eps, algorithm=self.algorithm,
|
||||
leaf_size=self.leaf_size, metric=self.metric,
|
||||
metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs)
|
||||
neighbors_model.fit(X)
|
||||
# This has worst case O(n^2) memory complexity
|
||||
neighborhoods = neighbors_model.radius_neighbors(X,
|
||||
return_distance=False)
|
||||
|
||||
if sample_weight is None:
|
||||
n_neighbors = np.array([len(neighbors)
|
||||
for neighbors in neighborhoods])
|
||||
else:
|
||||
n_neighbors = np.array([np.sum(sample_weight[neighbors])
|
||||
for neighbors in neighborhoods])
|
||||
|
||||
# Initially, all samples are noise.
|
||||
labels = np.full(X.shape[0], -1, dtype=np.intp)
|
||||
|
||||
# A list of all core samples found.
|
||||
core_samples = np.asarray(n_neighbors >= self.min_samples,
|
||||
dtype=np.uint8)
|
||||
dbscan_inner(core_samples, neighborhoods, labels)
|
||||
|
||||
self.core_sample_indices_ = np.where(core_samples)[0]
|
||||
self.labels_ = labels
|
||||
|
||||
if len(self.core_sample_indices_):
|
||||
# fix for scipy sparse indexing issue
|
||||
self.components_ = X[self.core_sample_indices_].copy()
|
||||
else:
|
||||
# no core samples
|
||||
self.components_ = np.empty((0, X.shape[1]))
|
||||
return self
|
||||
|
||||
def fit_predict(self, X, y=None, sample_weight=None):
|
||||
"""Perform DBSCAN clustering from features or distance matrix,
|
||||
and return cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, or distances between instances if
|
||||
``metric='precomputed'``. If a sparse matrix is provided, it will
|
||||
be converted into a sparse ``csr_matrix``.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with a
|
||||
negative weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels. Noisy samples are given the label -1.
|
||||
"""
|
||||
self.fit(X, sample_weight=sample_weight)
|
||||
return self.labels_
|
Binary file not shown.
|
@ -0,0 +1,77 @@
|
|||
"""
|
||||
Feature agglomeration. Base classes and functions for performing feature
|
||||
agglomeration.
|
||||
"""
|
||||
# Author: V. Michel, A. Gramfort
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import TransformerMixin
|
||||
from ..utils import check_array
|
||||
from ..utils.validation import check_is_fitted
|
||||
from scipy.sparse import issparse
|
||||
|
||||
###############################################################################
|
||||
# Mixin class for feature agglomeration.
|
||||
|
||||
|
||||
class AgglomerationTransform(TransformerMixin):
|
||||
"""
|
||||
A class for feature agglomeration via the transform interface
|
||||
"""
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform a new matrix using the built clustering
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or (n_samples,)
|
||||
A M by N array of M observations in N dimensions or a length
|
||||
M array of M one-dimensional observations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : array, shape = [n_samples, n_clusters] or [n_clusters]
|
||||
The pooled values for each feature cluster.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = check_array(X)
|
||||
if len(self.labels_) != X.shape[1]:
|
||||
raise ValueError("X has a different number of features than "
|
||||
"during fitting.")
|
||||
if self.pooling_func == np.mean and not issparse(X):
|
||||
size = np.bincount(self.labels_)
|
||||
n_samples = X.shape[0]
|
||||
# a fast way to compute the mean of grouped features
|
||||
nX = np.array([np.bincount(self.labels_, X[i, :]) / size
|
||||
for i in range(n_samples)])
|
||||
else:
|
||||
nX = [self.pooling_func(X[:, self.labels_ == l], axis=1)
|
||||
for l in np.unique(self.labels_)]
|
||||
nX = np.array(nX).T
|
||||
return nX
|
||||
|
||||
def inverse_transform(self, Xred):
|
||||
"""
|
||||
Inverse the transformation.
|
||||
Return a vector of size nb_features with the values of Xred assigned
|
||||
to each group of features
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)
|
||||
The values to be assigned to each cluster of samples
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : array, shape=[n_samples, n_features] or [n_features]
|
||||
A vector of size n_samples with the values of Xred assigned to
|
||||
each of the cluster of samples.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
unil, inverse = np.unique(self.labels_, return_inverse=True)
|
||||
return Xred[..., inverse]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
23
venv/Lib/site-packages/sklearn/cluster/_k_means_fast.pxd
Normal file
23
venv/Lib/site-packages/sklearn/cluster/_k_means_fast.pxd
Normal file
|
@ -0,0 +1,23 @@
|
|||
# cython: language_level=3
|
||||
|
||||
|
||||
from cython cimport floating
|
||||
cimport numpy as np
|
||||
|
||||
|
||||
cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil
|
||||
|
||||
cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
|
||||
floating, bint) nogil
|
||||
|
||||
cpdef void _relocate_empty_clusters_dense(
|
||||
np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1],
|
||||
floating[:, ::1], floating[::1], int[::1])
|
||||
|
||||
cpdef void _relocate_empty_clusters_sparse(
|
||||
floating[::1], int[::1], int[::1], floating[::1], floating[:, ::1],
|
||||
floating[:, ::1], floating[::1], int[::1])
|
||||
|
||||
cdef void _average_centers(floating[:, ::1], floating[::1])
|
||||
|
||||
cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1])
|
Binary file not shown.
1916
venv/Lib/site-packages/sklearn/cluster/_kmeans.py
Normal file
1916
venv/Lib/site-packages/sklearn/cluster/_kmeans.py
Normal file
File diff suppressed because it is too large
Load diff
465
venv/Lib/site-packages/sklearn/cluster/_mean_shift.py
Normal file
465
venv/Lib/site-packages/sklearn/cluster/_mean_shift.py
Normal file
|
@ -0,0 +1,465 @@
|
|||
"""Mean shift clustering algorithm.
|
||||
|
||||
Mean shift clustering aims to discover *blobs* in a smooth density of
|
||||
samples. It is a centroid based algorithm, which works by updating candidates
|
||||
for centroids to be the mean of the points within a given region. These
|
||||
candidates are then filtered in a post-processing stage to eliminate
|
||||
near-duplicates to form the final set of centroids.
|
||||
|
||||
Seeding is performed using a binning technique for scalability.
|
||||
"""
|
||||
|
||||
# Authors: Conrad Lee <conradlee@gmail.com>
|
||||
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Martino Sorbaro <martino.sorbaro@ed.ac.uk>
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
from collections import defaultdict
|
||||
from ..utils.validation import check_is_fitted, _deprecate_positional_args
|
||||
from ..utils import check_random_state, gen_batches, check_array
|
||||
from ..base import BaseEstimator, ClusterMixin
|
||||
from ..neighbors import NearestNeighbors
|
||||
from ..metrics.pairwise import pairwise_distances_argmin
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0,
|
||||
n_jobs=None):
|
||||
"""Estimate the bandwidth to use with the mean-shift algorithm.
|
||||
|
||||
That this function takes time at least quadratic in n_samples. For large
|
||||
datasets, it's wise to set that parameter to a small value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input points.
|
||||
|
||||
quantile : float, default=0.3
|
||||
should be between [0, 1]
|
||||
0.5 means that the median of all pairwise distances is used.
|
||||
|
||||
n_samples : int, default=None
|
||||
The number of samples to use. If not given, all samples are used.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
The generator used to randomly select the samples from input points
|
||||
for bandwidth estimation. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bandwidth : float
|
||||
The bandwidth parameter.
|
||||
"""
|
||||
X = check_array(X)
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
if n_samples is not None:
|
||||
idx = random_state.permutation(X.shape[0])[:n_samples]
|
||||
X = X[idx]
|
||||
n_neighbors = int(X.shape[0] * quantile)
|
||||
if n_neighbors < 1: # cannot fit NearestNeighbors with n_neighbors = 0
|
||||
n_neighbors = 1
|
||||
nbrs = NearestNeighbors(n_neighbors=n_neighbors,
|
||||
n_jobs=n_jobs)
|
||||
nbrs.fit(X)
|
||||
|
||||
bandwidth = 0.
|
||||
for batch in gen_batches(len(X), 500):
|
||||
d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
|
||||
bandwidth += np.max(d, axis=1).sum()
|
||||
|
||||
return bandwidth / X.shape[0]
|
||||
|
||||
|
||||
# separate function for each seed's iterative loop
|
||||
def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):
|
||||
# For each seed, climb gradient until convergence or max_iter
|
||||
bandwidth = nbrs.get_params()['radius']
|
||||
stop_thresh = 1e-3 * bandwidth # when mean has converged
|
||||
completed_iterations = 0
|
||||
while True:
|
||||
# Find mean of points within bandwidth
|
||||
i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth,
|
||||
return_distance=False)[0]
|
||||
points_within = X[i_nbrs]
|
||||
if len(points_within) == 0:
|
||||
break # Depending on seeding strategy this condition may occur
|
||||
my_old_mean = my_mean # save the old mean
|
||||
my_mean = np.mean(points_within, axis=0)
|
||||
# If converged or at max_iter, adds the cluster
|
||||
if (np.linalg.norm(my_mean - my_old_mean) < stop_thresh or
|
||||
completed_iterations == max_iter):
|
||||
break
|
||||
completed_iterations += 1
|
||||
return tuple(my_mean), len(points_within), completed_iterations
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def mean_shift(X, *, bandwidth=None, seeds=None, bin_seeding=False,
|
||||
min_bin_freq=1, cluster_all=True, max_iter=300,
|
||||
n_jobs=None):
|
||||
"""Perform mean shift clustering of data using a flat kernel.
|
||||
|
||||
Read more in the :ref:`User Guide <mean_shift>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
bandwidth : float, default=None
|
||||
Kernel bandwidth.
|
||||
|
||||
If bandwidth is not given, it is determined using a heuristic based on
|
||||
the median of all pairwise distances. This will take quadratic time in
|
||||
the number of samples. The sklearn.cluster.estimate_bandwidth function
|
||||
can be used to do this more efficiently.
|
||||
|
||||
seeds : array-like of shape (n_seeds, n_features) or None
|
||||
Point used as initial kernel locations. If None and bin_seeding=False,
|
||||
each data point is used as a seed. If None and bin_seeding=True,
|
||||
see bin_seeding.
|
||||
|
||||
bin_seeding : boolean, default=False
|
||||
If true, initial kernel locations are not locations of all
|
||||
points, but rather the location of the discretized version of
|
||||
points, where points are binned onto a grid whose coarseness
|
||||
corresponds to the bandwidth. Setting this option to True will speed
|
||||
up the algorithm because fewer seeds will be initialized.
|
||||
Ignored if seeds argument is not None.
|
||||
|
||||
min_bin_freq : int, default=1
|
||||
To speed up the algorithm, accept only those bins with at least
|
||||
min_bin_freq points as seeds.
|
||||
|
||||
cluster_all : bool, default=True
|
||||
If true, then all points are clustered, even those orphans that are
|
||||
not within any kernel. Orphans are assigned to the nearest kernel.
|
||||
If false, then orphans are given cluster label -1.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations, per seed point before the clustering
|
||||
operation terminates (for that seed point), if has not converged yet.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. This works by computing
|
||||
each of the n_init runs in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
Parallel Execution using *n_jobs*.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
cluster_centers : array, shape=[n_clusters, n_features]
|
||||
Coordinates of cluster centers.
|
||||
|
||||
labels : array, shape=[n_samples]
|
||||
Cluster labels for each point.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_mean_shift.py
|
||||
<sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
|
||||
|
||||
"""
|
||||
model = MeanShift(bandwidth=bandwidth, seeds=seeds,
|
||||
min_bin_freq=min_bin_freq,
|
||||
bin_seeding=bin_seeding,
|
||||
cluster_all=cluster_all, n_jobs=n_jobs,
|
||||
max_iter=max_iter).fit(X)
|
||||
return model.cluster_centers_, model.labels_
|
||||
|
||||
|
||||
def get_bin_seeds(X, bin_size, min_bin_freq=1):
|
||||
"""Finds seeds for mean_shift.
|
||||
|
||||
Finds seeds by first binning data onto a grid whose lines are
|
||||
spaced bin_size apart, and then choosing those bins with at least
|
||||
min_bin_freq points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Input points, the same points that will be used in mean_shift.
|
||||
|
||||
bin_size : float
|
||||
Controls the coarseness of the binning. Smaller values lead
|
||||
to more seeding (which is computationally more expensive). If you're
|
||||
not sure how to set this, set it to the value of the bandwidth used
|
||||
in clustering.mean_shift.
|
||||
|
||||
min_bin_freq : int, default=1
|
||||
Only bins with at least min_bin_freq will be selected as seeds.
|
||||
Raising this value decreases the number of seeds found, which
|
||||
makes mean_shift computationally cheaper.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bin_seeds : array-like of shape (n_samples, n_features)
|
||||
Points used as initial kernel positions in clustering.mean_shift.
|
||||
"""
|
||||
if bin_size == 0:
|
||||
return X
|
||||
|
||||
# Bin points
|
||||
bin_sizes = defaultdict(int)
|
||||
for point in X:
|
||||
binned_point = np.round(point / bin_size)
|
||||
bin_sizes[tuple(binned_point)] += 1
|
||||
|
||||
# Select only those bins as seeds which have enough members
|
||||
bin_seeds = np.array([point for point, freq in bin_sizes.items() if
|
||||
freq >= min_bin_freq], dtype=np.float32)
|
||||
if len(bin_seeds) == len(X):
|
||||
warnings.warn("Binning data failed with provided bin_size=%f,"
|
||||
" using data points as seeds." % bin_size)
|
||||
return X
|
||||
bin_seeds = bin_seeds * bin_size
|
||||
return bin_seeds
|
||||
|
||||
|
||||
class MeanShift(ClusterMixin, BaseEstimator):
|
||||
"""Mean shift clustering using a flat kernel.
|
||||
|
||||
Mean shift clustering aims to discover "blobs" in a smooth density of
|
||||
samples. It is a centroid-based algorithm, which works by updating
|
||||
candidates for centroids to be the mean of the points within a given
|
||||
region. These candidates are then filtered in a post-processing stage to
|
||||
eliminate near-duplicates to form the final set of centroids.
|
||||
|
||||
Seeding is performed using a binning technique for scalability.
|
||||
|
||||
Read more in the :ref:`User Guide <mean_shift>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bandwidth : float, default=None
|
||||
Bandwidth used in the RBF kernel.
|
||||
|
||||
If not given, the bandwidth is estimated using
|
||||
sklearn.cluster.estimate_bandwidth; see the documentation for that
|
||||
function for hints on scalability (see also the Notes, below).
|
||||
|
||||
seeds : array-like of shape (n_samples, n_features), default=None
|
||||
Seeds used to initialize kernels. If not set,
|
||||
the seeds are calculated by clustering.get_bin_seeds
|
||||
with bandwidth as the grid size and default values for
|
||||
other parameters.
|
||||
|
||||
bin_seeding : bool, default=False
|
||||
If true, initial kernel locations are not locations of all
|
||||
points, but rather the location of the discretized version of
|
||||
points, where points are binned onto a grid whose coarseness
|
||||
corresponds to the bandwidth. Setting this option to True will speed
|
||||
up the algorithm because fewer seeds will be initialized.
|
||||
The default value is False.
|
||||
Ignored if seeds argument is not None.
|
||||
|
||||
min_bin_freq : int, default=1
|
||||
To speed up the algorithm, accept only those bins with at least
|
||||
min_bin_freq points as seeds.
|
||||
|
||||
cluster_all : bool, default=True
|
||||
If true, then all points are clustered, even those orphans that are
|
||||
not within any kernel. Orphans are assigned to the nearest kernel.
|
||||
If false, then orphans are given cluster label -1.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. This works by computing
|
||||
each of the n_init runs in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations, per seed point before the clustering
|
||||
operation terminates (for that seed point), if has not converged yet.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_ : array, [n_clusters, n_features]
|
||||
Coordinates of cluster centers.
|
||||
|
||||
labels_ : array of shape (n_samples,)
|
||||
Labels of each point.
|
||||
|
||||
n_iter_ : int
|
||||
Maximum number of iterations performed on each seed.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import MeanShift
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = MeanShift(bandwidth=2).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
>>> clustering.predict([[0, 0], [5, 5]])
|
||||
array([1, 0])
|
||||
>>> clustering
|
||||
MeanShift(bandwidth=2)
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
Scalability:
|
||||
|
||||
Because this implementation uses a flat kernel and
|
||||
a Ball Tree to look up members of each kernel, the complexity will tend
|
||||
towards O(T*n*log(n)) in lower dimensions, with n the number of samples
|
||||
and T the number of points. In higher dimensions the complexity will
|
||||
tend towards O(T*n^2).
|
||||
|
||||
Scalability can be boosted by using fewer seeds, for example by using
|
||||
a higher value of min_bin_freq in the get_bin_seeds function.
|
||||
|
||||
Note that the estimate_bandwidth function is much less scalable than the
|
||||
mean shift algorithm and will be the bottleneck if it is used.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
|
||||
feature space analysis". IEEE Transactions on Pattern Analysis and
|
||||
Machine Intelligence. 2002. pp. 603-619.
|
||||
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False,
|
||||
min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300):
|
||||
self.bandwidth = bandwidth
|
||||
self.seeds = seeds
|
||||
self.bin_seeding = bin_seeding
|
||||
self.cluster_all = cluster_all
|
||||
self.min_bin_freq = min_bin_freq
|
||||
self.n_jobs = n_jobs
|
||||
self.max_iter = max_iter
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Perform clustering.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Samples to cluster.
|
||||
|
||||
y : Ignored
|
||||
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
bandwidth = self.bandwidth
|
||||
if bandwidth is None:
|
||||
bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)
|
||||
elif bandwidth <= 0:
|
||||
raise ValueError("bandwidth needs to be greater than zero or None,"
|
||||
" got %f" % bandwidth)
|
||||
|
||||
seeds = self.seeds
|
||||
if seeds is None:
|
||||
if self.bin_seeding:
|
||||
seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)
|
||||
else:
|
||||
seeds = X
|
||||
n_samples, n_features = X.shape
|
||||
center_intensity_dict = {}
|
||||
|
||||
# We use n_jobs=1 because this will be used in nested calls under
|
||||
# parallel calls to _mean_shift_single_seed so there is no need for
|
||||
# for further parallelism.
|
||||
nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
|
||||
|
||||
# execute iterations on all seeds in parallel
|
||||
all_res = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(_mean_shift_single_seed)
|
||||
(seed, X, nbrs, self.max_iter) for seed in seeds)
|
||||
# copy results in a dictionary
|
||||
for i in range(len(seeds)):
|
||||
if all_res[i][1]: # i.e. len(points_within) > 0
|
||||
center_intensity_dict[all_res[i][0]] = all_res[i][1]
|
||||
|
||||
self.n_iter_ = max([x[2] for x in all_res])
|
||||
|
||||
if not center_intensity_dict:
|
||||
# nothing near seeds
|
||||
raise ValueError("No point was within bandwidth=%f of any seed."
|
||||
" Try a different seeding strategy \
|
||||
or increase the bandwidth."
|
||||
% bandwidth)
|
||||
|
||||
# POST PROCESSING: remove near duplicate points
|
||||
# If the distance between two kernels is less than the bandwidth,
|
||||
# then we have to remove one because it is a duplicate. Remove the
|
||||
# one with fewer points.
|
||||
|
||||
sorted_by_intensity = sorted(center_intensity_dict.items(),
|
||||
key=lambda tup: (tup[1], tup[0]),
|
||||
reverse=True)
|
||||
sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
|
||||
unique = np.ones(len(sorted_centers), dtype=np.bool)
|
||||
nbrs = NearestNeighbors(radius=bandwidth,
|
||||
n_jobs=self.n_jobs).fit(sorted_centers)
|
||||
for i, center in enumerate(sorted_centers):
|
||||
if unique[i]:
|
||||
neighbor_idxs = nbrs.radius_neighbors([center],
|
||||
return_distance=False)[0]
|
||||
unique[neighbor_idxs] = 0
|
||||
unique[i] = 1 # leave the current point as unique
|
||||
cluster_centers = sorted_centers[unique]
|
||||
|
||||
# ASSIGN LABELS: a point belongs to the cluster that it is closest to
|
||||
nbrs = NearestNeighbors(n_neighbors=1,
|
||||
n_jobs=self.n_jobs).fit(cluster_centers)
|
||||
labels = np.zeros(n_samples, dtype=np.int)
|
||||
distances, idxs = nbrs.kneighbors(X)
|
||||
if self.cluster_all:
|
||||
labels = idxs.flatten()
|
||||
else:
|
||||
labels.fill(-1)
|
||||
bool_selector = distances.flatten() <= bandwidth
|
||||
labels[bool_selector] = idxs.flatten()[bool_selector]
|
||||
|
||||
self.cluster_centers_, self.labels_ = cluster_centers, labels
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the closest cluster each sample in X belongs to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape=[n_samples, n_features]
|
||||
New data to predict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array, shape [n_samples,]
|
||||
Index of the cluster each sample belongs to.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
return pairwise_distances_argmin(X, self.cluster_centers_)
|
928
venv/Lib/site-packages/sklearn/cluster/_optics.py
Normal file
928
venv/Lib/site-packages/sklearn/cluster/_optics.py
Normal file
|
@ -0,0 +1,928 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Ordering Points To Identify the Clustering Structure (OPTICS)
|
||||
|
||||
These routines execute the OPTICS algorithm, and implement various
|
||||
cluster extraction methods of the ordered list.
|
||||
|
||||
Authors: Shane Grigsby <refuge@rocktalus.com>
|
||||
Adrin Jalali <adrinjalali@gmail.com>
|
||||
Erich Schubert <erich@debian.org>
|
||||
Hanmin Qin <qinhanmin2005@sina.com>
|
||||
License: BSD 3 clause
|
||||
"""
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from ..utils import gen_batches, get_chunk_n_rows
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..neighbors import NearestNeighbors
|
||||
from ..base import BaseEstimator, ClusterMixin
|
||||
from ..metrics import pairwise_distances
|
||||
|
||||
|
||||
class OPTICS(ClusterMixin, BaseEstimator):
|
||||
"""Estimate clustering structure from vector array.
|
||||
|
||||
OPTICS (Ordering Points To Identify the Clustering Structure), closely
|
||||
related to DBSCAN, finds core sample of high density and expands clusters
|
||||
from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
|
||||
neighborhood radius. Better suited for usage on large datasets than the
|
||||
current sklearn implementation of DBSCAN.
|
||||
|
||||
Clusters are then extracted using a DBSCAN-like method
|
||||
(cluster_method = 'dbscan') or an automatic
|
||||
technique proposed in [1]_ (cluster_method = 'xi').
|
||||
|
||||
This implementation deviates from the original OPTICS by first performing
|
||||
k-nearest-neighborhood searches on all points to identify core sizes, then
|
||||
computing only the distances to unprocessed points when constructing the
|
||||
cluster order. Note that we do not employ a heap to manage the expansion
|
||||
candidates, so the time complexity will be O(n^2).
|
||||
|
||||
Read more in the :ref:`User Guide <optics>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
min_samples : int > 1 or float between 0 and 1 (default=5)
|
||||
The number of samples in a neighborhood for a point to be considered as
|
||||
a core point. Also, up and down steep regions can't have more then
|
||||
``min_samples`` consecutive non-steep points. Expressed as an absolute
|
||||
number or a fraction of the number of samples (rounded to be at least
|
||||
2).
|
||||
|
||||
max_eps : float, optional (default=np.inf)
|
||||
The maximum distance between two samples for one to be considered as
|
||||
in the neighborhood of the other. Default value of ``np.inf`` will
|
||||
identify clusters across all scales; reducing ``max_eps`` will result
|
||||
in shorter run times.
|
||||
|
||||
metric : str or callable, optional (default='minkowski')
|
||||
Metric to use for distance computation. Any metric from scikit-learn
|
||||
or scipy.spatial.distance can be used.
|
||||
|
||||
If metric is a callable function, it is called on each
|
||||
pair of instances (rows) and the resulting value recorded. The callable
|
||||
should take two arrays as input and return one value indicating the
|
||||
distance between them. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string. If metric is
|
||||
"precomputed", X is assumed to be a distance matrix and must be square.
|
||||
|
||||
Valid values for metric are:
|
||||
|
||||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||||
'manhattan']
|
||||
|
||||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||||
'yule']
|
||||
|
||||
See the documentation for scipy.spatial.distance for details on these
|
||||
metrics.
|
||||
|
||||
p : int, optional (default=2)
|
||||
Parameter for the Minkowski metric from
|
||||
:class:`sklearn.metrics.pairwise_distances`. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, optional (default=None)
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
cluster_method : str, optional (default='xi')
|
||||
The extraction method used to extract clusters using the calculated
|
||||
reachability and ordering. Possible values are "xi" and "dbscan".
|
||||
|
||||
eps : float, optional (default=None)
|
||||
The maximum distance between two samples for one to be considered as
|
||||
in the neighborhood of the other. By default it assumes the same value
|
||||
as ``max_eps``.
|
||||
Used only when ``cluster_method='dbscan'``.
|
||||
|
||||
xi : float, between 0 and 1, optional (default=0.05)
|
||||
Determines the minimum steepness on the reachability plot that
|
||||
constitutes a cluster boundary. For example, an upwards point in the
|
||||
reachability plot is defined by the ratio from one point to its
|
||||
successor being at most 1-xi.
|
||||
Used only when ``cluster_method='xi'``.
|
||||
|
||||
predecessor_correction : bool, optional (default=True)
|
||||
Correct clusters according to the predecessors calculated by OPTICS
|
||||
[2]_. This parameter has minimal effect on most datasets.
|
||||
Used only when ``cluster_method='xi'``.
|
||||
|
||||
min_cluster_size : int > 1 or float between 0 and 1 (default=None)
|
||||
Minimum number of samples in an OPTICS cluster, expressed as an
|
||||
absolute number or a fraction of the number of samples (rounded to be
|
||||
at least 2). If ``None``, the value of ``min_samples`` is used instead.
|
||||
Used only when ``cluster_method='xi'``.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method. (default)
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, optional (default=30)
|
||||
Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
|
||||
affect the speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
n_jobs : int or None, optional (default=None)
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
labels_ : array, shape (n_samples,)
|
||||
Cluster labels for each point in the dataset given to fit().
|
||||
Noisy samples and points which are not included in a leaf cluster
|
||||
of ``cluster_hierarchy_`` are labeled as -1.
|
||||
|
||||
reachability_ : array, shape (n_samples,)
|
||||
Reachability distances per sample, indexed by object order. Use
|
||||
``clust.reachability_[clust.ordering_]`` to access in cluster order.
|
||||
|
||||
ordering_ : array, shape (n_samples,)
|
||||
The cluster ordered list of sample indices.
|
||||
|
||||
core_distances_ : array, shape (n_samples,)
|
||||
Distance at which each sample becomes a core point, indexed by object
|
||||
order. Points which will never be core have a distance of inf. Use
|
||||
``clust.core_distances_[clust.ordering_]`` to access in cluster order.
|
||||
|
||||
predecessor_ : array, shape (n_samples,)
|
||||
Point that a sample was reached from, indexed by object order.
|
||||
Seed points have a predecessor of -1.
|
||||
|
||||
cluster_hierarchy_ : array, shape (n_clusters, 2)
|
||||
The list of clusters in the form of ``[start, end]`` in each row, with
|
||||
all indices inclusive. The clusters are ordered according to
|
||||
``(end, -start)`` (ascending) so that larger clusters encompassing
|
||||
smaller clusters come after those smaller ones. Since ``labels_`` does
|
||||
not reflect the hierarchy, usually
|
||||
``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also
|
||||
note that these indices are of the ``ordering_``, i.e.
|
||||
``X[ordering_][start:end + 1]`` form a cluster.
|
||||
Only available when ``cluster_method='xi'``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DBSCAN
|
||||
A similar clustering for a specified neighborhood radius (eps).
|
||||
Our implementation is optimized for runtime.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
|
||||
and Jörg Sander. "OPTICS: ordering points to identify the clustering
|
||||
structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
|
||||
|
||||
.. [2] Schubert, Erich, Michael Gertz.
|
||||
"Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
|
||||
the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import OPTICS
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [2, 5], [3, 6],
|
||||
... [8, 7], [8, 8], [7, 3]])
|
||||
>>> clustering = OPTICS(min_samples=2).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski',
|
||||
p=2, metric_params=None, cluster_method='xi', eps=None,
|
||||
xi=0.05, predecessor_correction=True, min_cluster_size=None,
|
||||
algorithm='auto', leaf_size=30, n_jobs=None):
|
||||
self.max_eps = max_eps
|
||||
self.min_samples = min_samples
|
||||
self.min_cluster_size = min_cluster_size
|
||||
self.algorithm = algorithm
|
||||
self.metric = metric
|
||||
self.metric_params = metric_params
|
||||
self.p = p
|
||||
self.leaf_size = leaf_size
|
||||
self.cluster_method = cluster_method
|
||||
self.eps = eps
|
||||
self.xi = xi
|
||||
self.predecessor_correction = predecessor_correction
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Perform OPTICS clustering.
|
||||
|
||||
Extracts an ordered list of points and reachability distances, and
|
||||
performs initial clustering using ``max_eps`` distance specified at
|
||||
OPTICS object instantiation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
|
||||
if metric=’precomputed’
|
||||
A feature array, or array of distances between samples if
|
||||
metric='precomputed'.
|
||||
|
||||
y : ignored
|
||||
Ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : instance of OPTICS
|
||||
The instance.
|
||||
"""
|
||||
X = self._validate_data(X, dtype=np.float)
|
||||
|
||||
if self.cluster_method not in ['dbscan', 'xi']:
|
||||
raise ValueError("cluster_method should be one of"
|
||||
" 'dbscan' or 'xi' but is %s" %
|
||||
self.cluster_method)
|
||||
|
||||
(self.ordering_, self.core_distances_, self.reachability_,
|
||||
self.predecessor_) = compute_optics_graph(
|
||||
X=X, min_samples=self.min_samples, algorithm=self.algorithm,
|
||||
leaf_size=self.leaf_size, metric=self.metric,
|
||||
metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs,
|
||||
max_eps=self.max_eps)
|
||||
|
||||
# Extract clusters from the calculated orders and reachability
|
||||
if self.cluster_method == 'xi':
|
||||
labels_, clusters_ = cluster_optics_xi(
|
||||
reachability=self.reachability_,
|
||||
predecessor=self.predecessor_,
|
||||
ordering=self.ordering_,
|
||||
min_samples=self.min_samples,
|
||||
min_cluster_size=self.min_cluster_size,
|
||||
xi=self.xi,
|
||||
predecessor_correction=self.predecessor_correction)
|
||||
self.cluster_hierarchy_ = clusters_
|
||||
elif self.cluster_method == 'dbscan':
|
||||
if self.eps is None:
|
||||
eps = self.max_eps
|
||||
else:
|
||||
eps = self.eps
|
||||
|
||||
if eps > self.max_eps:
|
||||
raise ValueError('Specify an epsilon smaller than %s. Got %s.'
|
||||
% (self.max_eps, eps))
|
||||
|
||||
labels_ = cluster_optics_dbscan(
|
||||
reachability=self.reachability_,
|
||||
core_distances=self.core_distances_,
|
||||
ordering=self.ordering_, eps=eps)
|
||||
|
||||
self.labels_ = labels_
|
||||
return self
|
||||
|
||||
|
||||
def _validate_size(size, n_samples, param_name):
|
||||
if size <= 0 or (size !=
|
||||
int(size)
|
||||
and size > 1):
|
||||
raise ValueError('%s must be a positive integer '
|
||||
'or a float between 0 and 1. Got %r' %
|
||||
(param_name, size))
|
||||
elif size > n_samples:
|
||||
raise ValueError('%s must be no greater than the'
|
||||
' number of samples (%d). Got %d' %
|
||||
(param_name, n_samples, size))
|
||||
|
||||
|
||||
# OPTICS helper functions
|
||||
def _compute_core_distances_(X, neighbors, min_samples, working_memory):
|
||||
"""Compute the k-th nearest neighbor of each sample
|
||||
|
||||
Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
|
||||
but with more memory efficiency.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array, shape (n_samples, n_features)
|
||||
The data.
|
||||
neighbors : NearestNeighbors instance
|
||||
The fitted nearest neighbors estimator.
|
||||
working_memory : int, optional
|
||||
The sought maximum memory for temporary distance matrix chunks.
|
||||
When None (default), the value of
|
||||
``sklearn.get_config()['working_memory']`` is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
core_distances : array, shape (n_samples,)
|
||||
Distance at which each sample becomes a core point.
|
||||
Points which will never be core have a distance of inf.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
core_distances = np.empty(n_samples)
|
||||
core_distances.fill(np.nan)
|
||||
|
||||
chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples,
|
||||
max_n_rows=n_samples,
|
||||
working_memory=working_memory)
|
||||
slices = gen_batches(n_samples, chunk_n_rows)
|
||||
for sl in slices:
|
||||
core_distances[sl] = neighbors.kneighbors(
|
||||
X[sl], min_samples)[0][:, -1]
|
||||
return core_distances
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params,
|
||||
algorithm, leaf_size, n_jobs):
|
||||
"""Computes the OPTICS reachability graph.
|
||||
|
||||
Read more in the :ref:`User Guide <optics>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
|
||||
if metric=’precomputed’.
|
||||
A feature array, or array of distances between samples if
|
||||
metric='precomputed'
|
||||
|
||||
min_samples : int > 1 or float between 0 and 1
|
||||
The number of samples in a neighborhood for a point to be considered
|
||||
as a core point. Expressed as an absolute number or a fraction of the
|
||||
number of samples (rounded to be at least 2).
|
||||
|
||||
max_eps : float, optional (default=np.inf)
|
||||
The maximum distance between two samples for one to be considered as
|
||||
in the neighborhood of the other. Default value of ``np.inf`` will
|
||||
identify clusters across all scales; reducing ``max_eps`` will result
|
||||
in shorter run times.
|
||||
|
||||
metric : string or callable, optional (default='minkowski')
|
||||
Metric to use for distance computation. Any metric from scikit-learn
|
||||
or scipy.spatial.distance can be used.
|
||||
|
||||
If metric is a callable function, it is called on each
|
||||
pair of instances (rows) and the resulting value recorded. The callable
|
||||
should take two arrays as input and return one value indicating the
|
||||
distance between them. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string. If metric is
|
||||
"precomputed", X is assumed to be a distance matrix and must be square.
|
||||
|
||||
Valid values for metric are:
|
||||
|
||||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||||
'manhattan']
|
||||
|
||||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||||
'yule']
|
||||
|
||||
See the documentation for scipy.spatial.distance for details on these
|
||||
metrics.
|
||||
|
||||
p : integer, optional (default=2)
|
||||
Parameter for the Minkowski metric from
|
||||
:class:`sklearn.metrics.pairwise_distances`. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, optional (default=None)
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method. (default)
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, optional (default=30)
|
||||
Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
|
||||
affect the speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
n_jobs : int or None, optional (default=None)
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ordering_ : array, shape (n_samples,)
|
||||
The cluster ordered list of sample indices.
|
||||
|
||||
core_distances_ : array, shape (n_samples,)
|
||||
Distance at which each sample becomes a core point, indexed by object
|
||||
order. Points which will never be core have a distance of inf. Use
|
||||
``clust.core_distances_[clust.ordering_]`` to access in cluster order.
|
||||
|
||||
reachability_ : array, shape (n_samples,)
|
||||
Reachability distances per sample, indexed by object order. Use
|
||||
``clust.reachability_[clust.ordering_]`` to access in cluster order.
|
||||
|
||||
predecessor_ : array, shape (n_samples,)
|
||||
Point that a sample was reached from, indexed by object order.
|
||||
Seed points have a predecessor of -1.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
|
||||
and Jörg Sander. "OPTICS: ordering points to identify the clustering
|
||||
structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
_validate_size(min_samples, n_samples, 'min_samples')
|
||||
if min_samples <= 1:
|
||||
min_samples = max(2, int(min_samples * n_samples))
|
||||
|
||||
# Start all points as 'unprocessed' ##
|
||||
reachability_ = np.empty(n_samples)
|
||||
reachability_.fill(np.inf)
|
||||
predecessor_ = np.empty(n_samples, dtype=int)
|
||||
predecessor_.fill(-1)
|
||||
|
||||
nbrs = NearestNeighbors(n_neighbors=min_samples,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric,
|
||||
metric_params=metric_params,
|
||||
p=p,
|
||||
n_jobs=n_jobs)
|
||||
|
||||
nbrs.fit(X)
|
||||
# Here we first do a kNN query for each point, this differs from
|
||||
# the original OPTICS that only used epsilon range queries.
|
||||
# TODO: handle working_memory somehow?
|
||||
core_distances_ = _compute_core_distances_(X=X, neighbors=nbrs,
|
||||
min_samples=min_samples,
|
||||
working_memory=None)
|
||||
# OPTICS puts an upper limit on these, use inf for undefined.
|
||||
core_distances_[core_distances_ > max_eps] = np.inf
|
||||
|
||||
# Main OPTICS loop. Not parallelizable. The order that entries are
|
||||
# written to the 'ordering_' list is important!
|
||||
# Note that this implementation is O(n^2) theoretically, but
|
||||
# supposedly with very low constant factors.
|
||||
processed = np.zeros(X.shape[0], dtype=bool)
|
||||
ordering = np.zeros(X.shape[0], dtype=int)
|
||||
for ordering_idx in range(X.shape[0]):
|
||||
# Choose next based on smallest reachability distance
|
||||
# (And prefer smaller ids on ties, possibly np.inf!)
|
||||
index = np.where(processed == 0)[0]
|
||||
point = index[np.argmin(reachability_[index])]
|
||||
|
||||
processed[point] = True
|
||||
ordering[ordering_idx] = point
|
||||
if core_distances_[point] != np.inf:
|
||||
_set_reach_dist(core_distances_=core_distances_,
|
||||
reachability_=reachability_,
|
||||
predecessor_=predecessor_,
|
||||
point_index=point,
|
||||
processed=processed, X=X, nbrs=nbrs,
|
||||
metric=metric, metric_params=metric_params,
|
||||
p=p, max_eps=max_eps)
|
||||
if np.all(np.isinf(reachability_)):
|
||||
warnings.warn("All reachability values are inf. Set a larger"
|
||||
" max_eps or all data will be considered outliers.",
|
||||
UserWarning)
|
||||
return ordering, core_distances_, reachability_, predecessor_
|
||||
|
||||
|
||||
def _set_reach_dist(core_distances_, reachability_, predecessor_,
|
||||
point_index, processed, X, nbrs, metric, metric_params,
|
||||
p, max_eps):
|
||||
P = X[point_index:point_index + 1]
|
||||
# Assume that radius_neighbors is faster without distances
|
||||
# and we don't need all distances, nevertheless, this means
|
||||
# we may be doing some work twice.
|
||||
indices = nbrs.radius_neighbors(P, radius=max_eps,
|
||||
return_distance=False)[0]
|
||||
|
||||
# Getting indices of neighbors that have not been processed
|
||||
unproc = np.compress(~np.take(processed, indices), indices)
|
||||
# Neighbors of current point are already processed.
|
||||
if not unproc.size:
|
||||
return
|
||||
|
||||
# Only compute distances to unprocessed neighbors:
|
||||
if metric == 'precomputed':
|
||||
dists = X[point_index, unproc]
|
||||
else:
|
||||
_params = dict() if metric_params is None else metric_params.copy()
|
||||
if metric == 'minkowski' and 'p' not in _params:
|
||||
# the same logic as neighbors, p is ignored if explicitly set
|
||||
# in the dict params
|
||||
_params['p'] = p
|
||||
dists = pairwise_distances(P, np.take(X, unproc, axis=0),
|
||||
metric=metric, n_jobs=None,
|
||||
**_params).ravel()
|
||||
|
||||
rdists = np.maximum(dists, core_distances_[point_index])
|
||||
improved = np.where(rdists < np.take(reachability_, unproc))
|
||||
reachability_[unproc[improved]] = rdists[improved]
|
||||
predecessor_[unproc[improved]] = point_index
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
|
||||
"""Performs DBSCAN extraction for an arbitrary epsilon.
|
||||
|
||||
Extracting the clusters runs in linear time. Note that this results in
|
||||
``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with
|
||||
similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reachability : array, shape (n_samples,)
|
||||
Reachability distances calculated by OPTICS (``reachability_``)
|
||||
|
||||
core_distances : array, shape (n_samples,)
|
||||
Distances at which points become core (``core_distances_``)
|
||||
|
||||
ordering : array, shape (n_samples,)
|
||||
OPTICS ordered point indices (``ordering_``)
|
||||
|
||||
eps : float
|
||||
DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results
|
||||
will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close
|
||||
to one another.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels_ : array, shape (n_samples,)
|
||||
The estimated labels.
|
||||
|
||||
"""
|
||||
n_samples = len(core_distances)
|
||||
labels = np.zeros(n_samples, dtype=int)
|
||||
|
||||
far_reach = reachability > eps
|
||||
near_core = core_distances <= eps
|
||||
labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1
|
||||
labels[far_reach & ~near_core] = -1
|
||||
return labels
|
||||
|
||||
|
||||
def cluster_optics_xi(*, reachability, predecessor, ordering, min_samples,
|
||||
min_cluster_size=None, xi=0.05,
|
||||
predecessor_correction=True):
|
||||
"""Automatically extract clusters according to the Xi-steep method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reachability : array, shape (n_samples,)
|
||||
Reachability distances calculated by OPTICS (`reachability_`)
|
||||
|
||||
predecessor : array, shape (n_samples,)
|
||||
Predecessors calculated by OPTICS.
|
||||
|
||||
ordering : array, shape (n_samples,)
|
||||
OPTICS ordered point indices (`ordering_`)
|
||||
|
||||
min_samples : int > 1 or float between 0 and 1
|
||||
The same as the min_samples given to OPTICS. Up and down steep regions
|
||||
can't have more then ``min_samples`` consecutive non-steep points.
|
||||
Expressed as an absolute number or a fraction of the number of samples
|
||||
(rounded to be at least 2).
|
||||
|
||||
min_cluster_size : int > 1 or float between 0 and 1 (default=None)
|
||||
Minimum number of samples in an OPTICS cluster, expressed as an
|
||||
absolute number or a fraction of the number of samples (rounded to be
|
||||
at least 2). If ``None``, the value of ``min_samples`` is used instead.
|
||||
|
||||
xi : float, between 0 and 1, optional (default=0.05)
|
||||
Determines the minimum steepness on the reachability plot that
|
||||
constitutes a cluster boundary. For example, an upwards point in the
|
||||
reachability plot is defined by the ratio from one point to its
|
||||
successor being at most 1-xi.
|
||||
|
||||
predecessor_correction : bool, optional (default=True)
|
||||
Correct clusters based on the calculated predecessors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array, shape (n_samples)
|
||||
The labels assigned to samples. Points which are not included
|
||||
in any cluster are labeled as -1.
|
||||
|
||||
clusters : array, shape (n_clusters, 2)
|
||||
The list of clusters in the form of ``[start, end]`` in each row, with
|
||||
all indices inclusive. The clusters are ordered according to ``(end,
|
||||
-start)`` (ascending) so that larger clusters encompassing smaller
|
||||
clusters come after such nested smaller clusters. Since ``labels`` does
|
||||
not reflect the hierarchy, usually ``len(clusters) >
|
||||
np.unique(labels)``.
|
||||
"""
|
||||
n_samples = len(reachability)
|
||||
_validate_size(min_samples, n_samples, 'min_samples')
|
||||
if min_samples <= 1:
|
||||
min_samples = max(2, int(min_samples * n_samples))
|
||||
if min_cluster_size is None:
|
||||
min_cluster_size = min_samples
|
||||
_validate_size(min_cluster_size, n_samples, 'min_cluster_size')
|
||||
if min_cluster_size <= 1:
|
||||
min_cluster_size = max(2, int(min_cluster_size * n_samples))
|
||||
|
||||
clusters = _xi_cluster(reachability[ordering], predecessor[ordering],
|
||||
ordering, xi,
|
||||
min_samples, min_cluster_size,
|
||||
predecessor_correction)
|
||||
labels = _extract_xi_labels(ordering, clusters)
|
||||
return labels, clusters
|
||||
|
||||
|
||||
def _extend_region(steep_point, xward_point, start, min_samples):
|
||||
"""Extend the area until it's maximal.
|
||||
|
||||
It's the same function for both upward and downward reagions, depending on
|
||||
the given input parameters. Assuming:
|
||||
|
||||
- steep_{upward/downward}: bool array indicating whether a point is a
|
||||
steep {upward/downward};
|
||||
- upward/downward: bool array indicating whether a point is
|
||||
upward/downward;
|
||||
|
||||
To extend an upward reagion, ``steep_point=steep_upward`` and
|
||||
``xward_point=downward`` are expected, and to extend a downward region,
|
||||
``steep_point=steep_downward`` and ``xward_point=upward``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
steep_point : bool array, shape (n_samples)
|
||||
True if the point is steep downward (upward).
|
||||
|
||||
xward_point : bool array, shape (n_samples)
|
||||
True if the point is an upward (respectively downward) point.
|
||||
|
||||
start : integer
|
||||
The start of the xward region.
|
||||
|
||||
min_samples : integer
|
||||
The same as the min_samples given to OPTICS. Up and down steep
|
||||
regions can't have more then ``min_samples`` consecutive non-steep
|
||||
points.
|
||||
|
||||
Returns
|
||||
-------
|
||||
index : integer
|
||||
The current index iterating over all the samples, i.e. where we are up
|
||||
to in our search.
|
||||
|
||||
end : integer
|
||||
The end of the region, which can be behind the index. The region
|
||||
includes the ``end`` index.
|
||||
"""
|
||||
n_samples = len(steep_point)
|
||||
non_xward_points = 0
|
||||
index = start
|
||||
end = start
|
||||
# find a maximal area
|
||||
while index < n_samples:
|
||||
if steep_point[index]:
|
||||
non_xward_points = 0
|
||||
end = index
|
||||
elif not xward_point[index]:
|
||||
# it's not a steep point, but still goes up.
|
||||
non_xward_points += 1
|
||||
# region should include no more than min_samples consecutive
|
||||
# non steep xward points.
|
||||
if non_xward_points > min_samples:
|
||||
break
|
||||
else:
|
||||
return end
|
||||
index += 1
|
||||
return end
|
||||
|
||||
|
||||
def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot):
|
||||
"""Update steep down areas (SDAs) using the new maximum in between (mib)
|
||||
value, and the given complement of xi, i.e. ``1 - xi``.
|
||||
"""
|
||||
if np.isinf(mib):
|
||||
return []
|
||||
res = [sda for sda in sdas
|
||||
if mib <= reachability_plot[sda['start']] * xi_complement]
|
||||
for sda in res:
|
||||
sda['mib'] = max(sda['mib'], mib)
|
||||
return res
|
||||
|
||||
|
||||
def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e):
|
||||
"""Correct for predecessors.
|
||||
|
||||
Applies Algorithm 2 of [1]_.
|
||||
|
||||
Input parameters are ordered by the computer OPTICS ordering.
|
||||
|
||||
.. [1] Schubert, Erich, Michael Gertz.
|
||||
"Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
|
||||
the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
|
||||
"""
|
||||
while s < e:
|
||||
if reachability_plot[s] > reachability_plot[e]:
|
||||
return s, e
|
||||
p_e = ordering[predecessor_plot[e]]
|
||||
for i in range(s, e):
|
||||
if p_e == ordering[i]:
|
||||
return s, e
|
||||
e -= 1
|
||||
return None, None
|
||||
|
||||
|
||||
def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
|
||||
min_cluster_size, predecessor_correction):
|
||||
"""Automatically extract clusters according to the Xi-steep method.
|
||||
|
||||
This is rouphly an implementation of Figure 19 of the OPTICS paper.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
reachability_plot : array, shape (n_samples)
|
||||
The reachability plot, i.e. reachability ordered according to
|
||||
the calculated ordering, all computed by OPTICS.
|
||||
|
||||
predecessor_plot : array, shape (n_samples)
|
||||
Predecessors ordered according to the calculated ordering.
|
||||
|
||||
xi : float, between 0 and 1
|
||||
Determines the minimum steepness on the reachability plot that
|
||||
constitutes a cluster boundary. For example, an upwards point in the
|
||||
reachability plot is defined by the ratio from one point to its
|
||||
successor being at most 1-xi.
|
||||
|
||||
min_samples : int > 1
|
||||
The same as the min_samples given to OPTICS. Up and down steep regions
|
||||
can't have more then ``min_samples`` consecutive non-steep points.
|
||||
|
||||
min_cluster_size : int > 1
|
||||
Minimum number of samples in an OPTICS cluster.
|
||||
|
||||
predecessor_correction : bool
|
||||
Correct clusters based on the calculated predecessors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
clusters : array, shape (n_clusters, 2)
|
||||
The list of clusters in the form of [start, end] in each row, with all
|
||||
indices inclusive. The clusters are ordered in a way that larger
|
||||
clusters encompassing smaller clusters come after those smaller
|
||||
clusters.
|
||||
"""
|
||||
|
||||
# Our implementation adds an inf to the end of reachability plot
|
||||
# this helps to find potential clusters at the end of the
|
||||
# reachability plot even if there's no upward region at the end of it.
|
||||
reachability_plot = np.hstack((reachability_plot, np.inf))
|
||||
|
||||
xi_complement = 1 - xi
|
||||
sdas = [] # steep down areas, introduced in section 4.3.2 of the paper
|
||||
clusters = []
|
||||
index = 0
|
||||
mib = 0. # maximum in between, section 4.3.2
|
||||
|
||||
# Our implementation corrects a mistake in the original
|
||||
# paper, i.e., in Definition 9 steep downward point,
|
||||
# r(p) * (1 - x1) <= r(p + 1) should be
|
||||
# r(p) * (1 - x1) >= r(p + 1)
|
||||
with np.errstate(invalid='ignore'):
|
||||
ratio = reachability_plot[:-1] / reachability_plot[1:]
|
||||
steep_upward = ratio <= xi_complement
|
||||
steep_downward = ratio >= 1 / xi_complement
|
||||
downward = ratio > 1
|
||||
upward = ratio < 1
|
||||
|
||||
# the following loop is is almost exactly as Figure 19 of the paper.
|
||||
# it jumps over the areas which are not either steep down or up areas
|
||||
for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)):
|
||||
# just continue if steep_index has been a part of a discovered xward
|
||||
# area.
|
||||
if steep_index < index:
|
||||
continue
|
||||
|
||||
mib = max(mib, np.max(reachability_plot[index:steep_index + 1]))
|
||||
|
||||
# steep downward areas
|
||||
if steep_downward[steep_index]:
|
||||
sdas = _update_filter_sdas(sdas, mib, xi_complement,
|
||||
reachability_plot)
|
||||
D_start = steep_index
|
||||
D_end = _extend_region(steep_downward, upward,
|
||||
D_start, min_samples)
|
||||
D = {'start': D_start, 'end': D_end, 'mib': 0.}
|
||||
sdas.append(D)
|
||||
index = D_end + 1
|
||||
mib = reachability_plot[index]
|
||||
|
||||
# steep upward areas
|
||||
else:
|
||||
sdas = _update_filter_sdas(sdas, mib, xi_complement,
|
||||
reachability_plot)
|
||||
U_start = steep_index
|
||||
U_end = _extend_region(steep_upward, downward, U_start,
|
||||
min_samples)
|
||||
index = U_end + 1
|
||||
mib = reachability_plot[index]
|
||||
|
||||
U_clusters = []
|
||||
for D in sdas:
|
||||
c_start = D['start']
|
||||
c_end = U_end
|
||||
|
||||
# line (**), sc2*
|
||||
if reachability_plot[c_end + 1] * xi_complement < D['mib']:
|
||||
continue
|
||||
|
||||
# Definition 11: criterion 4
|
||||
D_max = reachability_plot[D['start']]
|
||||
if D_max * xi_complement >= reachability_plot[c_end + 1]:
|
||||
# Find the first index from the left side which is almost
|
||||
# at the same level as the end of the detected cluster.
|
||||
while (reachability_plot[c_start + 1] >
|
||||
reachability_plot[c_end + 1]
|
||||
and c_start < D['end']):
|
||||
c_start += 1
|
||||
elif reachability_plot[c_end + 1] * xi_complement >= D_max:
|
||||
# Find the first index from the right side which is almost
|
||||
# at the same level as the beginning of the detected
|
||||
# cluster.
|
||||
# Our implementation corrects a mistake in the original
|
||||
# paper, i.e., in Definition 11 4c, r(x) < r(sD) should be
|
||||
# r(x) > r(sD).
|
||||
while (reachability_plot[c_end - 1] > D_max
|
||||
and c_end > U_start):
|
||||
c_end -= 1
|
||||
|
||||
# predecessor correction
|
||||
if predecessor_correction:
|
||||
c_start, c_end = _correct_predecessor(reachability_plot,
|
||||
predecessor_plot,
|
||||
ordering,
|
||||
c_start,
|
||||
c_end)
|
||||
if c_start is None:
|
||||
continue
|
||||
|
||||
# Definition 11: criterion 3.a
|
||||
if c_end - c_start + 1 < min_cluster_size:
|
||||
continue
|
||||
|
||||
# Definition 11: criterion 1
|
||||
if c_start > D['end']:
|
||||
continue
|
||||
|
||||
# Definition 11: criterion 2
|
||||
if c_end < U_start:
|
||||
continue
|
||||
|
||||
U_clusters.append((c_start, c_end))
|
||||
|
||||
# add smaller clusters first.
|
||||
U_clusters.reverse()
|
||||
clusters.extend(U_clusters)
|
||||
|
||||
return np.array(clusters)
|
||||
|
||||
|
||||
def _extract_xi_labels(ordering, clusters):
|
||||
"""Extracts the labels from the clusters returned by `_xi_cluster`.
|
||||
We rely on the fact that clusters are stored
|
||||
with the smaller clusters coming before the larger ones.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ordering : array, shape (n_samples)
|
||||
The ordering of points calculated by OPTICS
|
||||
|
||||
clusters : array, shape (n_clusters, 2)
|
||||
List of clusters i.e. (start, end) tuples,
|
||||
as returned by `_xi_cluster`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array, shape (n_samples)
|
||||
"""
|
||||
|
||||
labels = np.full(len(ordering), -1, dtype=int)
|
||||
label = 0
|
||||
for c in clusters:
|
||||
if not np.any(labels[c[0]:(c[1] + 1)] != -1):
|
||||
labels[c[0]:(c[1] + 1)] = label
|
||||
label += 1
|
||||
labels[ordering] = labels.copy()
|
||||
return labels
|
552
venv/Lib/site-packages/sklearn/cluster/_spectral.py
Normal file
552
venv/Lib/site-packages/sklearn/cluster/_spectral.py
Normal file
|
@ -0,0 +1,552 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Algorithms for spectral clustering"""
|
||||
|
||||
# Author: Gael Varoquaux gael.varoquaux@normalesup.org
|
||||
# Brian Cheung
|
||||
# Wei LI <kuantkid@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import BaseEstimator, ClusterMixin
|
||||
from ..utils import check_random_state, as_float_array
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..metrics.pairwise import pairwise_kernels
|
||||
from ..neighbors import kneighbors_graph, NearestNeighbors
|
||||
from ..manifold import spectral_embedding
|
||||
from ._kmeans import k_means
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20,
|
||||
random_state=None):
|
||||
"""Search for a partition matrix (clustering) which is closest to the
|
||||
eigenvector embedding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vectors : array-like, shape: (n_samples, n_clusters)
|
||||
The embedding space of the samples.
|
||||
|
||||
copy : boolean, optional, default: True
|
||||
Whether to copy vectors, or perform in-place normalization.
|
||||
|
||||
max_svd_restarts : int, optional, default: 30
|
||||
Maximum number of attempts to restart SVD if convergence fails
|
||||
|
||||
n_iter_max : int, optional, default: 30
|
||||
Maximum number of iterations to attempt in rotation and partition
|
||||
matrix search if machine precision convergence is not reached
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines random number generation for rotation matrix initialization.
|
||||
Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The labels of the clusters.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
The eigenvector embedding is used to iteratively search for the
|
||||
closest discrete partition. First, the eigenvector embedding is
|
||||
normalized to the space of partition matrices. An optimal discrete
|
||||
partition matrix closest to this normalized embedding multiplied by
|
||||
an initial rotation is calculated. Fixing this discrete partition
|
||||
matrix, an optimal rotation matrix is calculated. These two
|
||||
calculations are performed until convergence. The discrete partition
|
||||
matrix is returned as the clustering solution. Used in spectral
|
||||
clustering, this method tends to be faster and more robust to random
|
||||
initialization than k-means.
|
||||
|
||||
"""
|
||||
|
||||
from scipy.sparse import csc_matrix
|
||||
from scipy.linalg import LinAlgError
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
vectors = as_float_array(vectors, copy=copy)
|
||||
|
||||
eps = np.finfo(float).eps
|
||||
n_samples, n_components = vectors.shape
|
||||
|
||||
# Normalize the eigenvectors to an equal length of a vector of ones.
|
||||
# Reorient the eigenvectors to point in the negative direction with respect
|
||||
# to the first element. This may have to do with constraining the
|
||||
# eigenvectors to lie in a specific quadrant to make the discretization
|
||||
# search easier.
|
||||
norm_ones = np.sqrt(n_samples)
|
||||
for i in range(vectors.shape[1]):
|
||||
vectors[:, i] = (vectors[:, i] / np.linalg.norm(vectors[:, i])) \
|
||||
* norm_ones
|
||||
if vectors[0, i] != 0:
|
||||
vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])
|
||||
|
||||
# Normalize the rows of the eigenvectors. Samples should lie on the unit
|
||||
# hypersphere centered at the origin. This transforms the samples in the
|
||||
# embedding space to the space of partition matrices.
|
||||
vectors = vectors / np.sqrt((vectors ** 2).sum(axis=1))[:, np.newaxis]
|
||||
|
||||
svd_restarts = 0
|
||||
has_converged = False
|
||||
|
||||
# If there is an exception we try to randomize and rerun SVD again
|
||||
# do this max_svd_restarts times.
|
||||
while (svd_restarts < max_svd_restarts) and not has_converged:
|
||||
|
||||
# Initialize first column of rotation matrix with a row of the
|
||||
# eigenvectors
|
||||
rotation = np.zeros((n_components, n_components))
|
||||
rotation[:, 0] = vectors[random_state.randint(n_samples), :].T
|
||||
|
||||
# To initialize the rest of the rotation matrix, find the rows
|
||||
# of the eigenvectors that are as orthogonal to each other as
|
||||
# possible
|
||||
c = np.zeros(n_samples)
|
||||
for j in range(1, n_components):
|
||||
# Accumulate c to ensure row is as orthogonal as possible to
|
||||
# previous picks as well as current one
|
||||
c += np.abs(np.dot(vectors, rotation[:, j - 1]))
|
||||
rotation[:, j] = vectors[c.argmin(), :].T
|
||||
|
||||
last_objective_value = 0.0
|
||||
n_iter = 0
|
||||
|
||||
while not has_converged:
|
||||
n_iter += 1
|
||||
|
||||
t_discrete = np.dot(vectors, rotation)
|
||||
|
||||
labels = t_discrete.argmax(axis=1)
|
||||
vectors_discrete = csc_matrix(
|
||||
(np.ones(len(labels)), (np.arange(0, n_samples), labels)),
|
||||
shape=(n_samples, n_components))
|
||||
|
||||
t_svd = vectors_discrete.T * vectors
|
||||
|
||||
try:
|
||||
U, S, Vh = np.linalg.svd(t_svd)
|
||||
svd_restarts += 1
|
||||
except LinAlgError:
|
||||
print("SVD did not converge, randomizing and trying again")
|
||||
break
|
||||
|
||||
ncut_value = 2.0 * (n_samples - S.sum())
|
||||
if ((abs(ncut_value - last_objective_value) < eps) or
|
||||
(n_iter > n_iter_max)):
|
||||
has_converged = True
|
||||
else:
|
||||
# otherwise calculate rotation and continue
|
||||
last_objective_value = ncut_value
|
||||
rotation = np.dot(Vh.T, U.T)
|
||||
|
||||
if not has_converged:
|
||||
raise LinAlgError('SVD did not converge')
|
||||
return labels
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def spectral_clustering(affinity, *, n_clusters=8, n_components=None,
|
||||
eigen_solver=None, random_state=None, n_init=10,
|
||||
eigen_tol=0.0, assign_labels='kmeans'):
|
||||
"""Apply clustering to a projection of the normalized Laplacian.
|
||||
|
||||
In practice Spectral Clustering is very useful when the structure of
|
||||
the individual clusters is highly non-convex or more generally when
|
||||
a measure of the center and spread of the cluster is not a suitable
|
||||
description of the complete cluster. For instance, when clusters are
|
||||
nested circles on the 2D plane.
|
||||
|
||||
If affinity is the adjacency matrix of a graph, this method can be
|
||||
used to find normalized graph cuts.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_clustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
affinity : array-like or sparse matrix, shape: (n_samples, n_samples)
|
||||
The affinity matrix describing the relationship of the samples to
|
||||
embed. **Must be symmetric**.
|
||||
|
||||
Possible examples:
|
||||
- adjacency matrix of a graph,
|
||||
- heat kernel of the pairwise distance matrix of the samples,
|
||||
- symmetric k-nearest neighbours connectivity matrix of the samples.
|
||||
|
||||
n_clusters : integer, optional
|
||||
Number of clusters to extract.
|
||||
|
||||
n_components : integer, optional, default is n_clusters
|
||||
Number of eigen vectors to use for the spectral embedding
|
||||
|
||||
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
A pseudo random number generator used for the initialization of the
|
||||
lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by
|
||||
the K-Means initialization. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_init : int, optional, default: 10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of
|
||||
n_init consecutive runs in terms of inertia.
|
||||
|
||||
eigen_tol : float, optional, default: 0.0
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix
|
||||
when using arpack eigen_solver.
|
||||
|
||||
assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
|
||||
The strategy to use to assign labels in the embedding
|
||||
space. There are two ways to assign labels after the laplacian
|
||||
embedding. k-means can be applied and is a popular choice. But it can
|
||||
also be sensitive to initialization. Discretization is another
|
||||
approach which is less sensitive to random initialization. See
|
||||
the 'Multiclass spectral clustering' paper referenced below for
|
||||
more details on the discretization approach.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : array of integers, shape: n_samples
|
||||
The labels of the clusters.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
|
||||
|
||||
- A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
|
||||
|
||||
- Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
|
||||
|
||||
Notes
|
||||
-----
|
||||
The graph should contain only one connect component, elsewhere
|
||||
the results make little sense.
|
||||
|
||||
This algorithm solves the normalized cut for k=2: it is a
|
||||
normalized spectral clustering.
|
||||
"""
|
||||
if assign_labels not in ('kmeans', 'discretize'):
|
||||
raise ValueError("The 'assign_labels' parameter should be "
|
||||
"'kmeans' or 'discretize', but '%s' was given"
|
||||
% assign_labels)
|
||||
|
||||
random_state = check_random_state(random_state)
|
||||
n_components = n_clusters if n_components is None else n_components
|
||||
|
||||
# The first eigen vector is constant only for fully connected graphs
|
||||
# and should be kept for spectral clustering (drop_first = False)
|
||||
# See spectral_embedding documentation.
|
||||
maps = spectral_embedding(affinity, n_components=n_components,
|
||||
eigen_solver=eigen_solver,
|
||||
random_state=random_state,
|
||||
eigen_tol=eigen_tol, drop_first=False)
|
||||
|
||||
if assign_labels == 'kmeans':
|
||||
_, labels, _ = k_means(maps, n_clusters, random_state=random_state,
|
||||
n_init=n_init)
|
||||
else:
|
||||
labels = discretize(maps, random_state=random_state)
|
||||
|
||||
return labels
|
||||
|
||||
|
||||
class SpectralClustering(ClusterMixin, BaseEstimator):
|
||||
"""Apply clustering to a projection of the normalized Laplacian.
|
||||
|
||||
In practice Spectral Clustering is very useful when the structure of
|
||||
the individual clusters is highly non-convex or more generally when
|
||||
a measure of the center and spread of the cluster is not a suitable
|
||||
description of the complete cluster. For instance when clusters are
|
||||
nested circles on the 2D plane.
|
||||
|
||||
If affinity is the adjacency matrix of a graph, this method can be
|
||||
used to find normalized graph cuts.
|
||||
|
||||
When calling ``fit``, an affinity matrix is constructed using either
|
||||
kernel function such the Gaussian (aka RBF) kernel of the euclidean
|
||||
distanced ``d(X, X)``::
|
||||
|
||||
np.exp(-gamma * d(X,X) ** 2)
|
||||
|
||||
or a k-nearest neighbors connectivity matrix.
|
||||
|
||||
Alternatively, using ``precomputed``, a user-provided affinity
|
||||
matrix can be used.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_clustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : integer, optional
|
||||
The dimension of the projection subspace.
|
||||
|
||||
eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities.
|
||||
|
||||
n_components : integer, optional, default=n_clusters
|
||||
Number of eigen vectors to use for the spectral embedding
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
A pseudo random number generator used for the initialization of the
|
||||
lobpcg eigen vectors decomposition when ``eigen_solver='amg'`` and by
|
||||
the K-Means initialization. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_init : int, optional, default: 10
|
||||
Number of time the k-means algorithm will be run with different
|
||||
centroid seeds. The final results will be the best output of
|
||||
n_init consecutive runs in terms of inertia.
|
||||
|
||||
gamma : float, default=1.0
|
||||
Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
|
||||
Ignored for ``affinity='nearest_neighbors'``.
|
||||
|
||||
affinity : string or callable, default 'rbf'
|
||||
How to construct the affinity matrix.
|
||||
- 'nearest_neighbors' : construct the affinity matrix by computing a
|
||||
graph of nearest neighbors.
|
||||
- 'rbf' : construct the affinity matrix using a radial basis function
|
||||
(RBF) kernel.
|
||||
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
|
||||
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
|
||||
of precomputed nearest neighbors, and constructs the affinity matrix
|
||||
by selecting the ``n_neighbors`` nearest neighbors.
|
||||
- one of the kernels supported by
|
||||
:func:`~sklearn.metrics.pairwise_kernels`.
|
||||
|
||||
Only kernels that produce similarity scores (non-negative values that
|
||||
increase with similarity) should be used. This property is not checked
|
||||
by the clustering algorithm.
|
||||
|
||||
n_neighbors : integer
|
||||
Number of neighbors to use when constructing the affinity matrix using
|
||||
the nearest neighbors method. Ignored for ``affinity='rbf'``.
|
||||
|
||||
eigen_tol : float, optional, default: 0.0
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix
|
||||
when ``eigen_solver='arpack'``.
|
||||
|
||||
assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
|
||||
The strategy to use to assign labels in the embedding
|
||||
space. There are two ways to assign labels after the laplacian
|
||||
embedding. k-means can be applied and is a popular choice. But it can
|
||||
also be sensitive to initialization. Discretization is another approach
|
||||
which is less sensitive to random initialization.
|
||||
|
||||
degree : float, default=3
|
||||
Degree of the polynomial kernel. Ignored by other kernels.
|
||||
|
||||
coef0 : float, default=1
|
||||
Zero coefficient for polynomial and sigmoid kernels.
|
||||
Ignored by other kernels.
|
||||
|
||||
kernel_params : dictionary of string to any, optional
|
||||
Parameters (keyword arguments) and values for kernel passed as
|
||||
callable object. Ignored by other kernels.
|
||||
|
||||
n_jobs : int or None, optional (default=None)
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
affinity_matrix_ : array-like, shape (n_samples, n_samples)
|
||||
Affinity matrix used for clustering. Available only if after calling
|
||||
``fit``.
|
||||
|
||||
labels_ : array, shape (n_samples,)
|
||||
Labels of each point
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralClustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralClustering(n_clusters=2,
|
||||
... assign_labels="discretize",
|
||||
... random_state=0).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([1, 1, 1, 0, 0, 0])
|
||||
>>> clustering
|
||||
SpectralClustering(assign_labels='discretize', n_clusters=2,
|
||||
random_state=0)
|
||||
|
||||
Notes
|
||||
-----
|
||||
If you have an affinity matrix, such as a distance matrix,
|
||||
for which 0 means identical elements, and high values means
|
||||
very dissimilar elements, it can be transformed in a
|
||||
similarity matrix that is well suited for the algorithm by
|
||||
applying the Gaussian (RBF, heat) kernel::
|
||||
|
||||
np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
|
||||
|
||||
Where ``delta`` is a free parameter representing the width of the Gaussian
|
||||
kernel.
|
||||
|
||||
Another alternative is to take a symmetric version of the k
|
||||
nearest neighbors connectivity matrix of the points.
|
||||
|
||||
If the pyamg package is installed, it is used: this greatly
|
||||
speeds up computation.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
|
||||
|
||||
- A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
|
||||
|
||||
- Multiclass spectral clustering, 2003
|
||||
Stella X. Yu, Jianbo Shi
|
||||
https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_clusters=8, *, eigen_solver=None, n_components=None,
|
||||
random_state=None, n_init=10, gamma=1., affinity='rbf',
|
||||
n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans',
|
||||
degree=3, coef0=1, kernel_params=None, n_jobs=None):
|
||||
self.n_clusters = n_clusters
|
||||
self.eigen_solver = eigen_solver
|
||||
self.n_components = n_components
|
||||
self.random_state = random_state
|
||||
self.n_init = n_init
|
||||
self.gamma = gamma
|
||||
self.affinity = affinity
|
||||
self.n_neighbors = n_neighbors
|
||||
self.eigen_tol = eigen_tol
|
||||
self.assign_labels = assign_labels
|
||||
self.degree = degree
|
||||
self.coef0 = coef0
|
||||
self.kernel_params = kernel_params
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Perform spectral clustering from features, or affinity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features), or \
|
||||
array-like, shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse matrix is
|
||||
provided in a format other than ``csr_matrix``, ``csc_matrix``,
|
||||
or ``coo_matrix``, it will be converted into a sparse
|
||||
``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
|
||||
"""
|
||||
X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
|
||||
dtype=np.float64, ensure_min_samples=2)
|
||||
allow_squared = self.affinity in ["precomputed",
|
||||
"precomputed_nearest_neighbors"]
|
||||
if X.shape[0] == X.shape[1] and not allow_squared:
|
||||
warnings.warn("The spectral clustering API has changed. ``fit``"
|
||||
"now constructs an affinity matrix from data. To use"
|
||||
" a custom affinity matrix, "
|
||||
"set ``affinity=precomputed``.")
|
||||
|
||||
if self.affinity == 'nearest_neighbors':
|
||||
connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors,
|
||||
include_self=True,
|
||||
n_jobs=self.n_jobs)
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
elif self.affinity == 'precomputed_nearest_neighbors':
|
||||
estimator = NearestNeighbors(n_neighbors=self.n_neighbors,
|
||||
n_jobs=self.n_jobs,
|
||||
metric="precomputed").fit(X)
|
||||
connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
elif self.affinity == 'precomputed':
|
||||
self.affinity_matrix_ = X
|
||||
else:
|
||||
params = self.kernel_params
|
||||
if params is None:
|
||||
params = {}
|
||||
if not callable(self.affinity):
|
||||
params['gamma'] = self.gamma
|
||||
params['degree'] = self.degree
|
||||
params['coef0'] = self.coef0
|
||||
self.affinity_matrix_ = pairwise_kernels(X, metric=self.affinity,
|
||||
filter_params=True,
|
||||
**params)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
self.labels_ = spectral_clustering(self.affinity_matrix_,
|
||||
n_clusters=self.n_clusters,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
random_state=random_state,
|
||||
n_init=self.n_init,
|
||||
eigen_tol=self.eigen_tol,
|
||||
assign_labels=self.assign_labels)
|
||||
return self
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Perform spectral clustering from features, or affinity matrix,
|
||||
and return cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features), or \
|
||||
array-like, shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse matrix is
|
||||
provided in a format other than ``csr_matrix``, ``csc_matrix``,
|
||||
or ``coo_matrix``, it will be converted into a sparse
|
||||
``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray, shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
return super().fit_predict(X, y)
|
||||
|
||||
@property
|
||||
def _pairwise(self):
|
||||
return self.affinity in ["precomputed",
|
||||
"precomputed_nearest_neighbors"]
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _affinity_propagation # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.affinity_propagation_'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_affinity_propagation, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/cluster/bicluster.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/bicluster.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _bicluster # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.bicluster'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_bicluster, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/cluster/birch.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/birch.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _birch # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.birch'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_birch, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/cluster/dbscan_.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/dbscan_.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _dbscan # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.dbscan_'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_dbscan, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/cluster/hierarchical.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/hierarchical.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _agglomerative # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.hierarchical'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_agglomerative, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/cluster/k_means_.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/k_means_.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _kmeans # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.k_means_'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_kmeans, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/cluster/mean_shift_.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/mean_shift_.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _mean_shift # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.mean_shift_'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_mean_shift, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/cluster/optics_.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/optics_.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _optics # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.optics_'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_optics, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
50
venv/Lib/site-packages/sklearn/cluster/setup.py
Normal file
50
venv/Lib/site-packages/sklearn/cluster/setup.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# License: BSD 3 clause
|
||||
import os
|
||||
|
||||
import numpy
|
||||
|
||||
|
||||
def configuration(parent_package='', top_path=None):
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
libraries = []
|
||||
if os.name == 'posix':
|
||||
libraries.append('m')
|
||||
|
||||
config = Configuration('cluster', parent_package, top_path)
|
||||
|
||||
config.add_extension('_dbscan_inner',
|
||||
sources=['_dbscan_inner.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
language="c++")
|
||||
|
||||
config.add_extension('_hierarchical_fast',
|
||||
sources=['_hierarchical_fast.pyx'],
|
||||
language="c++",
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_extension('_k_means_fast',
|
||||
sources=['_k_means_fast.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_extension('_k_means_lloyd',
|
||||
sources=['_k_means_lloyd.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_extension('_k_means_elkan',
|
||||
sources=['_k_means_elkan.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_subpackage('tests')
|
||||
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from numpy.distutils.core import setup
|
||||
setup(**configuration(top_path='').todict())
|
18
venv/Lib/site-packages/sklearn/cluster/spectral.py
Normal file
18
venv/Lib/site-packages/sklearn/cluster/spectral.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _spectral # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.cluster.spectral'
|
||||
correct_import_path = 'sklearn.cluster'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_spectral, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
0
venv/Lib/site-packages/sklearn/cluster/tests/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/cluster/tests/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue