928 lines
37 KiB
Python
928 lines
37 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""Ordering Points To Identify the Clustering Structure (OPTICS)
|
||
|
||
These routines execute the OPTICS algorithm, and implement various
|
||
cluster extraction methods of the ordered list.
|
||
|
||
Authors: Shane Grigsby <refuge@rocktalus.com>
|
||
Adrin Jalali <adrinjalali@gmail.com>
|
||
Erich Schubert <erich@debian.org>
|
||
Hanmin Qin <qinhanmin2005@sina.com>
|
||
License: BSD 3 clause
|
||
"""
|
||
|
||
import warnings
|
||
import numpy as np
|
||
|
||
from ..utils import gen_batches, get_chunk_n_rows
|
||
from ..utils.validation import _deprecate_positional_args
|
||
from ..neighbors import NearestNeighbors
|
||
from ..base import BaseEstimator, ClusterMixin
|
||
from ..metrics import pairwise_distances
|
||
|
||
|
||
class OPTICS(ClusterMixin, BaseEstimator):
|
||
"""Estimate clustering structure from vector array.
|
||
|
||
OPTICS (Ordering Points To Identify the Clustering Structure), closely
|
||
related to DBSCAN, finds core sample of high density and expands clusters
|
||
from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
|
||
neighborhood radius. Better suited for usage on large datasets than the
|
||
current sklearn implementation of DBSCAN.
|
||
|
||
Clusters are then extracted using a DBSCAN-like method
|
||
(cluster_method = 'dbscan') or an automatic
|
||
technique proposed in [1]_ (cluster_method = 'xi').
|
||
|
||
This implementation deviates from the original OPTICS by first performing
|
||
k-nearest-neighborhood searches on all points to identify core sizes, then
|
||
computing only the distances to unprocessed points when constructing the
|
||
cluster order. Note that we do not employ a heap to manage the expansion
|
||
candidates, so the time complexity will be O(n^2).
|
||
|
||
Read more in the :ref:`User Guide <optics>`.
|
||
|
||
Parameters
|
||
----------
|
||
min_samples : int > 1 or float between 0 and 1 (default=5)
|
||
The number of samples in a neighborhood for a point to be considered as
|
||
a core point. Also, up and down steep regions can't have more then
|
||
``min_samples`` consecutive non-steep points. Expressed as an absolute
|
||
number or a fraction of the number of samples (rounded to be at least
|
||
2).
|
||
|
||
max_eps : float, optional (default=np.inf)
|
||
The maximum distance between two samples for one to be considered as
|
||
in the neighborhood of the other. Default value of ``np.inf`` will
|
||
identify clusters across all scales; reducing ``max_eps`` will result
|
||
in shorter run times.
|
||
|
||
metric : str or callable, optional (default='minkowski')
|
||
Metric to use for distance computation. Any metric from scikit-learn
|
||
or scipy.spatial.distance can be used.
|
||
|
||
If metric is a callable function, it is called on each
|
||
pair of instances (rows) and the resulting value recorded. The callable
|
||
should take two arrays as input and return one value indicating the
|
||
distance between them. This works for Scipy's metrics, but is less
|
||
efficient than passing the metric name as a string. If metric is
|
||
"precomputed", X is assumed to be a distance matrix and must be square.
|
||
|
||
Valid values for metric are:
|
||
|
||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||
'manhattan']
|
||
|
||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||
'yule']
|
||
|
||
See the documentation for scipy.spatial.distance for details on these
|
||
metrics.
|
||
|
||
p : int, optional (default=2)
|
||
Parameter for the Minkowski metric from
|
||
:class:`sklearn.metrics.pairwise_distances`. When p = 1, this is
|
||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||
|
||
metric_params : dict, optional (default=None)
|
||
Additional keyword arguments for the metric function.
|
||
|
||
cluster_method : str, optional (default='xi')
|
||
The extraction method used to extract clusters using the calculated
|
||
reachability and ordering. Possible values are "xi" and "dbscan".
|
||
|
||
eps : float, optional (default=None)
|
||
The maximum distance between two samples for one to be considered as
|
||
in the neighborhood of the other. By default it assumes the same value
|
||
as ``max_eps``.
|
||
Used only when ``cluster_method='dbscan'``.
|
||
|
||
xi : float, between 0 and 1, optional (default=0.05)
|
||
Determines the minimum steepness on the reachability plot that
|
||
constitutes a cluster boundary. For example, an upwards point in the
|
||
reachability plot is defined by the ratio from one point to its
|
||
successor being at most 1-xi.
|
||
Used only when ``cluster_method='xi'``.
|
||
|
||
predecessor_correction : bool, optional (default=True)
|
||
Correct clusters according to the predecessors calculated by OPTICS
|
||
[2]_. This parameter has minimal effect on most datasets.
|
||
Used only when ``cluster_method='xi'``.
|
||
|
||
min_cluster_size : int > 1 or float between 0 and 1 (default=None)
|
||
Minimum number of samples in an OPTICS cluster, expressed as an
|
||
absolute number or a fraction of the number of samples (rounded to be
|
||
at least 2). If ``None``, the value of ``min_samples`` is used instead.
|
||
Used only when ``cluster_method='xi'``.
|
||
|
||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
|
||
Algorithm used to compute the nearest neighbors:
|
||
|
||
- 'ball_tree' will use :class:`BallTree`
|
||
- 'kd_tree' will use :class:`KDTree`
|
||
- 'brute' will use a brute-force search.
|
||
- 'auto' will attempt to decide the most appropriate algorithm
|
||
based on the values passed to :meth:`fit` method. (default)
|
||
|
||
Note: fitting on sparse input will override the setting of
|
||
this parameter, using brute force.
|
||
|
||
leaf_size : int, optional (default=30)
|
||
Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
|
||
affect the speed of the construction and query, as well as the memory
|
||
required to store the tree. The optimal value depends on the
|
||
nature of the problem.
|
||
|
||
n_jobs : int or None, optional (default=None)
|
||
The number of parallel jobs to run for neighbors search.
|
||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
for more details.
|
||
|
||
Attributes
|
||
----------
|
||
labels_ : array, shape (n_samples,)
|
||
Cluster labels for each point in the dataset given to fit().
|
||
Noisy samples and points which are not included in a leaf cluster
|
||
of ``cluster_hierarchy_`` are labeled as -1.
|
||
|
||
reachability_ : array, shape (n_samples,)
|
||
Reachability distances per sample, indexed by object order. Use
|
||
``clust.reachability_[clust.ordering_]`` to access in cluster order.
|
||
|
||
ordering_ : array, shape (n_samples,)
|
||
The cluster ordered list of sample indices.
|
||
|
||
core_distances_ : array, shape (n_samples,)
|
||
Distance at which each sample becomes a core point, indexed by object
|
||
order. Points which will never be core have a distance of inf. Use
|
||
``clust.core_distances_[clust.ordering_]`` to access in cluster order.
|
||
|
||
predecessor_ : array, shape (n_samples,)
|
||
Point that a sample was reached from, indexed by object order.
|
||
Seed points have a predecessor of -1.
|
||
|
||
cluster_hierarchy_ : array, shape (n_clusters, 2)
|
||
The list of clusters in the form of ``[start, end]`` in each row, with
|
||
all indices inclusive. The clusters are ordered according to
|
||
``(end, -start)`` (ascending) so that larger clusters encompassing
|
||
smaller clusters come after those smaller ones. Since ``labels_`` does
|
||
not reflect the hierarchy, usually
|
||
``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also
|
||
note that these indices are of the ``ordering_``, i.e.
|
||
``X[ordering_][start:end + 1]`` form a cluster.
|
||
Only available when ``cluster_method='xi'``.
|
||
|
||
See Also
|
||
--------
|
||
DBSCAN
|
||
A similar clustering for a specified neighborhood radius (eps).
|
||
Our implementation is optimized for runtime.
|
||
|
||
References
|
||
----------
|
||
.. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
|
||
and Jörg Sander. "OPTICS: ordering points to identify the clustering
|
||
structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
|
||
|
||
.. [2] Schubert, Erich, Michael Gertz.
|
||
"Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
|
||
the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
|
||
|
||
Examples
|
||
--------
|
||
>>> from sklearn.cluster import OPTICS
|
||
>>> import numpy as np
|
||
>>> X = np.array([[1, 2], [2, 5], [3, 6],
|
||
... [8, 7], [8, 8], [7, 3]])
|
||
>>> clustering = OPTICS(min_samples=2).fit(X)
|
||
>>> clustering.labels_
|
||
array([0, 0, 0, 1, 1, 1])
|
||
"""
|
||
@_deprecate_positional_args
|
||
def __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski',
|
||
p=2, metric_params=None, cluster_method='xi', eps=None,
|
||
xi=0.05, predecessor_correction=True, min_cluster_size=None,
|
||
algorithm='auto', leaf_size=30, n_jobs=None):
|
||
self.max_eps = max_eps
|
||
self.min_samples = min_samples
|
||
self.min_cluster_size = min_cluster_size
|
||
self.algorithm = algorithm
|
||
self.metric = metric
|
||
self.metric_params = metric_params
|
||
self.p = p
|
||
self.leaf_size = leaf_size
|
||
self.cluster_method = cluster_method
|
||
self.eps = eps
|
||
self.xi = xi
|
||
self.predecessor_correction = predecessor_correction
|
||
self.n_jobs = n_jobs
|
||
|
||
def fit(self, X, y=None):
|
||
"""Perform OPTICS clustering.
|
||
|
||
Extracts an ordered list of points and reachability distances, and
|
||
performs initial clustering using ``max_eps`` distance specified at
|
||
OPTICS object instantiation.
|
||
|
||
Parameters
|
||
----------
|
||
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
|
||
if metric=’precomputed’
|
||
A feature array, or array of distances between samples if
|
||
metric='precomputed'.
|
||
|
||
y : ignored
|
||
Ignored.
|
||
|
||
Returns
|
||
-------
|
||
self : instance of OPTICS
|
||
The instance.
|
||
"""
|
||
X = self._validate_data(X, dtype=np.float)
|
||
|
||
if self.cluster_method not in ['dbscan', 'xi']:
|
||
raise ValueError("cluster_method should be one of"
|
||
" 'dbscan' or 'xi' but is %s" %
|
||
self.cluster_method)
|
||
|
||
(self.ordering_, self.core_distances_, self.reachability_,
|
||
self.predecessor_) = compute_optics_graph(
|
||
X=X, min_samples=self.min_samples, algorithm=self.algorithm,
|
||
leaf_size=self.leaf_size, metric=self.metric,
|
||
metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs,
|
||
max_eps=self.max_eps)
|
||
|
||
# Extract clusters from the calculated orders and reachability
|
||
if self.cluster_method == 'xi':
|
||
labels_, clusters_ = cluster_optics_xi(
|
||
reachability=self.reachability_,
|
||
predecessor=self.predecessor_,
|
||
ordering=self.ordering_,
|
||
min_samples=self.min_samples,
|
||
min_cluster_size=self.min_cluster_size,
|
||
xi=self.xi,
|
||
predecessor_correction=self.predecessor_correction)
|
||
self.cluster_hierarchy_ = clusters_
|
||
elif self.cluster_method == 'dbscan':
|
||
if self.eps is None:
|
||
eps = self.max_eps
|
||
else:
|
||
eps = self.eps
|
||
|
||
if eps > self.max_eps:
|
||
raise ValueError('Specify an epsilon smaller than %s. Got %s.'
|
||
% (self.max_eps, eps))
|
||
|
||
labels_ = cluster_optics_dbscan(
|
||
reachability=self.reachability_,
|
||
core_distances=self.core_distances_,
|
||
ordering=self.ordering_, eps=eps)
|
||
|
||
self.labels_ = labels_
|
||
return self
|
||
|
||
|
||
def _validate_size(size, n_samples, param_name):
|
||
if size <= 0 or (size !=
|
||
int(size)
|
||
and size > 1):
|
||
raise ValueError('%s must be a positive integer '
|
||
'or a float between 0 and 1. Got %r' %
|
||
(param_name, size))
|
||
elif size > n_samples:
|
||
raise ValueError('%s must be no greater than the'
|
||
' number of samples (%d). Got %d' %
|
||
(param_name, n_samples, size))
|
||
|
||
|
||
# OPTICS helper functions
|
||
def _compute_core_distances_(X, neighbors, min_samples, working_memory):
|
||
"""Compute the k-th nearest neighbor of each sample
|
||
|
||
Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]
|
||
but with more memory efficiency.
|
||
|
||
Parameters
|
||
----------
|
||
X : array, shape (n_samples, n_features)
|
||
The data.
|
||
neighbors : NearestNeighbors instance
|
||
The fitted nearest neighbors estimator.
|
||
working_memory : int, optional
|
||
The sought maximum memory for temporary distance matrix chunks.
|
||
When None (default), the value of
|
||
``sklearn.get_config()['working_memory']`` is used.
|
||
|
||
Returns
|
||
-------
|
||
core_distances : array, shape (n_samples,)
|
||
Distance at which each sample becomes a core point.
|
||
Points which will never be core have a distance of inf.
|
||
"""
|
||
n_samples = X.shape[0]
|
||
core_distances = np.empty(n_samples)
|
||
core_distances.fill(np.nan)
|
||
|
||
chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples,
|
||
max_n_rows=n_samples,
|
||
working_memory=working_memory)
|
||
slices = gen_batches(n_samples, chunk_n_rows)
|
||
for sl in slices:
|
||
core_distances[sl] = neighbors.kneighbors(
|
||
X[sl], min_samples)[0][:, -1]
|
||
return core_distances
|
||
|
||
|
||
@_deprecate_positional_args
|
||
def compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params,
|
||
algorithm, leaf_size, n_jobs):
|
||
"""Computes the OPTICS reachability graph.
|
||
|
||
Read more in the :ref:`User Guide <optics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
|
||
if metric=’precomputed’.
|
||
A feature array, or array of distances between samples if
|
||
metric='precomputed'
|
||
|
||
min_samples : int > 1 or float between 0 and 1
|
||
The number of samples in a neighborhood for a point to be considered
|
||
as a core point. Expressed as an absolute number or a fraction of the
|
||
number of samples (rounded to be at least 2).
|
||
|
||
max_eps : float, optional (default=np.inf)
|
||
The maximum distance between two samples for one to be considered as
|
||
in the neighborhood of the other. Default value of ``np.inf`` will
|
||
identify clusters across all scales; reducing ``max_eps`` will result
|
||
in shorter run times.
|
||
|
||
metric : string or callable, optional (default='minkowski')
|
||
Metric to use for distance computation. Any metric from scikit-learn
|
||
or scipy.spatial.distance can be used.
|
||
|
||
If metric is a callable function, it is called on each
|
||
pair of instances (rows) and the resulting value recorded. The callable
|
||
should take two arrays as input and return one value indicating the
|
||
distance between them. This works for Scipy's metrics, but is less
|
||
efficient than passing the metric name as a string. If metric is
|
||
"precomputed", X is assumed to be a distance matrix and must be square.
|
||
|
||
Valid values for metric are:
|
||
|
||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||
'manhattan']
|
||
|
||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||
'yule']
|
||
|
||
See the documentation for scipy.spatial.distance for details on these
|
||
metrics.
|
||
|
||
p : integer, optional (default=2)
|
||
Parameter for the Minkowski metric from
|
||
:class:`sklearn.metrics.pairwise_distances`. When p = 1, this is
|
||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||
|
||
metric_params : dict, optional (default=None)
|
||
Additional keyword arguments for the metric function.
|
||
|
||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
|
||
Algorithm used to compute the nearest neighbors:
|
||
|
||
- 'ball_tree' will use :class:`BallTree`
|
||
- 'kd_tree' will use :class:`KDTree`
|
||
- 'brute' will use a brute-force search.
|
||
- 'auto' will attempt to decide the most appropriate algorithm
|
||
based on the values passed to :meth:`fit` method. (default)
|
||
|
||
Note: fitting on sparse input will override the setting of
|
||
this parameter, using brute force.
|
||
|
||
leaf_size : int, optional (default=30)
|
||
Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
|
||
affect the speed of the construction and query, as well as the memory
|
||
required to store the tree. The optimal value depends on the
|
||
nature of the problem.
|
||
|
||
n_jobs : int or None, optional (default=None)
|
||
The number of parallel jobs to run for neighbors search.
|
||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
for more details.
|
||
|
||
Returns
|
||
-------
|
||
ordering_ : array, shape (n_samples,)
|
||
The cluster ordered list of sample indices.
|
||
|
||
core_distances_ : array, shape (n_samples,)
|
||
Distance at which each sample becomes a core point, indexed by object
|
||
order. Points which will never be core have a distance of inf. Use
|
||
``clust.core_distances_[clust.ordering_]`` to access in cluster order.
|
||
|
||
reachability_ : array, shape (n_samples,)
|
||
Reachability distances per sample, indexed by object order. Use
|
||
``clust.reachability_[clust.ordering_]`` to access in cluster order.
|
||
|
||
predecessor_ : array, shape (n_samples,)
|
||
Point that a sample was reached from, indexed by object order.
|
||
Seed points have a predecessor of -1.
|
||
|
||
References
|
||
----------
|
||
.. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,
|
||
and Jörg Sander. "OPTICS: ordering points to identify the clustering
|
||
structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
|
||
"""
|
||
n_samples = X.shape[0]
|
||
_validate_size(min_samples, n_samples, 'min_samples')
|
||
if min_samples <= 1:
|
||
min_samples = max(2, int(min_samples * n_samples))
|
||
|
||
# Start all points as 'unprocessed' ##
|
||
reachability_ = np.empty(n_samples)
|
||
reachability_.fill(np.inf)
|
||
predecessor_ = np.empty(n_samples, dtype=int)
|
||
predecessor_.fill(-1)
|
||
|
||
nbrs = NearestNeighbors(n_neighbors=min_samples,
|
||
algorithm=algorithm,
|
||
leaf_size=leaf_size,
|
||
metric=metric,
|
||
metric_params=metric_params,
|
||
p=p,
|
||
n_jobs=n_jobs)
|
||
|
||
nbrs.fit(X)
|
||
# Here we first do a kNN query for each point, this differs from
|
||
# the original OPTICS that only used epsilon range queries.
|
||
# TODO: handle working_memory somehow?
|
||
core_distances_ = _compute_core_distances_(X=X, neighbors=nbrs,
|
||
min_samples=min_samples,
|
||
working_memory=None)
|
||
# OPTICS puts an upper limit on these, use inf for undefined.
|
||
core_distances_[core_distances_ > max_eps] = np.inf
|
||
|
||
# Main OPTICS loop. Not parallelizable. The order that entries are
|
||
# written to the 'ordering_' list is important!
|
||
# Note that this implementation is O(n^2) theoretically, but
|
||
# supposedly with very low constant factors.
|
||
processed = np.zeros(X.shape[0], dtype=bool)
|
||
ordering = np.zeros(X.shape[0], dtype=int)
|
||
for ordering_idx in range(X.shape[0]):
|
||
# Choose next based on smallest reachability distance
|
||
# (And prefer smaller ids on ties, possibly np.inf!)
|
||
index = np.where(processed == 0)[0]
|
||
point = index[np.argmin(reachability_[index])]
|
||
|
||
processed[point] = True
|
||
ordering[ordering_idx] = point
|
||
if core_distances_[point] != np.inf:
|
||
_set_reach_dist(core_distances_=core_distances_,
|
||
reachability_=reachability_,
|
||
predecessor_=predecessor_,
|
||
point_index=point,
|
||
processed=processed, X=X, nbrs=nbrs,
|
||
metric=metric, metric_params=metric_params,
|
||
p=p, max_eps=max_eps)
|
||
if np.all(np.isinf(reachability_)):
|
||
warnings.warn("All reachability values are inf. Set a larger"
|
||
" max_eps or all data will be considered outliers.",
|
||
UserWarning)
|
||
return ordering, core_distances_, reachability_, predecessor_
|
||
|
||
|
||
def _set_reach_dist(core_distances_, reachability_, predecessor_,
|
||
point_index, processed, X, nbrs, metric, metric_params,
|
||
p, max_eps):
|
||
P = X[point_index:point_index + 1]
|
||
# Assume that radius_neighbors is faster without distances
|
||
# and we don't need all distances, nevertheless, this means
|
||
# we may be doing some work twice.
|
||
indices = nbrs.radius_neighbors(P, radius=max_eps,
|
||
return_distance=False)[0]
|
||
|
||
# Getting indices of neighbors that have not been processed
|
||
unproc = np.compress(~np.take(processed, indices), indices)
|
||
# Neighbors of current point are already processed.
|
||
if not unproc.size:
|
||
return
|
||
|
||
# Only compute distances to unprocessed neighbors:
|
||
if metric == 'precomputed':
|
||
dists = X[point_index, unproc]
|
||
else:
|
||
_params = dict() if metric_params is None else metric_params.copy()
|
||
if metric == 'minkowski' and 'p' not in _params:
|
||
# the same logic as neighbors, p is ignored if explicitly set
|
||
# in the dict params
|
||
_params['p'] = p
|
||
dists = pairwise_distances(P, np.take(X, unproc, axis=0),
|
||
metric=metric, n_jobs=None,
|
||
**_params).ravel()
|
||
|
||
rdists = np.maximum(dists, core_distances_[point_index])
|
||
improved = np.where(rdists < np.take(reachability_, unproc))
|
||
reachability_[unproc[improved]] = rdists[improved]
|
||
predecessor_[unproc[improved]] = point_index
|
||
|
||
|
||
@_deprecate_positional_args
|
||
def cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):
|
||
"""Performs DBSCAN extraction for an arbitrary epsilon.
|
||
|
||
Extracting the clusters runs in linear time. Note that this results in
|
||
``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with
|
||
similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.
|
||
|
||
Parameters
|
||
----------
|
||
reachability : array, shape (n_samples,)
|
||
Reachability distances calculated by OPTICS (``reachability_``)
|
||
|
||
core_distances : array, shape (n_samples,)
|
||
Distances at which points become core (``core_distances_``)
|
||
|
||
ordering : array, shape (n_samples,)
|
||
OPTICS ordered point indices (``ordering_``)
|
||
|
||
eps : float
|
||
DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results
|
||
will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close
|
||
to one another.
|
||
|
||
Returns
|
||
-------
|
||
labels_ : array, shape (n_samples,)
|
||
The estimated labels.
|
||
|
||
"""
|
||
n_samples = len(core_distances)
|
||
labels = np.zeros(n_samples, dtype=int)
|
||
|
||
far_reach = reachability > eps
|
||
near_core = core_distances <= eps
|
||
labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1
|
||
labels[far_reach & ~near_core] = -1
|
||
return labels
|
||
|
||
|
||
def cluster_optics_xi(*, reachability, predecessor, ordering, min_samples,
|
||
min_cluster_size=None, xi=0.05,
|
||
predecessor_correction=True):
|
||
"""Automatically extract clusters according to the Xi-steep method.
|
||
|
||
Parameters
|
||
----------
|
||
reachability : array, shape (n_samples,)
|
||
Reachability distances calculated by OPTICS (`reachability_`)
|
||
|
||
predecessor : array, shape (n_samples,)
|
||
Predecessors calculated by OPTICS.
|
||
|
||
ordering : array, shape (n_samples,)
|
||
OPTICS ordered point indices (`ordering_`)
|
||
|
||
min_samples : int > 1 or float between 0 and 1
|
||
The same as the min_samples given to OPTICS. Up and down steep regions
|
||
can't have more then ``min_samples`` consecutive non-steep points.
|
||
Expressed as an absolute number or a fraction of the number of samples
|
||
(rounded to be at least 2).
|
||
|
||
min_cluster_size : int > 1 or float between 0 and 1 (default=None)
|
||
Minimum number of samples in an OPTICS cluster, expressed as an
|
||
absolute number or a fraction of the number of samples (rounded to be
|
||
at least 2). If ``None``, the value of ``min_samples`` is used instead.
|
||
|
||
xi : float, between 0 and 1, optional (default=0.05)
|
||
Determines the minimum steepness on the reachability plot that
|
||
constitutes a cluster boundary. For example, an upwards point in the
|
||
reachability plot is defined by the ratio from one point to its
|
||
successor being at most 1-xi.
|
||
|
||
predecessor_correction : bool, optional (default=True)
|
||
Correct clusters based on the calculated predecessors.
|
||
|
||
Returns
|
||
-------
|
||
labels : array, shape (n_samples)
|
||
The labels assigned to samples. Points which are not included
|
||
in any cluster are labeled as -1.
|
||
|
||
clusters : array, shape (n_clusters, 2)
|
||
The list of clusters in the form of ``[start, end]`` in each row, with
|
||
all indices inclusive. The clusters are ordered according to ``(end,
|
||
-start)`` (ascending) so that larger clusters encompassing smaller
|
||
clusters come after such nested smaller clusters. Since ``labels`` does
|
||
not reflect the hierarchy, usually ``len(clusters) >
|
||
np.unique(labels)``.
|
||
"""
|
||
n_samples = len(reachability)
|
||
_validate_size(min_samples, n_samples, 'min_samples')
|
||
if min_samples <= 1:
|
||
min_samples = max(2, int(min_samples * n_samples))
|
||
if min_cluster_size is None:
|
||
min_cluster_size = min_samples
|
||
_validate_size(min_cluster_size, n_samples, 'min_cluster_size')
|
||
if min_cluster_size <= 1:
|
||
min_cluster_size = max(2, int(min_cluster_size * n_samples))
|
||
|
||
clusters = _xi_cluster(reachability[ordering], predecessor[ordering],
|
||
ordering, xi,
|
||
min_samples, min_cluster_size,
|
||
predecessor_correction)
|
||
labels = _extract_xi_labels(ordering, clusters)
|
||
return labels, clusters
|
||
|
||
|
||
def _extend_region(steep_point, xward_point, start, min_samples):
|
||
"""Extend the area until it's maximal.
|
||
|
||
It's the same function for both upward and downward reagions, depending on
|
||
the given input parameters. Assuming:
|
||
|
||
- steep_{upward/downward}: bool array indicating whether a point is a
|
||
steep {upward/downward};
|
||
- upward/downward: bool array indicating whether a point is
|
||
upward/downward;
|
||
|
||
To extend an upward reagion, ``steep_point=steep_upward`` and
|
||
``xward_point=downward`` are expected, and to extend a downward region,
|
||
``steep_point=steep_downward`` and ``xward_point=upward``.
|
||
|
||
Parameters
|
||
----------
|
||
steep_point : bool array, shape (n_samples)
|
||
True if the point is steep downward (upward).
|
||
|
||
xward_point : bool array, shape (n_samples)
|
||
True if the point is an upward (respectively downward) point.
|
||
|
||
start : integer
|
||
The start of the xward region.
|
||
|
||
min_samples : integer
|
||
The same as the min_samples given to OPTICS. Up and down steep
|
||
regions can't have more then ``min_samples`` consecutive non-steep
|
||
points.
|
||
|
||
Returns
|
||
-------
|
||
index : integer
|
||
The current index iterating over all the samples, i.e. where we are up
|
||
to in our search.
|
||
|
||
end : integer
|
||
The end of the region, which can be behind the index. The region
|
||
includes the ``end`` index.
|
||
"""
|
||
n_samples = len(steep_point)
|
||
non_xward_points = 0
|
||
index = start
|
||
end = start
|
||
# find a maximal area
|
||
while index < n_samples:
|
||
if steep_point[index]:
|
||
non_xward_points = 0
|
||
end = index
|
||
elif not xward_point[index]:
|
||
# it's not a steep point, but still goes up.
|
||
non_xward_points += 1
|
||
# region should include no more than min_samples consecutive
|
||
# non steep xward points.
|
||
if non_xward_points > min_samples:
|
||
break
|
||
else:
|
||
return end
|
||
index += 1
|
||
return end
|
||
|
||
|
||
def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot):
|
||
"""Update steep down areas (SDAs) using the new maximum in between (mib)
|
||
value, and the given complement of xi, i.e. ``1 - xi``.
|
||
"""
|
||
if np.isinf(mib):
|
||
return []
|
||
res = [sda for sda in sdas
|
||
if mib <= reachability_plot[sda['start']] * xi_complement]
|
||
for sda in res:
|
||
sda['mib'] = max(sda['mib'], mib)
|
||
return res
|
||
|
||
|
||
def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e):
|
||
"""Correct for predecessors.
|
||
|
||
Applies Algorithm 2 of [1]_.
|
||
|
||
Input parameters are ordered by the computer OPTICS ordering.
|
||
|
||
.. [1] Schubert, Erich, Michael Gertz.
|
||
"Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of
|
||
the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329.
|
||
"""
|
||
while s < e:
|
||
if reachability_plot[s] > reachability_plot[e]:
|
||
return s, e
|
||
p_e = ordering[predecessor_plot[e]]
|
||
for i in range(s, e):
|
||
if p_e == ordering[i]:
|
||
return s, e
|
||
e -= 1
|
||
return None, None
|
||
|
||
|
||
def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
|
||
min_cluster_size, predecessor_correction):
|
||
"""Automatically extract clusters according to the Xi-steep method.
|
||
|
||
This is rouphly an implementation of Figure 19 of the OPTICS paper.
|
||
|
||
Parameters
|
||
----------
|
||
reachability_plot : array, shape (n_samples)
|
||
The reachability plot, i.e. reachability ordered according to
|
||
the calculated ordering, all computed by OPTICS.
|
||
|
||
predecessor_plot : array, shape (n_samples)
|
||
Predecessors ordered according to the calculated ordering.
|
||
|
||
xi : float, between 0 and 1
|
||
Determines the minimum steepness on the reachability plot that
|
||
constitutes a cluster boundary. For example, an upwards point in the
|
||
reachability plot is defined by the ratio from one point to its
|
||
successor being at most 1-xi.
|
||
|
||
min_samples : int > 1
|
||
The same as the min_samples given to OPTICS. Up and down steep regions
|
||
can't have more then ``min_samples`` consecutive non-steep points.
|
||
|
||
min_cluster_size : int > 1
|
||
Minimum number of samples in an OPTICS cluster.
|
||
|
||
predecessor_correction : bool
|
||
Correct clusters based on the calculated predecessors.
|
||
|
||
Returns
|
||
-------
|
||
clusters : array, shape (n_clusters, 2)
|
||
The list of clusters in the form of [start, end] in each row, with all
|
||
indices inclusive. The clusters are ordered in a way that larger
|
||
clusters encompassing smaller clusters come after those smaller
|
||
clusters.
|
||
"""
|
||
|
||
# Our implementation adds an inf to the end of reachability plot
|
||
# this helps to find potential clusters at the end of the
|
||
# reachability plot even if there's no upward region at the end of it.
|
||
reachability_plot = np.hstack((reachability_plot, np.inf))
|
||
|
||
xi_complement = 1 - xi
|
||
sdas = [] # steep down areas, introduced in section 4.3.2 of the paper
|
||
clusters = []
|
||
index = 0
|
||
mib = 0. # maximum in between, section 4.3.2
|
||
|
||
# Our implementation corrects a mistake in the original
|
||
# paper, i.e., in Definition 9 steep downward point,
|
||
# r(p) * (1 - x1) <= r(p + 1) should be
|
||
# r(p) * (1 - x1) >= r(p + 1)
|
||
with np.errstate(invalid='ignore'):
|
||
ratio = reachability_plot[:-1] / reachability_plot[1:]
|
||
steep_upward = ratio <= xi_complement
|
||
steep_downward = ratio >= 1 / xi_complement
|
||
downward = ratio > 1
|
||
upward = ratio < 1
|
||
|
||
# the following loop is is almost exactly as Figure 19 of the paper.
|
||
# it jumps over the areas which are not either steep down or up areas
|
||
for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)):
|
||
# just continue if steep_index has been a part of a discovered xward
|
||
# area.
|
||
if steep_index < index:
|
||
continue
|
||
|
||
mib = max(mib, np.max(reachability_plot[index:steep_index + 1]))
|
||
|
||
# steep downward areas
|
||
if steep_downward[steep_index]:
|
||
sdas = _update_filter_sdas(sdas, mib, xi_complement,
|
||
reachability_plot)
|
||
D_start = steep_index
|
||
D_end = _extend_region(steep_downward, upward,
|
||
D_start, min_samples)
|
||
D = {'start': D_start, 'end': D_end, 'mib': 0.}
|
||
sdas.append(D)
|
||
index = D_end + 1
|
||
mib = reachability_plot[index]
|
||
|
||
# steep upward areas
|
||
else:
|
||
sdas = _update_filter_sdas(sdas, mib, xi_complement,
|
||
reachability_plot)
|
||
U_start = steep_index
|
||
U_end = _extend_region(steep_upward, downward, U_start,
|
||
min_samples)
|
||
index = U_end + 1
|
||
mib = reachability_plot[index]
|
||
|
||
U_clusters = []
|
||
for D in sdas:
|
||
c_start = D['start']
|
||
c_end = U_end
|
||
|
||
# line (**), sc2*
|
||
if reachability_plot[c_end + 1] * xi_complement < D['mib']:
|
||
continue
|
||
|
||
# Definition 11: criterion 4
|
||
D_max = reachability_plot[D['start']]
|
||
if D_max * xi_complement >= reachability_plot[c_end + 1]:
|
||
# Find the first index from the left side which is almost
|
||
# at the same level as the end of the detected cluster.
|
||
while (reachability_plot[c_start + 1] >
|
||
reachability_plot[c_end + 1]
|
||
and c_start < D['end']):
|
||
c_start += 1
|
||
elif reachability_plot[c_end + 1] * xi_complement >= D_max:
|
||
# Find the first index from the right side which is almost
|
||
# at the same level as the beginning of the detected
|
||
# cluster.
|
||
# Our implementation corrects a mistake in the original
|
||
# paper, i.e., in Definition 11 4c, r(x) < r(sD) should be
|
||
# r(x) > r(sD).
|
||
while (reachability_plot[c_end - 1] > D_max
|
||
and c_end > U_start):
|
||
c_end -= 1
|
||
|
||
# predecessor correction
|
||
if predecessor_correction:
|
||
c_start, c_end = _correct_predecessor(reachability_plot,
|
||
predecessor_plot,
|
||
ordering,
|
||
c_start,
|
||
c_end)
|
||
if c_start is None:
|
||
continue
|
||
|
||
# Definition 11: criterion 3.a
|
||
if c_end - c_start + 1 < min_cluster_size:
|
||
continue
|
||
|
||
# Definition 11: criterion 1
|
||
if c_start > D['end']:
|
||
continue
|
||
|
||
# Definition 11: criterion 2
|
||
if c_end < U_start:
|
||
continue
|
||
|
||
U_clusters.append((c_start, c_end))
|
||
|
||
# add smaller clusters first.
|
||
U_clusters.reverse()
|
||
clusters.extend(U_clusters)
|
||
|
||
return np.array(clusters)
|
||
|
||
|
||
def _extract_xi_labels(ordering, clusters):
|
||
"""Extracts the labels from the clusters returned by `_xi_cluster`.
|
||
We rely on the fact that clusters are stored
|
||
with the smaller clusters coming before the larger ones.
|
||
|
||
Parameters
|
||
----------
|
||
ordering : array, shape (n_samples)
|
||
The ordering of points calculated by OPTICS
|
||
|
||
clusters : array, shape (n_clusters, 2)
|
||
List of clusters i.e. (start, end) tuples,
|
||
as returned by `_xi_cluster`.
|
||
|
||
Returns
|
||
-------
|
||
labels : array, shape (n_samples)
|
||
"""
|
||
|
||
labels = np.full(len(ordering), -1, dtype=int)
|
||
label = 0
|
||
for c in clusters:
|
||
if not np.any(labels[c[0]:(c[1] + 1)] != -1):
|
||
labels[c[0]:(c[1] + 1)] = label
|
||
label += 1
|
||
labels[ordering] = labels.copy()
|
||
return labels
|