1773 lines
60 KiB
Python
1773 lines
60 KiB
Python
"""
|
|
Generate samples of synthetic data sets.
|
|
"""
|
|
|
|
# Authors: B. Thirion, G. Varoquaux, A. Gramfort, V. Michel, O. Grisel,
|
|
# G. Louppe, J. Nothman
|
|
# License: BSD 3 clause
|
|
|
|
import numbers
|
|
import array
|
|
from collections.abc import Iterable
|
|
|
|
import numpy as np
|
|
from scipy import linalg
|
|
import scipy.sparse as sp
|
|
|
|
from ..preprocessing import MultiLabelBinarizer
|
|
from ..utils import check_array, check_random_state
|
|
from ..utils import shuffle as util_shuffle
|
|
from ..utils.random import sample_without_replacement
|
|
from ..utils.validation import _deprecate_positional_args
|
|
|
|
|
|
def _generate_hypercube(samples, dimensions, rng):
|
|
"""Returns distinct binary samples of length dimensions
|
|
"""
|
|
if dimensions > 30:
|
|
return np.hstack([rng.randint(2, size=(samples, dimensions - 30)),
|
|
_generate_hypercube(samples, 30, rng)])
|
|
out = sample_without_replacement(2 ** dimensions, samples,
|
|
random_state=rng).astype(dtype='>u4',
|
|
copy=False)
|
|
out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:]
|
|
return out
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_classification(n_samples=100, n_features=20, *, n_informative=2,
|
|
n_redundant=2, n_repeated=0, n_classes=2,
|
|
n_clusters_per_class=2, weights=None, flip_y=0.01,
|
|
class_sep=1.0, hypercube=True, shift=0.0, scale=1.0,
|
|
shuffle=True, random_state=None):
|
|
"""Generate a random n-class classification problem.
|
|
|
|
This initially creates clusters of points normally distributed (std=1)
|
|
about vertices of an ``n_informative``-dimensional hypercube with sides of
|
|
length ``2*class_sep`` and assigns an equal number of clusters to each
|
|
class. It introduces interdependence between these features and adds
|
|
various types of further noise to the data.
|
|
|
|
Without shuffling, ``X`` horizontally stacks features in the following
|
|
order: the primary ``n_informative`` features, followed by ``n_redundant``
|
|
linear combinations of the informative features, followed by ``n_repeated``
|
|
duplicates, drawn randomly with replacement from the informative and
|
|
redundant features. The remaining features are filled with random noise.
|
|
Thus, without shuffling, all useful features are contained in the columns
|
|
``X[:, :n_informative + n_redundant + n_repeated]``.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
n_features : int, optional (default=20)
|
|
The total number of features. These comprise ``n_informative``
|
|
informative features, ``n_redundant`` redundant features,
|
|
``n_repeated`` duplicated features and
|
|
``n_features-n_informative-n_redundant-n_repeated`` useless features
|
|
drawn at random.
|
|
|
|
n_informative : int, optional (default=2)
|
|
The number of informative features. Each class is composed of a number
|
|
of gaussian clusters each located around the vertices of a hypercube
|
|
in a subspace of dimension ``n_informative``. For each cluster,
|
|
informative features are drawn independently from N(0, 1) and then
|
|
randomly linearly combined within each cluster in order to add
|
|
covariance. The clusters are then placed on the vertices of the
|
|
hypercube.
|
|
|
|
n_redundant : int, optional (default=2)
|
|
The number of redundant features. These features are generated as
|
|
random linear combinations of the informative features.
|
|
|
|
n_repeated : int, optional (default=0)
|
|
The number of duplicated features, drawn randomly from the informative
|
|
and the redundant features.
|
|
|
|
n_classes : int, optional (default=2)
|
|
The number of classes (or labels) of the classification problem.
|
|
|
|
n_clusters_per_class : int, optional (default=2)
|
|
The number of clusters per class.
|
|
|
|
weights : array-like of shape (n_classes,) or (n_classes - 1,),\
|
|
(default=None)
|
|
The proportions of samples assigned to each class. If None, then
|
|
classes are balanced. Note that if ``len(weights) == n_classes - 1``,
|
|
then the last class weight is automatically inferred.
|
|
More than ``n_samples`` samples may be returned if the sum of
|
|
``weights`` exceeds 1.
|
|
|
|
flip_y : float, optional (default=0.01)
|
|
The fraction of samples whose class is assigned randomly. Larger
|
|
values introduce noise in the labels and make the classification
|
|
task harder. Note that the default setting flip_y > 0 might lead
|
|
to less than n_classes in y in some cases.
|
|
|
|
class_sep : float, optional (default=1.0)
|
|
The factor multiplying the hypercube size. Larger values spread
|
|
out the clusters/classes and make the classification task easier.
|
|
|
|
hypercube : boolean, optional (default=True)
|
|
If True, the clusters are put on the vertices of a hypercube. If
|
|
False, the clusters are put on the vertices of a random polytope.
|
|
|
|
shift : float, array of shape [n_features] or None, optional (default=0.0)
|
|
Shift features by the specified value. If None, then features
|
|
are shifted by a random value drawn in [-class_sep, class_sep].
|
|
|
|
scale : float, array of shape [n_features] or None, optional (default=1.0)
|
|
Multiply features by the specified value. If None, then features
|
|
are scaled by a random value drawn in [1, 100]. Note that scaling
|
|
happens after shifting.
|
|
|
|
shuffle : boolean, optional (default=True)
|
|
Shuffle the samples and the features.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The generated samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The integer labels for class membership of each sample.
|
|
|
|
Notes
|
|
-----
|
|
The algorithm is adapted from Guyon [1] and was designed to generate
|
|
the "Madelon" dataset.
|
|
|
|
References
|
|
----------
|
|
.. [1] I. Guyon, "Design of experiments for the NIPS 2003 variable
|
|
selection benchmark", 2003.
|
|
|
|
See also
|
|
--------
|
|
make_blobs: simplified variant
|
|
make_multilabel_classification: unrelated generator for multilabel tasks
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
# Count features, clusters and samples
|
|
if n_informative + n_redundant + n_repeated > n_features:
|
|
raise ValueError("Number of informative, redundant and repeated "
|
|
"features must sum to less than the number of total"
|
|
" features")
|
|
# Use log2 to avoid overflow errors
|
|
if n_informative < np.log2(n_classes * n_clusters_per_class):
|
|
msg = "n_classes({}) * n_clusters_per_class({}) must be"
|
|
msg += " smaller or equal 2**n_informative({})={}"
|
|
raise ValueError(msg.format(n_classes, n_clusters_per_class,
|
|
n_informative, 2**n_informative))
|
|
|
|
if weights is not None:
|
|
if len(weights) not in [n_classes, n_classes - 1]:
|
|
raise ValueError("Weights specified but incompatible with number "
|
|
"of classes.")
|
|
if len(weights) == n_classes - 1:
|
|
if isinstance(weights, list):
|
|
weights = weights + [1.0 - sum(weights)]
|
|
else:
|
|
weights = np.resize(weights, n_classes)
|
|
weights[-1] = 1.0 - sum(weights[:-1])
|
|
else:
|
|
weights = [1.0 / n_classes] * n_classes
|
|
|
|
n_useless = n_features - n_informative - n_redundant - n_repeated
|
|
n_clusters = n_classes * n_clusters_per_class
|
|
|
|
# Distribute samples among clusters by weight
|
|
n_samples_per_cluster = [
|
|
int(n_samples * weights[k % n_classes] / n_clusters_per_class)
|
|
for k in range(n_clusters)]
|
|
|
|
for i in range(n_samples - sum(n_samples_per_cluster)):
|
|
n_samples_per_cluster[i % n_clusters] += 1
|
|
|
|
# Initialize X and y
|
|
X = np.zeros((n_samples, n_features))
|
|
y = np.zeros(n_samples, dtype=np.int)
|
|
|
|
# Build the polytope whose vertices become cluster centroids
|
|
centroids = _generate_hypercube(n_clusters, n_informative,
|
|
generator).astype(float, copy=False)
|
|
centroids *= 2 * class_sep
|
|
centroids -= class_sep
|
|
if not hypercube:
|
|
centroids *= generator.rand(n_clusters, 1)
|
|
centroids *= generator.rand(1, n_informative)
|
|
|
|
# Initially draw informative features from the standard normal
|
|
X[:, :n_informative] = generator.randn(n_samples, n_informative)
|
|
|
|
# Create each cluster; a variant of make_blobs
|
|
stop = 0
|
|
for k, centroid in enumerate(centroids):
|
|
start, stop = stop, stop + n_samples_per_cluster[k]
|
|
y[start:stop] = k % n_classes # assign labels
|
|
X_k = X[start:stop, :n_informative] # slice a view of the cluster
|
|
|
|
A = 2 * generator.rand(n_informative, n_informative) - 1
|
|
X_k[...] = np.dot(X_k, A) # introduce random covariance
|
|
|
|
X_k += centroid # shift the cluster to a vertex
|
|
|
|
# Create redundant features
|
|
if n_redundant > 0:
|
|
B = 2 * generator.rand(n_informative, n_redundant) - 1
|
|
X[:, n_informative:n_informative + n_redundant] = \
|
|
np.dot(X[:, :n_informative], B)
|
|
|
|
# Repeat some features
|
|
if n_repeated > 0:
|
|
n = n_informative + n_redundant
|
|
indices = ((n - 1) * generator.rand(n_repeated) + 0.5).astype(np.intp)
|
|
X[:, n:n + n_repeated] = X[:, indices]
|
|
|
|
# Fill useless features
|
|
if n_useless > 0:
|
|
X[:, -n_useless:] = generator.randn(n_samples, n_useless)
|
|
|
|
# Randomly replace labels
|
|
if flip_y >= 0.0:
|
|
flip_mask = generator.rand(n_samples) < flip_y
|
|
y[flip_mask] = generator.randint(n_classes, size=flip_mask.sum())
|
|
|
|
# Randomly shift and scale
|
|
if shift is None:
|
|
shift = (2 * generator.rand(n_features) - 1) * class_sep
|
|
X += shift
|
|
|
|
if scale is None:
|
|
scale = 1 + 100 * generator.rand(n_features)
|
|
X *= scale
|
|
|
|
if shuffle:
|
|
# Randomly permute samples
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
# Randomly permute features
|
|
indices = np.arange(n_features)
|
|
generator.shuffle(indices)
|
|
X[:, :] = X[:, indices]
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_multilabel_classification(n_samples=100, n_features=20, *,
|
|
n_classes=5,
|
|
n_labels=2, length=50, allow_unlabeled=True,
|
|
sparse=False, return_indicator='dense',
|
|
return_distributions=False,
|
|
random_state=None):
|
|
"""Generate a random multilabel classification problem.
|
|
|
|
For each sample, the generative process is:
|
|
- pick the number of labels: n ~ Poisson(n_labels)
|
|
- n times, choose a class c: c ~ Multinomial(theta)
|
|
- pick the document length: k ~ Poisson(length)
|
|
- k times, choose a word: w ~ Multinomial(theta_c)
|
|
|
|
In the above process, rejection sampling is used to make sure that
|
|
n is never zero or more than `n_classes`, and that the document length
|
|
is never zero. Likewise, we reject classes which have already been chosen.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
n_features : int, optional (default=20)
|
|
The total number of features.
|
|
|
|
n_classes : int, optional (default=5)
|
|
The number of classes of the classification problem.
|
|
|
|
n_labels : int, optional (default=2)
|
|
The average number of labels per instance. More precisely, the number
|
|
of labels per sample is drawn from a Poisson distribution with
|
|
``n_labels`` as its expected value, but samples are bounded (using
|
|
rejection sampling) by ``n_classes``, and must be nonzero if
|
|
``allow_unlabeled`` is False.
|
|
|
|
length : int, optional (default=50)
|
|
The sum of the features (number of words if documents) is drawn from
|
|
a Poisson distribution with this expected value.
|
|
|
|
allow_unlabeled : bool, optional (default=True)
|
|
If ``True``, some instances might not belong to any class.
|
|
|
|
sparse : bool, optional (default=False)
|
|
If ``True``, return a sparse feature matrix
|
|
|
|
.. versionadded:: 0.17
|
|
parameter to allow *sparse* output.
|
|
|
|
return_indicator : 'dense' (default) | 'sparse' | False
|
|
If ``dense`` return ``Y`` in the dense binary indicator format. If
|
|
``'sparse'`` return ``Y`` in the sparse binary indicator format.
|
|
``False`` returns a list of lists of labels.
|
|
|
|
return_distributions : bool, optional (default=False)
|
|
If ``True``, return the prior class probability and conditional
|
|
probabilities of features given classes, from which the data was
|
|
drawn.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The generated samples.
|
|
|
|
Y : array or sparse CSR matrix of shape [n_samples, n_classes]
|
|
The label sets.
|
|
|
|
p_c : array, shape [n_classes]
|
|
The probability of each class being drawn. Only returned if
|
|
``return_distributions=True``.
|
|
|
|
p_w_c : array, shape [n_features, n_classes]
|
|
The probability of each feature being drawn given each class.
|
|
Only returned if ``return_distributions=True``.
|
|
|
|
"""
|
|
if n_classes < 1:
|
|
raise ValueError(
|
|
"'n_classes' should be an integer greater than 0. Got {} instead."
|
|
.format(n_classes)
|
|
)
|
|
if length < 1:
|
|
raise ValueError(
|
|
"'length' should be an integer greater than 0. Got {} instead."
|
|
.format(length)
|
|
)
|
|
|
|
generator = check_random_state(random_state)
|
|
p_c = generator.rand(n_classes)
|
|
p_c /= p_c.sum()
|
|
cumulative_p_c = np.cumsum(p_c)
|
|
p_w_c = generator.rand(n_features, n_classes)
|
|
p_w_c /= np.sum(p_w_c, axis=0)
|
|
|
|
def sample_example():
|
|
_, n_classes = p_w_c.shape
|
|
|
|
# pick a nonzero number of labels per document by rejection sampling
|
|
y_size = n_classes + 1
|
|
while (not allow_unlabeled and y_size == 0) or y_size > n_classes:
|
|
y_size = generator.poisson(n_labels)
|
|
|
|
# pick n classes
|
|
y = set()
|
|
while len(y) != y_size:
|
|
# pick a class with probability P(c)
|
|
c = np.searchsorted(cumulative_p_c,
|
|
generator.rand(y_size - len(y)))
|
|
y.update(c)
|
|
y = list(y)
|
|
|
|
# pick a non-zero document length by rejection sampling
|
|
n_words = 0
|
|
while n_words == 0:
|
|
n_words = generator.poisson(length)
|
|
|
|
# generate a document of length n_words
|
|
if len(y) == 0:
|
|
# if sample does not belong to any class, generate noise word
|
|
words = generator.randint(n_features, size=n_words)
|
|
return words, y
|
|
|
|
# sample words with replacement from selected classes
|
|
cumulative_p_w_sample = p_w_c.take(y, axis=1).sum(axis=1).cumsum()
|
|
cumulative_p_w_sample /= cumulative_p_w_sample[-1]
|
|
words = np.searchsorted(cumulative_p_w_sample, generator.rand(n_words))
|
|
return words, y
|
|
|
|
X_indices = array.array('i')
|
|
X_indptr = array.array('i', [0])
|
|
Y = []
|
|
for i in range(n_samples):
|
|
words, y = sample_example()
|
|
X_indices.extend(words)
|
|
X_indptr.append(len(X_indices))
|
|
Y.append(y)
|
|
X_data = np.ones(len(X_indices), dtype=np.float64)
|
|
X = sp.csr_matrix((X_data, X_indices, X_indptr),
|
|
shape=(n_samples, n_features))
|
|
X.sum_duplicates()
|
|
if not sparse:
|
|
X = X.toarray()
|
|
|
|
# return_indicator can be True due to backward compatibility
|
|
if return_indicator in (True, 'sparse', 'dense'):
|
|
lb = MultiLabelBinarizer(sparse_output=(return_indicator == 'sparse'))
|
|
Y = lb.fit([range(n_classes)]).transform(Y)
|
|
elif return_indicator is not False:
|
|
raise ValueError("return_indicator must be either 'sparse', 'dense' "
|
|
'or False.')
|
|
if return_distributions:
|
|
return X, Y, p_c, p_w_c
|
|
return X, Y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_hastie_10_2(n_samples=12000, *, random_state=None):
|
|
"""Generates data for binary classification used in
|
|
Hastie et al. 2009, Example 10.2.
|
|
|
|
The ten features are standard independent Gaussian and
|
|
the target ``y`` is defined by::
|
|
|
|
y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=12000)
|
|
The number of samples.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, 10]
|
|
The input samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
|
|
Learning Ed. 2", Springer, 2009.
|
|
|
|
See also
|
|
--------
|
|
make_gaussian_quantiles: a generalization of this dataset approach
|
|
"""
|
|
rs = check_random_state(random_state)
|
|
|
|
shape = (n_samples, 10)
|
|
X = rs.normal(size=shape).reshape(shape)
|
|
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False)
|
|
y[y == 0.0] = -1.0
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_regression(n_samples=100, n_features=100, *, n_informative=10,
|
|
n_targets=1, bias=0.0, effective_rank=None,
|
|
tail_strength=0.5, noise=0.0, shuffle=True, coef=False,
|
|
random_state=None):
|
|
"""Generate a random regression problem.
|
|
|
|
The input set can either be well conditioned (by default) or have a low
|
|
rank-fat tail singular profile. See :func:`make_low_rank_matrix` for
|
|
more details.
|
|
|
|
The output is generated by applying a (potentially biased) random linear
|
|
regression model with `n_informative` nonzero regressors to the previously
|
|
generated input and some gaussian centered noise with some adjustable
|
|
scale.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
n_features : int, optional (default=100)
|
|
The number of features.
|
|
|
|
n_informative : int, optional (default=10)
|
|
The number of informative features, i.e., the number of features used
|
|
to build the linear model used to generate the output.
|
|
|
|
n_targets : int, optional (default=1)
|
|
The number of regression targets, i.e., the dimension of the y output
|
|
vector associated with a sample. By default, the output is a scalar.
|
|
|
|
bias : float, optional (default=0.0)
|
|
The bias term in the underlying linear model.
|
|
|
|
effective_rank : int or None, optional (default=None)
|
|
if not None:
|
|
The approximate number of singular vectors required to explain most
|
|
of the input data by linear combinations. Using this kind of
|
|
singular spectrum in the input allows the generator to reproduce
|
|
the correlations often observed in practice.
|
|
if None:
|
|
The input set is well conditioned, centered and gaussian with
|
|
unit variance.
|
|
|
|
tail_strength : float between 0.0 and 1.0, optional (default=0.5)
|
|
The relative importance of the fat noisy tail of the singular values
|
|
profile if `effective_rank` is not None.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
shuffle : boolean, optional (default=True)
|
|
Shuffle the samples and the features.
|
|
|
|
coef : boolean, optional (default=False)
|
|
If True, the coefficients of the underlying linear model are returned.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The input samples.
|
|
|
|
y : array of shape [n_samples] or [n_samples, n_targets]
|
|
The output values.
|
|
|
|
coef : array of shape [n_features] or [n_features, n_targets], optional
|
|
The coefficient of the underlying linear model. It is returned only if
|
|
coef is True.
|
|
"""
|
|
n_informative = min(n_features, n_informative)
|
|
generator = check_random_state(random_state)
|
|
|
|
if effective_rank is None:
|
|
# Randomly generate a well conditioned input set
|
|
X = generator.randn(n_samples, n_features)
|
|
|
|
else:
|
|
# Randomly generate a low rank, fat tail input set
|
|
X = make_low_rank_matrix(n_samples=n_samples,
|
|
n_features=n_features,
|
|
effective_rank=effective_rank,
|
|
tail_strength=tail_strength,
|
|
random_state=generator)
|
|
|
|
# Generate a ground truth model with only n_informative features being non
|
|
# zeros (the other features are not correlated to y and should be ignored
|
|
# by a sparsifying regularizers such as L1 or elastic net)
|
|
ground_truth = np.zeros((n_features, n_targets))
|
|
ground_truth[:n_informative, :] = 100 * generator.rand(n_informative,
|
|
n_targets)
|
|
|
|
y = np.dot(X, ground_truth) + bias
|
|
|
|
# Add noise
|
|
if noise > 0.0:
|
|
y += generator.normal(scale=noise, size=y.shape)
|
|
|
|
# Randomly permute samples and features
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
indices = np.arange(n_features)
|
|
generator.shuffle(indices)
|
|
X[:, :] = X[:, indices]
|
|
ground_truth = ground_truth[indices]
|
|
|
|
y = np.squeeze(y)
|
|
|
|
if coef:
|
|
return X, y, np.squeeze(ground_truth)
|
|
|
|
else:
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None,
|
|
factor=.8):
|
|
"""Make a large circle containing a smaller circle in 2d.
|
|
|
|
A simple toy dataset to visualize clustering and classification
|
|
algorithms.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int or two-element tuple, optional (default=100)
|
|
If int, it is the total number of points generated.
|
|
For odd numbers, the inner circle will have one point more than the
|
|
outer circle.
|
|
If two-element tuple, number of points in outer circle and inner
|
|
circle.
|
|
|
|
shuffle : bool, optional (default=True)
|
|
Whether to shuffle the samples.
|
|
|
|
noise : double or None (default=None)
|
|
Standard deviation of Gaussian noise added to the data.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset shuffling and noise.
|
|
Pass an int for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
factor : 0 < double < 1 (default=.8)
|
|
Scale factor between inner and outer circle.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, 2]
|
|
The generated samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The integer labels (0 or 1) for class membership of each sample.
|
|
"""
|
|
|
|
if factor >= 1 or factor < 0:
|
|
raise ValueError("'factor' has to be between 0 and 1.")
|
|
|
|
if isinstance(n_samples, numbers.Integral):
|
|
n_samples_out = n_samples // 2
|
|
n_samples_in = n_samples - n_samples_out
|
|
else:
|
|
try:
|
|
n_samples_out, n_samples_in = n_samples
|
|
except ValueError:
|
|
raise ValueError('`n_samples` can be either an int or '
|
|
'a two-element tuple.')
|
|
|
|
generator = check_random_state(random_state)
|
|
# so as not to have the first point = last point, we set endpoint=False
|
|
linspace_out = np.linspace(0, 2 * np.pi, n_samples_out, endpoint=False)
|
|
linspace_in = np.linspace(0, 2 * np.pi, n_samples_in, endpoint=False)
|
|
outer_circ_x = np.cos(linspace_out)
|
|
outer_circ_y = np.sin(linspace_out)
|
|
inner_circ_x = np.cos(linspace_in) * factor
|
|
inner_circ_y = np.sin(linspace_in) * factor
|
|
|
|
X = np.vstack([np.append(outer_circ_x, inner_circ_x),
|
|
np.append(outer_circ_y, inner_circ_y)]).T
|
|
y = np.hstack([np.zeros(n_samples_out, dtype=np.intp),
|
|
np.ones(n_samples_in, dtype=np.intp)])
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
if noise is not None:
|
|
X += generator.normal(scale=noise, size=X.shape)
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
|
|
"""Make two interleaving half circles
|
|
|
|
A simple toy dataset to visualize clustering and classification
|
|
algorithms. Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int or two-element tuple, optional (default=100)
|
|
If int, the total number of points generated.
|
|
If two-element tuple, number of points in each of two moons.
|
|
|
|
shuffle : bool, optional (default=True)
|
|
Whether to shuffle the samples.
|
|
|
|
noise : double or None (default=None)
|
|
Standard deviation of Gaussian noise added to the data.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset shuffling and noise.
|
|
Pass an int for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, 2]
|
|
The generated samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The integer labels (0 or 1) for class membership of each sample.
|
|
"""
|
|
|
|
if isinstance(n_samples, numbers.Integral):
|
|
n_samples_out = n_samples // 2
|
|
n_samples_in = n_samples - n_samples_out
|
|
else:
|
|
try:
|
|
n_samples_out, n_samples_in = n_samples
|
|
except ValueError:
|
|
raise ValueError('`n_samples` can be either an int or '
|
|
'a two-element tuple.')
|
|
|
|
generator = check_random_state(random_state)
|
|
|
|
outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))
|
|
outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))
|
|
inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))
|
|
inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - .5
|
|
|
|
X = np.vstack([np.append(outer_circ_x, inner_circ_x),
|
|
np.append(outer_circ_y, inner_circ_y)]).T
|
|
y = np.hstack([np.zeros(n_samples_out, dtype=np.intp),
|
|
np.ones(n_samples_in, dtype=np.intp)])
|
|
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
if noise is not None:
|
|
X += generator.normal(scale=noise, size=X.shape)
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0,
|
|
center_box=(-10.0, 10.0), shuffle=True, random_state=None,
|
|
return_centers=False):
|
|
"""Generate isotropic Gaussian blobs for clustering.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int or array-like, optional (default=100)
|
|
If int, it is the total number of points equally divided among
|
|
clusters.
|
|
If array-like, each element of the sequence indicates
|
|
the number of samples per cluster.
|
|
|
|
.. versionchanged:: v0.20
|
|
one can now pass an array-like to the ``n_samples`` parameter
|
|
|
|
n_features : int, optional (default=2)
|
|
The number of features for each sample.
|
|
|
|
centers : int or array of shape [n_centers, n_features], optional
|
|
(default=None)
|
|
The number of centers to generate, or the fixed center locations.
|
|
If n_samples is an int and centers is None, 3 centers are generated.
|
|
If n_samples is array-like, centers must be
|
|
either None or an array of length equal to the length of n_samples.
|
|
|
|
cluster_std : float or sequence of floats, optional (default=1.0)
|
|
The standard deviation of the clusters.
|
|
|
|
center_box : pair of floats (min, max), optional (default=(-10.0, 10.0))
|
|
The bounding box for each cluster center when centers are
|
|
generated at random.
|
|
|
|
shuffle : boolean, optional (default=True)
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
return_centers : bool, optional (default=False)
|
|
If True, then return the centers of each cluster
|
|
|
|
.. versionadded:: 0.23
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The generated samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The integer labels for cluster membership of each sample.
|
|
|
|
centers : array, shape [n_centers, n_features]
|
|
The centers of each cluster. Only returned if
|
|
``return_centers=True``.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import make_blobs
|
|
>>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
|
|
... random_state=0)
|
|
>>> print(X.shape)
|
|
(10, 2)
|
|
>>> y
|
|
array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
|
|
>>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
|
|
... random_state=0)
|
|
>>> print(X.shape)
|
|
(10, 2)
|
|
>>> y
|
|
array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])
|
|
|
|
See also
|
|
--------
|
|
make_classification: a more intricate variant
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
if isinstance(n_samples, numbers.Integral):
|
|
# Set n_centers by looking at centers arg
|
|
if centers is None:
|
|
centers = 3
|
|
|
|
if isinstance(centers, numbers.Integral):
|
|
n_centers = centers
|
|
centers = generator.uniform(center_box[0], center_box[1],
|
|
size=(n_centers, n_features))
|
|
|
|
else:
|
|
centers = check_array(centers)
|
|
n_features = centers.shape[1]
|
|
n_centers = centers.shape[0]
|
|
|
|
else:
|
|
# Set n_centers by looking at [n_samples] arg
|
|
n_centers = len(n_samples)
|
|
if centers is None:
|
|
centers = generator.uniform(center_box[0], center_box[1],
|
|
size=(n_centers, n_features))
|
|
try:
|
|
assert len(centers) == n_centers
|
|
except TypeError:
|
|
raise ValueError("Parameter `centers` must be array-like. "
|
|
"Got {!r} instead".format(centers))
|
|
except AssertionError:
|
|
raise ValueError("Length of `n_samples` not consistent"
|
|
" with number of centers. Got n_samples = {} "
|
|
"and centers = {}".format(n_samples, centers))
|
|
else:
|
|
centers = check_array(centers)
|
|
n_features = centers.shape[1]
|
|
|
|
# stds: if cluster_std is given as list, it must be consistent
|
|
# with the n_centers
|
|
if (hasattr(cluster_std, "__len__") and len(cluster_std) != n_centers):
|
|
raise ValueError("Length of `clusters_std` not consistent with "
|
|
"number of centers. Got centers = {} "
|
|
"and cluster_std = {}".format(centers, cluster_std))
|
|
|
|
if isinstance(cluster_std, numbers.Real):
|
|
cluster_std = np.full(len(centers), cluster_std)
|
|
|
|
X = []
|
|
y = []
|
|
|
|
if isinstance(n_samples, Iterable):
|
|
n_samples_per_center = n_samples
|
|
else:
|
|
n_samples_per_center = [int(n_samples // n_centers)] * n_centers
|
|
|
|
for i in range(n_samples % n_centers):
|
|
n_samples_per_center[i] += 1
|
|
|
|
for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
|
|
X.append(generator.normal(loc=centers[i], scale=std,
|
|
size=(n, n_features)))
|
|
y += [i] * n
|
|
|
|
X = np.concatenate(X)
|
|
y = np.array(y)
|
|
|
|
if shuffle:
|
|
total_n_samples = np.sum(n_samples)
|
|
indices = np.arange(total_n_samples)
|
|
generator.shuffle(indices)
|
|
X = X[indices]
|
|
y = y[indices]
|
|
|
|
if return_centers:
|
|
return X, y, centers
|
|
else:
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_friedman1(n_samples=100, n_features=10, *, noise=0.0,
|
|
random_state=None):
|
|
"""Generate the "Friedman #1" regression problem
|
|
|
|
This dataset is described in Friedman [1] and Breiman [2].
|
|
|
|
Inputs `X` are independent features uniformly distributed on the interval
|
|
[0, 1]. The output `y` is created according to the formula::
|
|
|
|
y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
|
|
+ 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).
|
|
|
|
Out of the `n_features` features, only 5 are actually used to compute
|
|
`y`. The remaining features are independent of `y`.
|
|
|
|
The number of features has to be >= 5.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
n_features : int, optional (default=10)
|
|
The number of features. Should be at least 5.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset noise. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The input samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
|
|
of Statistics 19 (1), pages 1-67, 1991.
|
|
|
|
.. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
|
|
pages 123-140, 1996.
|
|
"""
|
|
if n_features < 5:
|
|
raise ValueError("n_features must be at least five.")
|
|
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.rand(n_samples, n_features)
|
|
y = 10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 \
|
|
+ 10 * X[:, 3] + 5 * X[:, 4] + noise * generator.randn(n_samples)
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
|
|
"""Generate the "Friedman #2" regression problem
|
|
|
|
This dataset is described in Friedman [1] and Breiman [2].
|
|
|
|
Inputs `X` are 4 independent features uniformly distributed on the
|
|
intervals::
|
|
|
|
0 <= X[:, 0] <= 100,
|
|
40 * pi <= X[:, 1] <= 560 * pi,
|
|
0 <= X[:, 2] <= 1,
|
|
1 <= X[:, 3] <= 11.
|
|
|
|
The output `y` is created according to the formula::
|
|
|
|
y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] \
|
|
- 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset noise. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, 4]
|
|
The input samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
|
|
of Statistics 19 (1), pages 1-67, 1991.
|
|
|
|
.. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
|
|
pages 123-140, 1996.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.rand(n_samples, 4)
|
|
X[:, 0] *= 100
|
|
X[:, 1] *= 520 * np.pi
|
|
X[:, 1] += 40 * np.pi
|
|
X[:, 3] *= 10
|
|
X[:, 3] += 1
|
|
|
|
y = (X[:, 0] ** 2
|
|
+ (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 \
|
|
+ noise * generator.randn(n_samples)
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
|
|
"""Generate the "Friedman #3" regression problem
|
|
|
|
This dataset is described in Friedman [1] and Breiman [2].
|
|
|
|
Inputs `X` are 4 independent features uniformly distributed on the
|
|
intervals::
|
|
|
|
0 <= X[:, 0] <= 100,
|
|
40 * pi <= X[:, 1] <= 560 * pi,
|
|
0 <= X[:, 2] <= 1,
|
|
1 <= X[:, 3] <= 11.
|
|
|
|
The output `y` is created according to the formula::
|
|
|
|
y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) \
|
|
/ X[:, 0]) + noise * N(0, 1).
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise applied to the output.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset noise. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, 4]
|
|
The input samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Friedman, "Multivariate adaptive regression splines", The Annals
|
|
of Statistics 19 (1), pages 1-67, 1991.
|
|
|
|
.. [2] L. Breiman, "Bagging predictors", Machine Learning 24,
|
|
pages 123-140, 1996.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.rand(n_samples, 4)
|
|
X[:, 0] *= 100
|
|
X[:, 1] *= 520 * np.pi
|
|
X[:, 1] += 40 * np.pi
|
|
X[:, 3] *= 10
|
|
X[:, 3] += 1
|
|
|
|
y = np.arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) \
|
|
+ noise * generator.randn(n_samples)
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10,
|
|
tail_strength=0.5, random_state=None):
|
|
"""Generate a mostly low rank matrix with bell-shaped singular values
|
|
|
|
Most of the variance can be explained by a bell-shaped curve of width
|
|
effective_rank: the low rank part of the singular values profile is::
|
|
|
|
(1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)
|
|
|
|
The remaining singular values' tail is fat, decreasing as::
|
|
|
|
tail_strength * exp(-0.1 * i / effective_rank).
|
|
|
|
The low rank part of the profile can be considered the structured
|
|
signal part of the data while the tail can be considered the noisy
|
|
part of the data that cannot be summarized by a low number of linear
|
|
components (singular vectors).
|
|
|
|
This kind of singular profiles is often seen in practice, for instance:
|
|
- gray level pictures of faces
|
|
- TF-IDF vectors of text documents crawled from the web
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
n_features : int, optional (default=100)
|
|
The number of features.
|
|
|
|
effective_rank : int, optional (default=10)
|
|
The approximate number of singular vectors required to explain most of
|
|
the data by linear combinations.
|
|
|
|
tail_strength : float between 0.0 and 1.0, optional (default=0.5)
|
|
The relative importance of the fat noisy tail of the singular values
|
|
profile.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The matrix.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
n = min(n_samples, n_features)
|
|
|
|
# Random (ortho normal) vectors
|
|
u, _ = linalg.qr(generator.randn(n_samples, n), mode='economic')
|
|
v, _ = linalg.qr(generator.randn(n_features, n), mode='economic')
|
|
|
|
# Index of the singular values
|
|
singular_ind = np.arange(n, dtype=np.float64)
|
|
|
|
# Build the singular profile by assembling signal and noise components
|
|
low_rank = ((1 - tail_strength) *
|
|
np.exp(-1.0 * (singular_ind / effective_rank) ** 2))
|
|
tail = tail_strength * np.exp(-0.1 * singular_ind / effective_rank)
|
|
s = np.identity(n) * (low_rank + tail)
|
|
|
|
return np.dot(np.dot(u, s), v.T)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_sparse_coded_signal(n_samples, *, n_components, n_features,
|
|
n_nonzero_coefs, random_state=None):
|
|
"""Generate a signal as a sparse combination of dictionary elements.
|
|
|
|
Returns a matrix Y = DX, such as D is (n_features, n_components),
|
|
X is (n_components, n_samples) and each column of X has exactly
|
|
n_nonzero_coefs non-zero elements.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int
|
|
number of samples to generate
|
|
|
|
n_components : int,
|
|
number of components in the dictionary
|
|
|
|
n_features : int
|
|
number of features of the dataset to generate
|
|
|
|
n_nonzero_coefs : int
|
|
number of active (non-zero) coefficients in each sample
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
data : array of shape [n_features, n_samples]
|
|
The encoded signal (Y).
|
|
|
|
dictionary : array of shape [n_features, n_components]
|
|
The dictionary with normalized components (D).
|
|
|
|
code : array of shape [n_components, n_samples]
|
|
The sparse code such that each column of this matrix has exactly
|
|
n_nonzero_coefs non-zero items (X).
|
|
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
# generate dictionary
|
|
D = generator.randn(n_features, n_components)
|
|
D /= np.sqrt(np.sum((D ** 2), axis=0))
|
|
|
|
# generate code
|
|
X = np.zeros((n_components, n_samples))
|
|
for i in range(n_samples):
|
|
idx = np.arange(n_components)
|
|
generator.shuffle(idx)
|
|
idx = idx[:n_nonzero_coefs]
|
|
X[idx, i] = generator.randn(n_nonzero_coefs)
|
|
|
|
# encode signal
|
|
Y = np.dot(D, X)
|
|
|
|
return map(np.squeeze, (Y, D, X))
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_sparse_uncorrelated(n_samples=100, n_features=10, *,
|
|
random_state=None):
|
|
"""Generate a random regression problem with sparse uncorrelated design
|
|
|
|
This dataset is described in Celeux et al [1]. as::
|
|
|
|
X ~ N(0, 1)
|
|
y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]
|
|
|
|
Only the first 4 features are informative. The remaining features are
|
|
useless.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of samples.
|
|
|
|
n_features : int, optional (default=10)
|
|
The number of features.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The input samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The output values.
|
|
|
|
References
|
|
----------
|
|
.. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,
|
|
"Regularization in regression: comparing Bayesian and frequentist
|
|
methods in a poorly informative situation", 2009.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
X = generator.normal(loc=0, scale=1, size=(n_samples, n_features))
|
|
y = generator.normal(loc=(X[:, 0] +
|
|
2 * X[:, 1] -
|
|
2 * X[:, 2] -
|
|
1.5 * X[:, 3]), scale=np.ones(n_samples))
|
|
|
|
return X, y
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_spd_matrix(n_dim, *, random_state=None):
|
|
"""Generate a random symmetric, positive-definite matrix.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_dim : int
|
|
The matrix dimension.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_dim, n_dim]
|
|
The random symmetric, positive-definite matrix.
|
|
|
|
See also
|
|
--------
|
|
make_sparse_spd_matrix
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
A = generator.rand(n_dim, n_dim)
|
|
U, s, V = linalg.svd(np.dot(A.T, A))
|
|
X = np.dot(np.dot(U, 1.0 + np.diag(generator.rand(n_dim))), V)
|
|
|
|
return X
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False,
|
|
smallest_coef=.1, largest_coef=.9,
|
|
random_state=None):
|
|
"""Generate a sparse symmetric definite positive matrix.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
dim : integer, optional (default=1)
|
|
The size of the random matrix to generate.
|
|
|
|
alpha : float between 0 and 1, optional (default=0.95)
|
|
The probability that a coefficient is zero (see notes). Larger values
|
|
enforce more sparsity.
|
|
|
|
norm_diag : boolean, optional (default=False)
|
|
Whether to normalize the output matrix to make the leading diagonal
|
|
elements all 1
|
|
|
|
smallest_coef : float between 0 and 1, optional (default=0.1)
|
|
The value of the smallest coefficient.
|
|
|
|
largest_coef : float between 0 and 1, optional (default=0.9)
|
|
The value of the largest coefficient.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
prec : sparse matrix of shape (dim, dim)
|
|
The generated matrix.
|
|
|
|
Notes
|
|
-----
|
|
The sparsity is actually imposed on the cholesky factor of the matrix.
|
|
Thus alpha does not translate directly into the filling fraction of
|
|
the matrix itself.
|
|
|
|
See also
|
|
--------
|
|
make_spd_matrix
|
|
"""
|
|
random_state = check_random_state(random_state)
|
|
|
|
chol = -np.eye(dim)
|
|
aux = random_state.rand(dim, dim)
|
|
aux[aux < alpha] = 0
|
|
aux[aux > alpha] = (smallest_coef
|
|
+ (largest_coef - smallest_coef)
|
|
* random_state.rand(np.sum(aux > alpha)))
|
|
aux = np.tril(aux, k=-1)
|
|
|
|
# Permute the lines: we don't want to have asymmetries in the final
|
|
# SPD matrix
|
|
permutation = random_state.permutation(dim)
|
|
aux = aux[permutation].T[permutation]
|
|
chol += aux
|
|
prec = np.dot(chol.T, chol)
|
|
|
|
if norm_diag:
|
|
# Form the diagonal vector into a row matrix
|
|
d = np.diag(prec).reshape(1, prec.shape[0])
|
|
d = 1. / np.sqrt(d)
|
|
|
|
prec *= d
|
|
prec *= d.T
|
|
|
|
return prec
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
|
|
"""Generate a swiss roll dataset.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of sample points on the S curve.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, 3]
|
|
The points.
|
|
|
|
t : array of shape [n_samples]
|
|
The univariate position of the sample according to the main dimension
|
|
of the points in the manifold.
|
|
|
|
Notes
|
|
-----
|
|
The algorithm is from Marsland [1].
|
|
|
|
References
|
|
----------
|
|
.. [1] S. Marsland, "Machine Learning: An Algorithmic Perspective",
|
|
Chapter 10, 2009.
|
|
http://seat.massey.ac.nz/personal/s.r.marsland/Code/10/lle.py
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
t = 1.5 * np.pi * (1 + 2 * generator.rand(1, n_samples))
|
|
x = t * np.cos(t)
|
|
y = 21 * generator.rand(1, n_samples)
|
|
z = t * np.sin(t)
|
|
|
|
X = np.concatenate((x, y, z))
|
|
X += noise * generator.randn(3, n_samples)
|
|
X = X.T
|
|
t = np.squeeze(t)
|
|
|
|
return X, t
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
|
|
"""Generate an S curve dataset.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int, optional (default=100)
|
|
The number of sample points on the S curve.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, 3]
|
|
The points.
|
|
|
|
t : array of shape [n_samples]
|
|
The univariate position of the sample according to the main dimension
|
|
of the points in the manifold.
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
t = 3 * np.pi * (generator.rand(1, n_samples) - 0.5)
|
|
x = np.sin(t)
|
|
y = 2.0 * generator.rand(1, n_samples)
|
|
z = np.sign(t) * (np.cos(t) - 1)
|
|
|
|
X = np.concatenate((x, y, z))
|
|
X += noise * generator.randn(3, n_samples)
|
|
X = X.T
|
|
t = np.squeeze(t)
|
|
|
|
return X, t
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_gaussian_quantiles(*, mean=None, cov=1., n_samples=100,
|
|
n_features=2, n_classes=3,
|
|
shuffle=True, random_state=None):
|
|
r"""Generate isotropic Gaussian and label samples by quantile
|
|
|
|
This classification dataset is constructed by taking a multi-dimensional
|
|
standard normal distribution and defining classes separated by nested
|
|
concentric multi-dimensional spheres such that roughly equal numbers of
|
|
samples are in each class (quantiles of the :math:`\chi^2` distribution).
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
mean : array of shape [n_features], optional (default=None)
|
|
The mean of the multi-dimensional normal distribution.
|
|
If None then use the origin (0, 0, ...).
|
|
|
|
cov : float, optional (default=1.)
|
|
The covariance matrix will be this value times the unit matrix. This
|
|
dataset only produces symmetric normal distributions.
|
|
|
|
n_samples : int, optional (default=100)
|
|
The total number of points equally divided among classes.
|
|
|
|
n_features : int, optional (default=2)
|
|
The number of features for each sample.
|
|
|
|
n_classes : int, optional (default=3)
|
|
The number of classes
|
|
|
|
shuffle : boolean, optional (default=True)
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape [n_samples, n_features]
|
|
The generated samples.
|
|
|
|
y : array of shape [n_samples]
|
|
The integer labels for quantile membership of each sample.
|
|
|
|
Notes
|
|
-----
|
|
The dataset is from Zhu et al [1].
|
|
|
|
References
|
|
----------
|
|
.. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
|
|
|
|
"""
|
|
if n_samples < n_classes:
|
|
raise ValueError("n_samples must be at least n_classes")
|
|
|
|
generator = check_random_state(random_state)
|
|
|
|
if mean is None:
|
|
mean = np.zeros(n_features)
|
|
else:
|
|
mean = np.array(mean)
|
|
|
|
# Build multivariate normal distribution
|
|
X = generator.multivariate_normal(mean, cov * np.identity(n_features),
|
|
(n_samples,))
|
|
|
|
# Sort by distance from origin
|
|
idx = np.argsort(np.sum((X - mean[np.newaxis, :]) ** 2, axis=1))
|
|
X = X[idx, :]
|
|
|
|
# Label by quantile
|
|
step = n_samples // n_classes
|
|
|
|
y = np.hstack([np.repeat(np.arange(n_classes), step),
|
|
np.repeat(n_classes - 1, n_samples - step * n_classes)])
|
|
|
|
if shuffle:
|
|
X, y = util_shuffle(X, y, random_state=generator)
|
|
|
|
return X, y
|
|
|
|
|
|
def _shuffle(data, random_state=None):
|
|
generator = check_random_state(random_state)
|
|
n_rows, n_cols = data.shape
|
|
row_idx = generator.permutation(n_rows)
|
|
col_idx = generator.permutation(n_cols)
|
|
result = data[row_idx][:, col_idx]
|
|
return result, row_idx, col_idx
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10,
|
|
maxval=100, shuffle=True, random_state=None):
|
|
"""Generate an array with constant block diagonal structure for
|
|
biclustering.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
shape : iterable (n_rows, n_cols)
|
|
The shape of the result.
|
|
|
|
n_clusters : integer
|
|
The number of biclusters.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise.
|
|
|
|
minval : int, optional (default=10)
|
|
Minimum value of a bicluster.
|
|
|
|
maxval : int, optional (default=100)
|
|
Maximum value of a bicluster.
|
|
|
|
shuffle : boolean, optional (default=True)
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape `shape`
|
|
The generated array.
|
|
|
|
rows : array of shape (n_clusters, X.shape[0],)
|
|
The indicators for cluster membership of each row.
|
|
|
|
cols : array of shape (n_clusters, X.shape[1],)
|
|
The indicators for cluster membership of each column.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] Dhillon, I. S. (2001, August). Co-clustering documents and
|
|
words using bipartite spectral graph partitioning. In Proceedings
|
|
of the seventh ACM SIGKDD international conference on Knowledge
|
|
discovery and data mining (pp. 269-274). ACM.
|
|
|
|
See also
|
|
--------
|
|
make_checkerboard
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
n_rows, n_cols = shape
|
|
consts = generator.uniform(minval, maxval, n_clusters)
|
|
|
|
# row and column clusters of approximately equal sizes
|
|
row_sizes = generator.multinomial(n_rows,
|
|
np.repeat(1.0 / n_clusters,
|
|
n_clusters))
|
|
col_sizes = generator.multinomial(n_cols,
|
|
np.repeat(1.0 / n_clusters,
|
|
n_clusters))
|
|
|
|
row_labels = np.hstack(list(np.repeat(val, rep) for val, rep in
|
|
zip(range(n_clusters), row_sizes)))
|
|
col_labels = np.hstack(list(np.repeat(val, rep) for val, rep in
|
|
zip(range(n_clusters), col_sizes)))
|
|
|
|
result = np.zeros(shape, dtype=np.float64)
|
|
for i in range(n_clusters):
|
|
selector = np.outer(row_labels == i, col_labels == i)
|
|
result[selector] += consts[i]
|
|
|
|
if noise > 0:
|
|
result += generator.normal(scale=noise, size=result.shape)
|
|
|
|
if shuffle:
|
|
result, row_idx, col_idx = _shuffle(result, random_state)
|
|
row_labels = row_labels[row_idx]
|
|
col_labels = col_labels[col_idx]
|
|
|
|
rows = np.vstack([row_labels == c for c in range(n_clusters)])
|
|
cols = np.vstack([col_labels == c for c in range(n_clusters)])
|
|
|
|
return result, rows, cols
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def make_checkerboard(shape, n_clusters, *, noise=0.0, minval=10,
|
|
maxval=100, shuffle=True, random_state=None):
|
|
"""Generate an array with block checkerboard structure for
|
|
biclustering.
|
|
|
|
Read more in the :ref:`User Guide <sample_generators>`.
|
|
|
|
Parameters
|
|
----------
|
|
shape : iterable (n_rows, n_cols)
|
|
The shape of the result.
|
|
|
|
n_clusters : integer or iterable (n_row_clusters, n_column_clusters)
|
|
The number of row and column clusters.
|
|
|
|
noise : float, optional (default=0.0)
|
|
The standard deviation of the gaussian noise.
|
|
|
|
minval : int, optional (default=10)
|
|
Minimum value of a bicluster.
|
|
|
|
maxval : int, optional (default=100)
|
|
Maximum value of a bicluster.
|
|
|
|
shuffle : boolean, optional (default=True)
|
|
Shuffle the samples.
|
|
|
|
random_state : int, RandomState instance, default=None
|
|
Determines random number generation for dataset creation. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`.
|
|
|
|
Returns
|
|
-------
|
|
X : array of shape `shape`
|
|
The generated array.
|
|
|
|
rows : array of shape (n_clusters, X.shape[0],)
|
|
The indicators for cluster membership of each row.
|
|
|
|
cols : array of shape (n_clusters, X.shape[1],)
|
|
The indicators for cluster membership of each column.
|
|
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).
|
|
Spectral biclustering of microarray data: coclustering genes
|
|
and conditions. Genome research, 13(4), 703-716.
|
|
|
|
See also
|
|
--------
|
|
make_biclusters
|
|
"""
|
|
generator = check_random_state(random_state)
|
|
|
|
if hasattr(n_clusters, "__len__"):
|
|
n_row_clusters, n_col_clusters = n_clusters
|
|
else:
|
|
n_row_clusters = n_col_clusters = n_clusters
|
|
|
|
# row and column clusters of approximately equal sizes
|
|
n_rows, n_cols = shape
|
|
row_sizes = generator.multinomial(n_rows,
|
|
np.repeat(1.0 / n_row_clusters,
|
|
n_row_clusters))
|
|
col_sizes = generator.multinomial(n_cols,
|
|
np.repeat(1.0 / n_col_clusters,
|
|
n_col_clusters))
|
|
|
|
row_labels = np.hstack(list(np.repeat(val, rep) for val, rep in
|
|
zip(range(n_row_clusters), row_sizes)))
|
|
col_labels = np.hstack(list(np.repeat(val, rep) for val, rep in
|
|
zip(range(n_col_clusters), col_sizes)))
|
|
|
|
result = np.zeros(shape, dtype=np.float64)
|
|
for i in range(n_row_clusters):
|
|
for j in range(n_col_clusters):
|
|
selector = np.outer(row_labels == i, col_labels == j)
|
|
result[selector] += generator.uniform(minval, maxval)
|
|
|
|
if noise > 0:
|
|
result += generator.normal(scale=noise, size=result.shape)
|
|
|
|
if shuffle:
|
|
result, row_idx, col_idx = _shuffle(result, random_state)
|
|
row_labels = row_labels[row_idx]
|
|
col_labels = col_labels[col_idx]
|
|
|
|
rows = np.vstack([row_labels == label
|
|
for label in range(n_row_clusters)
|
|
for _ in range(n_col_clusters)])
|
|
cols = np.vstack([col_labels == label
|
|
for _ in range(n_row_clusters)
|
|
for label in range(n_col_clusters)])
|
|
|
|
return result, rows, cols
|