Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
|
@ -0,0 +1,13 @@
|
|||
"""
|
||||
The :mod:`sklearn.feature_extraction` module deals with feature extraction
|
||||
from raw data. It currently includes methods to extract features from text and
|
||||
images.
|
||||
"""
|
||||
|
||||
from ._dict_vectorizer import DictVectorizer
|
||||
from ._hash import FeatureHasher
|
||||
from .image import img_to_graph, grid_to_graph
|
||||
from . import text
|
||||
|
||||
__all__ = ['DictVectorizer', 'image', 'img_to_graph', 'grid_to_graph', 'text',
|
||||
'FeatureHasher']
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,364 @@
|
|||
# Authors: Lars Buitinck
|
||||
# Dan Blanchard <dblanchard@ets.org>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from array import array
|
||||
from collections.abc import Mapping
|
||||
from operator import itemgetter
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..utils import check_array, tosequence
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
def _tosequence(X):
|
||||
"""Turn X into a sequence or ndarray, avoiding a copy if possible."""
|
||||
if isinstance(X, Mapping): # single sample
|
||||
return [X]
|
||||
else:
|
||||
return tosequence(X)
|
||||
|
||||
|
||||
class DictVectorizer(TransformerMixin, BaseEstimator):
|
||||
"""Transforms lists of feature-value mappings to vectors.
|
||||
|
||||
This transformer turns lists of mappings (dict-like objects) of feature
|
||||
names to feature values into Numpy arrays or scipy.sparse matrices for use
|
||||
with scikit-learn estimators.
|
||||
|
||||
When feature values are strings, this transformer will do a binary one-hot
|
||||
(aka one-of-K) coding: one boolean-valued feature is constructed for each
|
||||
of the possible string values that the feature can take on. For instance,
|
||||
a feature "f" that can take on the values "ham" and "spam" will become two
|
||||
features in the output, one signifying "f=ham", the other "f=spam".
|
||||
|
||||
However, note that this transformer will only do a binary one-hot encoding
|
||||
when feature values are of type string. If categorical features are
|
||||
represented as numeric values such as int, the DictVectorizer can be
|
||||
followed by :class:`sklearn.preprocessing.OneHotEncoder` to complete
|
||||
binary one-hot encoding.
|
||||
|
||||
Features that do not occur in a sample (mapping) will have a zero value
|
||||
in the resulting array/matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <dict_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dtype : dtype, default=np.float64
|
||||
The type of feature values. Passed to Numpy array/scipy.sparse matrix
|
||||
constructors as the dtype argument.
|
||||
separator : str, default="="
|
||||
Separator string used when constructing new features for one-hot
|
||||
coding.
|
||||
sparse : bool, default=True
|
||||
Whether transform should produce scipy.sparse matrices.
|
||||
sort : bool, default=True
|
||||
Whether ``feature_names_`` and ``vocabulary_`` should be
|
||||
sorted when fitting.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
vocabulary_ : dict
|
||||
A dictionary mapping feature names to feature indices.
|
||||
|
||||
feature_names_ : list
|
||||
A list of length n_features containing the feature names (e.g., "f=ham"
|
||||
and "f=spam").
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction import DictVectorizer
|
||||
>>> v = DictVectorizer(sparse=False)
|
||||
>>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
|
||||
>>> X = v.fit_transform(D)
|
||||
>>> X
|
||||
array([[2., 0., 1.],
|
||||
[0., 1., 3.]])
|
||||
>>> v.inverse_transform(X) == \
|
||||
[{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]
|
||||
True
|
||||
>>> v.transform({'foo': 4, 'unseen_feature': 3})
|
||||
array([[0., 0., 4.]])
|
||||
|
||||
See also
|
||||
--------
|
||||
FeatureHasher : performs vectorization using only a hash function.
|
||||
sklearn.preprocessing.OrdinalEncoder : handles nominal/categorical
|
||||
features encoded as columns of arbitrary data types.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, dtype=np.float64, separator="=", sparse=True,
|
||||
sort=True):
|
||||
self.dtype = dtype
|
||||
self.separator = separator
|
||||
self.sparse = sparse
|
||||
self.sort = sort
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Learn a list of feature name -> indices mappings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Mapping or iterable over Mappings
|
||||
Dict(s) or Mapping(s) from feature names (arbitrary Python
|
||||
objects) to feature values (strings or convertible to dtype).
|
||||
y : (ignored)
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
"""
|
||||
feature_names = []
|
||||
vocab = {}
|
||||
|
||||
for x in X:
|
||||
for f, v in x.items():
|
||||
if isinstance(v, str):
|
||||
f = "%s%s%s" % (f, self.separator, v)
|
||||
if f not in vocab:
|
||||
feature_names.append(f)
|
||||
vocab[f] = len(vocab)
|
||||
|
||||
if self.sort:
|
||||
feature_names.sort()
|
||||
vocab = {f: i for i, f in enumerate(feature_names)}
|
||||
|
||||
self.feature_names_ = feature_names
|
||||
self.vocabulary_ = vocab
|
||||
|
||||
return self
|
||||
|
||||
def _transform(self, X, fitting):
|
||||
# Sanity check: Python's array has no way of explicitly requesting the
|
||||
# signed 32-bit integers that scipy.sparse needs, so we use the next
|
||||
# best thing: typecode "i" (int). However, if that gives larger or
|
||||
# smaller integers than 32-bit ones, np.frombuffer screws up.
|
||||
assert array("i").itemsize == 4, (
|
||||
"sizeof(int) != 4 on your platform; please report this at"
|
||||
" https://github.com/scikit-learn/scikit-learn/issues and"
|
||||
" include the output from platform.platform() in your bug report")
|
||||
|
||||
dtype = self.dtype
|
||||
if fitting:
|
||||
feature_names = []
|
||||
vocab = {}
|
||||
else:
|
||||
feature_names = self.feature_names_
|
||||
vocab = self.vocabulary_
|
||||
|
||||
# Process everything as sparse regardless of setting
|
||||
X = [X] if isinstance(X, Mapping) else X
|
||||
|
||||
indices = array("i")
|
||||
indptr = [0]
|
||||
# XXX we could change values to an array.array as well, but it
|
||||
# would require (heuristic) conversion of dtype to typecode...
|
||||
values = []
|
||||
|
||||
# collect all the possible feature names and build sparse matrix at
|
||||
# same time
|
||||
for x in X:
|
||||
for f, v in x.items():
|
||||
if isinstance(v, str):
|
||||
f = "%s%s%s" % (f, self.separator, v)
|
||||
v = 1
|
||||
if f in vocab:
|
||||
indices.append(vocab[f])
|
||||
values.append(dtype(v))
|
||||
else:
|
||||
if fitting:
|
||||
feature_names.append(f)
|
||||
vocab[f] = len(vocab)
|
||||
indices.append(vocab[f])
|
||||
values.append(dtype(v))
|
||||
|
||||
indptr.append(len(indices))
|
||||
|
||||
if len(indptr) == 1:
|
||||
raise ValueError("Sample sequence X is empty.")
|
||||
|
||||
indices = np.frombuffer(indices, dtype=np.intc)
|
||||
shape = (len(indptr) - 1, len(vocab))
|
||||
|
||||
result_matrix = sp.csr_matrix((values, indices, indptr),
|
||||
shape=shape, dtype=dtype)
|
||||
|
||||
# Sort everything if asked
|
||||
if fitting and self.sort:
|
||||
feature_names.sort()
|
||||
map_index = np.empty(len(feature_names), dtype=np.int32)
|
||||
for new_val, f in enumerate(feature_names):
|
||||
map_index[new_val] = vocab[f]
|
||||
vocab[f] = new_val
|
||||
result_matrix = result_matrix[:, map_index]
|
||||
|
||||
if self.sparse:
|
||||
result_matrix.sort_indices()
|
||||
else:
|
||||
result_matrix = result_matrix.toarray()
|
||||
|
||||
if fitting:
|
||||
self.feature_names_ = feature_names
|
||||
self.vocabulary_ = vocab
|
||||
|
||||
return result_matrix
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Learn a list of feature name -> indices mappings and transform X.
|
||||
|
||||
Like fit(X) followed by transform(X), but does not require
|
||||
materializing X in memory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Mapping or iterable over Mappings
|
||||
Dict(s) or Mapping(s) from feature names (arbitrary Python
|
||||
objects) to feature values (strings or convertible to dtype).
|
||||
y : (ignored)
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xa : {array, sparse matrix}
|
||||
Feature vectors; always 2-d.
|
||||
"""
|
||||
return self._transform(X, fitting=True)
|
||||
|
||||
def inverse_transform(self, X, dict_type=dict):
|
||||
"""Transform array or sparse matrix X back to feature mappings.
|
||||
|
||||
X must have been produced by this DictVectorizer's transform or
|
||||
fit_transform method; it may only have passed through transformers
|
||||
that preserve the number of features and their order.
|
||||
|
||||
In the case of one-hot/one-of-K coding, the constructed feature
|
||||
names and values are returned rather than the original ones.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Sample matrix.
|
||||
dict_type : type, default=dict
|
||||
Constructor for feature mappings. Must conform to the
|
||||
collections.Mapping API.
|
||||
|
||||
Returns
|
||||
-------
|
||||
D : list of dict_type objects of shape (n_samples,)
|
||||
Feature mappings for the samples in X.
|
||||
"""
|
||||
# COO matrix is not subscriptable
|
||||
X = check_array(X, accept_sparse=['csr', 'csc'])
|
||||
n_samples = X.shape[0]
|
||||
|
||||
names = self.feature_names_
|
||||
dicts = [dict_type() for _ in range(n_samples)]
|
||||
|
||||
if sp.issparse(X):
|
||||
for i, j in zip(*X.nonzero()):
|
||||
dicts[i][names[j]] = X[i, j]
|
||||
else:
|
||||
for i, d in enumerate(dicts):
|
||||
for j, v in enumerate(X[i, :]):
|
||||
if v != 0:
|
||||
d[names[j]] = X[i, j]
|
||||
|
||||
return dicts
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform feature->value dicts to array or sparse matrix.
|
||||
|
||||
Named features not encountered during fit or fit_transform will be
|
||||
silently ignored.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : Mapping or iterable over Mappings of shape (n_samples,)
|
||||
Dict(s) or Mapping(s) from feature names (arbitrary Python
|
||||
objects) to feature values (strings or convertible to dtype).
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xa : {array, sparse matrix}
|
||||
Feature vectors; always 2-d.
|
||||
"""
|
||||
if self.sparse:
|
||||
return self._transform(X, fitting=False)
|
||||
|
||||
else:
|
||||
dtype = self.dtype
|
||||
vocab = self.vocabulary_
|
||||
X = _tosequence(X)
|
||||
Xa = np.zeros((len(X), len(vocab)), dtype=dtype)
|
||||
|
||||
for i, x in enumerate(X):
|
||||
for f, v in x.items():
|
||||
if isinstance(v, str):
|
||||
f = "%s%s%s" % (f, self.separator, v)
|
||||
v = 1
|
||||
try:
|
||||
Xa[i, vocab[f]] = dtype(v)
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return Xa
|
||||
|
||||
def get_feature_names(self):
|
||||
"""Returns a list of feature names, ordered by their indices.
|
||||
|
||||
If one-of-K coding is applied to categorical features, this will
|
||||
include the constructed feature names but not the original ones.
|
||||
"""
|
||||
return self.feature_names_
|
||||
|
||||
def restrict(self, support, indices=False):
|
||||
"""Restrict the features to those in support using feature selection.
|
||||
|
||||
This function modifies the estimator in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
support : array-like
|
||||
Boolean mask or list of indices (as returned by the get_support
|
||||
member of feature selectors).
|
||||
indices : bool, default=False
|
||||
Whether support is a list of indices.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction import DictVectorizer
|
||||
>>> from sklearn.feature_selection import SelectKBest, chi2
|
||||
>>> v = DictVectorizer()
|
||||
>>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
|
||||
>>> X = v.fit_transform(D)
|
||||
>>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
|
||||
>>> v.get_feature_names()
|
||||
['bar', 'baz', 'foo']
|
||||
>>> v.restrict(support.get_support())
|
||||
DictVectorizer()
|
||||
>>> v.get_feature_names()
|
||||
['bar', 'foo']
|
||||
"""
|
||||
if not indices:
|
||||
support = np.where(support)[0]
|
||||
|
||||
names = self.feature_names_
|
||||
new_vocab = {}
|
||||
for i in support:
|
||||
new_vocab[names[i]] = len(new_vocab)
|
||||
|
||||
self.vocabulary_ = new_vocab
|
||||
self.feature_names_ = [f for f, i in sorted(new_vocab.items(),
|
||||
key=itemgetter(1))]
|
||||
|
||||
return self
|
||||
|
||||
def _more_tags(self):
|
||||
return {'X_types': ["dict"]}
|
173
venv/Lib/site-packages/sklearn/feature_extraction/_hash.py
Normal file
173
venv/Lib/site-packages/sklearn/feature_extraction/_hash.py
Normal file
|
@ -0,0 +1,173 @@
|
|||
# Author: Lars Buitinck
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numbers
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from ..utils import IS_PYPY
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
|
||||
if not IS_PYPY:
|
||||
from ._hashing_fast import transform as _hashing_transform
|
||||
else:
|
||||
def _hashing_transform(*args, **kwargs):
|
||||
raise NotImplementedError(
|
||||
'FeatureHasher is not compatible with PyPy (see '
|
||||
'https://github.com/scikit-learn/scikit-learn/issues/11540 '
|
||||
'for the status updates).')
|
||||
|
||||
|
||||
def _iteritems(d):
|
||||
"""Like d.iteritems, but accepts any collections.Mapping."""
|
||||
return d.iteritems() if hasattr(d, "iteritems") else d.items()
|
||||
|
||||
|
||||
class FeatureHasher(TransformerMixin, BaseEstimator):
|
||||
"""Implements feature hashing, aka the hashing trick.
|
||||
|
||||
This class turns sequences of symbolic feature names (strings) into
|
||||
scipy.sparse matrices, using a hash function to compute the matrix column
|
||||
corresponding to a name. The hash function employed is the signed 32-bit
|
||||
version of Murmurhash3.
|
||||
|
||||
Feature names of type byte string are used as-is. Unicode strings are
|
||||
converted to UTF-8 first, but no Unicode normalization is done.
|
||||
Feature values must be (finite) numbers.
|
||||
|
||||
This class is a low-memory alternative to DictVectorizer and
|
||||
CountVectorizer, intended for large-scale (online) learning and situations
|
||||
where memory is tight, e.g. when running prediction code on embedded
|
||||
devices.
|
||||
|
||||
Read more in the :ref:`User Guide <feature_hashing>`.
|
||||
|
||||
.. versionadded:: 0.13
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_features : int, default=2**20
|
||||
The number of features (columns) in the output matrices. Small numbers
|
||||
of features are likely to cause hash collisions, but large numbers
|
||||
will cause larger coefficient dimensions in linear learners.
|
||||
input_type : {"dict", "pair"}, default="dict"
|
||||
Either "dict" (the default) to accept dictionaries over
|
||||
(feature_name, value); "pair" to accept pairs of (feature_name, value);
|
||||
or "string" to accept single strings.
|
||||
feature_name should be a string, while value should be a number.
|
||||
In the case of "string", a value of 1 is implied.
|
||||
The feature_name is hashed to find the appropriate column for the
|
||||
feature. The value's sign might be flipped in the output (but see
|
||||
non_negative, below).
|
||||
dtype : numpy dtype, default=np.float64
|
||||
The type of feature values. Passed to scipy.sparse matrix constructors
|
||||
as the dtype argument. Do not set this to bool, np.boolean or any
|
||||
unsigned integer type.
|
||||
alternate_sign : bool, default=True
|
||||
When True, an alternating sign is added to the features as to
|
||||
approximately conserve the inner product in the hashed space even for
|
||||
small n_features. This approach is similar to sparse random projection.
|
||||
|
||||
.. versionchanged:: 0.19
|
||||
``alternate_sign`` replaces the now deprecated ``non_negative``
|
||||
parameter.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.feature_extraction import FeatureHasher
|
||||
>>> h = FeatureHasher(n_features=10)
|
||||
>>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
|
||||
>>> f = h.transform(D)
|
||||
>>> f.toarray()
|
||||
array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],
|
||||
[ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])
|
||||
|
||||
See also
|
||||
--------
|
||||
DictVectorizer : vectorizes string-valued features using a hash table.
|
||||
sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_features=(2 ** 20), *, input_type="dict",
|
||||
dtype=np.float64, alternate_sign=True):
|
||||
self._validate_params(n_features, input_type)
|
||||
|
||||
self.dtype = dtype
|
||||
self.input_type = input_type
|
||||
self.n_features = n_features
|
||||
self.alternate_sign = alternate_sign
|
||||
|
||||
@staticmethod
|
||||
def _validate_params(n_features, input_type):
|
||||
# strangely, np.int16 instances are not instances of Integral,
|
||||
# while np.int64 instances are...
|
||||
if not isinstance(n_features, numbers.Integral):
|
||||
raise TypeError("n_features must be integral, got %r (%s)."
|
||||
% (n_features, type(n_features)))
|
||||
elif n_features < 1 or n_features >= np.iinfo(np.int32).max + 1:
|
||||
raise ValueError("Invalid number of features (%d)." % n_features)
|
||||
|
||||
if input_type not in ("dict", "pair", "string"):
|
||||
raise ValueError("input_type must be 'dict', 'pair' or 'string',"
|
||||
" got %r." % input_type)
|
||||
|
||||
def fit(self, X=None, y=None):
|
||||
"""No-op.
|
||||
|
||||
This method doesn't do anything. It exists purely for compatibility
|
||||
with the scikit-learn transformer API.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : FeatureHasher
|
||||
|
||||
"""
|
||||
# repeat input validation for grid search (which calls set_params)
|
||||
self._validate_params(self.n_features, self.input_type)
|
||||
return self
|
||||
|
||||
def transform(self, raw_X):
|
||||
"""Transform a sequence of instances to a scipy.sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_X : iterable over iterable over raw features, length = n_samples
|
||||
Samples. Each sample must be iterable an (e.g., a list or tuple)
|
||||
containing/generating feature names (and optionally values, see
|
||||
the input_type constructor argument) which will be hashed.
|
||||
raw_X need not support the len function, so it can be the result
|
||||
of a generator; n_samples is determined on the fly.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : sparse matrix of shape (n_samples, n_features)
|
||||
Feature matrix, for use with estimators or further transformers.
|
||||
|
||||
"""
|
||||
raw_X = iter(raw_X)
|
||||
if self.input_type == "dict":
|
||||
raw_X = (_iteritems(d) for d in raw_X)
|
||||
elif self.input_type == "string":
|
||||
raw_X = (((f, 1) for f in x) for x in raw_X)
|
||||
indices, indptr, values = \
|
||||
_hashing_transform(raw_X, self.n_features, self.dtype,
|
||||
self.alternate_sign, seed=0)
|
||||
n_samples = indptr.shape[0] - 1
|
||||
|
||||
if n_samples == 0:
|
||||
raise ValueError("Cannot vectorize empty sequence.")
|
||||
|
||||
X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype,
|
||||
shape=(n_samples, self.n_features))
|
||||
X.sum_duplicates() # also sorts the indices
|
||||
|
||||
return X
|
||||
|
||||
def _more_tags(self):
|
||||
return {'X_types': [self.input_type]}
|
Binary file not shown.
|
@ -0,0 +1,45 @@
|
|||
# This list of English stop words is taken from the "Glasgow Information
|
||||
# Retrieval Group". The original list can be found at
|
||||
# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
|
||||
ENGLISH_STOP_WORDS = frozenset([
|
||||
"a", "about", "above", "across", "after", "afterwards", "again", "against",
|
||||
"all", "almost", "alone", "along", "already", "also", "although", "always",
|
||||
"am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
|
||||
"any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
|
||||
"around", "as", "at", "back", "be", "became", "because", "become",
|
||||
"becomes", "becoming", "been", "before", "beforehand", "behind", "being",
|
||||
"below", "beside", "besides", "between", "beyond", "bill", "both",
|
||||
"bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
|
||||
"could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
|
||||
"down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
|
||||
"elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
|
||||
"everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
|
||||
"find", "fire", "first", "five", "for", "former", "formerly", "forty",
|
||||
"found", "four", "from", "front", "full", "further", "get", "give", "go",
|
||||
"had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
|
||||
"hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
|
||||
"how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
|
||||
"interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
|
||||
"latterly", "least", "less", "ltd", "made", "many", "may", "me",
|
||||
"meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
|
||||
"move", "much", "must", "my", "myself", "name", "namely", "neither",
|
||||
"never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
|
||||
"nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
|
||||
"once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
|
||||
"ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
|
||||
"please", "put", "rather", "re", "same", "see", "seem", "seemed",
|
||||
"seeming", "seems", "serious", "several", "she", "should", "show", "side",
|
||||
"since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
|
||||
"something", "sometime", "sometimes", "somewhere", "still", "such",
|
||||
"system", "take", "ten", "than", "that", "the", "their", "them",
|
||||
"themselves", "then", "thence", "there", "thereafter", "thereby",
|
||||
"therefore", "therein", "thereupon", "these", "they", "thick", "thin",
|
||||
"third", "this", "those", "though", "three", "through", "throughout",
|
||||
"thru", "thus", "to", "together", "too", "top", "toward", "towards",
|
||||
"twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
|
||||
"very", "via", "was", "we", "well", "were", "what", "whatever", "when",
|
||||
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
|
||||
"wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
|
||||
"who", "whoever", "whole", "whom", "whose", "why", "will", "with",
|
||||
"within", "without", "would", "yet", "you", "your", "yours", "yourself",
|
||||
"yourselves"])
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _dict_vectorizer # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.feature_extraction.dict_vectorizer'
|
||||
correct_import_path = 'sklearn.feature_extraction'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_dict_vectorizer, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/feature_extraction/hashing.py
Normal file
18
venv/Lib/site-packages/sklearn/feature_extraction/hashing.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _hash # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.feature_extraction.hashing'
|
||||
correct_import_path = 'sklearn.feature_extraction'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_hash, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
591
venv/Lib/site-packages/sklearn/feature_extraction/image.py
Normal file
591
venv/Lib/site-packages/sklearn/feature_extraction/image.py
Normal file
|
@ -0,0 +1,591 @@
|
|||
"""
|
||||
The :mod:`sklearn.feature_extraction.image` submodule gathers utilities to
|
||||
extract features from images.
|
||||
"""
|
||||
|
||||
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Olivier Grisel
|
||||
# Vlad Niculae
|
||||
# License: BSD 3 clause
|
||||
|
||||
from itertools import product
|
||||
import numbers
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from numpy.lib.stride_tricks import as_strided
|
||||
|
||||
from ..utils import check_array, check_random_state, deprecated
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..base import BaseEstimator
|
||||
|
||||
__all__ = ['PatchExtractor',
|
||||
'extract_patches_2d',
|
||||
'grid_to_graph',
|
||||
'img_to_graph',
|
||||
'reconstruct_from_patches_2d']
|
||||
|
||||
###############################################################################
|
||||
# From an image to a graph
|
||||
|
||||
|
||||
def _make_edges_3d(n_x, n_y, n_z=1):
|
||||
"""Returns a list of edges for a 3D image.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_x : int
|
||||
The size of the grid in the x direction.
|
||||
n_y : int
|
||||
The size of the grid in the y direction.
|
||||
n_z : integer, default=1
|
||||
The size of the grid in the z direction, defaults to 1
|
||||
"""
|
||||
vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z))
|
||||
edges_deep = np.vstack((vertices[:, :, :-1].ravel(),
|
||||
vertices[:, :, 1:].ravel()))
|
||||
edges_right = np.vstack((vertices[:, :-1].ravel(),
|
||||
vertices[:, 1:].ravel()))
|
||||
edges_down = np.vstack((vertices[:-1].ravel(), vertices[1:].ravel()))
|
||||
edges = np.hstack((edges_deep, edges_right, edges_down))
|
||||
return edges
|
||||
|
||||
|
||||
def _compute_gradient_3d(edges, img):
|
||||
_, n_y, n_z = img.shape
|
||||
gradient = np.abs(img[edges[0] // (n_y * n_z),
|
||||
(edges[0] % (n_y * n_z)) // n_z,
|
||||
(edges[0] % (n_y * n_z)) % n_z] -
|
||||
img[edges[1] // (n_y * n_z),
|
||||
(edges[1] % (n_y * n_z)) // n_z,
|
||||
(edges[1] % (n_y * n_z)) % n_z])
|
||||
return gradient
|
||||
|
||||
|
||||
# XXX: Why mask the image after computing the weights?
|
||||
|
||||
def _mask_edges_weights(mask, edges, weights=None):
|
||||
"""Apply a mask to edges (weighted or not)"""
|
||||
inds = np.arange(mask.size)
|
||||
inds = inds[mask.ravel()]
|
||||
ind_mask = np.logical_and(np.in1d(edges[0], inds),
|
||||
np.in1d(edges[1], inds))
|
||||
edges = edges[:, ind_mask]
|
||||
if weights is not None:
|
||||
weights = weights[ind_mask]
|
||||
if len(edges.ravel()):
|
||||
maxval = edges.max()
|
||||
else:
|
||||
maxval = 0
|
||||
order = np.searchsorted(np.unique(edges.ravel()), np.arange(maxval + 1))
|
||||
edges = order[edges]
|
||||
if weights is None:
|
||||
return edges
|
||||
else:
|
||||
return edges, weights
|
||||
|
||||
|
||||
def _to_graph(n_x, n_y, n_z, mask=None, img=None,
|
||||
return_as=sparse.coo_matrix, dtype=None):
|
||||
"""Auxiliary function for img_to_graph and grid_to_graph
|
||||
"""
|
||||
edges = _make_edges_3d(n_x, n_y, n_z)
|
||||
|
||||
if dtype is None:
|
||||
if img is None:
|
||||
dtype = np.int
|
||||
else:
|
||||
dtype = img.dtype
|
||||
|
||||
if img is not None:
|
||||
img = np.atleast_3d(img)
|
||||
weights = _compute_gradient_3d(edges, img)
|
||||
if mask is not None:
|
||||
edges, weights = _mask_edges_weights(mask, edges, weights)
|
||||
diag = img.squeeze()[mask]
|
||||
else:
|
||||
diag = img.ravel()
|
||||
n_voxels = diag.size
|
||||
else:
|
||||
if mask is not None:
|
||||
mask = mask.astype(dtype=np.bool, copy=False)
|
||||
mask = np.asarray(mask, dtype=np.bool)
|
||||
edges = _mask_edges_weights(mask, edges)
|
||||
n_voxels = np.sum(mask)
|
||||
else:
|
||||
n_voxels = n_x * n_y * n_z
|
||||
weights = np.ones(edges.shape[1], dtype=dtype)
|
||||
diag = np.ones(n_voxels, dtype=dtype)
|
||||
|
||||
diag_idx = np.arange(n_voxels)
|
||||
i_idx = np.hstack((edges[0], edges[1]))
|
||||
j_idx = np.hstack((edges[1], edges[0]))
|
||||
graph = sparse.coo_matrix((np.hstack((weights, weights, diag)),
|
||||
(np.hstack((i_idx, diag_idx)),
|
||||
np.hstack((j_idx, diag_idx)))),
|
||||
(n_voxels, n_voxels),
|
||||
dtype=dtype)
|
||||
if return_as is np.ndarray:
|
||||
return graph.toarray()
|
||||
return return_as(graph)
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):
|
||||
"""Graph of the pixel-to-pixel gradient connections
|
||||
|
||||
Edges are weighted with the gradient values.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
img : ndarray of shape (height, width) or (height, width, channel)
|
||||
2D or 3D image.
|
||||
mask : ndarray of shape (height, width) or \
|
||||
(height, width, channel), dtype=bool, default=None
|
||||
An optional mask of the image, to consider only part of the
|
||||
pixels.
|
||||
return_as : np.ndarray or a sparse matrix class, \
|
||||
default=sparse.coo_matrix
|
||||
The class to use to build the returned adjacency matrix.
|
||||
dtype : dtype, default=None
|
||||
The data of the returned sparse matrix. By default it is the
|
||||
dtype of img
|
||||
|
||||
Notes
|
||||
-----
|
||||
For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was
|
||||
handled by returning a dense np.matrix instance. Going forward, np.ndarray
|
||||
returns an np.ndarray, as expected.
|
||||
|
||||
For compatibility, user code relying on this method should wrap its
|
||||
calls in ``np.asarray`` to avoid type issues.
|
||||
"""
|
||||
img = np.atleast_3d(img)
|
||||
n_x, n_y, n_z = img.shape
|
||||
return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype)
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def grid_to_graph(n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix,
|
||||
dtype=np.int):
|
||||
"""Graph of the pixel-to-pixel connections
|
||||
|
||||
Edges exist if 2 voxels are connected.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_x : int
|
||||
Dimension in x axis
|
||||
n_y : int
|
||||
Dimension in y axis
|
||||
n_z : int, default=1
|
||||
Dimension in z axis
|
||||
mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None
|
||||
An optional mask of the image, to consider only part of the
|
||||
pixels.
|
||||
return_as : np.ndarray or a sparse matrix class, \
|
||||
default=sparse.coo_matrix
|
||||
The class to use to build the returned adjacency matrix.
|
||||
dtype : dtype, default=int
|
||||
The data of the returned sparse matrix. By default it is int
|
||||
|
||||
Notes
|
||||
-----
|
||||
For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was
|
||||
handled by returning a dense np.matrix instance. Going forward, np.ndarray
|
||||
returns an np.ndarray, as expected.
|
||||
|
||||
For compatibility, user code relying on this method should wrap its
|
||||
calls in ``np.asarray`` to avoid type issues.
|
||||
"""
|
||||
return _to_graph(n_x, n_y, n_z, mask=mask, return_as=return_as,
|
||||
dtype=dtype)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# From an image to a set of small image patches
|
||||
|
||||
def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):
|
||||
"""Compute the number of patches that will be extracted in an image.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
i_h : int
|
||||
The image height
|
||||
i_w : int
|
||||
The image with
|
||||
p_h : int
|
||||
The height of a patch
|
||||
p_w : int
|
||||
The width of a patch
|
||||
max_patches : int or float, default=None
|
||||
The maximum number of patches to extract. If max_patches is a float
|
||||
between 0 and 1, it is taken to be a proportion of the total number
|
||||
of patches.
|
||||
"""
|
||||
n_h = i_h - p_h + 1
|
||||
n_w = i_w - p_w + 1
|
||||
all_patches = n_h * n_w
|
||||
|
||||
if max_patches:
|
||||
if (isinstance(max_patches, (numbers.Integral))
|
||||
and max_patches < all_patches):
|
||||
return max_patches
|
||||
elif (isinstance(max_patches, (numbers.Integral))
|
||||
and max_patches >= all_patches):
|
||||
return all_patches
|
||||
elif (isinstance(max_patches, (numbers.Real))
|
||||
and 0 < max_patches < 1):
|
||||
return int(max_patches * all_patches)
|
||||
else:
|
||||
raise ValueError("Invalid value for max_patches: %r" % max_patches)
|
||||
else:
|
||||
return all_patches
|
||||
|
||||
|
||||
def _extract_patches(arr, patch_shape=8, extraction_step=1):
|
||||
"""Extracts patches of any n-dimensional array in place using strides.
|
||||
|
||||
Given an n-dimensional array it will return a 2n-dimensional array with
|
||||
the first n dimensions indexing patch position and the last n indexing
|
||||
the patch content. This operation is immediate (O(1)). A reshape
|
||||
performed on the first n dimensions will cause numpy to copy data, leading
|
||||
to a list of extracted patches.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ndarray
|
||||
n-dimensional array of which patches are to be extracted
|
||||
|
||||
patch_shape : int or tuple of length arr.ndim.default=8
|
||||
Indicates the shape of the patches to be extracted. If an
|
||||
integer is given, the shape will be a hypercube of
|
||||
sidelength given by its value.
|
||||
|
||||
extraction_step : int or tuple of length arr.ndim, default=1
|
||||
Indicates step size at which extraction shall be performed.
|
||||
If integer is given, then the step is uniform in all dimensions.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
patches : strided ndarray
|
||||
2n-dimensional array indexing patches on first n dimensions and
|
||||
containing patches on the last n dimensions. These dimensions
|
||||
are fake, but this way no data is copied. A simple reshape invokes
|
||||
a copying operation to obtain a list of patches:
|
||||
result.reshape([-1] + list(patch_shape))
|
||||
"""
|
||||
|
||||
arr_ndim = arr.ndim
|
||||
|
||||
if isinstance(patch_shape, numbers.Number):
|
||||
patch_shape = tuple([patch_shape] * arr_ndim)
|
||||
if isinstance(extraction_step, numbers.Number):
|
||||
extraction_step = tuple([extraction_step] * arr_ndim)
|
||||
|
||||
patch_strides = arr.strides
|
||||
|
||||
slices = tuple(slice(None, None, st) for st in extraction_step)
|
||||
indexing_strides = arr[slices].strides
|
||||
|
||||
patch_indices_shape = ((np.array(arr.shape) - np.array(patch_shape)) //
|
||||
np.array(extraction_step)) + 1
|
||||
|
||||
shape = tuple(list(patch_indices_shape) + list(patch_shape))
|
||||
strides = tuple(list(indexing_strides) + list(patch_strides))
|
||||
|
||||
patches = as_strided(arr, shape=shape, strides=strides)
|
||||
return patches
|
||||
|
||||
|
||||
@deprecated("The function feature_extraction.image.extract_patches has been "
|
||||
"deprecated in 0.22 and will be removed in 0.24.")
|
||||
def extract_patches(arr, patch_shape=8, extraction_step=1):
|
||||
"""Extracts patches of any n-dimensional array in place using strides.
|
||||
|
||||
Given an n-dimensional array it will return a 2n-dimensional array with
|
||||
the first n dimensions indexing patch position and the last n indexing
|
||||
the patch content. This operation is immediate (O(1)). A reshape
|
||||
performed on the first n dimensions will cause numpy to copy data, leading
|
||||
to a list of extracted patches.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : ndarray
|
||||
n-dimensional array of which patches are to be extracted
|
||||
|
||||
patch_shape : int or tuple of length arr.ndim, default=8
|
||||
Indicates the shape of the patches to be extracted. If an
|
||||
integer is given, the shape will be a hypercube of
|
||||
sidelength given by its value.
|
||||
|
||||
extraction_step : int or tuple of length arr.ndim, default=1
|
||||
Indicates step size at which extraction shall be performed.
|
||||
If integer is given, then the step is uniform in all dimensions.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
patches : strided ndarray
|
||||
2n-dimensional array indexing patches on first n dimensions and
|
||||
containing patches on the last n dimensions. These dimensions
|
||||
are fake, but this way no data is copied. A simple reshape invokes
|
||||
a copying operation to obtain a list of patches:
|
||||
result.reshape([-1] + list(patch_shape))
|
||||
"""
|
||||
return _extract_patches(arr, patch_shape=patch_shape,
|
||||
extraction_step=extraction_step)
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def extract_patches_2d(image, patch_size, *, max_patches=None,
|
||||
random_state=None):
|
||||
"""Reshape a 2D image into a collection of patches
|
||||
|
||||
The resulting patches are allocated in a dedicated array.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
image : ndarray of shape (image_height, image_width) or \
|
||||
(image_height, image_width, n_channels)
|
||||
The original image data. For color images, the last dimension specifies
|
||||
the channel: a RGB image would have `n_channels=3`.
|
||||
|
||||
patch_size : tuple of int (patch_height, patch_width)
|
||||
The dimensions of one patch.
|
||||
|
||||
max_patches : int or float, default=None
|
||||
The maximum number of patches to extract. If `max_patches` is a float
|
||||
between 0 and 1, it is taken to be a proportion of the total number
|
||||
of patches.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator used for random sampling when
|
||||
`max_patches` is not None. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
patches : array of shape (n_patches, patch_height, patch_width) or \
|
||||
(n_patches, patch_height, patch_width, n_channels)
|
||||
The collection of patches extracted from the image, where `n_patches`
|
||||
is either `max_patches` or the total number of patches that can be
|
||||
extracted.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_sample_image
|
||||
>>> from sklearn.feature_extraction import image
|
||||
>>> # Use the array data from the first image in this dataset:
|
||||
>>> one_image = load_sample_image("china.jpg")
|
||||
>>> print('Image shape: {}'.format(one_image.shape))
|
||||
Image shape: (427, 640, 3)
|
||||
>>> patches = image.extract_patches_2d(one_image, (2, 2))
|
||||
>>> print('Patches shape: {}'.format(patches.shape))
|
||||
Patches shape: (272214, 2, 2, 3)
|
||||
>>> # Here are just two of these patches:
|
||||
>>> print(patches[1])
|
||||
[[[174 201 231]
|
||||
[174 201 231]]
|
||||
[[173 200 230]
|
||||
[173 200 230]]]
|
||||
>>> print(patches[800])
|
||||
[[[187 214 243]
|
||||
[188 215 244]]
|
||||
[[187 214 243]
|
||||
[188 215 244]]]
|
||||
"""
|
||||
i_h, i_w = image.shape[:2]
|
||||
p_h, p_w = patch_size
|
||||
|
||||
if p_h > i_h:
|
||||
raise ValueError("Height of the patch should be less than the height"
|
||||
" of the image.")
|
||||
|
||||
if p_w > i_w:
|
||||
raise ValueError("Width of the patch should be less than the width"
|
||||
" of the image.")
|
||||
|
||||
image = check_array(image, allow_nd=True)
|
||||
image = image.reshape((i_h, i_w, -1))
|
||||
n_colors = image.shape[-1]
|
||||
|
||||
extracted_patches = _extract_patches(image,
|
||||
patch_shape=(p_h, p_w, n_colors),
|
||||
extraction_step=1)
|
||||
|
||||
n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)
|
||||
if max_patches:
|
||||
rng = check_random_state(random_state)
|
||||
i_s = rng.randint(i_h - p_h + 1, size=n_patches)
|
||||
j_s = rng.randint(i_w - p_w + 1, size=n_patches)
|
||||
patches = extracted_patches[i_s, j_s, 0]
|
||||
else:
|
||||
patches = extracted_patches
|
||||
|
||||
patches = patches.reshape(-1, p_h, p_w, n_colors)
|
||||
# remove the color dimension if useless
|
||||
if patches.shape[-1] == 1:
|
||||
return patches.reshape((n_patches, p_h, p_w))
|
||||
else:
|
||||
return patches
|
||||
|
||||
|
||||
def reconstruct_from_patches_2d(patches, image_size):
|
||||
"""Reconstruct the image from all of its patches.
|
||||
|
||||
Patches are assumed to overlap and the image is constructed by filling in
|
||||
the patches from left to right, top to bottom, averaging the overlapping
|
||||
regions.
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
patches : ndarray of shape (n_patches, patch_height, patch_width) or \
|
||||
(n_patches, patch_height, patch_width, n_channels)
|
||||
The complete set of patches. If the patches contain colour information,
|
||||
channels are indexed along the last dimension: RGB patches would
|
||||
have `n_channels=3`.
|
||||
|
||||
image_size : tuple of int (image_height, image_width) or \
|
||||
(image_height, image_width, n_channels)
|
||||
The size of the image that will be reconstructed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
image : ndarray of shape image_size
|
||||
The reconstructed image.
|
||||
"""
|
||||
i_h, i_w = image_size[:2]
|
||||
p_h, p_w = patches.shape[1:3]
|
||||
img = np.zeros(image_size)
|
||||
# compute the dimensions of the patches array
|
||||
n_h = i_h - p_h + 1
|
||||
n_w = i_w - p_w + 1
|
||||
for p, (i, j) in zip(patches, product(range(n_h), range(n_w))):
|
||||
img[i:i + p_h, j:j + p_w] += p
|
||||
|
||||
for i in range(i_h):
|
||||
for j in range(i_w):
|
||||
# divide by the amount of overlap
|
||||
# XXX: is this the most efficient way? memory-wise yes, cpu wise?
|
||||
img[i, j] /= float(min(i + 1, p_h, i_h - i) *
|
||||
min(j + 1, p_w, i_w - j))
|
||||
return img
|
||||
|
||||
|
||||
class PatchExtractor(BaseEstimator):
|
||||
"""Extracts patches from a collection of images
|
||||
|
||||
Read more in the :ref:`User Guide <image_feature_extraction>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
patch_size : tuple of int (patch_height, patch_width)
|
||||
The dimensions of one patch.
|
||||
|
||||
max_patches : int or float, default=None
|
||||
The maximum number of patches per image to extract. If max_patches is a
|
||||
float in (0, 1), it is taken to mean a proportion of the total number
|
||||
of patches.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator used for random sampling when
|
||||
`max_patches` is not None. Use an int to make the randomness
|
||||
deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_sample_images
|
||||
>>> from sklearn.feature_extraction import image
|
||||
>>> # Use the array data from the second image in this dataset:
|
||||
>>> X = load_sample_images().images[1]
|
||||
>>> print('Image shape: {}'.format(X.shape))
|
||||
Image shape: (427, 640, 3)
|
||||
>>> pe = image.PatchExtractor(patch_size=(2, 2))
|
||||
>>> pe_fit = pe.fit(X)
|
||||
>>> pe_trans = pe.transform(X)
|
||||
>>> print('Patches shape: {}'.format(pe_trans.shape))
|
||||
Patches shape: (545706, 2, 2)
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, patch_size=None, max_patches=None,
|
||||
random_state=None):
|
||||
self.patch_size = patch_size
|
||||
self.max_patches = max_patches
|
||||
self.random_state = random_state
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Do nothing and return the estimator unchanged.
|
||||
|
||||
This method is just there to implement the usual API and hence
|
||||
work in pipelines.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data.
|
||||
"""
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Transforms the image samples in X into a matrix of patch data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, image_height, image_width) or \
|
||||
(n_samples, image_height, image_width, n_channels)
|
||||
Array of images from which to extract patches. For color images,
|
||||
the last dimension specifies the channel: a RGB image would have
|
||||
`n_channels=3`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
patches : array of shape (n_patches, patch_height, patch_width) or \
|
||||
(n_patches, patch_height, patch_width, n_channels)
|
||||
The collection of patches extracted from the images, where
|
||||
`n_patches` is either `n_samples * max_patches` or the total
|
||||
number of patches that can be extracted.
|
||||
"""
|
||||
self.random_state = check_random_state(self.random_state)
|
||||
n_images, i_h, i_w = X.shape[:3]
|
||||
X = np.reshape(X, (n_images, i_h, i_w, -1))
|
||||
n_channels = X.shape[-1]
|
||||
if self.patch_size is None:
|
||||
patch_size = i_h // 10, i_w // 10
|
||||
else:
|
||||
patch_size = self.patch_size
|
||||
|
||||
# compute the dimensions of the patches array
|
||||
p_h, p_w = patch_size
|
||||
n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, self.max_patches)
|
||||
patches_shape = (n_images * n_patches,) + patch_size
|
||||
if n_channels > 1:
|
||||
patches_shape += (n_channels,)
|
||||
|
||||
# extract the patches
|
||||
patches = np.empty(patches_shape)
|
||||
for ii, image in enumerate(X):
|
||||
patches[ii * n_patches:(ii + 1) * n_patches] = extract_patches_2d(
|
||||
image, patch_size, max_patches=self.max_patches,
|
||||
random_state=self.random_state)
|
||||
return patches
|
||||
|
||||
def _more_tags(self):
|
||||
return {'X_types': ['3darray']}
|
21
venv/Lib/site-packages/sklearn/feature_extraction/setup.py
Normal file
21
venv/Lib/site-packages/sklearn/feature_extraction/setup.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
import os
|
||||
import platform
|
||||
|
||||
|
||||
def configuration(parent_package='', top_path=None):
|
||||
import numpy
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
config = Configuration('feature_extraction', parent_package, top_path)
|
||||
libraries = []
|
||||
if os.name == 'posix':
|
||||
libraries.append('m')
|
||||
|
||||
if platform.python_implementation() != 'PyPy':
|
||||
config.add_extension('_hashing_fast',
|
||||
sources=['_hashing_fast.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
config.add_subpackage("tests")
|
||||
|
||||
return config
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _stop_words # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.feature_extraction.stop_words'
|
||||
correct_import_path = 'sklearn.feature_extraction.text'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_stop_words, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,121 @@
|
|||
# Authors: Lars Buitinck
|
||||
# Dan Blanchard <dblanchard@ets.org>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from random import Random
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sparse', (True, False))
|
||||
@pytest.mark.parametrize('dtype', (int, np.float32, np.int16))
|
||||
@pytest.mark.parametrize('sort', (True, False))
|
||||
@pytest.mark.parametrize('iterable', (True, False))
|
||||
def test_dictvectorizer(sparse, dtype, sort, iterable):
|
||||
D = [{"foo": 1, "bar": 3},
|
||||
{"bar": 4, "baz": 2},
|
||||
{"bar": 1, "quux": 1, "quuux": 2}]
|
||||
|
||||
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
|
||||
X = v.fit_transform(iter(D) if iterable else D)
|
||||
|
||||
assert sp.issparse(X) == sparse
|
||||
assert X.shape == (3, 5)
|
||||
assert X.sum() == 14
|
||||
assert v.inverse_transform(X) == D
|
||||
|
||||
if sparse:
|
||||
# CSR matrices can't be compared for equality
|
||||
assert_array_equal(X.A, v.transform(iter(D) if iterable
|
||||
else D).A)
|
||||
else:
|
||||
assert_array_equal(X, v.transform(iter(D) if iterable
|
||||
else D))
|
||||
|
||||
if sort:
|
||||
assert (v.feature_names_ ==
|
||||
sorted(v.feature_names_))
|
||||
|
||||
|
||||
def test_feature_selection():
|
||||
# make two feature dicts with two useful features and a bunch of useless
|
||||
# ones, in terms of chi2
|
||||
d1 = dict([("useless%d" % i, 10) for i in range(20)],
|
||||
useful1=1, useful2=20)
|
||||
d2 = dict([("useless%d" % i, 10) for i in range(20)],
|
||||
useful1=20, useful2=1)
|
||||
|
||||
for indices in (True, False):
|
||||
v = DictVectorizer().fit([d1, d2])
|
||||
X = v.transform([d1, d2])
|
||||
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
|
||||
|
||||
v.restrict(sel.get_support(indices=indices), indices=indices)
|
||||
assert v.get_feature_names() == ["useful1", "useful2"]
|
||||
|
||||
|
||||
def test_one_of_k():
|
||||
D_in = [{"version": "1", "ham": 2},
|
||||
{"version": "2", "spam": .3},
|
||||
{"version=3": True, "spam": -1}]
|
||||
v = DictVectorizer()
|
||||
X = v.fit_transform(D_in)
|
||||
assert X.shape == (3, 5)
|
||||
|
||||
D_out = v.inverse_transform(X)
|
||||
assert D_out[0] == {"version=1": 1, "ham": 2}
|
||||
|
||||
names = v.get_feature_names()
|
||||
assert "version=2" in names
|
||||
assert "version" not in names
|
||||
|
||||
|
||||
def test_unseen_or_no_features():
|
||||
D = [{"camelot": 0, "spamalot": 1}]
|
||||
for sparse in [True, False]:
|
||||
v = DictVectorizer(sparse=sparse).fit(D)
|
||||
|
||||
X = v.transform({"push the pram a lot": 2})
|
||||
if sparse:
|
||||
X = X.toarray()
|
||||
assert_array_equal(X, np.zeros((1, 2)))
|
||||
|
||||
X = v.transform({})
|
||||
if sparse:
|
||||
X = X.toarray()
|
||||
assert_array_equal(X, np.zeros((1, 2)))
|
||||
|
||||
try:
|
||||
v.transform([])
|
||||
except ValueError as e:
|
||||
assert "empty" in str(e)
|
||||
|
||||
|
||||
def test_deterministic_vocabulary():
|
||||
# Generate equal dictionaries with different memory layouts
|
||||
items = [("%03d" % i, i) for i in range(1000)]
|
||||
rng = Random(42)
|
||||
d_sorted = dict(items)
|
||||
rng.shuffle(items)
|
||||
d_shuffled = dict(items)
|
||||
|
||||
# check that the memory layout does not impact the resulting vocabulary
|
||||
v_1 = DictVectorizer().fit([d_sorted])
|
||||
v_2 = DictVectorizer().fit([d_shuffled])
|
||||
|
||||
assert v_1.vocabulary_ == v_2.vocabulary_
|
||||
|
||||
|
||||
def test_n_features_in():
|
||||
# For vectorizers, n_features_in_ does not make sense and does not exist.
|
||||
dv = DictVectorizer()
|
||||
assert not hasattr(dv, 'n_features_in_')
|
||||
d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
|
||||
dv.fit(d)
|
||||
assert not hasattr(dv, 'n_features_in_')
|
|
@ -0,0 +1,171 @@
|
|||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal
|
||||
import pytest
|
||||
|
||||
from sklearn.feature_extraction import FeatureHasher
|
||||
from sklearn.utils._testing import (ignore_warnings,
|
||||
fails_if_pypy)
|
||||
|
||||
pytestmark = fails_if_pypy
|
||||
|
||||
|
||||
def test_feature_hasher_dicts():
|
||||
h = FeatureHasher(n_features=16)
|
||||
assert "dict" == h.input_type
|
||||
|
||||
raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
|
||||
{"foo": "baz", "gaga": "string1"}]
|
||||
X1 = FeatureHasher(n_features=16).transform(raw_X)
|
||||
gen = (iter(d.items()) for d in raw_X)
|
||||
X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
|
||||
assert_array_equal(X1.toarray(), X2.toarray())
|
||||
|
||||
|
||||
def test_feature_hasher_strings():
|
||||
# mix byte and Unicode strings; note that "foo" is a duplicate in row 0
|
||||
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
|
||||
["bar".encode("ascii"), "baz", "quux"]]
|
||||
|
||||
for lg_n_features in (7, 9, 11, 16, 22):
|
||||
n_features = 2 ** lg_n_features
|
||||
|
||||
it = (x for x in raw_X) # iterable
|
||||
|
||||
h = FeatureHasher(n_features=n_features, input_type="string",
|
||||
alternate_sign=False)
|
||||
X = h.transform(it)
|
||||
|
||||
assert X.shape[0] == len(raw_X)
|
||||
assert X.shape[1] == n_features
|
||||
|
||||
assert X[0].sum() == 4
|
||||
assert X[1].sum() == 3
|
||||
|
||||
assert X.nnz == 6
|
||||
|
||||
|
||||
def test_hashing_transform_seed():
|
||||
# check the influence of the seed when computing the hashes
|
||||
# import is here to avoid importing on pypy
|
||||
from sklearn.feature_extraction._hashing_fast import (
|
||||
transform as _hashing_transform)
|
||||
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
|
||||
["bar".encode("ascii"), "baz", "quux"]]
|
||||
|
||||
raw_X_ = (((f, 1) for f in x) for x in raw_X)
|
||||
indices, indptr, _ = _hashing_transform(raw_X_, 2 ** 7, str,
|
||||
False)
|
||||
|
||||
raw_X_ = (((f, 1) for f in x) for x in raw_X)
|
||||
indices_0, indptr_0, _ = _hashing_transform(raw_X_, 2 ** 7, str,
|
||||
False, seed=0)
|
||||
assert_array_equal(indices, indices_0)
|
||||
assert_array_equal(indptr, indptr_0)
|
||||
|
||||
raw_X_ = (((f, 1) for f in x) for x in raw_X)
|
||||
indices_1, _, _ = _hashing_transform(raw_X_, 2 ** 7, str,
|
||||
False, seed=1)
|
||||
with pytest.raises(AssertionError):
|
||||
assert_array_equal(indices, indices_1)
|
||||
|
||||
|
||||
def test_feature_hasher_pairs():
|
||||
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
|
||||
{"baz": 3, "quux": 4, "foo": -1}])
|
||||
h = FeatureHasher(n_features=16, input_type="pair")
|
||||
x1, x2 = h.transform(raw_X).toarray()
|
||||
x1_nz = sorted(np.abs(x1[x1 != 0]))
|
||||
x2_nz = sorted(np.abs(x2[x2 != 0]))
|
||||
assert [1, 2] == x1_nz
|
||||
assert [1, 3, 4] == x2_nz
|
||||
|
||||
|
||||
def test_feature_hasher_pairs_with_string_values():
|
||||
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
|
||||
{"baz": "abc", "quux": 4, "foo": -1}])
|
||||
h = FeatureHasher(n_features=16, input_type="pair")
|
||||
x1, x2 = h.transform(raw_X).toarray()
|
||||
x1_nz = sorted(np.abs(x1[x1 != 0]))
|
||||
x2_nz = sorted(np.abs(x2[x2 != 0]))
|
||||
assert [1, 1] == x1_nz
|
||||
assert [1, 1, 4] == x2_nz
|
||||
|
||||
raw_X = (iter(d.items()) for d in [{"bax": "abc"},
|
||||
{"bax": "abc"}])
|
||||
x1, x2 = h.transform(raw_X).toarray()
|
||||
x1_nz = np.abs(x1[x1 != 0])
|
||||
x2_nz = np.abs(x2[x2 != 0])
|
||||
assert [1] == x1_nz
|
||||
assert [1] == x2_nz
|
||||
assert_array_equal(x1, x2)
|
||||
|
||||
|
||||
def test_hash_empty_input():
|
||||
n_features = 16
|
||||
raw_X = [[], (), iter(range(0))]
|
||||
|
||||
h = FeatureHasher(n_features=n_features, input_type="string")
|
||||
X = h.transform(raw_X)
|
||||
|
||||
assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
|
||||
|
||||
|
||||
def test_hasher_invalid_input():
|
||||
with pytest.raises(ValueError):
|
||||
FeatureHasher(input_type="gobbledygook")
|
||||
with pytest.raises(ValueError):
|
||||
FeatureHasher(n_features=-1)
|
||||
with pytest.raises(ValueError):
|
||||
FeatureHasher(n_features=0)
|
||||
with pytest.raises(TypeError):
|
||||
FeatureHasher(n_features='ham')
|
||||
|
||||
h = FeatureHasher(n_features=np.uint16(2 ** 6))
|
||||
with pytest.raises(ValueError):
|
||||
h.transform([])
|
||||
with pytest.raises(Exception):
|
||||
h.transform([[5.5]])
|
||||
with pytest.raises(Exception):
|
||||
h.transform([[None]])
|
||||
|
||||
|
||||
def test_hasher_set_params():
|
||||
# Test delayed input validation in fit (useful for grid search).
|
||||
hasher = FeatureHasher()
|
||||
hasher.set_params(n_features=np.inf)
|
||||
with pytest.raises(TypeError):
|
||||
hasher.fit()
|
||||
|
||||
|
||||
def test_hasher_zeros():
|
||||
# Assert that no zeros are materialized in the output.
|
||||
X = FeatureHasher().transform([{'foo': 0}])
|
||||
assert X.data.shape == (0,)
|
||||
|
||||
|
||||
@ignore_warnings(category=FutureWarning)
|
||||
def test_hasher_alternate_sign():
|
||||
X = [list("Thequickbrownfoxjumped")]
|
||||
|
||||
Xt = FeatureHasher(alternate_sign=True,
|
||||
input_type='string').fit_transform(X)
|
||||
assert Xt.data.min() < 0 and Xt.data.max() > 0
|
||||
|
||||
Xt = FeatureHasher(alternate_sign=False,
|
||||
input_type='string').fit_transform(X)
|
||||
assert Xt.data.min() > 0
|
||||
|
||||
|
||||
def test_hash_collisions():
|
||||
X = [list("Thequickbrownfoxjumped")]
|
||||
|
||||
Xt = FeatureHasher(alternate_sign=True, n_features=1,
|
||||
input_type='string').fit_transform(X)
|
||||
# check that some of the hashed tokens are added
|
||||
# with an opposite sign and cancel out
|
||||
assert abs(Xt.data[0]) < len(X[0])
|
||||
|
||||
Xt = FeatureHasher(alternate_sign=False, n_features=1,
|
||||
input_type='string').fit_transform(X)
|
||||
assert Xt.data[0] == len(X[0])
|
|
@ -0,0 +1,344 @@
|
|||
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import scipy as sp
|
||||
from scipy import ndimage
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
import pytest
|
||||
|
||||
from sklearn.feature_extraction.image import (
|
||||
img_to_graph, grid_to_graph, extract_patches_2d,
|
||||
reconstruct_from_patches_2d, PatchExtractor, _extract_patches,
|
||||
extract_patches)
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
|
||||
def test_img_to_graph():
|
||||
x, y = np.mgrid[:4, :4] - 10
|
||||
grad_x = img_to_graph(x)
|
||||
grad_y = img_to_graph(y)
|
||||
assert grad_x.nnz == grad_y.nnz
|
||||
# Negative elements are the diagonal: the elements of the original
|
||||
# image. Positive elements are the values of the gradient, they
|
||||
# should all be equal on grad_x and grad_y
|
||||
np.testing.assert_array_equal(grad_x.data[grad_x.data > 0],
|
||||
grad_y.data[grad_y.data > 0])
|
||||
|
||||
|
||||
def test_grid_to_graph():
|
||||
# Checking that the function works with graphs containing no edges
|
||||
size = 2
|
||||
roi_size = 1
|
||||
# Generating two convex parts with one vertex
|
||||
# Thus, edges will be empty in _to_graph
|
||||
mask = np.zeros((size, size), dtype=np.bool)
|
||||
mask[0:roi_size, 0:roi_size] = True
|
||||
mask[-roi_size:, -roi_size:] = True
|
||||
mask = mask.reshape(size ** 2)
|
||||
A = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
|
||||
assert connected_components(A)[0] == 2
|
||||
|
||||
# Checking that the function works whatever the type of mask is
|
||||
mask = np.ones((size, size), dtype=np.int16)
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask)
|
||||
assert connected_components(A)[0] == 1
|
||||
|
||||
# Checking dtype of the graph
|
||||
mask = np.ones((size, size))
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.bool)
|
||||
assert A.dtype == np.bool
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.int)
|
||||
assert A.dtype == np.int
|
||||
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask,
|
||||
dtype=np.float64)
|
||||
assert A.dtype == np.float64
|
||||
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning) # scipy deprecation inside face
|
||||
def test_connect_regions():
|
||||
try:
|
||||
face = sp.face(gray=True)
|
||||
except AttributeError:
|
||||
# Newer versions of scipy have face in misc
|
||||
from scipy import misc
|
||||
face = misc.face(gray=True)
|
||||
# subsample by 4 to reduce run time
|
||||
face = face[::4, ::4]
|
||||
for thr in (50, 150):
|
||||
mask = face > thr
|
||||
graph = img_to_graph(face, mask=mask)
|
||||
assert ndimage.label(mask)[1] == connected_components(graph)[0]
|
||||
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning) # scipy deprecation inside face
|
||||
def test_connect_regions_with_grid():
|
||||
try:
|
||||
face = sp.face(gray=True)
|
||||
except AttributeError:
|
||||
# Newer versions of scipy have face in misc
|
||||
from scipy import misc
|
||||
face = misc.face(gray=True)
|
||||
|
||||
# subsample by 4 to reduce run time
|
||||
face = face[::4, ::4]
|
||||
|
||||
mask = face > 50
|
||||
graph = grid_to_graph(*face.shape, mask=mask)
|
||||
assert ndimage.label(mask)[1] == connected_components(graph)[0]
|
||||
|
||||
mask = face > 150
|
||||
graph = grid_to_graph(*face.shape, mask=mask, dtype=None)
|
||||
assert ndimage.label(mask)[1] == connected_components(graph)[0]
|
||||
|
||||
|
||||
def _downsampled_face():
|
||||
try:
|
||||
face = sp.face(gray=True)
|
||||
except AttributeError:
|
||||
# Newer versions of scipy have face in misc
|
||||
from scipy import misc
|
||||
face = misc.face(gray=True)
|
||||
face = face.astype(np.float32)
|
||||
face = (face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2]
|
||||
+ face[1::2, 1::2])
|
||||
face = (face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2]
|
||||
+ face[1::2, 1::2])
|
||||
face = face.astype(np.float32)
|
||||
face /= 16.0
|
||||
return face
|
||||
|
||||
|
||||
def _orange_face(face=None):
|
||||
face = _downsampled_face() if face is None else face
|
||||
face_color = np.zeros(face.shape + (3,))
|
||||
face_color[:, :, 0] = 256 - face
|
||||
face_color[:, :, 1] = 256 - face / 2
|
||||
face_color[:, :, 2] = 256 - face / 4
|
||||
return face_color
|
||||
|
||||
|
||||
def _make_images(face=None):
|
||||
face = _downsampled_face() if face is None else face
|
||||
# make a collection of faces
|
||||
images = np.zeros((3,) + face.shape)
|
||||
images[0] = face
|
||||
images[1] = face + 1
|
||||
images[2] = face + 2
|
||||
return images
|
||||
|
||||
downsampled_face = _downsampled_face()
|
||||
orange_face = _orange_face(downsampled_face)
|
||||
face_collection = _make_images(downsampled_face)
|
||||
|
||||
|
||||
def test_extract_patches_all():
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 16, 16
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_extract_patches_all_color():
|
||||
face = orange_face
|
||||
i_h, i_w = face.shape[:2]
|
||||
p_h, p_w = 16, 16
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w, 3)
|
||||
|
||||
|
||||
def test_extract_patches_all_rect():
|
||||
face = downsampled_face
|
||||
face = face[:, 32:97]
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 16, 12
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_extract_patches_max_patches():
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 16, 16
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w), max_patches=100)
|
||||
assert patches.shape == (100, p_h, p_w)
|
||||
|
||||
expected_n_patches = int(0.5 * (i_h - p_h + 1) * (i_w - p_w + 1))
|
||||
patches = extract_patches_2d(face, (p_h, p_w), max_patches=0.5)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(face, (p_h, p_w), max_patches=2.0)
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(face, (p_h, p_w), max_patches=-1.0)
|
||||
|
||||
|
||||
def test_extract_patch_same_size_image():
|
||||
face = downsampled_face
|
||||
# Request patches of the same size as image
|
||||
# Should return just the single patch a.k.a. the image
|
||||
patches = extract_patches_2d(face, face.shape, max_patches=2)
|
||||
assert patches.shape[0] == 1
|
||||
|
||||
|
||||
def test_extract_patches_less_than_max_patches():
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p_h, p_w = 3 * i_h // 4, 3 * i_w // 4
|
||||
# this is 3185
|
||||
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w), max_patches=4000)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_reconstruct_patches_perfect():
|
||||
face = downsampled_face
|
||||
p_h, p_w = 16, 16
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
|
||||
np.testing.assert_array_almost_equal(face, face_reconstructed)
|
||||
|
||||
|
||||
def test_reconstruct_patches_perfect_color():
|
||||
face = orange_face
|
||||
p_h, p_w = 16, 16
|
||||
|
||||
patches = extract_patches_2d(face, (p_h, p_w))
|
||||
face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
|
||||
np.testing.assert_array_almost_equal(face, face_reconstructed)
|
||||
|
||||
|
||||
def test_patch_extractor_fit():
|
||||
faces = face_collection
|
||||
extr = PatchExtractor(patch_size=(8, 8), max_patches=100, random_state=0)
|
||||
assert extr == extr.fit(faces)
|
||||
|
||||
|
||||
def test_patch_extractor_max_patches():
|
||||
faces = face_collection
|
||||
i_h, i_w = faces.shape[1:3]
|
||||
p_h, p_w = 8, 8
|
||||
|
||||
max_patches = 100
|
||||
expected_n_patches = len(faces) * max_patches
|
||||
extr = PatchExtractor(patch_size=(p_h, p_w), max_patches=max_patches,
|
||||
random_state=0)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
max_patches = 0.5
|
||||
expected_n_patches = len(faces) * int((i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
* max_patches)
|
||||
extr = PatchExtractor(patch_size=(p_h, p_w), max_patches=max_patches,
|
||||
random_state=0)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_patch_extractor_max_patches_default():
|
||||
faces = face_collection
|
||||
extr = PatchExtractor(max_patches=100, random_state=0)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (len(faces) * 100, 19, 25)
|
||||
|
||||
|
||||
def test_patch_extractor_all_patches():
|
||||
faces = face_collection
|
||||
i_h, i_w = faces.shape[1:3]
|
||||
p_h, p_w = 8, 8
|
||||
expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
extr = PatchExtractor(patch_size=(p_h, p_w), random_state=0)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w)
|
||||
|
||||
|
||||
def test_patch_extractor_color():
|
||||
faces = _make_images(orange_face)
|
||||
i_h, i_w = faces.shape[1:3]
|
||||
p_h, p_w = 8, 8
|
||||
expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
|
||||
extr = PatchExtractor(patch_size=(p_h, p_w), random_state=0)
|
||||
patches = extr.transform(faces)
|
||||
assert patches.shape == (expected_n_patches, p_h, p_w, 3)
|
||||
|
||||
|
||||
def test_extract_patches_strided():
|
||||
|
||||
image_shapes_1D = [(10,), (10,), (11,), (10,)]
|
||||
patch_sizes_1D = [(1,), (2,), (3,), (8,)]
|
||||
patch_steps_1D = [(1,), (1,), (4,), (2,)]
|
||||
|
||||
expected_views_1D = [(10,), (9,), (3,), (2,)]
|
||||
last_patch_1D = [(10,), (8,), (8,), (2,)]
|
||||
|
||||
image_shapes_2D = [(10, 20), (10, 20), (10, 20), (11, 20)]
|
||||
patch_sizes_2D = [(2, 2), (10, 10), (10, 11), (6, 6)]
|
||||
patch_steps_2D = [(5, 5), (3, 10), (3, 4), (4, 2)]
|
||||
|
||||
expected_views_2D = [(2, 4), (1, 2), (1, 3), (2, 8)]
|
||||
last_patch_2D = [(5, 15), (0, 10), (0, 8), (4, 14)]
|
||||
|
||||
image_shapes_3D = [(5, 4, 3), (3, 3, 3), (7, 8, 9), (7, 8, 9)]
|
||||
patch_sizes_3D = [(2, 2, 3), (2, 2, 2), (1, 7, 3), (1, 3, 3)]
|
||||
patch_steps_3D = [(1, 2, 10), (1, 1, 1), (2, 1, 3), (3, 3, 4)]
|
||||
|
||||
expected_views_3D = [(4, 2, 1), (2, 2, 2), (4, 2, 3), (3, 2, 2)]
|
||||
last_patch_3D = [(3, 2, 0), (1, 1, 1), (6, 1, 6), (6, 3, 4)]
|
||||
|
||||
image_shapes = image_shapes_1D + image_shapes_2D + image_shapes_3D
|
||||
patch_sizes = patch_sizes_1D + patch_sizes_2D + patch_sizes_3D
|
||||
patch_steps = patch_steps_1D + patch_steps_2D + patch_steps_3D
|
||||
expected_views = expected_views_1D + expected_views_2D + expected_views_3D
|
||||
last_patches = last_patch_1D + last_patch_2D + last_patch_3D
|
||||
|
||||
for (image_shape, patch_size, patch_step, expected_view,
|
||||
last_patch) in zip(image_shapes, patch_sizes, patch_steps,
|
||||
expected_views, last_patches):
|
||||
image = np.arange(np.prod(image_shape)).reshape(image_shape)
|
||||
patches = _extract_patches(image, patch_shape=patch_size,
|
||||
extraction_step=patch_step)
|
||||
|
||||
ndim = len(image_shape)
|
||||
|
||||
assert patches.shape[:ndim] == expected_view
|
||||
last_patch_slices = tuple(slice(i, i + j, None) for i, j in
|
||||
zip(last_patch, patch_size))
|
||||
assert (patches[(-1, None, None) * ndim] ==
|
||||
image[last_patch_slices].squeeze()).all()
|
||||
|
||||
|
||||
def test_extract_patches_square():
|
||||
# test same patch size for all dimensions
|
||||
face = downsampled_face
|
||||
i_h, i_w = face.shape
|
||||
p = 8
|
||||
expected_n_patches = ((i_h - p + 1), (i_w - p + 1))
|
||||
patches = _extract_patches(face, patch_shape=p)
|
||||
assert patches.shape == (expected_n_patches[0],
|
||||
expected_n_patches[1], p, p)
|
||||
|
||||
|
||||
def test_width_patch():
|
||||
# width and height of the patch should be less than the image
|
||||
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(x, (4, 1))
|
||||
with pytest.raises(ValueError):
|
||||
extract_patches_2d(x, (1, 4))
|
||||
|
||||
|
||||
# TODO: Remove in 0.24
|
||||
def test_extract_patches_deprecated():
|
||||
msg = ("The function feature_extraction.image.extract_patches has been "
|
||||
"deprecated in 0.22 and will be removed in 0.24.")
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
extract_patches(downsampled_face)
|
1374
venv/Lib/site-packages/sklearn/feature_extraction/tests/test_text.py
Normal file
1374
venv/Lib/site-packages/sklearn/feature_extraction/tests/test_text.py
Normal file
File diff suppressed because it is too large
Load diff
1884
venv/Lib/site-packages/sklearn/feature_extraction/text.py
Normal file
1884
venv/Lib/site-packages/sklearn/feature_extraction/text.py
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue