Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,13 @@
"""
The :mod:`sklearn.feature_extraction` module deals with feature extraction
from raw data. It currently includes methods to extract features from text and
images.
"""
from ._dict_vectorizer import DictVectorizer
from ._hash import FeatureHasher
from .image import img_to_graph, grid_to_graph
from . import text
__all__ = ['DictVectorizer', 'image', 'img_to_graph', 'grid_to_graph', 'text',
'FeatureHasher']

View file

@ -0,0 +1,364 @@
# Authors: Lars Buitinck
# Dan Blanchard <dblanchard@ets.org>
# License: BSD 3 clause
from array import array
from collections.abc import Mapping
from operator import itemgetter
import numpy as np
import scipy.sparse as sp
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array, tosequence
from ..utils.validation import _deprecate_positional_args
def _tosequence(X):
"""Turn X into a sequence or ndarray, avoiding a copy if possible."""
if isinstance(X, Mapping): # single sample
return [X]
else:
return tosequence(X)
class DictVectorizer(TransformerMixin, BaseEstimator):
"""Transforms lists of feature-value mappings to vectors.
This transformer turns lists of mappings (dict-like objects) of feature
names to feature values into Numpy arrays or scipy.sparse matrices for use
with scikit-learn estimators.
When feature values are strings, this transformer will do a binary one-hot
(aka one-of-K) coding: one boolean-valued feature is constructed for each
of the possible string values that the feature can take on. For instance,
a feature "f" that can take on the values "ham" and "spam" will become two
features in the output, one signifying "f=ham", the other "f=spam".
However, note that this transformer will only do a binary one-hot encoding
when feature values are of type string. If categorical features are
represented as numeric values such as int, the DictVectorizer can be
followed by :class:`sklearn.preprocessing.OneHotEncoder` to complete
binary one-hot encoding.
Features that do not occur in a sample (mapping) will have a zero value
in the resulting array/matrix.
Read more in the :ref:`User Guide <dict_feature_extraction>`.
Parameters
----------
dtype : dtype, default=np.float64
The type of feature values. Passed to Numpy array/scipy.sparse matrix
constructors as the dtype argument.
separator : str, default="="
Separator string used when constructing new features for one-hot
coding.
sparse : bool, default=True
Whether transform should produce scipy.sparse matrices.
sort : bool, default=True
Whether ``feature_names_`` and ``vocabulary_`` should be
sorted when fitting.
Attributes
----------
vocabulary_ : dict
A dictionary mapping feature names to feature indices.
feature_names_ : list
A list of length n_features containing the feature names (e.g., "f=ham"
and "f=spam").
Examples
--------
>>> from sklearn.feature_extraction import DictVectorizer
>>> v = DictVectorizer(sparse=False)
>>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
>>> X = v.fit_transform(D)
>>> X
array([[2., 0., 1.],
[0., 1., 3.]])
>>> v.inverse_transform(X) == \
[{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]
True
>>> v.transform({'foo': 4, 'unseen_feature': 3})
array([[0., 0., 4.]])
See also
--------
FeatureHasher : performs vectorization using only a hash function.
sklearn.preprocessing.OrdinalEncoder : handles nominal/categorical
features encoded as columns of arbitrary data types.
"""
@_deprecate_positional_args
def __init__(self, *, dtype=np.float64, separator="=", sparse=True,
sort=True):
self.dtype = dtype
self.separator = separator
self.sparse = sparse
self.sort = sort
def fit(self, X, y=None):
"""Learn a list of feature name -> indices mappings.
Parameters
----------
X : Mapping or iterable over Mappings
Dict(s) or Mapping(s) from feature names (arbitrary Python
objects) to feature values (strings or convertible to dtype).
y : (ignored)
Returns
-------
self
"""
feature_names = []
vocab = {}
for x in X:
for f, v in x.items():
if isinstance(v, str):
f = "%s%s%s" % (f, self.separator, v)
if f not in vocab:
feature_names.append(f)
vocab[f] = len(vocab)
if self.sort:
feature_names.sort()
vocab = {f: i for i, f in enumerate(feature_names)}
self.feature_names_ = feature_names
self.vocabulary_ = vocab
return self
def _transform(self, X, fitting):
# Sanity check: Python's array has no way of explicitly requesting the
# signed 32-bit integers that scipy.sparse needs, so we use the next
# best thing: typecode "i" (int). However, if that gives larger or
# smaller integers than 32-bit ones, np.frombuffer screws up.
assert array("i").itemsize == 4, (
"sizeof(int) != 4 on your platform; please report this at"
" https://github.com/scikit-learn/scikit-learn/issues and"
" include the output from platform.platform() in your bug report")
dtype = self.dtype
if fitting:
feature_names = []
vocab = {}
else:
feature_names = self.feature_names_
vocab = self.vocabulary_
# Process everything as sparse regardless of setting
X = [X] if isinstance(X, Mapping) else X
indices = array("i")
indptr = [0]
# XXX we could change values to an array.array as well, but it
# would require (heuristic) conversion of dtype to typecode...
values = []
# collect all the possible feature names and build sparse matrix at
# same time
for x in X:
for f, v in x.items():
if isinstance(v, str):
f = "%s%s%s" % (f, self.separator, v)
v = 1
if f in vocab:
indices.append(vocab[f])
values.append(dtype(v))
else:
if fitting:
feature_names.append(f)
vocab[f] = len(vocab)
indices.append(vocab[f])
values.append(dtype(v))
indptr.append(len(indices))
if len(indptr) == 1:
raise ValueError("Sample sequence X is empty.")
indices = np.frombuffer(indices, dtype=np.intc)
shape = (len(indptr) - 1, len(vocab))
result_matrix = sp.csr_matrix((values, indices, indptr),
shape=shape, dtype=dtype)
# Sort everything if asked
if fitting and self.sort:
feature_names.sort()
map_index = np.empty(len(feature_names), dtype=np.int32)
for new_val, f in enumerate(feature_names):
map_index[new_val] = vocab[f]
vocab[f] = new_val
result_matrix = result_matrix[:, map_index]
if self.sparse:
result_matrix.sort_indices()
else:
result_matrix = result_matrix.toarray()
if fitting:
self.feature_names_ = feature_names
self.vocabulary_ = vocab
return result_matrix
def fit_transform(self, X, y=None):
"""Learn a list of feature name -> indices mappings and transform X.
Like fit(X) followed by transform(X), but does not require
materializing X in memory.
Parameters
----------
X : Mapping or iterable over Mappings
Dict(s) or Mapping(s) from feature names (arbitrary Python
objects) to feature values (strings or convertible to dtype).
y : (ignored)
Returns
-------
Xa : {array, sparse matrix}
Feature vectors; always 2-d.
"""
return self._transform(X, fitting=True)
def inverse_transform(self, X, dict_type=dict):
"""Transform array or sparse matrix X back to feature mappings.
X must have been produced by this DictVectorizer's transform or
fit_transform method; it may only have passed through transformers
that preserve the number of features and their order.
In the case of one-hot/one-of-K coding, the constructed feature
names and values are returned rather than the original ones.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Sample matrix.
dict_type : type, default=dict
Constructor for feature mappings. Must conform to the
collections.Mapping API.
Returns
-------
D : list of dict_type objects of shape (n_samples,)
Feature mappings for the samples in X.
"""
# COO matrix is not subscriptable
X = check_array(X, accept_sparse=['csr', 'csc'])
n_samples = X.shape[0]
names = self.feature_names_
dicts = [dict_type() for _ in range(n_samples)]
if sp.issparse(X):
for i, j in zip(*X.nonzero()):
dicts[i][names[j]] = X[i, j]
else:
for i, d in enumerate(dicts):
for j, v in enumerate(X[i, :]):
if v != 0:
d[names[j]] = X[i, j]
return dicts
def transform(self, X):
"""Transform feature->value dicts to array or sparse matrix.
Named features not encountered during fit or fit_transform will be
silently ignored.
Parameters
----------
X : Mapping or iterable over Mappings of shape (n_samples,)
Dict(s) or Mapping(s) from feature names (arbitrary Python
objects) to feature values (strings or convertible to dtype).
Returns
-------
Xa : {array, sparse matrix}
Feature vectors; always 2-d.
"""
if self.sparse:
return self._transform(X, fitting=False)
else:
dtype = self.dtype
vocab = self.vocabulary_
X = _tosequence(X)
Xa = np.zeros((len(X), len(vocab)), dtype=dtype)
for i, x in enumerate(X):
for f, v in x.items():
if isinstance(v, str):
f = "%s%s%s" % (f, self.separator, v)
v = 1
try:
Xa[i, vocab[f]] = dtype(v)
except KeyError:
pass
return Xa
def get_feature_names(self):
"""Returns a list of feature names, ordered by their indices.
If one-of-K coding is applied to categorical features, this will
include the constructed feature names but not the original ones.
"""
return self.feature_names_
def restrict(self, support, indices=False):
"""Restrict the features to those in support using feature selection.
This function modifies the estimator in-place.
Parameters
----------
support : array-like
Boolean mask or list of indices (as returned by the get_support
member of feature selectors).
indices : bool, default=False
Whether support is a list of indices.
Returns
-------
self
Examples
--------
>>> from sklearn.feature_extraction import DictVectorizer
>>> from sklearn.feature_selection import SelectKBest, chi2
>>> v = DictVectorizer()
>>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
>>> X = v.fit_transform(D)
>>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
>>> v.get_feature_names()
['bar', 'baz', 'foo']
>>> v.restrict(support.get_support())
DictVectorizer()
>>> v.get_feature_names()
['bar', 'foo']
"""
if not indices:
support = np.where(support)[0]
names = self.feature_names_
new_vocab = {}
for i in support:
new_vocab[names[i]] = len(new_vocab)
self.vocabulary_ = new_vocab
self.feature_names_ = [f for f, i in sorted(new_vocab.items(),
key=itemgetter(1))]
return self
def _more_tags(self):
return {'X_types': ["dict"]}

View file

@ -0,0 +1,173 @@
# Author: Lars Buitinck
# License: BSD 3 clause
import numbers
import numpy as np
import scipy.sparse as sp
from ..utils import IS_PYPY
from ..utils.validation import _deprecate_positional_args
from ..base import BaseEstimator, TransformerMixin
if not IS_PYPY:
from ._hashing_fast import transform as _hashing_transform
else:
def _hashing_transform(*args, **kwargs):
raise NotImplementedError(
'FeatureHasher is not compatible with PyPy (see '
'https://github.com/scikit-learn/scikit-learn/issues/11540 '
'for the status updates).')
def _iteritems(d):
"""Like d.iteritems, but accepts any collections.Mapping."""
return d.iteritems() if hasattr(d, "iteritems") else d.items()
class FeatureHasher(TransformerMixin, BaseEstimator):
"""Implements feature hashing, aka the hashing trick.
This class turns sequences of symbolic feature names (strings) into
scipy.sparse matrices, using a hash function to compute the matrix column
corresponding to a name. The hash function employed is the signed 32-bit
version of Murmurhash3.
Feature names of type byte string are used as-is. Unicode strings are
converted to UTF-8 first, but no Unicode normalization is done.
Feature values must be (finite) numbers.
This class is a low-memory alternative to DictVectorizer and
CountVectorizer, intended for large-scale (online) learning and situations
where memory is tight, e.g. when running prediction code on embedded
devices.
Read more in the :ref:`User Guide <feature_hashing>`.
.. versionadded:: 0.13
Parameters
----------
n_features : int, default=2**20
The number of features (columns) in the output matrices. Small numbers
of features are likely to cause hash collisions, but large numbers
will cause larger coefficient dimensions in linear learners.
input_type : {"dict", "pair"}, default="dict"
Either "dict" (the default) to accept dictionaries over
(feature_name, value); "pair" to accept pairs of (feature_name, value);
or "string" to accept single strings.
feature_name should be a string, while value should be a number.
In the case of "string", a value of 1 is implied.
The feature_name is hashed to find the appropriate column for the
feature. The value's sign might be flipped in the output (but see
non_negative, below).
dtype : numpy dtype, default=np.float64
The type of feature values. Passed to scipy.sparse matrix constructors
as the dtype argument. Do not set this to bool, np.boolean or any
unsigned integer type.
alternate_sign : bool, default=True
When True, an alternating sign is added to the features as to
approximately conserve the inner product in the hashed space even for
small n_features. This approach is similar to sparse random projection.
.. versionchanged:: 0.19
``alternate_sign`` replaces the now deprecated ``non_negative``
parameter.
Examples
--------
>>> from sklearn.feature_extraction import FeatureHasher
>>> h = FeatureHasher(n_features=10)
>>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
>>> f = h.transform(D)
>>> f.toarray()
array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],
[ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])
See also
--------
DictVectorizer : vectorizes string-valued features using a hash table.
sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features.
"""
@_deprecate_positional_args
def __init__(self, n_features=(2 ** 20), *, input_type="dict",
dtype=np.float64, alternate_sign=True):
self._validate_params(n_features, input_type)
self.dtype = dtype
self.input_type = input_type
self.n_features = n_features
self.alternate_sign = alternate_sign
@staticmethod
def _validate_params(n_features, input_type):
# strangely, np.int16 instances are not instances of Integral,
# while np.int64 instances are...
if not isinstance(n_features, numbers.Integral):
raise TypeError("n_features must be integral, got %r (%s)."
% (n_features, type(n_features)))
elif n_features < 1 or n_features >= np.iinfo(np.int32).max + 1:
raise ValueError("Invalid number of features (%d)." % n_features)
if input_type not in ("dict", "pair", "string"):
raise ValueError("input_type must be 'dict', 'pair' or 'string',"
" got %r." % input_type)
def fit(self, X=None, y=None):
"""No-op.
This method doesn't do anything. It exists purely for compatibility
with the scikit-learn transformer API.
Parameters
----------
X : ndarray
Returns
-------
self : FeatureHasher
"""
# repeat input validation for grid search (which calls set_params)
self._validate_params(self.n_features, self.input_type)
return self
def transform(self, raw_X):
"""Transform a sequence of instances to a scipy.sparse matrix.
Parameters
----------
raw_X : iterable over iterable over raw features, length = n_samples
Samples. Each sample must be iterable an (e.g., a list or tuple)
containing/generating feature names (and optionally values, see
the input_type constructor argument) which will be hashed.
raw_X need not support the len function, so it can be the result
of a generator; n_samples is determined on the fly.
Returns
-------
X : sparse matrix of shape (n_samples, n_features)
Feature matrix, for use with estimators or further transformers.
"""
raw_X = iter(raw_X)
if self.input_type == "dict":
raw_X = (_iteritems(d) for d in raw_X)
elif self.input_type == "string":
raw_X = (((f, 1) for f in x) for x in raw_X)
indices, indptr, values = \
_hashing_transform(raw_X, self.n_features, self.dtype,
self.alternate_sign, seed=0)
n_samples = indptr.shape[0] - 1
if n_samples == 0:
raise ValueError("Cannot vectorize empty sequence.")
X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype,
shape=(n_samples, self.n_features))
X.sum_duplicates() # also sorts the indices
return X
def _more_tags(self):
return {'X_types': [self.input_type]}

View file

@ -0,0 +1,45 @@
# This list of English stop words is taken from the "Glasgow Information
# Retrieval Group". The original list can be found at
# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
ENGLISH_STOP_WORDS = frozenset([
"a", "about", "above", "across", "after", "afterwards", "again", "against",
"all", "almost", "alone", "along", "already", "also", "although", "always",
"am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
"any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
"around", "as", "at", "back", "be", "became", "because", "become",
"becomes", "becoming", "been", "before", "beforehand", "behind", "being",
"below", "beside", "besides", "between", "beyond", "bill", "both",
"bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
"could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
"down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
"elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
"everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
"find", "fire", "first", "five", "for", "former", "formerly", "forty",
"found", "four", "from", "front", "full", "further", "get", "give", "go",
"had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
"hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
"how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
"interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
"latterly", "least", "less", "ltd", "made", "many", "may", "me",
"meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
"move", "much", "must", "my", "myself", "name", "namely", "neither",
"never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
"nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
"ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
"please", "put", "rather", "re", "same", "see", "seem", "seemed",
"seeming", "seems", "serious", "several", "she", "should", "show", "side",
"since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
"something", "sometime", "sometimes", "somewhere", "still", "such",
"system", "take", "ten", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefore", "therein", "thereupon", "these", "they", "thick", "thin",
"third", "this", "those", "though", "three", "through", "throughout",
"thru", "thus", "to", "together", "too", "top", "toward", "towards",
"twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
"very", "via", "was", "we", "well", "were", "what", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
"wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
"who", "whoever", "whole", "whom", "whose", "why", "will", "with",
"within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"])

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _dict_vectorizer # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.feature_extraction.dict_vectorizer'
correct_import_path = 'sklearn.feature_extraction'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_dict_vectorizer, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _hash # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.feature_extraction.hashing'
correct_import_path = 'sklearn.feature_extraction'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_hash, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,591 @@
"""
The :mod:`sklearn.feature_extraction.image` submodule gathers utilities to
extract features from images.
"""
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Olivier Grisel
# Vlad Niculae
# License: BSD 3 clause
from itertools import product
import numbers
import numpy as np
from scipy import sparse
from numpy.lib.stride_tricks import as_strided
from ..utils import check_array, check_random_state, deprecated
from ..utils.validation import _deprecate_positional_args
from ..base import BaseEstimator
__all__ = ['PatchExtractor',
'extract_patches_2d',
'grid_to_graph',
'img_to_graph',
'reconstruct_from_patches_2d']
###############################################################################
# From an image to a graph
def _make_edges_3d(n_x, n_y, n_z=1):
"""Returns a list of edges for a 3D image.
Parameters
----------
n_x : int
The size of the grid in the x direction.
n_y : int
The size of the grid in the y direction.
n_z : integer, default=1
The size of the grid in the z direction, defaults to 1
"""
vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z))
edges_deep = np.vstack((vertices[:, :, :-1].ravel(),
vertices[:, :, 1:].ravel()))
edges_right = np.vstack((vertices[:, :-1].ravel(),
vertices[:, 1:].ravel()))
edges_down = np.vstack((vertices[:-1].ravel(), vertices[1:].ravel()))
edges = np.hstack((edges_deep, edges_right, edges_down))
return edges
def _compute_gradient_3d(edges, img):
_, n_y, n_z = img.shape
gradient = np.abs(img[edges[0] // (n_y * n_z),
(edges[0] % (n_y * n_z)) // n_z,
(edges[0] % (n_y * n_z)) % n_z] -
img[edges[1] // (n_y * n_z),
(edges[1] % (n_y * n_z)) // n_z,
(edges[1] % (n_y * n_z)) % n_z])
return gradient
# XXX: Why mask the image after computing the weights?
def _mask_edges_weights(mask, edges, weights=None):
"""Apply a mask to edges (weighted or not)"""
inds = np.arange(mask.size)
inds = inds[mask.ravel()]
ind_mask = np.logical_and(np.in1d(edges[0], inds),
np.in1d(edges[1], inds))
edges = edges[:, ind_mask]
if weights is not None:
weights = weights[ind_mask]
if len(edges.ravel()):
maxval = edges.max()
else:
maxval = 0
order = np.searchsorted(np.unique(edges.ravel()), np.arange(maxval + 1))
edges = order[edges]
if weights is None:
return edges
else:
return edges, weights
def _to_graph(n_x, n_y, n_z, mask=None, img=None,
return_as=sparse.coo_matrix, dtype=None):
"""Auxiliary function for img_to_graph and grid_to_graph
"""
edges = _make_edges_3d(n_x, n_y, n_z)
if dtype is None:
if img is None:
dtype = np.int
else:
dtype = img.dtype
if img is not None:
img = np.atleast_3d(img)
weights = _compute_gradient_3d(edges, img)
if mask is not None:
edges, weights = _mask_edges_weights(mask, edges, weights)
diag = img.squeeze()[mask]
else:
diag = img.ravel()
n_voxels = diag.size
else:
if mask is not None:
mask = mask.astype(dtype=np.bool, copy=False)
mask = np.asarray(mask, dtype=np.bool)
edges = _mask_edges_weights(mask, edges)
n_voxels = np.sum(mask)
else:
n_voxels = n_x * n_y * n_z
weights = np.ones(edges.shape[1], dtype=dtype)
diag = np.ones(n_voxels, dtype=dtype)
diag_idx = np.arange(n_voxels)
i_idx = np.hstack((edges[0], edges[1]))
j_idx = np.hstack((edges[1], edges[0]))
graph = sparse.coo_matrix((np.hstack((weights, weights, diag)),
(np.hstack((i_idx, diag_idx)),
np.hstack((j_idx, diag_idx)))),
(n_voxels, n_voxels),
dtype=dtype)
if return_as is np.ndarray:
return graph.toarray()
return return_as(graph)
@_deprecate_positional_args
def img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):
"""Graph of the pixel-to-pixel gradient connections
Edges are weighted with the gradient values.
Read more in the :ref:`User Guide <image_feature_extraction>`.
Parameters
----------
img : ndarray of shape (height, width) or (height, width, channel)
2D or 3D image.
mask : ndarray of shape (height, width) or \
(height, width, channel), dtype=bool, default=None
An optional mask of the image, to consider only part of the
pixels.
return_as : np.ndarray or a sparse matrix class, \
default=sparse.coo_matrix
The class to use to build the returned adjacency matrix.
dtype : dtype, default=None
The data of the returned sparse matrix. By default it is the
dtype of img
Notes
-----
For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was
handled by returning a dense np.matrix instance. Going forward, np.ndarray
returns an np.ndarray, as expected.
For compatibility, user code relying on this method should wrap its
calls in ``np.asarray`` to avoid type issues.
"""
img = np.atleast_3d(img)
n_x, n_y, n_z = img.shape
return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype)
@_deprecate_positional_args
def grid_to_graph(n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix,
dtype=np.int):
"""Graph of the pixel-to-pixel connections
Edges exist if 2 voxels are connected.
Parameters
----------
n_x : int
Dimension in x axis
n_y : int
Dimension in y axis
n_z : int, default=1
Dimension in z axis
mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None
An optional mask of the image, to consider only part of the
pixels.
return_as : np.ndarray or a sparse matrix class, \
default=sparse.coo_matrix
The class to use to build the returned adjacency matrix.
dtype : dtype, default=int
The data of the returned sparse matrix. By default it is int
Notes
-----
For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was
handled by returning a dense np.matrix instance. Going forward, np.ndarray
returns an np.ndarray, as expected.
For compatibility, user code relying on this method should wrap its
calls in ``np.asarray`` to avoid type issues.
"""
return _to_graph(n_x, n_y, n_z, mask=mask, return_as=return_as,
dtype=dtype)
###############################################################################
# From an image to a set of small image patches
def _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):
"""Compute the number of patches that will be extracted in an image.
Read more in the :ref:`User Guide <image_feature_extraction>`.
Parameters
----------
i_h : int
The image height
i_w : int
The image with
p_h : int
The height of a patch
p_w : int
The width of a patch
max_patches : int or float, default=None
The maximum number of patches to extract. If max_patches is a float
between 0 and 1, it is taken to be a proportion of the total number
of patches.
"""
n_h = i_h - p_h + 1
n_w = i_w - p_w + 1
all_patches = n_h * n_w
if max_patches:
if (isinstance(max_patches, (numbers.Integral))
and max_patches < all_patches):
return max_patches
elif (isinstance(max_patches, (numbers.Integral))
and max_patches >= all_patches):
return all_patches
elif (isinstance(max_patches, (numbers.Real))
and 0 < max_patches < 1):
return int(max_patches * all_patches)
else:
raise ValueError("Invalid value for max_patches: %r" % max_patches)
else:
return all_patches
def _extract_patches(arr, patch_shape=8, extraction_step=1):
"""Extracts patches of any n-dimensional array in place using strides.
Given an n-dimensional array it will return a 2n-dimensional array with
the first n dimensions indexing patch position and the last n indexing
the patch content. This operation is immediate (O(1)). A reshape
performed on the first n dimensions will cause numpy to copy data, leading
to a list of extracted patches.
Read more in the :ref:`User Guide <image_feature_extraction>`.
Parameters
----------
arr : ndarray
n-dimensional array of which patches are to be extracted
patch_shape : int or tuple of length arr.ndim.default=8
Indicates the shape of the patches to be extracted. If an
integer is given, the shape will be a hypercube of
sidelength given by its value.
extraction_step : int or tuple of length arr.ndim, default=1
Indicates step size at which extraction shall be performed.
If integer is given, then the step is uniform in all dimensions.
Returns
-------
patches : strided ndarray
2n-dimensional array indexing patches on first n dimensions and
containing patches on the last n dimensions. These dimensions
are fake, but this way no data is copied. A simple reshape invokes
a copying operation to obtain a list of patches:
result.reshape([-1] + list(patch_shape))
"""
arr_ndim = arr.ndim
if isinstance(patch_shape, numbers.Number):
patch_shape = tuple([patch_shape] * arr_ndim)
if isinstance(extraction_step, numbers.Number):
extraction_step = tuple([extraction_step] * arr_ndim)
patch_strides = arr.strides
slices = tuple(slice(None, None, st) for st in extraction_step)
indexing_strides = arr[slices].strides
patch_indices_shape = ((np.array(arr.shape) - np.array(patch_shape)) //
np.array(extraction_step)) + 1
shape = tuple(list(patch_indices_shape) + list(patch_shape))
strides = tuple(list(indexing_strides) + list(patch_strides))
patches = as_strided(arr, shape=shape, strides=strides)
return patches
@deprecated("The function feature_extraction.image.extract_patches has been "
"deprecated in 0.22 and will be removed in 0.24.")
def extract_patches(arr, patch_shape=8, extraction_step=1):
"""Extracts patches of any n-dimensional array in place using strides.
Given an n-dimensional array it will return a 2n-dimensional array with
the first n dimensions indexing patch position and the last n indexing
the patch content. This operation is immediate (O(1)). A reshape
performed on the first n dimensions will cause numpy to copy data, leading
to a list of extracted patches.
Read more in the :ref:`User Guide <image_feature_extraction>`.
Parameters
----------
arr : ndarray
n-dimensional array of which patches are to be extracted
patch_shape : int or tuple of length arr.ndim, default=8
Indicates the shape of the patches to be extracted. If an
integer is given, the shape will be a hypercube of
sidelength given by its value.
extraction_step : int or tuple of length arr.ndim, default=1
Indicates step size at which extraction shall be performed.
If integer is given, then the step is uniform in all dimensions.
Returns
-------
patches : strided ndarray
2n-dimensional array indexing patches on first n dimensions and
containing patches on the last n dimensions. These dimensions
are fake, but this way no data is copied. A simple reshape invokes
a copying operation to obtain a list of patches:
result.reshape([-1] + list(patch_shape))
"""
return _extract_patches(arr, patch_shape=patch_shape,
extraction_step=extraction_step)
@_deprecate_positional_args
def extract_patches_2d(image, patch_size, *, max_patches=None,
random_state=None):
"""Reshape a 2D image into a collection of patches
The resulting patches are allocated in a dedicated array.
Read more in the :ref:`User Guide <image_feature_extraction>`.
Parameters
----------
image : ndarray of shape (image_height, image_width) or \
(image_height, image_width, n_channels)
The original image data. For color images, the last dimension specifies
the channel: a RGB image would have `n_channels=3`.
patch_size : tuple of int (patch_height, patch_width)
The dimensions of one patch.
max_patches : int or float, default=None
The maximum number of patches to extract. If `max_patches` is a float
between 0 and 1, it is taken to be a proportion of the total number
of patches.
random_state : int, RandomState instance, default=None
Determines the random number generator used for random sampling when
`max_patches` is not None. Use an int to make the randomness
deterministic.
See :term:`Glossary <random_state>`.
Returns
-------
patches : array of shape (n_patches, patch_height, patch_width) or \
(n_patches, patch_height, patch_width, n_channels)
The collection of patches extracted from the image, where `n_patches`
is either `max_patches` or the total number of patches that can be
extracted.
Examples
--------
>>> from sklearn.datasets import load_sample_image
>>> from sklearn.feature_extraction import image
>>> # Use the array data from the first image in this dataset:
>>> one_image = load_sample_image("china.jpg")
>>> print('Image shape: {}'.format(one_image.shape))
Image shape: (427, 640, 3)
>>> patches = image.extract_patches_2d(one_image, (2, 2))
>>> print('Patches shape: {}'.format(patches.shape))
Patches shape: (272214, 2, 2, 3)
>>> # Here are just two of these patches:
>>> print(patches[1])
[[[174 201 231]
[174 201 231]]
[[173 200 230]
[173 200 230]]]
>>> print(patches[800])
[[[187 214 243]
[188 215 244]]
[[187 214 243]
[188 215 244]]]
"""
i_h, i_w = image.shape[:2]
p_h, p_w = patch_size
if p_h > i_h:
raise ValueError("Height of the patch should be less than the height"
" of the image.")
if p_w > i_w:
raise ValueError("Width of the patch should be less than the width"
" of the image.")
image = check_array(image, allow_nd=True)
image = image.reshape((i_h, i_w, -1))
n_colors = image.shape[-1]
extracted_patches = _extract_patches(image,
patch_shape=(p_h, p_w, n_colors),
extraction_step=1)
n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)
if max_patches:
rng = check_random_state(random_state)
i_s = rng.randint(i_h - p_h + 1, size=n_patches)
j_s = rng.randint(i_w - p_w + 1, size=n_patches)
patches = extracted_patches[i_s, j_s, 0]
else:
patches = extracted_patches
patches = patches.reshape(-1, p_h, p_w, n_colors)
# remove the color dimension if useless
if patches.shape[-1] == 1:
return patches.reshape((n_patches, p_h, p_w))
else:
return patches
def reconstruct_from_patches_2d(patches, image_size):
"""Reconstruct the image from all of its patches.
Patches are assumed to overlap and the image is constructed by filling in
the patches from left to right, top to bottom, averaging the overlapping
regions.
Read more in the :ref:`User Guide <image_feature_extraction>`.
Parameters
----------
patches : ndarray of shape (n_patches, patch_height, patch_width) or \
(n_patches, patch_height, patch_width, n_channels)
The complete set of patches. If the patches contain colour information,
channels are indexed along the last dimension: RGB patches would
have `n_channels=3`.
image_size : tuple of int (image_height, image_width) or \
(image_height, image_width, n_channels)
The size of the image that will be reconstructed.
Returns
-------
image : ndarray of shape image_size
The reconstructed image.
"""
i_h, i_w = image_size[:2]
p_h, p_w = patches.shape[1:3]
img = np.zeros(image_size)
# compute the dimensions of the patches array
n_h = i_h - p_h + 1
n_w = i_w - p_w + 1
for p, (i, j) in zip(patches, product(range(n_h), range(n_w))):
img[i:i + p_h, j:j + p_w] += p
for i in range(i_h):
for j in range(i_w):
# divide by the amount of overlap
# XXX: is this the most efficient way? memory-wise yes, cpu wise?
img[i, j] /= float(min(i + 1, p_h, i_h - i) *
min(j + 1, p_w, i_w - j))
return img
class PatchExtractor(BaseEstimator):
"""Extracts patches from a collection of images
Read more in the :ref:`User Guide <image_feature_extraction>`.
.. versionadded:: 0.9
Parameters
----------
patch_size : tuple of int (patch_height, patch_width)
The dimensions of one patch.
max_patches : int or float, default=None
The maximum number of patches per image to extract. If max_patches is a
float in (0, 1), it is taken to mean a proportion of the total number
of patches.
random_state : int, RandomState instance, default=None
Determines the random number generator used for random sampling when
`max_patches` is not None. Use an int to make the randomness
deterministic.
See :term:`Glossary <random_state>`.
Examples
--------
>>> from sklearn.datasets import load_sample_images
>>> from sklearn.feature_extraction import image
>>> # Use the array data from the second image in this dataset:
>>> X = load_sample_images().images[1]
>>> print('Image shape: {}'.format(X.shape))
Image shape: (427, 640, 3)
>>> pe = image.PatchExtractor(patch_size=(2, 2))
>>> pe_fit = pe.fit(X)
>>> pe_trans = pe.transform(X)
>>> print('Patches shape: {}'.format(pe_trans.shape))
Patches shape: (545706, 2, 2)
"""
@_deprecate_positional_args
def __init__(self, *, patch_size=None, max_patches=None,
random_state=None):
self.patch_size = patch_size
self.max_patches = max_patches
self.random_state = random_state
def fit(self, X, y=None):
"""Do nothing and return the estimator unchanged.
This method is just there to implement the usual API and hence
work in pipelines.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data.
"""
return self
def transform(self, X):
"""Transforms the image samples in X into a matrix of patch data.
Parameters
----------
X : ndarray of shape (n_samples, image_height, image_width) or \
(n_samples, image_height, image_width, n_channels)
Array of images from which to extract patches. For color images,
the last dimension specifies the channel: a RGB image would have
`n_channels=3`.
Returns
-------
patches : array of shape (n_patches, patch_height, patch_width) or \
(n_patches, patch_height, patch_width, n_channels)
The collection of patches extracted from the images, where
`n_patches` is either `n_samples * max_patches` or the total
number of patches that can be extracted.
"""
self.random_state = check_random_state(self.random_state)
n_images, i_h, i_w = X.shape[:3]
X = np.reshape(X, (n_images, i_h, i_w, -1))
n_channels = X.shape[-1]
if self.patch_size is None:
patch_size = i_h // 10, i_w // 10
else:
patch_size = self.patch_size
# compute the dimensions of the patches array
p_h, p_w = patch_size
n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, self.max_patches)
patches_shape = (n_images * n_patches,) + patch_size
if n_channels > 1:
patches_shape += (n_channels,)
# extract the patches
patches = np.empty(patches_shape)
for ii, image in enumerate(X):
patches[ii * n_patches:(ii + 1) * n_patches] = extract_patches_2d(
image, patch_size, max_patches=self.max_patches,
random_state=self.random_state)
return patches
def _more_tags(self):
return {'X_types': ['3darray']}

View file

@ -0,0 +1,21 @@
import os
import platform
def configuration(parent_package='', top_path=None):
import numpy
from numpy.distutils.misc_util import Configuration
config = Configuration('feature_extraction', parent_package, top_path)
libraries = []
if os.name == 'posix':
libraries.append('m')
if platform.python_implementation() != 'PyPy':
config.add_extension('_hashing_fast',
sources=['_hashing_fast.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_subpackage("tests")
return config

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _stop_words # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.feature_extraction.stop_words'
correct_import_path = 'sklearn.feature_extraction.text'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_stop_words, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,121 @@
# Authors: Lars Buitinck
# Dan Blanchard <dblanchard@ets.org>
# License: BSD 3 clause
from random import Random
import numpy as np
import scipy.sparse as sp
from numpy.testing import assert_array_equal
import pytest
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
@pytest.mark.parametrize('sparse', (True, False))
@pytest.mark.parametrize('dtype', (int, np.float32, np.int16))
@pytest.mark.parametrize('sort', (True, False))
@pytest.mark.parametrize('iterable', (True, False))
def test_dictvectorizer(sparse, dtype, sort, iterable):
D = [{"foo": 1, "bar": 3},
{"bar": 4, "baz": 2},
{"bar": 1, "quux": 1, "quuux": 2}]
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
X = v.fit_transform(iter(D) if iterable else D)
assert sp.issparse(X) == sparse
assert X.shape == (3, 5)
assert X.sum() == 14
assert v.inverse_transform(X) == D
if sparse:
# CSR matrices can't be compared for equality
assert_array_equal(X.A, v.transform(iter(D) if iterable
else D).A)
else:
assert_array_equal(X, v.transform(iter(D) if iterable
else D))
if sort:
assert (v.feature_names_ ==
sorted(v.feature_names_))
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert v.get_feature_names() == ["useful1", "useful2"]
def test_one_of_k():
D_in = [{"version": "1", "ham": 2},
{"version": "2", "spam": .3},
{"version=3": True, "spam": -1}]
v = DictVectorizer()
X = v.fit_transform(D_in)
assert X.shape == (3, 5)
D_out = v.inverse_transform(X)
assert D_out[0] == {"version=1": 1, "ham": 2}
names = v.get_feature_names()
assert "version=2" in names
assert "version" not in names
def test_unseen_or_no_features():
D = [{"camelot": 0, "spamalot": 1}]
for sparse in [True, False]:
v = DictVectorizer(sparse=sparse).fit(D)
X = v.transform({"push the pram a lot": 2})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
X = v.transform({})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
try:
v.transform([])
except ValueError as e:
assert "empty" in str(e)
def test_deterministic_vocabulary():
# Generate equal dictionaries with different memory layouts
items = [("%03d" % i, i) for i in range(1000)]
rng = Random(42)
d_sorted = dict(items)
rng.shuffle(items)
d_shuffled = dict(items)
# check that the memory layout does not impact the resulting vocabulary
v_1 = DictVectorizer().fit([d_sorted])
v_2 = DictVectorizer().fit([d_shuffled])
assert v_1.vocabulary_ == v_2.vocabulary_
def test_n_features_in():
# For vectorizers, n_features_in_ does not make sense and does not exist.
dv = DictVectorizer()
assert not hasattr(dv, 'n_features_in_')
d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
dv.fit(d)
assert not hasattr(dv, 'n_features_in_')

View file

@ -0,0 +1,171 @@
import numpy as np
from numpy.testing import assert_array_equal
import pytest
from sklearn.feature_extraction import FeatureHasher
from sklearn.utils._testing import (ignore_warnings,
fails_if_pypy)
pytestmark = fails_if_pypy
def test_feature_hasher_dicts():
h = FeatureHasher(n_features=16)
assert "dict" == h.input_type
raw_X = [{"foo": "bar", "dada": 42, "tzara": 37},
{"foo": "baz", "gaga": "string1"}]
X1 = FeatureHasher(n_features=16).transform(raw_X)
gen = (iter(d.items()) for d in raw_X)
X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
assert_array_equal(X1.toarray(), X2.toarray())
def test_feature_hasher_strings():
# mix byte and Unicode strings; note that "foo" is a duplicate in row 0
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
["bar".encode("ascii"), "baz", "quux"]]
for lg_n_features in (7, 9, 11, 16, 22):
n_features = 2 ** lg_n_features
it = (x for x in raw_X) # iterable
h = FeatureHasher(n_features=n_features, input_type="string",
alternate_sign=False)
X = h.transform(it)
assert X.shape[0] == len(raw_X)
assert X.shape[1] == n_features
assert X[0].sum() == 4
assert X[1].sum() == 3
assert X.nnz == 6
def test_hashing_transform_seed():
# check the influence of the seed when computing the hashes
# import is here to avoid importing on pypy
from sklearn.feature_extraction._hashing_fast import (
transform as _hashing_transform)
raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
["bar".encode("ascii"), "baz", "quux"]]
raw_X_ = (((f, 1) for f in x) for x in raw_X)
indices, indptr, _ = _hashing_transform(raw_X_, 2 ** 7, str,
False)
raw_X_ = (((f, 1) for f in x) for x in raw_X)
indices_0, indptr_0, _ = _hashing_transform(raw_X_, 2 ** 7, str,
False, seed=0)
assert_array_equal(indices, indices_0)
assert_array_equal(indptr, indptr_0)
raw_X_ = (((f, 1) for f in x) for x in raw_X)
indices_1, _, _ = _hashing_transform(raw_X_, 2 ** 7, str,
False, seed=1)
with pytest.raises(AssertionError):
assert_array_equal(indices, indices_1)
def test_feature_hasher_pairs():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
{"baz": 3, "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert [1, 2] == x1_nz
assert [1, 3, 4] == x2_nz
def test_feature_hasher_pairs_with_string_values():
raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
{"baz": "abc", "quux": 4, "foo": -1}])
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert [1, 1] == x1_nz
assert [1, 1, 4] == x2_nz
raw_X = (iter(d.items()) for d in [{"bax": "abc"},
{"bax": "abc"}])
x1, x2 = h.transform(raw_X).toarray()
x1_nz = np.abs(x1[x1 != 0])
x2_nz = np.abs(x2[x2 != 0])
assert [1] == x1_nz
assert [1] == x2_nz
assert_array_equal(x1, x2)
def test_hash_empty_input():
n_features = 16
raw_X = [[], (), iter(range(0))]
h = FeatureHasher(n_features=n_features, input_type="string")
X = h.transform(raw_X)
assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
def test_hasher_invalid_input():
with pytest.raises(ValueError):
FeatureHasher(input_type="gobbledygook")
with pytest.raises(ValueError):
FeatureHasher(n_features=-1)
with pytest.raises(ValueError):
FeatureHasher(n_features=0)
with pytest.raises(TypeError):
FeatureHasher(n_features='ham')
h = FeatureHasher(n_features=np.uint16(2 ** 6))
with pytest.raises(ValueError):
h.transform([])
with pytest.raises(Exception):
h.transform([[5.5]])
with pytest.raises(Exception):
h.transform([[None]])
def test_hasher_set_params():
# Test delayed input validation in fit (useful for grid search).
hasher = FeatureHasher()
hasher.set_params(n_features=np.inf)
with pytest.raises(TypeError):
hasher.fit()
def test_hasher_zeros():
# Assert that no zeros are materialized in the output.
X = FeatureHasher().transform([{'foo': 0}])
assert X.data.shape == (0,)
@ignore_warnings(category=FutureWarning)
def test_hasher_alternate_sign():
X = [list("Thequickbrownfoxjumped")]
Xt = FeatureHasher(alternate_sign=True,
input_type='string').fit_transform(X)
assert Xt.data.min() < 0 and Xt.data.max() > 0
Xt = FeatureHasher(alternate_sign=False,
input_type='string').fit_transform(X)
assert Xt.data.min() > 0
def test_hash_collisions():
X = [list("Thequickbrownfoxjumped")]
Xt = FeatureHasher(alternate_sign=True, n_features=1,
input_type='string').fit_transform(X)
# check that some of the hashed tokens are added
# with an opposite sign and cancel out
assert abs(Xt.data[0]) < len(X[0])
Xt = FeatureHasher(alternate_sign=False, n_features=1,
input_type='string').fit_transform(X)
assert Xt.data[0] == len(X[0])

View file

@ -0,0 +1,344 @@
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# License: BSD 3 clause
import numpy as np
import scipy as sp
from scipy import ndimage
from scipy.sparse.csgraph import connected_components
import pytest
from sklearn.feature_extraction.image import (
img_to_graph, grid_to_graph, extract_patches_2d,
reconstruct_from_patches_2d, PatchExtractor, _extract_patches,
extract_patches)
from sklearn.utils._testing import ignore_warnings
def test_img_to_graph():
x, y = np.mgrid[:4, :4] - 10
grad_x = img_to_graph(x)
grad_y = img_to_graph(y)
assert grad_x.nnz == grad_y.nnz
# Negative elements are the diagonal: the elements of the original
# image. Positive elements are the values of the gradient, they
# should all be equal on grad_x and grad_y
np.testing.assert_array_equal(grad_x.data[grad_x.data > 0],
grad_y.data[grad_y.data > 0])
def test_grid_to_graph():
# Checking that the function works with graphs containing no edges
size = 2
roi_size = 1
# Generating two convex parts with one vertex
# Thus, edges will be empty in _to_graph
mask = np.zeros((size, size), dtype=np.bool)
mask[0:roi_size, 0:roi_size] = True
mask[-roi_size:, -roi_size:] = True
mask = mask.reshape(size ** 2)
A = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
assert connected_components(A)[0] == 2
# Checking that the function works whatever the type of mask is
mask = np.ones((size, size), dtype=np.int16)
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask)
assert connected_components(A)[0] == 1
# Checking dtype of the graph
mask = np.ones((size, size))
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.bool)
assert A.dtype == np.bool
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.int)
assert A.dtype == np.int
A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask,
dtype=np.float64)
assert A.dtype == np.float64
@ignore_warnings(category=DeprecationWarning) # scipy deprecation inside face
def test_connect_regions():
try:
face = sp.face(gray=True)
except AttributeError:
# Newer versions of scipy have face in misc
from scipy import misc
face = misc.face(gray=True)
# subsample by 4 to reduce run time
face = face[::4, ::4]
for thr in (50, 150):
mask = face > thr
graph = img_to_graph(face, mask=mask)
assert ndimage.label(mask)[1] == connected_components(graph)[0]
@ignore_warnings(category=DeprecationWarning) # scipy deprecation inside face
def test_connect_regions_with_grid():
try:
face = sp.face(gray=True)
except AttributeError:
# Newer versions of scipy have face in misc
from scipy import misc
face = misc.face(gray=True)
# subsample by 4 to reduce run time
face = face[::4, ::4]
mask = face > 50
graph = grid_to_graph(*face.shape, mask=mask)
assert ndimage.label(mask)[1] == connected_components(graph)[0]
mask = face > 150
graph = grid_to_graph(*face.shape, mask=mask, dtype=None)
assert ndimage.label(mask)[1] == connected_components(graph)[0]
def _downsampled_face():
try:
face = sp.face(gray=True)
except AttributeError:
# Newer versions of scipy have face in misc
from scipy import misc
face = misc.face(gray=True)
face = face.astype(np.float32)
face = (face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2]
+ face[1::2, 1::2])
face = (face[::2, ::2] + face[1::2, ::2] + face[::2, 1::2]
+ face[1::2, 1::2])
face = face.astype(np.float32)
face /= 16.0
return face
def _orange_face(face=None):
face = _downsampled_face() if face is None else face
face_color = np.zeros(face.shape + (3,))
face_color[:, :, 0] = 256 - face
face_color[:, :, 1] = 256 - face / 2
face_color[:, :, 2] = 256 - face / 4
return face_color
def _make_images(face=None):
face = _downsampled_face() if face is None else face
# make a collection of faces
images = np.zeros((3,) + face.shape)
images[0] = face
images[1] = face + 1
images[2] = face + 2
return images
downsampled_face = _downsampled_face()
orange_face = _orange_face(downsampled_face)
face_collection = _make_images(downsampled_face)
def test_extract_patches_all():
face = downsampled_face
i_h, i_w = face.shape
p_h, p_w = 16, 16
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
patches = extract_patches_2d(face, (p_h, p_w))
assert patches.shape == (expected_n_patches, p_h, p_w)
def test_extract_patches_all_color():
face = orange_face
i_h, i_w = face.shape[:2]
p_h, p_w = 16, 16
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
patches = extract_patches_2d(face, (p_h, p_w))
assert patches.shape == (expected_n_patches, p_h, p_w, 3)
def test_extract_patches_all_rect():
face = downsampled_face
face = face[:, 32:97]
i_h, i_w = face.shape
p_h, p_w = 16, 12
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
patches = extract_patches_2d(face, (p_h, p_w))
assert patches.shape == (expected_n_patches, p_h, p_w)
def test_extract_patches_max_patches():
face = downsampled_face
i_h, i_w = face.shape
p_h, p_w = 16, 16
patches = extract_patches_2d(face, (p_h, p_w), max_patches=100)
assert patches.shape == (100, p_h, p_w)
expected_n_patches = int(0.5 * (i_h - p_h + 1) * (i_w - p_w + 1))
patches = extract_patches_2d(face, (p_h, p_w), max_patches=0.5)
assert patches.shape == (expected_n_patches, p_h, p_w)
with pytest.raises(ValueError):
extract_patches_2d(face, (p_h, p_w), max_patches=2.0)
with pytest.raises(ValueError):
extract_patches_2d(face, (p_h, p_w), max_patches=-1.0)
def test_extract_patch_same_size_image():
face = downsampled_face
# Request patches of the same size as image
# Should return just the single patch a.k.a. the image
patches = extract_patches_2d(face, face.shape, max_patches=2)
assert patches.shape[0] == 1
def test_extract_patches_less_than_max_patches():
face = downsampled_face
i_h, i_w = face.shape
p_h, p_w = 3 * i_h // 4, 3 * i_w // 4
# this is 3185
expected_n_patches = (i_h - p_h + 1) * (i_w - p_w + 1)
patches = extract_patches_2d(face, (p_h, p_w), max_patches=4000)
assert patches.shape == (expected_n_patches, p_h, p_w)
def test_reconstruct_patches_perfect():
face = downsampled_face
p_h, p_w = 16, 16
patches = extract_patches_2d(face, (p_h, p_w))
face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
np.testing.assert_array_almost_equal(face, face_reconstructed)
def test_reconstruct_patches_perfect_color():
face = orange_face
p_h, p_w = 16, 16
patches = extract_patches_2d(face, (p_h, p_w))
face_reconstructed = reconstruct_from_patches_2d(patches, face.shape)
np.testing.assert_array_almost_equal(face, face_reconstructed)
def test_patch_extractor_fit():
faces = face_collection
extr = PatchExtractor(patch_size=(8, 8), max_patches=100, random_state=0)
assert extr == extr.fit(faces)
def test_patch_extractor_max_patches():
faces = face_collection
i_h, i_w = faces.shape[1:3]
p_h, p_w = 8, 8
max_patches = 100
expected_n_patches = len(faces) * max_patches
extr = PatchExtractor(patch_size=(p_h, p_w), max_patches=max_patches,
random_state=0)
patches = extr.transform(faces)
assert patches.shape == (expected_n_patches, p_h, p_w)
max_patches = 0.5
expected_n_patches = len(faces) * int((i_h - p_h + 1) * (i_w - p_w + 1)
* max_patches)
extr = PatchExtractor(patch_size=(p_h, p_w), max_patches=max_patches,
random_state=0)
patches = extr.transform(faces)
assert patches.shape == (expected_n_patches, p_h, p_w)
def test_patch_extractor_max_patches_default():
faces = face_collection
extr = PatchExtractor(max_patches=100, random_state=0)
patches = extr.transform(faces)
assert patches.shape == (len(faces) * 100, 19, 25)
def test_patch_extractor_all_patches():
faces = face_collection
i_h, i_w = faces.shape[1:3]
p_h, p_w = 8, 8
expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
extr = PatchExtractor(patch_size=(p_h, p_w), random_state=0)
patches = extr.transform(faces)
assert patches.shape == (expected_n_patches, p_h, p_w)
def test_patch_extractor_color():
faces = _make_images(orange_face)
i_h, i_w = faces.shape[1:3]
p_h, p_w = 8, 8
expected_n_patches = len(faces) * (i_h - p_h + 1) * (i_w - p_w + 1)
extr = PatchExtractor(patch_size=(p_h, p_w), random_state=0)
patches = extr.transform(faces)
assert patches.shape == (expected_n_patches, p_h, p_w, 3)
def test_extract_patches_strided():
image_shapes_1D = [(10,), (10,), (11,), (10,)]
patch_sizes_1D = [(1,), (2,), (3,), (8,)]
patch_steps_1D = [(1,), (1,), (4,), (2,)]
expected_views_1D = [(10,), (9,), (3,), (2,)]
last_patch_1D = [(10,), (8,), (8,), (2,)]
image_shapes_2D = [(10, 20), (10, 20), (10, 20), (11, 20)]
patch_sizes_2D = [(2, 2), (10, 10), (10, 11), (6, 6)]
patch_steps_2D = [(5, 5), (3, 10), (3, 4), (4, 2)]
expected_views_2D = [(2, 4), (1, 2), (1, 3), (2, 8)]
last_patch_2D = [(5, 15), (0, 10), (0, 8), (4, 14)]
image_shapes_3D = [(5, 4, 3), (3, 3, 3), (7, 8, 9), (7, 8, 9)]
patch_sizes_3D = [(2, 2, 3), (2, 2, 2), (1, 7, 3), (1, 3, 3)]
patch_steps_3D = [(1, 2, 10), (1, 1, 1), (2, 1, 3), (3, 3, 4)]
expected_views_3D = [(4, 2, 1), (2, 2, 2), (4, 2, 3), (3, 2, 2)]
last_patch_3D = [(3, 2, 0), (1, 1, 1), (6, 1, 6), (6, 3, 4)]
image_shapes = image_shapes_1D + image_shapes_2D + image_shapes_3D
patch_sizes = patch_sizes_1D + patch_sizes_2D + patch_sizes_3D
patch_steps = patch_steps_1D + patch_steps_2D + patch_steps_3D
expected_views = expected_views_1D + expected_views_2D + expected_views_3D
last_patches = last_patch_1D + last_patch_2D + last_patch_3D
for (image_shape, patch_size, patch_step, expected_view,
last_patch) in zip(image_shapes, patch_sizes, patch_steps,
expected_views, last_patches):
image = np.arange(np.prod(image_shape)).reshape(image_shape)
patches = _extract_patches(image, patch_shape=patch_size,
extraction_step=patch_step)
ndim = len(image_shape)
assert patches.shape[:ndim] == expected_view
last_patch_slices = tuple(slice(i, i + j, None) for i, j in
zip(last_patch, patch_size))
assert (patches[(-1, None, None) * ndim] ==
image[last_patch_slices].squeeze()).all()
def test_extract_patches_square():
# test same patch size for all dimensions
face = downsampled_face
i_h, i_w = face.shape
p = 8
expected_n_patches = ((i_h - p + 1), (i_w - p + 1))
patches = _extract_patches(face, patch_shape=p)
assert patches.shape == (expected_n_patches[0],
expected_n_patches[1], p, p)
def test_width_patch():
# width and height of the patch should be less than the image
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
with pytest.raises(ValueError):
extract_patches_2d(x, (4, 1))
with pytest.raises(ValueError):
extract_patches_2d(x, (1, 4))
# TODO: Remove in 0.24
def test_extract_patches_deprecated():
msg = ("The function feature_extraction.image.extract_patches has been "
"deprecated in 0.22 and will be removed in 0.24.")
with pytest.warns(FutureWarning, match=msg):
extract_patches(downsampled_face)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff