Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,41 @@
from cython cimport floating
cpdef enum BLAS_Order:
RowMajor # C contiguous
ColMajor # Fortran contiguous
cpdef enum BLAS_Trans:
NoTrans = 110 # correspond to 'n'
Trans = 116 # correspond to 't'
# BLAS Level 1 ################################################################
cdef floating _dot(int, floating*, int, floating*, int) nogil
cdef floating _asum(int, floating*, int) nogil
cdef void _axpy(int, floating, floating*, int, floating*, int) nogil
cdef floating _nrm2(int, floating*, int) nogil
cdef void _copy(int, floating*, int, floating*, int) nogil
cdef void _scal(int, floating, floating*, int) nogil
cdef void _rotg(floating*, floating*, floating*, floating*) nogil
cdef void _rot(int, floating*, int, floating*, int, floating, floating) nogil
# BLAS Level 2 ################################################################
cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, floating*, int,
floating*, int, floating, floating*, int) nogil
cdef void _ger(BLAS_Order, int, int, floating, floating*, int, floating*, int,
floating*, int) nogil
# BLASLevel 3 ################################################################
cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating,
floating*, int, floating*, int, floating, floating*,
int) nogil

View file

@ -0,0 +1,311 @@
from contextlib import closing
from contextlib import suppress
from io import StringIO
import uuid
import html
from sklearn import config_context
class _VisualBlock:
"""HTML Representation of Estimator
Parameters
----------
kind : {'serial', 'parallel', 'single'}
kind of HTML block
estimators : list of estimators or `_VisualBlock`s or a single estimator
If kind != 'single', then `estimators` is a list of
estimators.
If kind == 'single', then `estimators` is a single estimator.
names : list of str
If kind != 'single', then `names` corresponds to estimators.
If kind == 'single', then `names` is a single string corresponding to
the single estimator.
name_details : list of str, str, or None, default=None
If kind != 'single', then `name_details` corresponds to `names`.
If kind == 'single', then `name_details` is a single string
corresponding to the single estimator.
dash_wrapped : bool, default=True
If true, wrapped HTML element will be wrapped with a dashed border.
Only active when kind != 'single'.
"""
def __init__(self, kind, estimators, *, names=None, name_details=None,
dash_wrapped=True):
self.kind = kind
self.estimators = estimators
self.dash_wrapped = dash_wrapped
if self.kind in ('parallel', 'serial'):
if names is None:
names = (None, ) * len(estimators)
if name_details is None:
name_details = (None, ) * len(estimators)
self.names = names
self.name_details = name_details
def _sk_visual_block_(self):
return self
def _write_label_html(out, name, name_details,
outer_class="sk-label-container",
inner_class="sk-label",
checked=False):
"""Write labeled html with or without a dropdown with named details"""
out.write(f'<div class="{outer_class}">'
f'<div class="{inner_class} sk-toggleable">')
name = html.escape(name)
if name_details is not None:
checked_str = 'checked' if checked else ''
est_id = uuid.uuid4()
out.write(f'<input class="sk-toggleable__control sk-hidden--visually" '
f'id="{est_id}" type="checkbox" {checked_str}>'
f'<label class="sk-toggleable__label" for="{est_id}">'
f'{name}</label>'
f'<div class="sk-toggleable__content"><pre>{name_details}'
f'</pre></div>')
else:
out.write(f'<label>{name}</label>')
out.write('</div></div>') # outer_class inner_class
def _get_visual_block(estimator):
"""Generate information about how to display an estimator.
"""
with suppress(AttributeError):
return estimator._sk_visual_block_()
if isinstance(estimator, str):
return _VisualBlock('single', estimator,
names=estimator, name_details=estimator)
elif estimator is None:
return _VisualBlock('single', estimator,
names='None', name_details='None')
# check if estimator looks like a meta estimator wraps estimators
if hasattr(estimator, 'get_params'):
estimators = []
for key, value in estimator.get_params().items():
# Only look at the estimators in the first layer
if '__' not in key and hasattr(value, 'get_params'):
estimators.append(value)
if len(estimators):
return _VisualBlock('parallel', estimators, names=None)
return _VisualBlock('single', estimator,
names=estimator.__class__.__name__,
name_details=str(estimator))
def _write_estimator_html(out, estimator, estimator_label,
estimator_label_details, first_call=False):
"""Write estimator to html in serial, parallel, or by itself (single).
"""
if first_call:
est_block = _get_visual_block(estimator)
else:
with config_context(print_changed_only=True):
est_block = _get_visual_block(estimator)
if est_block.kind in ('serial', 'parallel'):
dashed_wrapped = first_call or est_block.dash_wrapped
dash_cls = " sk-dashed-wrapped" if dashed_wrapped else ""
out.write(f'<div class="sk-item{dash_cls}">')
if estimator_label:
_write_label_html(out, estimator_label, estimator_label_details)
kind = est_block.kind
out.write(f'<div class="sk-{kind}">')
est_infos = zip(est_block.estimators, est_block.names,
est_block.name_details)
for est, name, name_details in est_infos:
if kind == 'serial':
_write_estimator_html(out, est, name, name_details)
else: # parallel
out.write('<div class="sk-parallel-item">')
# wrap element in a serial visualblock
serial_block = _VisualBlock('serial', [est],
dash_wrapped=False)
_write_estimator_html(out, serial_block, name, name_details)
out.write('</div>') # sk-parallel-item
out.write('</div></div>')
elif est_block.kind == 'single':
_write_label_html(out, est_block.names, est_block.name_details,
outer_class="sk-item", inner_class="sk-estimator",
checked=first_call)
_STYLE = """
div.sk-top-container {
color: black;
background-color: white;
}
div.sk-toggleable {
background-color: white;
}
label.sk-toggleable__label {
cursor: pointer;
display: block;
width: 100%;
margin-bottom: 0;
padding: 0.2em 0.3em;
box-sizing: border-box;
text-align: center;
}
div.sk-toggleable__content {
max-height: 0;
max-width: 0;
overflow: hidden;
text-align: left;
background-color: #f0f8ff;
}
div.sk-toggleable__content pre {
margin: 0.2em;
color: black;
border-radius: 0.25em;
background-color: #f0f8ff;
}
input.sk-toggleable__control:checked~div.sk-toggleable__content {
max-height: 200px;
max-width: 100%;
overflow: auto;
}
div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
background-color: #d4ebff;
}
div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
background-color: #d4ebff;
}
input.sk-hidden--visually {
border: 0;
clip: rect(1px 1px 1px 1px);
clip: rect(1px, 1px, 1px, 1px);
height: 1px;
margin: -1px;
overflow: hidden;
padding: 0;
position: absolute;
width: 1px;
}
div.sk-estimator {
font-family: monospace;
background-color: #f0f8ff;
margin: 0.25em 0.25em;
border: 1px dotted black;
border-radius: 0.25em;
box-sizing: border-box;
}
div.sk-estimator:hover {
background-color: #d4ebff;
}
div.sk-parallel-item::after {
content: "";
width: 100%;
border-bottom: 1px solid gray;
flex-grow: 1;
}
div.sk-label:hover label.sk-toggleable__label {
background-color: #d4ebff;
}
div.sk-serial::before {
content: "";
position: absolute;
border-left: 1px solid gray;
box-sizing: border-box;
top: 2em;
bottom: 0;
left: 50%;
}
div.sk-serial {
display: flex;
flex-direction: column;
align-items: center;
background-color: white;
}
div.sk-item {
z-index: 1;
}
div.sk-parallel {
display: flex;
align-items: stretch;
justify-content: center;
background-color: white;
}
div.sk-parallel-item {
display: flex;
flex-direction: column;
position: relative;
background-color: white;
}
div.sk-parallel-item:first-child::after {
align-self: flex-end;
width: 50%;
}
div.sk-parallel-item:last-child::after {
align-self: flex-start;
width: 50%;
}
div.sk-parallel-item:only-child::after {
width: 0;
}
div.sk-dashed-wrapped {
border: 1px dashed gray;
margin: 0.2em;
box-sizing: border-box;
padding-bottom: 0.1em;
background-color: white;
position: relative;
}
div.sk-label label {
font-family: monospace;
font-weight: bold;
background-color: white;
display: inline-block;
line-height: 1.2em;
}
div.sk-label-container {
position: relative;
z-index: 2;
text-align: center;
}
div.sk-container {
display: inline-block;
position: relative;
}
""".replace(' ', '').replace('\n', '') # noqa
def estimator_html_repr(estimator):
"""Build a HTML representation of an estimator.
Read more in the :ref:`User Guide <visualizing_composite_estimators>`.
Parameters
----------
estimator : estimator object
The estimator to visualize.
Returns
-------
html: str
HTML representation of estimator.
"""
with closing(StringIO()) as out:
out.write(f'<style>{_STYLE}</style>'
f'<div class="sk-top-container"><div class="sk-container">')
_write_estimator_html(out, estimator, estimator.__class__.__name__,
str(estimator), first_call=True)
out.write('</div></div>')
html_output = out.getvalue()
return html_output

View file

@ -0,0 +1,22 @@
# Author: Gael Varoquaux
# License: BSD
"""
Uses C++ map containers for fast dict-like behavior with keys being
integers, and values float.
"""
from libcpp.map cimport map as cpp_map
# Import the C-level symbols of numpy
cimport numpy as np
ctypedef np.float64_t DTYPE_t
ctypedef np.intp_t ITYPE_t
###############################################################################
# An object to be used in Python
cdef class IntFloatDict:
cdef cpp_map[ITYPE_t, DTYPE_t] my_map
cdef _to_arrays(self, ITYPE_t [:] keys, DTYPE_t [:] values)

View file

@ -0,0 +1,19 @@
import warnings as _warnings
with _warnings.catch_warnings():
_warnings.simplefilter("ignore")
# joblib imports may raise DeprecationWarning on certain Python
# versions
import joblib
from joblib import logger
from joblib import dump, load
from joblib import __version__
from joblib import effective_n_jobs
from joblib import hash
from joblib import cpu_count, Parallel, Memory, delayed
from joblib import parallel_backend, register_parallel_backend
__all__ = ["parallel_backend", "register_parallel_backend", "cpu_count",
"Parallel", "Memory", "delayed", "effective_n_jobs", "hash",
"logger", "dump", "load", "joblib", "__version__"]

View file

@ -0,0 +1,19 @@
import numpy as np
from . import is_scalar_nan
from .fixes import _object_dtype_isnan
def _get_mask(X, value_to_mask):
"""Compute the boolean mask X == value_to_mask."""
if is_scalar_nan(value_to_mask):
if X.dtype.kind == "f":
return np.isnan(X)
elif X.dtype.kind in ("i", "u"):
# can't have NaNs in integer array.
return np.zeros(X.shape, dtype=bool)
else:
# np.isnan does not work on object dtypes.
return _object_dtype_isnan(X)
else:
return X == value_to_mask

View file

@ -0,0 +1,166 @@
import numpy as np
from ..base import BaseEstimator, ClassifierMixin
from .validation import _num_samples, check_array
class ArraySlicingWrapper:
"""
Parameters
----------
array
"""
def __init__(self, array):
self.array = array
def __getitem__(self, aslice):
return MockDataFrame(self.array[aslice])
class MockDataFrame:
"""
Parameters
----------
array
"""
# have shape and length but don't support indexing.
def __init__(self, array):
self.array = array
self.values = array
self.shape = array.shape
self.ndim = array.ndim
# ugly hack to make iloc work.
self.iloc = ArraySlicingWrapper(array)
def __len__(self):
return len(self.array)
def __array__(self, dtype=None):
# Pandas data frames also are array-like: we want to make sure that
# input validation in cross-validation does not try to call that
# method.
return self.array
def __eq__(self, other):
return MockDataFrame(self.array == other.array)
def __ne__(self, other):
return not self == other
class CheckingClassifier(ClassifierMixin, BaseEstimator):
"""Dummy classifier to test pipelining and meta-estimators.
Checks some property of X and y in fit / predict.
This allows testing whether pipelines / cross-validation or metaestimators
changed the input.
Parameters
----------
check_y
check_X
foo_param
expected_fit_params
Attributes
----------
classes_
"""
def __init__(self, check_y=None, check_X=None, foo_param=0,
expected_fit_params=None):
self.check_y = check_y
self.check_X = check_X
self.foo_param = foo_param
self.expected_fit_params = expected_fit_params
def fit(self, X, y, **fit_params):
"""
Fit classifier
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like of shape (n_samples, n_output) or (n_samples,), optional
Target relative to X for classification or regression;
None for unsupervised learning.
**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of the estimator
"""
assert len(X) == len(y)
if self.check_X is not None:
assert self.check_X(X)
if self.check_y is not None:
assert self.check_y(y)
self.n_features_in_ = len(X)
self.classes_ = np.unique(check_array(y, ensure_2d=False,
allow_nd=True))
if self.expected_fit_params:
missing = set(self.expected_fit_params) - set(fit_params)
assert len(missing) == 0, 'Expected fit parameter(s) %s not ' \
'seen.' % list(missing)
for key, value in fit_params.items():
assert len(value) == len(X), (
'Fit parameter %s has length %d; '
'expected %d.'
% (key, len(value), len(X)))
return self
def predict(self, T):
"""
Parameters
----------
T : indexable, length n_samples
"""
if self.check_X is not None:
assert self.check_X(T)
return self.classes_[np.zeros(_num_samples(T), dtype=np.int)]
def score(self, X=None, Y=None):
"""
Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data, where n_samples is the number of samples and
n_features is the number of features.
Y : array-like of shape (n_samples, n_output) or (n_samples,), optional
Target relative to X for classification or regression;
None for unsupervised learning.
"""
if self.foo_param > 1:
score = 1.
else:
score = 0.
return score
def _more_tags(self):
return {'_skip_test': True, 'X_types': ['1dlabel']}
class NoSampleWeightWrapper(BaseEstimator):
"""Wrap estimator which will not expose `sample_weight`.
Parameters
----------
est : estimator, default=None
The estimator to wrap.
"""
def __init__(self, est=None):
self.est = est
def fit(self, X, y):
return self.est.fit(X, y)
def predict(self, X):
return self.est.predict(X)
def predict_proba(self, X):
return self.est.predict_proba(X)
def _more_tags(self):
return {'_skip_test': True}

View file

@ -0,0 +1,438 @@
"""This module contains the _EstimatorPrettyPrinter class used in
BaseEstimator.__repr__ for pretty-printing estimators"""
# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 Python Software Foundation;
# All Rights Reserved
# Authors: Fred L. Drake, Jr. <fdrake@acm.org> (built-in CPython pprint module)
# Nicolas Hug (scikit-learn specific changes)
# License: PSF License version 2 (see below)
# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
# --------------------------------------------
# 1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"),
# and the Individual or Organization ("Licensee") accessing and otherwise
# using this software ("Python") in source or binary form and its associated
# documentation.
# 2. Subject to the terms and conditions of this License Agreement, PSF hereby
# grants Licensee a nonexclusive, royalty-free, world-wide license to
# reproduce, analyze, test, perform and/or display publicly, prepare
# derivative works, distribute, and otherwise use Python alone or in any
# derivative version, provided, however, that PSF's License Agreement and
# PSF's notice of copyright, i.e., "Copyright (c) 2001, 2002, 2003, 2004,
# 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
# 2017, 2018 Python Software Foundation; All Rights Reserved" are retained in
# Python alone or in any derivative version prepared by Licensee.
# 3. In the event Licensee prepares a derivative work that is based on or
# incorporates Python or any part thereof, and wants to make the derivative
# work available to others as provided herein, then Licensee hereby agrees to
# include in any such work a brief summary of the changes made to Python.
# 4. PSF is making Python available to Licensee on an "AS IS" basis. PSF MAKES
# NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT
# NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF
# MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
# PYTHON WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
# 5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON FOR ANY
# INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF
# MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, OR ANY DERIVATIVE
# THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
# 6. This License Agreement will automatically terminate upon a material
# breach of its terms and conditions.
# 7. Nothing in this License Agreement shall be deemed to create any
# relationship of agency, partnership, or joint venture between PSF and
# Licensee. This License Agreement does not grant permission to use PSF
# trademarks or trade name in a trademark sense to endorse or promote products
# or services of Licensee, or any third party.
# 8. By copying, installing or otherwise using Python, Licensee agrees to be
# bound by the terms and conditions of this License Agreement.
# Brief summary of changes to original code:
# - "compact" parameter is supported for dicts, not just lists or tuples
# - estimators have a custom handler, they're not just treated as objects
# - long sequences (lists, tuples, dict items) with more than N elements are
# shortened using ellipsis (', ...') at the end.
from inspect import signature
import pprint
from collections import OrderedDict
from ..base import BaseEstimator
from .._config import get_config
from . import is_scalar_nan
class KeyValTuple(tuple):
"""Dummy class for correctly rendering key-value tuples from dicts."""
def __repr__(self):
# needed for _dispatch[tuple.__repr__] not to be overridden
return super().__repr__()
class KeyValTupleParam(KeyValTuple):
"""Dummy class for correctly rendering key-value tuples from parameters."""
pass
def _changed_params(estimator):
"""Return dict (param_name: value) of parameters that were given to
estimator with non-default values."""
params = estimator.get_params(deep=False)
filtered_params = {}
init_func = getattr(estimator.__init__, 'deprecated_original',
estimator.__init__)
init_params = signature(init_func).parameters
init_params = {name: param.default for name, param in init_params.items()}
for k, v in params.items():
if (k not in init_params or ( # happens if k is part of a **kwargs
repr(v) != repr(init_params[k]) and
not (is_scalar_nan(init_params[k]) and is_scalar_nan(v)))):
filtered_params[k] = v
return filtered_params
class _EstimatorPrettyPrinter(pprint.PrettyPrinter):
"""Pretty Printer class for estimator objects.
This extends the pprint.PrettyPrinter class, because:
- we need estimators to be printed with their parameters, e.g.
Estimator(param1=value1, ...) which is not supported by default.
- the 'compact' parameter of PrettyPrinter is ignored for dicts, which
may lead to very long representations that we want to avoid.
Quick overview of pprint.PrettyPrinter (see also
https://stackoverflow.com/questions/49565047/pprint-with-hex-numbers):
- the entry point is the _format() method which calls format() (overridden
here)
- format() directly calls _safe_repr() for a first try at rendering the
object
- _safe_repr formats the whole object reccursively, only calling itself,
not caring about line length or anything
- back to _format(), if the output string is too long, _format() then calls
the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on
the type of the object. This where the line length and the compact
parameters are taken into account.
- those _pprint_TYPE() methods will internally use the format() method for
rendering the nested objects of an object (e.g. the elements of a list)
In the end, everything has to be implemented twice: in _safe_repr and in
the custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not
straightforward to extend (especially when we want a compact output), so
the code is a bit convoluted.
This class overrides:
- format() to support the changed_only parameter
- _safe_repr to support printing of estimators (for when they fit on a
single line)
- _format_dict_items so that dict are correctly 'compacted'
- _format_items so that ellipsis is used on long lists and tuples
When estimators cannot be printed on a single line, the builtin _format()
will call _pprint_estimator() because it was registered to do so (see
_dispatch[BaseEstimator.__repr__] = _pprint_estimator).
both _format_dict_items() and _pprint_estimator() use the
_format_params_or_dict_items() method that will format parameters and
key-value pairs respecting the compact parameter. This method needs another
subroutine _pprint_key_val_tuple() used when a parameter or a key-value
pair is too long to fit on a single line. This subroutine is called in
_format() and is registered as well in the _dispatch dict (just like
_pprint_estimator). We had to create the two classes KeyValTuple and
KeyValTupleParam for this.
"""
def __init__(self, indent=1, width=80, depth=None, stream=None, *,
compact=False, indent_at_name=True,
n_max_elements_to_show=None):
super().__init__(indent, width, depth, stream, compact=compact)
self._indent_at_name = indent_at_name
if self._indent_at_name:
self._indent_per_level = 1 # ignore indent param
self._changed_only = get_config()['print_changed_only']
# Max number of elements in a list, dict, tuple until we start using
# ellipsis. This also affects the number of arguments of an estimators
# (they are treated as dicts)
self.n_max_elements_to_show = n_max_elements_to_show
def format(self, object, context, maxlevels, level):
return _safe_repr(object, context, maxlevels, level,
changed_only=self._changed_only)
def _pprint_estimator(self, object, stream, indent, allowance, context,
level):
stream.write(object.__class__.__name__ + '(')
if self._indent_at_name:
indent += len(object.__class__.__name__)
if self._changed_only:
params = _changed_params(object)
else:
params = object.get_params(deep=False)
params = OrderedDict((name, val)
for (name, val) in sorted(params.items()))
self._format_params(params.items(), stream, indent, allowance + 1,
context, level)
stream.write(')')
def _format_dict_items(self, items, stream, indent, allowance, context,
level):
return self._format_params_or_dict_items(
items, stream, indent, allowance, context, level, is_dict=True)
def _format_params(self, items, stream, indent, allowance, context, level):
return self._format_params_or_dict_items(
items, stream, indent, allowance, context, level, is_dict=False)
def _format_params_or_dict_items(self, object, stream, indent, allowance,
context, level, is_dict):
"""Format dict items or parameters respecting the compact=True
parameter. For some reason, the builtin rendering of dict items doesn't
respect compact=True and will use one line per key-value if all cannot
fit in a single line.
Dict items will be rendered as <'key': value> while params will be
rendered as <key=value>. The implementation is mostly copy/pasting from
the builtin _format_items().
This also adds ellipsis if the number of items is greater than
self.n_max_elements_to_show.
"""
write = stream.write
indent += self._indent_per_level
delimnl = ',\n' + ' ' * indent
delim = ''
width = max_width = self._width - indent + 1
it = iter(object)
try:
next_ent = next(it)
except StopIteration:
return
last = False
n_items = 0
while not last:
if n_items == self.n_max_elements_to_show:
write(', ...')
break
n_items += 1
ent = next_ent
try:
next_ent = next(it)
except StopIteration:
last = True
max_width -= allowance
width -= allowance
if self._compact:
k, v = ent
krepr = self._repr(k, context, level)
vrepr = self._repr(v, context, level)
if not is_dict:
krepr = krepr.strip("'")
middle = ': ' if is_dict else '='
rep = krepr + middle + vrepr
w = len(rep) + 2
if width < w:
width = max_width
if delim:
delim = delimnl
if width >= w:
width -= w
write(delim)
delim = ', '
write(rep)
continue
write(delim)
delim = delimnl
class_ = KeyValTuple if is_dict else KeyValTupleParam
self._format(class_(ent), stream, indent,
allowance if last else 1, context, level)
def _format_items(self, items, stream, indent, allowance, context, level):
"""Format the items of an iterable (list, tuple...). Same as the
built-in _format_items, with support for ellipsis if the number of
elements is greater than self.n_max_elements_to_show.
"""
write = stream.write
indent += self._indent_per_level
if self._indent_per_level > 1:
write((self._indent_per_level - 1) * ' ')
delimnl = ',\n' + ' ' * indent
delim = ''
width = max_width = self._width - indent + 1
it = iter(items)
try:
next_ent = next(it)
except StopIteration:
return
last = False
n_items = 0
while not last:
if n_items == self.n_max_elements_to_show:
write(', ...')
break
n_items += 1
ent = next_ent
try:
next_ent = next(it)
except StopIteration:
last = True
max_width -= allowance
width -= allowance
if self._compact:
rep = self._repr(ent, context, level)
w = len(rep) + 2
if width < w:
width = max_width
if delim:
delim = delimnl
if width >= w:
width -= w
write(delim)
delim = ', '
write(rep)
continue
write(delim)
delim = delimnl
self._format(ent, stream, indent,
allowance if last else 1, context, level)
def _pprint_key_val_tuple(self, object, stream, indent, allowance, context,
level):
"""Pretty printing for key-value tuples from dict or parameters."""
k, v = object
rep = self._repr(k, context, level)
if isinstance(object, KeyValTupleParam):
rep = rep.strip("'")
middle = '='
else:
middle = ': '
stream.write(rep)
stream.write(middle)
self._format(v, stream, indent + len(rep) + len(middle), allowance,
context, level)
# Note: need to copy _dispatch to prevent instances of the builtin
# PrettyPrinter class to call methods of _EstimatorPrettyPrinter (see issue
# 12906)
# mypy error: "Type[PrettyPrinter]" has no attribute "_dispatch"
_dispatch = pprint.PrettyPrinter._dispatch.copy() # type: ignore
_dispatch[BaseEstimator.__repr__] = _pprint_estimator
_dispatch[KeyValTuple.__repr__] = _pprint_key_val_tuple
def _safe_repr(object, context, maxlevels, level, changed_only=False):
"""Same as the builtin _safe_repr, with added support for Estimator
objects."""
typ = type(object)
if typ in pprint._builtin_scalars:
return repr(object), True, False
r = getattr(typ, "__repr__", None)
if issubclass(typ, dict) and r is dict.__repr__:
if not object:
return "{}", True, False
objid = id(object)
if maxlevels and level >= maxlevels:
return "{...}", False, objid in context
if objid in context:
return pprint._recursion(object), False, True
context[objid] = 1
readable = True
recursive = False
components = []
append = components.append
level += 1
saferepr = _safe_repr
items = sorted(object.items(), key=pprint._safe_tuple)
for k, v in items:
krepr, kreadable, krecur = saferepr(
k, context, maxlevels, level, changed_only=changed_only)
vrepr, vreadable, vrecur = saferepr(
v, context, maxlevels, level, changed_only=changed_only)
append("%s: %s" % (krepr, vrepr))
readable = readable and kreadable and vreadable
if krecur or vrecur:
recursive = True
del context[objid]
return "{%s}" % ", ".join(components), readable, recursive
if (issubclass(typ, list) and r is list.__repr__) or \
(issubclass(typ, tuple) and r is tuple.__repr__):
if issubclass(typ, list):
if not object:
return "[]", True, False
format = "[%s]"
elif len(object) == 1:
format = "(%s,)"
else:
if not object:
return "()", True, False
format = "(%s)"
objid = id(object)
if maxlevels and level >= maxlevels:
return format % "...", False, objid in context
if objid in context:
return pprint._recursion(object), False, True
context[objid] = 1
readable = True
recursive = False
components = []
append = components.append
level += 1
for o in object:
orepr, oreadable, orecur = _safe_repr(
o, context, maxlevels, level, changed_only=changed_only)
append(orepr)
if not oreadable:
readable = False
if orecur:
recursive = True
del context[objid]
return format % ", ".join(components), readable, recursive
if issubclass(typ, BaseEstimator):
objid = id(object)
if maxlevels and level >= maxlevels:
return "{...}", False, objid in context
if objid in context:
return pprint._recursion(object), False, True
context[objid] = 1
readable = True
recursive = False
if changed_only:
params = _changed_params(object)
else:
params = object.get_params(deep=False)
components = []
append = components.append
level += 1
saferepr = _safe_repr
items = sorted(params.items(), key=pprint._safe_tuple)
for k, v in items:
krepr, kreadable, krecur = saferepr(
k, context, maxlevels, level, changed_only=changed_only)
vrepr, vreadable, vrecur = saferepr(
v, context, maxlevels, level, changed_only=changed_only)
append("%s=%s" % (krepr.strip("'"), vrepr))
readable = readable and kreadable and vreadable
if krecur or vrecur:
recursive = True
del context[objid]
return ("%s(%s)" % (typ.__name__, ", ".join(components)), readable,
recursive)
rep = repr(object)
return rep, (rep and not rep.startswith('<')), False

View file

@ -0,0 +1,44 @@
# Authors: Arnaud Joly
#
# License: BSD 3 clause
import numpy as np
cimport numpy as np
ctypedef np.npy_uint32 UINT32_t
cdef inline UINT32_t DEFAULT_SEED = 1
cdef enum:
# Max value for our rand_r replacement (near the bottom).
# We don't use RAND_MAX because it's different across platforms and
# particularly tiny on Windows/MSVC.
RAND_R_MAX = 0x7FFFFFFF
cpdef sample_without_replacement(np.int_t n_population,
np.int_t n_samples,
method=*,
random_state=*)
# rand_r replacement using a 32bit XorShift generator
# See http://www.jstatsoft.org/v08/i14/paper for details
cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:
"""Generate a pseudo-random np.uint32 from a np.uint32 seed"""
# seed shouldn't ever be 0.
if (seed[0] == 0): seed[0] = DEFAULT_SEED
seed[0] ^= <UINT32_t>(seed[0] << 13)
seed[0] ^= <UINT32_t>(seed[0] >> 17)
seed[0] ^= <UINT32_t>(seed[0] << 5)
# Note: we must be careful with the final line cast to np.uint32 so that
# the function behaves consistently across platforms.
#
# The following cast might yield different results on different platforms:
# wrong_cast = <UINT32_t> RAND_R_MAX + 1
#
# We can use:
# good_cast = <UINT32_t>(RAND_R_MAX + 1)
# or:
# cdef np.uint32_t another_good_cast = <UINT32_t>RAND_R_MAX + 1
return seed[0] % <UINT32_t>(RAND_R_MAX + 1)

View file

@ -0,0 +1,116 @@
#------------------------------------------------------------------------------
"""
Dataset abstractions for sequential data access.
WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
"""
cimport numpy as np
# SequentialDataset and its two concrete subclasses are (optionally randomized)
# iterators over the rows of a matrix X and corresponding target values y.
cdef class SequentialDataset64:
cdef int current_index
cdef np.ndarray index
cdef int *index_data_ptr
cdef Py_ssize_t n_samples
cdef np.uint32_t seed
cdef void shuffle(self, np.uint32_t seed) nogil
cdef int _get_next_index(self) nogil
cdef int _get_random_index(self) nogil
cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
int *nnz, double *y, double *sample_weight,
int current_index) nogil
cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
int *nnz, double *y, double *sample_weight) nogil
cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
int *nnz, double *y, double *sample_weight) nogil
cdef class ArrayDataset64(SequentialDataset64):
cdef np.ndarray X
cdef np.ndarray Y
cdef np.ndarray sample_weights
cdef Py_ssize_t n_features
cdef np.npy_intp X_stride
cdef double *X_data_ptr
cdef double *Y_data_ptr
cdef np.ndarray feature_indices
cdef int *feature_indices_ptr
cdef double *sample_weight_data
cdef class CSRDataset64(SequentialDataset64):
cdef np.ndarray X_data
cdef np.ndarray X_indptr
cdef np.ndarray X_indices
cdef np.ndarray Y
cdef np.ndarray sample_weights
cdef double *X_data_ptr
cdef int *X_indptr_ptr
cdef int *X_indices_ptr
cdef double *Y_data_ptr
cdef double *sample_weight_data
#------------------------------------------------------------------------------
"""
Dataset abstractions for sequential data access.
WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
"""
cimport numpy as np
# SequentialDataset and its two concrete subclasses are (optionally randomized)
# iterators over the rows of a matrix X and corresponding target values y.
cdef class SequentialDataset32:
cdef int current_index
cdef np.ndarray index
cdef int *index_data_ptr
cdef Py_ssize_t n_samples
cdef np.uint32_t seed
cdef void shuffle(self, np.uint32_t seed) nogil
cdef int _get_next_index(self) nogil
cdef int _get_random_index(self) nogil
cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
int *nnz, float *y, float *sample_weight,
int current_index) nogil
cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
int *nnz, float *y, float *sample_weight) nogil
cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
int *nnz, float *y, float *sample_weight) nogil
cdef class ArrayDataset32(SequentialDataset32):
cdef np.ndarray X
cdef np.ndarray Y
cdef np.ndarray sample_weights
cdef Py_ssize_t n_features
cdef np.npy_intp X_stride
cdef float *X_data_ptr
cdef float *Y_data_ptr
cdef np.ndarray feature_indices
cdef int *feature_indices_ptr
cdef float *sample_weight_data
cdef class CSRDataset32(SequentialDataset32):
cdef np.ndarray X_data
cdef np.ndarray X_indptr
cdef np.ndarray X_indices
cdef np.ndarray Y
cdef np.ndarray sample_weights
cdef float *X_data_ptr
cdef int *X_indptr_ptr
cdef int *X_indices_ptr
cdef float *Y_data_ptr
cdef float *sample_weight_data

View file

@ -0,0 +1,94 @@
"""
Utility methods to print system info for debugging
adapted from :func:`pandas.show_versions`
"""
# License: BSD 3 clause
import platform
import sys
import importlib
from ._openmp_helpers import _openmp_parallelism_enabled
def _get_sys_info():
"""System information
Return
------
sys_info : dict
system and Python version information
"""
python = sys.version.replace('\n', ' ')
blob = [
("python", python),
('executable', sys.executable),
("machine", platform.platform()),
]
return dict(blob)
def _get_deps_info():
"""Overview of the installed version of main dependencies
Returns
-------
deps_info: dict
version information on relevant Python libraries
"""
deps = [
"pip",
"setuptools",
"sklearn",
"numpy",
"scipy",
"Cython",
"pandas",
"matplotlib",
"joblib",
"threadpoolctl"
]
def get_version(module):
return module.__version__
deps_info = {}
for modname in deps:
try:
if modname in sys.modules:
mod = sys.modules[modname]
else:
mod = importlib.import_module(modname)
ver = get_version(mod)
deps_info[modname] = ver
except ImportError:
deps_info[modname] = None
return deps_info
def show_versions():
"""Print useful debugging information"
.. versionadded:: 0.20
"""
sys_info = _get_sys_info()
deps_info = _get_deps_info()
print('\nSystem:')
for k, stat in sys_info.items():
print("{k:>10}: {stat}".format(k=k, stat=stat))
print('\nPython dependencies:')
for k, stat in deps_info.items():
print("{k:>13}: {stat}".format(k=k, stat=stat))
print("\n{k}: {stat}".format(k="Built with OpenMP",
stat=_openmp_parallelism_enabled()))

View file

@ -0,0 +1,873 @@
"""Testing utilities."""
# Copyright (c) 2011, 2012
# Authors: Pietro Berkes,
# Andreas Muller
# Mathieu Blondel
# Olivier Grisel
# Arnaud Joly
# Denis Engemann
# Giorgio Patrini
# Thierry Guillemot
# License: BSD 3 clause
import os
import os.path as op
import inspect
import pkgutil
import warnings
import sys
import functools
import tempfile
from subprocess import check_output, STDOUT, CalledProcessError
from subprocess import TimeoutExpired
import scipy as sp
from functools import wraps
from operator import itemgetter
from inspect import signature
import shutil
import atexit
import unittest
from unittest import TestCase
# WindowsError only exist on Windows
try:
WindowsError
except NameError:
WindowsError = None
from numpy.testing import assert_allclose
from numpy.testing import assert_almost_equal
from numpy.testing import assert_approx_equal
from numpy.testing import assert_array_equal
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_less
import numpy as np
import joblib
import sklearn
from sklearn.base import (BaseEstimator, ClassifierMixin, ClusterMixin,
RegressorMixin, TransformerMixin)
from sklearn.utils import deprecated, IS_PYPY, _IS_32BIT
__all__ = ["assert_equal", "assert_not_equal", "assert_raises",
"assert_raises_regexp",
"assert_almost_equal", "assert_array_equal",
"assert_array_almost_equal", "assert_array_less",
"assert_less", "assert_less_equal",
"assert_greater", "assert_greater_equal",
"assert_approx_equal", "assert_allclose",
"assert_run_python_script", "SkipTest", "all_estimators"]
_dummy = TestCase('__init__')
deprecation_message = (
'This helper is deprecated in version 0.22 and will be removed in version '
'0.24. Please use "assert" instead'
)
assert_equal = deprecated(deprecation_message)(_dummy.assertEqual)
assert_not_equal = deprecated(deprecation_message)(_dummy.assertNotEqual)
assert_raises = _dummy.assertRaises
SkipTest = unittest.case.SkipTest
assert_dict_equal = _dummy.assertDictEqual
assert_in = deprecated(deprecation_message)(_dummy.assertIn)
assert_not_in = deprecated(deprecation_message)(_dummy.assertNotIn)
assert_less = deprecated(deprecation_message)(_dummy.assertLess)
assert_greater = deprecated(deprecation_message)(_dummy.assertGreater)
assert_less_equal = deprecated(deprecation_message)(_dummy.assertLessEqual)
assert_greater_equal = deprecated(deprecation_message)(
_dummy.assertGreaterEqual)
assert_raises_regex = _dummy.assertRaisesRegex
# assert_raises_regexp is deprecated in Python 3.4 in favor of
# assert_raises_regex but lets keep the backward compat in scikit-learn with
# the old name for now
assert_raises_regexp = assert_raises_regex
def assert_warns(warning_class, func, *args, **kw):
"""Test that a certain warning occurs.
Parameters
----------
warning_class : the warning class
The class to test for, e.g. UserWarning.
func : callable
Callable object to trigger warnings.
*args : the positional arguments to `func`.
**kw : the keyword arguments to `func`
Returns
-------
result : the return value of `func`
"""
with warnings.catch_warnings(record=True) as w:
# Cause all warnings to always be triggered.
warnings.simplefilter("always")
# Trigger a warning.
result = func(*args, **kw)
if hasattr(np, 'FutureWarning'):
# Filter out numpy-specific warnings in numpy >= 1.9
w = [e for e in w
if e.category is not np.VisibleDeprecationWarning]
# Verify some things
if not len(w) > 0:
raise AssertionError("No warning raised when calling %s"
% func.__name__)
found = any(warning.category is warning_class for warning in w)
if not found:
raise AssertionError("%s did not give warning: %s( is %s)"
% (func.__name__, warning_class, w))
return result
def assert_warns_message(warning_class, message, func, *args, **kw):
# very important to avoid uncontrolled state propagation
"""Test that a certain warning occurs and with a certain message.
Parameters
----------
warning_class : the warning class
The class to test for, e.g. UserWarning.
message : str | callable
The message or a substring of the message to test for. If callable,
it takes a string as the argument and will trigger an AssertionError
if the callable returns `False`.
func : callable
Callable object to trigger warnings.
*args : the positional arguments to `func`.
**kw : the keyword arguments to `func`.
Returns
-------
result : the return value of `func`
"""
with warnings.catch_warnings(record=True) as w:
# Cause all warnings to always be triggered.
warnings.simplefilter("always")
if hasattr(np, 'FutureWarning'):
# Let's not catch the numpy internal DeprecationWarnings
warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
# Trigger a warning.
result = func(*args, **kw)
# Verify some things
if not len(w) > 0:
raise AssertionError("No warning raised when calling %s"
% func.__name__)
found = [issubclass(warning.category, warning_class) for warning in w]
if not any(found):
raise AssertionError("No warning raised for %s with class "
"%s"
% (func.__name__, warning_class))
message_found = False
# Checks the message of all warnings belong to warning_class
for index in [i for i, x in enumerate(found) if x]:
# substring will match, the entire message with typo won't
msg = w[index].message # For Python 3 compatibility
msg = str(msg.args[0] if hasattr(msg, 'args') else msg)
if callable(message): # add support for certain tests
check_in_message = message
else:
def check_in_message(msg): return message in msg
if check_in_message(msg):
message_found = True
break
if not message_found:
raise AssertionError("Did not receive the message you expected "
"('%s') for <%s>, got: '%s'"
% (message, func.__name__, msg))
return result
def assert_warns_div0(func, *args, **kw):
"""Assume that numpy's warning for divide by zero is raised
Handles the case of platforms that do not support warning on divide by zero
Parameters
----------
func
*args
**kw
"""
with np.errstate(divide='warn', invalid='warn'):
try:
assert_warns(RuntimeWarning, np.divide, 1, np.zeros(1))
except AssertionError:
# This platform does not report numpy divide by zeros
return func(*args, **kw)
return assert_warns_message(RuntimeWarning,
'invalid value encountered',
func, *args, **kw)
# To remove when we support numpy 1.7
def assert_no_warnings(func, *args, **kw):
"""
Parameters
----------
func
*args
**kw
"""
# very important to avoid uncontrolled state propagation
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter('always')
result = func(*args, **kw)
if hasattr(np, 'FutureWarning'):
# Filter out numpy-specific warnings in numpy >= 1.9
w = [e for e in w
if e.category is not np.VisibleDeprecationWarning]
if len(w) > 0:
raise AssertionError("Got warnings when calling %s: [%s]"
% (func.__name__,
', '.join(str(warning) for warning in w)))
return result
def ignore_warnings(obj=None, category=Warning):
"""Context manager and decorator to ignore warnings.
Note: Using this (in both variants) will clear all warnings
from all python modules loaded. In case you need to test
cross-module-warning-logging, this is not your tool of choice.
Parameters
----------
obj : callable or None
callable where you want to ignore the warnings.
category : warning class, defaults to Warning.
The category to filter. If Warning, all categories will be muted.
Examples
--------
>>> with ignore_warnings():
... warnings.warn('buhuhuhu')
>>> def nasty_warn():
... warnings.warn('buhuhuhu')
... print(42)
>>> ignore_warnings(nasty_warn)()
42
"""
if isinstance(obj, type) and issubclass(obj, Warning):
# Avoid common pitfall of passing category as the first positional
# argument which result in the test not being run
warning_name = obj.__name__
raise ValueError(
"'obj' should be a callable where you want to ignore warnings. "
"You passed a warning class instead: 'obj={warning_name}'. "
"If you want to pass a warning class to ignore_warnings, "
"you should use 'category={warning_name}'".format(
warning_name=warning_name))
elif callable(obj):
return _IgnoreWarnings(category=category)(obj)
else:
return _IgnoreWarnings(category=category)
class _IgnoreWarnings:
"""Improved and simplified Python warnings context manager and decorator.
This class allows the user to ignore the warnings raised by a function.
Copied from Python 2.7.5 and modified as required.
Parameters
----------
category : tuple of warning class, default to Warning
The category to filter. By default, all the categories will be muted.
"""
def __init__(self, category):
self._record = True
self._module = sys.modules['warnings']
self._entered = False
self.log = []
self.category = category
def __call__(self, fn):
"""Decorator to catch and hide warnings without visual nesting."""
@wraps(fn)
def wrapper(*args, **kwargs):
with warnings.catch_warnings():
warnings.simplefilter("ignore", self.category)
return fn(*args, **kwargs)
return wrapper
def __repr__(self):
args = []
if self._record:
args.append("record=True")
if self._module is not sys.modules['warnings']:
args.append("module=%r" % self._module)
name = type(self).__name__
return "%s(%s)" % (name, ", ".join(args))
def __enter__(self):
if self._entered:
raise RuntimeError("Cannot enter %r twice" % self)
self._entered = True
self._filters = self._module.filters
self._module.filters = self._filters[:]
self._showwarning = self._module.showwarning
warnings.simplefilter("ignore", self.category)
def __exit__(self, *exc_info):
if not self._entered:
raise RuntimeError("Cannot exit %r without entering first" % self)
self._module.filters = self._filters
self._module.showwarning = self._showwarning
self.log[:] = []
def assert_raise_message(exceptions, message, function, *args, **kwargs):
"""Helper function to test the message raised in an exception.
Given an exception, a callable to raise the exception, and
a message string, tests that the correct exception is raised and
that the message is a substring of the error thrown. Used to test
that the specific message thrown during an exception is correct.
Parameters
----------
exceptions : exception or tuple of exception
An Exception object.
message : str
The error message or a substring of the error message.
function : callable
Callable object to raise error.
*args : the positional arguments to `function`.
**kwargs : the keyword arguments to `function`.
"""
try:
function(*args, **kwargs)
except exceptions as e:
error_message = str(e)
if message not in error_message:
raise AssertionError("Error message does not include the expected"
" string: %r. Observed error message: %r" %
(message, error_message))
else:
# concatenate exception names
if isinstance(exceptions, tuple):
names = " or ".join(e.__name__ for e in exceptions)
else:
names = exceptions.__name__
raise AssertionError("%s not raised by %s" %
(names, function.__name__))
def assert_allclose_dense_sparse(x, y, rtol=1e-07, atol=1e-9, err_msg=''):
"""Assert allclose for sparse and dense data.
Both x and y need to be either sparse or dense, they
can't be mixed.
Parameters
----------
x : array-like or sparse matrix
First array to compare.
y : array-like or sparse matrix
Second array to compare.
rtol : float, optional
relative tolerance; see numpy.allclose
atol : float, optional
absolute tolerance; see numpy.allclose. Note that the default here is
more tolerant than the default for numpy.testing.assert_allclose, where
atol=0.
err_msg : string, default=''
Error message to raise.
"""
if sp.sparse.issparse(x) and sp.sparse.issparse(y):
x = x.tocsr()
y = y.tocsr()
x.sum_duplicates()
y.sum_duplicates()
assert_array_equal(x.indices, y.indices, err_msg=err_msg)
assert_array_equal(x.indptr, y.indptr, err_msg=err_msg)
assert_allclose(x.data, y.data, rtol=rtol, atol=atol, err_msg=err_msg)
elif not sp.sparse.issparse(x) and not sp.sparse.issparse(y):
# both dense
assert_allclose(x, y, rtol=rtol, atol=atol, err_msg=err_msg)
else:
raise ValueError("Can only compare two sparse matrices,"
" not a sparse matrix and an array.")
# TODO: Remove in 0.24. This class is now in utils.__init__.
def all_estimators(type_filter=None):
"""Get a list of all estimators from sklearn.
This function crawls the module and gets all classes that inherit
from BaseEstimator. Classes that are defined in test-modules are not
included.
By default meta_estimators such as GridSearchCV are also not included.
Parameters
----------
type_filter : string, list of string, or None, default=None
Which kind of estimators should be returned. If None, no filter is
applied and all estimators are returned. Possible values are
'classifier', 'regressor', 'cluster' and 'transformer' to get
estimators only of these specific types, or a list of these to
get the estimators that fit at least one of the types.
Returns
-------
estimators : list of tuples
List of (name, class), where ``name`` is the class name as string
and ``class`` is the actual type of the class.
"""
def is_abstract(c):
if not(hasattr(c, '__abstractmethods__')):
return False
if not len(c.__abstractmethods__):
return False
return True
all_classes = []
# get parent folder
path = sklearn.__path__
for importer, modname, ispkg in pkgutil.walk_packages(
path=path, prefix='sklearn.', onerror=lambda x: None):
if ".tests." in modname or "externals" in modname:
continue
if IS_PYPY and ('_svmlight_format_io' in modname or
'feature_extraction._hashing_fast' in modname):
continue
# Ignore deprecation warnings triggered at import time.
with ignore_warnings(category=FutureWarning):
module = __import__(modname, fromlist="dummy")
classes = inspect.getmembers(module, inspect.isclass)
all_classes.extend(classes)
all_classes = set(all_classes)
estimators = [c for c in all_classes
if (issubclass(c[1], BaseEstimator) and
c[0] != 'BaseEstimator')]
# get rid of abstract base classes
estimators = [c for c in estimators if not is_abstract(c[1])]
if type_filter is not None:
if not isinstance(type_filter, list):
type_filter = [type_filter]
else:
type_filter = list(type_filter) # copy
filtered_estimators = []
filters = {'classifier': ClassifierMixin,
'regressor': RegressorMixin,
'transformer': TransformerMixin,
'cluster': ClusterMixin}
for name, mixin in filters.items():
if name in type_filter:
type_filter.remove(name)
filtered_estimators.extend([est for est in estimators
if issubclass(est[1], mixin)])
estimators = filtered_estimators
if type_filter:
raise ValueError("Parameter type_filter must be 'classifier', "
"'regressor', 'transformer', 'cluster' or "
"None, got"
" %s." % repr(type_filter))
# drop duplicates, sort for reproducibility
# itemgetter is used to ensure the sort does not extend to the 2nd item of
# the tuple
return sorted(set(estimators), key=itemgetter(0))
def set_random_state(estimator, random_state=0):
"""Set random state of an estimator if it has the `random_state` param.
Parameters
----------
estimator : object
The estimator
random_state : int, RandomState instance or None, optional, default=0
Pseudo random number generator state.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
"""
if "random_state" in estimator.get_params():
estimator.set_params(random_state=random_state)
try:
import pytest
skip_if_32bit = pytest.mark.skipif(_IS_32BIT,
reason='skipped on 32bit platforms')
skip_travis = pytest.mark.skipif(os.environ.get('TRAVIS') == 'true',
reason='skip on travis')
fails_if_pypy = pytest.mark.xfail(IS_PYPY,
reason='not compatible with PyPy')
skip_if_no_parallel = pytest.mark.skipif(not joblib.parallel.mp,
reason="joblib is in serial mode")
# Decorator for tests involving both BLAS calls and multiprocessing.
#
# Under POSIX (e.g. Linux or OSX), using multiprocessing in conjunction
# with some implementation of BLAS (or other libraries that manage an
# internal posix thread pool) can cause a crash or a freeze of the Python
# process.
#
# In practice all known packaged distributions (from Linux distros or
# Anaconda) of BLAS under Linux seems to be safe. So we this problem seems
# to only impact OSX users.
#
# This wrapper makes it possible to skip tests that can possibly cause
# this crash under OS X with.
#
# Under Python 3.4+ it is possible to use the `forkserver` start method
# for multiprocessing to avoid this issue. However it can cause pickling
# errors on interactively defined functions. It therefore not enabled by
# default.
if_safe_multiprocessing_with_blas = pytest.mark.skipif(
sys.platform == 'darwin',
reason="Possible multi-process bug with some BLAS")
except ImportError:
pass
def check_skip_network():
if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)):
raise SkipTest("Text tutorial requires large dataset download")
def _delete_folder(folder_path, warn=False):
"""Utility function to cleanup a temporary folder if still existing.
Copy from joblib.pool (for independence).
"""
try:
if os.path.exists(folder_path):
# This can fail under windows,
# but will succeed when called by atexit
shutil.rmtree(folder_path)
except WindowsError:
if warn:
warnings.warn("Could not delete temporary folder %s" % folder_path)
class TempMemmap:
"""
Parameters
----------
data
mmap_mode
"""
def __init__(self, data, mmap_mode='r'):
self.mmap_mode = mmap_mode
self.data = data
def __enter__(self):
data_read_only, self.temp_folder = create_memmap_backed_data(
self.data, mmap_mode=self.mmap_mode, return_folder=True)
return data_read_only
def __exit__(self, exc_type, exc_val, exc_tb):
_delete_folder(self.temp_folder)
def create_memmap_backed_data(data, mmap_mode='r', return_folder=False):
"""
Parameters
----------
data
mmap_mode
return_folder
"""
temp_folder = tempfile.mkdtemp(prefix='sklearn_testing_')
atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
filename = op.join(temp_folder, 'data.pkl')
joblib.dump(data, filename)
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
result = (memmap_backed_data if not return_folder
else (memmap_backed_data, temp_folder))
return result
# Utils to test docstrings
def _get_args(function, varargs=False):
"""Helper to get function arguments"""
try:
params = signature(function).parameters
except ValueError:
# Error on builtin C function
return []
args = [key for key, param in params.items()
if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)]
if varargs:
varargs = [param.name for param in params.values()
if param.kind == param.VAR_POSITIONAL]
if len(varargs) == 0:
varargs = None
return args, varargs
else:
return args
def _get_func_name(func):
"""Get function full name
Parameters
----------
func : callable
The function object.
Returns
-------
name : str
The function name.
"""
parts = []
module = inspect.getmodule(func)
if module:
parts.append(module.__name__)
qualname = func.__qualname__
if qualname != func.__name__:
parts.append(qualname[:qualname.find('.')])
parts.append(func.__name__)
return '.'.join(parts)
def check_docstring_parameters(func, doc=None, ignore=None):
"""Helper to check docstring
Parameters
----------
func : callable
The function object to test.
doc : str, optional (default: None)
Docstring if it is passed manually to the test.
ignore : None | list
Parameters to ignore.
Returns
-------
incorrect : list
A list of string describing the incorrect results.
"""
from numpydoc import docscrape
incorrect = []
ignore = [] if ignore is None else ignore
func_name = _get_func_name(func)
if (not func_name.startswith('sklearn.') or
func_name.startswith('sklearn.externals')):
return incorrect
# Don't check docstring for property-functions
if inspect.isdatadescriptor(func):
return incorrect
# Don't check docstring for setup / teardown pytest functions
if func_name.split('.')[-1] in ('setup_module', 'teardown_module'):
return incorrect
# Dont check estimator_checks module
if func_name.split('.')[2] == 'estimator_checks':
return incorrect
# Get the arguments from the function signature
param_signature = list(filter(lambda x: x not in ignore, _get_args(func)))
# drop self
if len(param_signature) > 0 and param_signature[0] == 'self':
param_signature.remove('self')
# Analyze function's docstring
if doc is None:
with warnings.catch_warnings(record=True) as w:
try:
doc = docscrape.FunctionDoc(func)
except Exception as exp:
incorrect += [func_name + ' parsing error: ' + str(exp)]
return incorrect
if len(w):
raise RuntimeError('Error for %s:\n%s' % (func_name, w[0]))
param_docs = []
for name, type_definition, param_doc in doc['Parameters']:
# Type hints are empty only if parameter name ended with :
if not type_definition.strip():
if ':' in name and name[:name.index(':')][-1:].strip():
incorrect += [func_name +
' There was no space between the param name and '
'colon (%r)' % name]
elif name.rstrip().endswith(':'):
incorrect += [func_name +
' Parameter %r has an empty type spec. '
'Remove the colon' % (name.lstrip())]
# Create a list of parameters to compare with the parameters gotten
# from the func signature
if '*' not in name:
param_docs.append(name.split(':')[0].strip('` '))
# If one of the docstring's parameters had an error then return that
# incorrect message
if len(incorrect) > 0:
return incorrect
# Remove the parameters that should be ignored from list
param_docs = list(filter(lambda x: x not in ignore, param_docs))
# The following is derived from pytest, Copyright (c) 2004-2017 Holger
# Krekel and others, Licensed under MIT License. See
# https://github.com/pytest-dev/pytest
message = []
for i in range(min(len(param_docs), len(param_signature))):
if param_signature[i] != param_docs[i]:
message += ["There's a parameter name mismatch in function"
" docstring w.r.t. function signature, at index %s"
" diff: %r != %r" %
(i, param_signature[i], param_docs[i])]
break
if len(param_signature) > len(param_docs):
message += ["Parameters in function docstring have less items w.r.t."
" function signature, first missing item: %s" %
param_signature[len(param_docs)]]
elif len(param_signature) < len(param_docs):
message += ["Parameters in function docstring have more items w.r.t."
" function signature, first extra item: %s" %
param_docs[len(param_signature)]]
# If there wasn't any difference in the parameters themselves between
# docstring and signature including having the same length then return
# empty list
if len(message) == 0:
return []
import difflib
import pprint
param_docs_formatted = pprint.pformat(param_docs).splitlines()
param_signature_formatted = pprint.pformat(param_signature).splitlines()
message += ["Full diff:"]
message.extend(
line.strip() for line in difflib.ndiff(param_signature_formatted,
param_docs_formatted)
)
incorrect.extend(message)
# Prepend function name
incorrect = ['In function: ' + func_name] + incorrect
return incorrect
def assert_run_python_script(source_code, timeout=60):
"""Utility to check assertions in an independent Python subprocess.
The script provided in the source code should return 0 and not print
anything on stderr or stdout.
This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle
Parameters
----------
source_code : str
The Python source code to execute.
timeout : int
Time in seconds before timeout.
"""
fd, source_file = tempfile.mkstemp(suffix='_src_test_sklearn.py')
os.close(fd)
try:
with open(source_file, 'wb') as f:
f.write(source_code.encode('utf-8'))
cmd = [sys.executable, source_file]
cwd = op.normpath(op.join(op.dirname(sklearn.__file__), '..'))
env = os.environ.copy()
try:
env["PYTHONPATH"] = os.pathsep.join([cwd, env["PYTHONPATH"]])
except KeyError:
env["PYTHONPATH"] = cwd
kwargs = {
'cwd': cwd,
'stderr': STDOUT,
'env': env
}
# If coverage is running, pass the config file to the subprocess
coverage_rc = os.environ.get("COVERAGE_PROCESS_START")
if coverage_rc:
kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc
kwargs['timeout'] = timeout
try:
try:
out = check_output(cmd, **kwargs)
except CalledProcessError as e:
raise RuntimeError(u"script errored with output:\n%s"
% e.output.decode('utf-8'))
if out != b"":
raise AssertionError(out.decode('utf-8'))
except TimeoutExpired as e:
raise RuntimeError(u"script timeout, output so far:\n%s"
% e.output.decode('utf-8'))
finally:
os.unlink(source_file)
def _convert_container(container, constructor_name, columns_name=None):
if constructor_name == 'list':
return list(container)
elif constructor_name == 'tuple':
return tuple(container)
elif constructor_name == 'array':
return np.asarray(container)
elif constructor_name == 'sparse':
return sp.sparse.csr_matrix(container)
elif constructor_name == 'dataframe':
pd = pytest.importorskip('pandas')
return pd.DataFrame(container, columns=columns_name)
elif constructor_name == 'series':
pd = pytest.importorskip('pandas')
return pd.Series(container)
elif constructor_name == 'index':
pd = pytest.importorskip('pandas')
return pd.Index(container)
elif constructor_name == 'slice':
return slice(container[0], container[1])

View file

@ -0,0 +1,20 @@
"""Efficient (dense) parameter vector implementation for linear models. """
cdef class WeightVector(object):
cdef double *w_data_ptr
cdef double *aw_data_ptr
cdef double wscale
cdef double average_a
cdef double average_b
cdef int n_features
cdef double sq_norm
cdef void add(self, double *x_data_ptr, int *x_ind_ptr,
int xnnz, double c) nogil
cdef void add_average(self, double *x_data_ptr, int *x_ind_ptr,
int xnnz, double c, double num_iter) nogil
cdef double dot(self, double *x_data_ptr, int *x_ind_ptr,
int xnnz) nogil
cdef void scale(self, double c) nogil
cdef void reset_wscale(self) nogil
cdef double norm(self) nogil

View file

@ -0,0 +1,181 @@
# Authors: Andreas Mueller
# Manoj Kumar
# License: BSD 3 clause
import numpy as np
from .validation import _deprecate_positional_args
@_deprecate_positional_args
def compute_class_weight(class_weight, *, classes, y):
"""Estimate class weights for unbalanced datasets.
Parameters
----------
class_weight : dict, 'balanced' or None
If 'balanced', class weights will be given by
``n_samples / (n_classes * np.bincount(y))``.
If a dictionary is given, keys are classes and values
are corresponding class weights.
If None is given, the class weights will be uniform.
classes : ndarray
Array of the classes occurring in the data, as given by
``np.unique(y_org)`` with ``y_org`` the original class labels.
y : array-like, shape (n_samples,)
Array of original class labels per sample;
Returns
-------
class_weight_vect : ndarray, shape (n_classes,)
Array with class_weight_vect[i] the weight for i-th class
References
----------
The "balanced" heuristic is inspired by
Logistic Regression in Rare Events Data, King, Zen, 2001.
"""
# Import error caused by circular imports.
from ..preprocessing import LabelEncoder
if set(y) - set(classes):
raise ValueError("classes should include all valid labels that can "
"be in y")
if class_weight is None or len(class_weight) == 0:
# uniform class weights
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
elif class_weight == 'balanced':
# Find the weight of each class as present in y.
le = LabelEncoder()
y_ind = le.fit_transform(y)
if not all(np.in1d(classes, le.classes_)):
raise ValueError("classes should have valid labels that are in y")
recip_freq = len(y) / (len(le.classes_) *
np.bincount(y_ind).astype(np.float64))
weight = recip_freq[le.transform(classes)]
else:
# user-defined dictionary
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
if not isinstance(class_weight, dict):
raise ValueError("class_weight must be dict, 'balanced', or None,"
" got: %r" % class_weight)
for c in class_weight:
i = np.searchsorted(classes, c)
if i >= len(classes) or classes[i] != c:
raise ValueError("Class label {} not present.".format(c))
else:
weight[i] = class_weight[c]
return weight
@_deprecate_positional_args
def compute_sample_weight(class_weight, y, *, indices=None):
"""Estimate sample weights by class for unbalanced datasets.
Parameters
----------
class_weight : dict, list of dicts, "balanced", or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
Note that for multioutput (including multilabel) weights should be
defined for each class of every column in its own dict. For example,
for four-class multilabel classification weights should be
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
[{1:1}, {2:5}, {3:1}, {4:1}].
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data:
``n_samples / (n_classes * np.bincount(y))``.
For multi-output, the weights of each column of y will be multiplied.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
Array of original class labels per sample.
indices : array-like, shape (n_subsample,), or None
Array of indices to be used in a subsample. Can be of length less than
n_samples in the case of a subsample, or equal to n_samples in the
case of a bootstrap subsample with repeated indices. If None, the
sample weight will be calculated over the full sample. Only "balanced"
is supported for class_weight if this is provided.
Returns
-------
sample_weight_vect : ndarray, shape (n_samples,)
Array with sample weights as applied to the original y
"""
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
if isinstance(class_weight, str):
if class_weight not in ['balanced']:
raise ValueError('The only valid preset for class_weight is '
'"balanced". Given "%s".' % class_weight)
elif (indices is not None and
not isinstance(class_weight, str)):
raise ValueError('The only valid class_weight for subsampling is '
'"balanced". Given "%s".' % class_weight)
elif n_outputs > 1:
if (not hasattr(class_weight, "__iter__") or
isinstance(class_weight, dict)):
raise ValueError("For multi-output, class_weight should be a "
"list of dicts, or a valid string.")
if len(class_weight) != n_outputs:
raise ValueError("For multi-output, number of elements in "
"class_weight should match number of outputs.")
expanded_class_weight = []
for k in range(n_outputs):
y_full = y[:, k]
classes_full = np.unique(y_full)
classes_missing = None
if class_weight == 'balanced' or n_outputs == 1:
class_weight_k = class_weight
else:
class_weight_k = class_weight[k]
if indices is not None:
# Get class weights for the subsample, covering all classes in
# case some labels that were present in the original data are
# missing from the sample.
y_subsample = y[indices, k]
classes_subsample = np.unique(y_subsample)
weight_k = np.take(compute_class_weight(class_weight_k,
classes=classes_subsample,
y=y_subsample),
np.searchsorted(classes_subsample,
classes_full),
mode='clip')
classes_missing = set(classes_full) - set(classes_subsample)
else:
weight_k = compute_class_weight(class_weight_k,
classes=classes_full,
y=y_full)
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
if classes_missing:
# Make missing classes' weight zero
weight_k[np.in1d(y_full, list(classes_missing))] = 0.
expanded_class_weight.append(weight_k)
expanded_class_weight = np.prod(expanded_class_weight,
axis=0,
dtype=np.float64)
return expanded_class_weight

View file

@ -0,0 +1,143 @@
import warnings
import functools
__all__ = ["deprecated"]
class deprecated:
"""Decorator to mark a function or class as deprecated.
Issue a warning when the function is called/the class is instantiated and
adds a warning to the docstring.
The optional extra argument will be appended to the deprecation message
and the docstring. Note: to use this with the default value for extra, put
in an empty of parentheses:
>>> from sklearn.utils import deprecated
>>> deprecated()
<sklearn.utils.deprecation.deprecated object at ...>
>>> @deprecated()
... def some_function(): pass
Parameters
----------
extra : string
to be added to the deprecation messages
"""
# Adapted from https://wiki.python.org/moin/PythonDecoratorLibrary,
# but with many changes.
def __init__(self, extra=''):
self.extra = extra
def __call__(self, obj):
"""Call method
Parameters
----------
obj : object
"""
if isinstance(obj, type):
return self._decorate_class(obj)
elif isinstance(obj, property):
# Note that this is only triggered properly if the `property`
# decorator comes before the `deprecated` decorator, like so:
#
# @deprecated(msg)
# @property
# def deprecated_attribute_(self):
# ...
return self._decorate_property(obj)
else:
return self._decorate_fun(obj)
def _decorate_class(self, cls):
msg = "Class %s is deprecated" % cls.__name__
if self.extra:
msg += "; %s" % self.extra
# FIXME: we should probably reset __new__ for full generality
init = cls.__init__
def wrapped(*args, **kwargs):
warnings.warn(msg, category=FutureWarning)
return init(*args, **kwargs)
cls.__init__ = wrapped
wrapped.__name__ = '__init__'
wrapped.__doc__ = self._update_doc(init.__doc__)
wrapped.deprecated_original = init
return cls
def _decorate_fun(self, fun):
"""Decorate function fun"""
msg = "Function %s is deprecated" % fun.__name__
if self.extra:
msg += "; %s" % self.extra
@functools.wraps(fun)
def wrapped(*args, **kwargs):
warnings.warn(msg, category=FutureWarning)
return fun(*args, **kwargs)
wrapped.__doc__ = self._update_doc(wrapped.__doc__)
# Add a reference to the wrapped function so that we can introspect
# on function arguments in Python 2 (already works in Python 3)
wrapped.__wrapped__ = fun
return wrapped
def _decorate_property(self, prop):
msg = self.extra
@property
def wrapped(*args, **kwargs):
warnings.warn(msg, category=FutureWarning)
return prop.fget(*args, **kwargs)
return wrapped
def _update_doc(self, olddoc):
newdoc = "DEPRECATED"
if self.extra:
newdoc = "%s: %s" % (newdoc, self.extra)
if olddoc:
newdoc = "%s\n\n %s" % (newdoc, olddoc)
return newdoc
def _is_deprecated(func):
"""Helper to check if func is wrapped by our deprecated decorator"""
closures = getattr(func, '__closure__', [])
if closures is None:
closures = []
is_deprecated = ('deprecated' in ''.join([c.cell_contents
for c in closures
if isinstance(c.cell_contents, str)]))
return is_deprecated
def _raise_dep_warning_if_not_pytest(deprecated_path, correct_path):
# Raise a deprecation warning with standardized deprecation message.
# Useful because we are now deprecating # anything that isn't explicitly
# in an __init__ file.
# TODO: remove in 0.24 since this shouldn't be needed anymore.
message = (
"The {deprecated_path} module is deprecated in version "
"0.22 and will be removed in version 0.24. "
"The corresponding classes / functions "
"should instead be imported from {correct_path}. "
"Anything that cannot be imported from {correct_path} is now "
"part of the private API."
).format(deprecated_path=deprecated_path, correct_path=correct_path)
warnings.warn(message, FutureWarning)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,837 @@
"""
Extended math utilities.
"""
# Authors: Gael Varoquaux
# Alexandre Gramfort
# Alexandre T. Passos
# Olivier Grisel
# Lars Buitinck
# Stefan van der Walt
# Kyle Kastner
# Giorgio Patrini
# License: BSD 3 clause
import warnings
import numpy as np
from scipy import linalg, sparse
from . import check_random_state
from ._logistic_sigmoid import _log_logistic_sigmoid
from .sparsefuncs_fast import csr_row_norms
from .validation import check_array
from .validation import _deprecate_positional_args
from .deprecation import deprecated
def squared_norm(x):
"""Squared Euclidean or Frobenius norm of x.
Faster than norm(x) ** 2.
Parameters
----------
x : array_like
Returns
-------
float
The Euclidean norm when x is a vector, the Frobenius norm when x
is a matrix (2-d array).
"""
x = np.ravel(x, order='K')
if np.issubdtype(x.dtype, np.integer):
warnings.warn('Array type is integer, np.dot may overflow. '
'Data should be float type to avoid this issue',
UserWarning)
return np.dot(x, x)
def row_norms(X, squared=False):
"""Row-wise (squared) Euclidean norm of X.
Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
matrices and does not create an X.shape-sized temporary.
Performs no input validation.
Parameters
----------
X : array_like
The input array
squared : bool, optional (default = False)
If True, return squared norms.
Returns
-------
array_like
The row-wise (squared) Euclidean norm of X.
"""
if sparse.issparse(X):
if not isinstance(X, sparse.csr_matrix):
X = sparse.csr_matrix(X)
norms = csr_row_norms(X)
else:
norms = np.einsum('ij,ij->i', X, X)
if not squared:
np.sqrt(norms, norms)
return norms
def fast_logdet(A):
"""Compute log(det(A)) for A symmetric
Equivalent to : np.log(nl.det(A)) but more robust.
It returns -Inf if det(A) is non positive or is not defined.
Parameters
----------
A : array_like
The matrix
"""
sign, ld = np.linalg.slogdet(A)
if not sign > 0:
return -np.inf
return ld
def density(w, **kwargs):
"""Compute density of a sparse vector
Parameters
----------
w : array_like
The sparse vector
Returns
-------
float
The density of w, between 0 and 1
"""
if hasattr(w, "toarray"):
d = float(w.nnz) / (w.shape[0] * w.shape[1])
else:
d = 0 if w is None else float((w != 0).sum()) / w.size
return d
@_deprecate_positional_args
def safe_sparse_dot(a, b, *, dense_output=False):
"""Dot product that handle the sparse matrix case correctly
Parameters
----------
a : array or sparse matrix
b : array or sparse matrix
dense_output : boolean, (default=False)
When False, ``a`` and ``b`` both being sparse will yield sparse output.
When True, output will always be a dense array.
Returns
-------
dot_product : array or sparse matrix
sparse if ``a`` and ``b`` are sparse and ``dense_output=False``.
"""
if a.ndim > 2 or b.ndim > 2:
if sparse.issparse(a):
# sparse is always 2D. Implies b is 3D+
# [i, j] @ [k, ..., l, m, n] -> [i, k, ..., l, n]
b_ = np.rollaxis(b, -2)
b_2d = b_.reshape((b.shape[-2], -1))
ret = a @ b_2d
ret = ret.reshape(a.shape[0], *b_.shape[1:])
elif sparse.issparse(b):
# sparse is always 2D. Implies a is 3D+
# [k, ..., l, m] @ [i, j] -> [k, ..., l, j]
a_2d = a.reshape(-1, a.shape[-1])
ret = a_2d @ b
ret = ret.reshape(*a.shape[:-1], b.shape[1])
else:
ret = np.dot(a, b)
else:
ret = a @ b
if (sparse.issparse(a) and sparse.issparse(b)
and dense_output and hasattr(ret, "toarray")):
return ret.toarray()
return ret
@_deprecate_positional_args
def randomized_range_finder(A, *, size, n_iter,
power_iteration_normalizer='auto',
random_state=None):
"""Computes an orthonormal matrix whose range approximates the range of A.
Parameters
----------
A : 2D array
The input data matrix
size : integer
Size of the return array
n_iter : integer
Number of power iterations used to stabilize the result
power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
Whether the power iterations are normalized with step-by-step
QR factorization (the slowest but most accurate), 'none'
(the fastest but numerically unstable when `n_iter` is large, e.g.
typically 5 or larger), or 'LU' factorization (numerically stable
but can lose slightly in accuracy). The 'auto' mode applies no
normalization if `n_iter` <= 2 and switches to LU otherwise.
.. versionadded:: 0.18
random_state : int, RandomState instance or None, optional (default=None)
The seed of the pseudo random number generator to use when shuffling
the data, i.e. getting the random vectors to initialize the algorithm.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Returns
-------
Q : 2D array
A (size x size) projection matrix, the range of which
approximates well the range of the input matrix A.
Notes
-----
Follows Algorithm 4.3 of
Finding structure with randomness: Stochastic algorithms for constructing
approximate matrix decompositions
Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf
An implementation of a randomized algorithm for principal component
analysis
A. Szlam et al. 2014
"""
random_state = check_random_state(random_state)
# Generating normal random vectors with shape: (A.shape[1], size)
Q = random_state.normal(size=(A.shape[1], size))
if A.dtype.kind == 'f':
# Ensure f32 is preserved as f32
Q = Q.astype(A.dtype, copy=False)
# Deal with "auto" mode
if power_iteration_normalizer == 'auto':
if n_iter <= 2:
power_iteration_normalizer = 'none'
else:
power_iteration_normalizer = 'LU'
# Perform power iterations with Q to further 'imprint' the top
# singular vectors of A in Q
for i in range(n_iter):
if power_iteration_normalizer == 'none':
Q = safe_sparse_dot(A, Q)
Q = safe_sparse_dot(A.T, Q)
elif power_iteration_normalizer == 'LU':
Q, _ = linalg.lu(safe_sparse_dot(A, Q), permute_l=True)
Q, _ = linalg.lu(safe_sparse_dot(A.T, Q), permute_l=True)
elif power_iteration_normalizer == 'QR':
Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode='economic')
Q, _ = linalg.qr(safe_sparse_dot(A.T, Q), mode='economic')
# Sample the range of A using by linear projection of Q
# Extract an orthonormal basis
Q, _ = linalg.qr(safe_sparse_dot(A, Q), mode='economic')
return Q
@_deprecate_positional_args
def randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto',
power_iteration_normalizer='auto', transpose='auto',
flip_sign=True, random_state=0):
"""Computes a truncated randomized SVD
Parameters
----------
M : ndarray or sparse matrix
Matrix to decompose
n_components : int
Number of singular values and vectors to extract.
n_oversamples : int (default is 10)
Additional number of random vectors to sample the range of M so as
to ensure proper conditioning. The total number of random vectors
used to find the range of M is n_components + n_oversamples. Smaller
number can improve speed but can negatively impact the quality of
approximation of singular vectors and singular values.
n_iter : int or 'auto' (default is 'auto')
Number of power iterations. It can be used to deal with very noisy
problems. When 'auto', it is set to 4, unless `n_components` is small
(< .1 * min(X.shape)) `n_iter` in which case is set to 7.
This improves precision with few components.
.. versionchanged:: 0.18
power_iteration_normalizer : 'auto' (default), 'QR', 'LU', 'none'
Whether the power iterations are normalized with step-by-step
QR factorization (the slowest but most accurate), 'none'
(the fastest but numerically unstable when `n_iter` is large, e.g.
typically 5 or larger), or 'LU' factorization (numerically stable
but can lose slightly in accuracy). The 'auto' mode applies no
normalization if `n_iter` <= 2 and switches to LU otherwise.
.. versionadded:: 0.18
transpose : True, False or 'auto' (default)
Whether the algorithm should be applied to M.T instead of M. The
result should approximately be the same. The 'auto' mode will
trigger the transposition if M.shape[1] > M.shape[0] since this
implementation of randomized SVD tend to be a little faster in that
case.
.. versionchanged:: 0.18
flip_sign : boolean, (True by default)
The output of a singular value decomposition is only unique up to a
permutation of the signs of the singular vectors. If `flip_sign` is
set to `True`, the sign ambiguity is resolved by making the largest
loadings for each component in the left singular vectors positive.
random_state : int, RandomState instance or None, optional (default=None)
The seed of the pseudo random number generator to use when shuffling
the data, i.e. getting the random vectors to initialize the algorithm.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
Notes
-----
This algorithm finds a (usually very good) approximate truncated
singular value decomposition using randomization to speed up the
computations. It is particularly fast on large matrices on which
you wish to extract only a small number of components. In order to
obtain further speed up, `n_iter` can be set <=2 (at the cost of
loss of precision).
References
----------
* Finding structure with randomness: Stochastic algorithms for constructing
approximate matrix decompositions
Halko, et al., 2009 https://arxiv.org/abs/0909.4061
* A randomized algorithm for the decomposition of matrices
Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert
* An implementation of a randomized algorithm for principal component
analysis
A. Szlam et al. 2014
"""
if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)):
warnings.warn("Calculating SVD of a {} is expensive. "
"csr_matrix is more efficient.".format(
type(M).__name__),
sparse.SparseEfficiencyWarning)
random_state = check_random_state(random_state)
n_random = n_components + n_oversamples
n_samples, n_features = M.shape
if n_iter == 'auto':
# Checks if the number of iterations is explicitly specified
# Adjust n_iter. 7 was found a good compromise for PCA. See #5299
n_iter = 7 if n_components < .1 * min(M.shape) else 4
if transpose == 'auto':
transpose = n_samples < n_features
if transpose:
# this implementation is a bit faster with smaller shape[1]
M = M.T
Q = randomized_range_finder(
M, size=n_random, n_iter=n_iter,
power_iteration_normalizer=power_iteration_normalizer,
random_state=random_state)
# project M to the (k + p) dimensional space using the basis vectors
B = safe_sparse_dot(Q.T, M)
# compute the SVD on the thin matrix: (k + p) wide
Uhat, s, V = linalg.svd(B, full_matrices=False)
del B
U = np.dot(Q, Uhat)
if flip_sign:
if not transpose:
U, V = svd_flip(U, V)
else:
# In case of transpose u_based_decision=false
# to actually flip based on u and not v.
U, V = svd_flip(U, V, u_based_decision=False)
if transpose:
# transpose back the results according to the input convention
return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
else:
return U[:, :n_components], s[:n_components], V[:n_components, :]
@_deprecate_positional_args
def weighted_mode(a, w, *, axis=0):
"""Returns an array of the weighted modal (most common) value in a
If there is more than one such value, only the first is returned.
The bin-count for the modal bins is also returned.
This is an extension of the algorithm in scipy.stats.mode.
Parameters
----------
a : array_like
n-dimensional array of which to find mode(s).
w : array_like
n-dimensional array of weights for each value
axis : int, optional
Axis along which to operate. Default is 0, i.e. the first axis.
Returns
-------
vals : ndarray
Array of modal values.
score : ndarray
Array of weighted counts for each mode.
Examples
--------
>>> from sklearn.utils.extmath import weighted_mode
>>> x = [4, 1, 4, 2, 4, 2]
>>> weights = [1, 1, 1, 1, 1, 1]
>>> weighted_mode(x, weights)
(array([4.]), array([3.]))
The value 4 appears three times: with uniform weights, the result is
simply the mode of the distribution.
>>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's
>>> weighted_mode(x, weights)
(array([2.]), array([3.5]))
The value 2 has the highest score: it appears twice with weights of
1.5 and 2: the sum of these is 3.5.
See Also
--------
scipy.stats.mode
"""
if axis is None:
a = np.ravel(a)
w = np.ravel(w)
axis = 0
else:
a = np.asarray(a)
w = np.asarray(w)
if a.shape != w.shape:
w = np.full(a.shape, w, dtype=w.dtype)
scores = np.unique(np.ravel(a)) # get ALL unique values
testshape = list(a.shape)
testshape[axis] = 1
oldmostfreq = np.zeros(testshape)
oldcounts = np.zeros(testshape)
for score in scores:
template = np.zeros(a.shape)
ind = (a == score)
template[ind] = w[ind]
counts = np.expand_dims(np.sum(template, axis), axis)
mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)
oldcounts = np.maximum(counts, oldcounts)
oldmostfreq = mostfrequent
return mostfrequent, oldcounts
def cartesian(arrays, out=None):
"""Generate a cartesian product of input arrays.
Parameters
----------
arrays : list of array-like
1-D arrays to form the cartesian product of.
out : ndarray
Array to place the cartesian product in.
Returns
-------
out : ndarray
2-D array of shape (M, len(arrays)) containing cartesian products
formed of input arrays.
Examples
--------
>>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
array([[1, 4, 6],
[1, 4, 7],
[1, 5, 6],
[1, 5, 7],
[2, 4, 6],
[2, 4, 7],
[2, 5, 6],
[2, 5, 7],
[3, 4, 6],
[3, 4, 7],
[3, 5, 6],
[3, 5, 7]])
"""
arrays = [np.asarray(x) for x in arrays]
shape = (len(x) for x in arrays)
dtype = arrays[0].dtype
ix = np.indices(shape)
ix = ix.reshape(len(arrays), -1).T
if out is None:
out = np.empty_like(ix, dtype=dtype)
for n, arr in enumerate(arrays):
out[:, n] = arrays[n][ix[:, n]]
return out
def svd_flip(u, v, u_based_decision=True):
"""Sign correction to ensure deterministic output from SVD.
Adjusts the columns of u and the rows of v such that the loadings in the
columns in u that are largest in absolute value are always positive.
Parameters
----------
u : ndarray
u and v are the output of `linalg.svd` or
:func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
dimensions so one can compute `np.dot(u * s, v)`.
v : ndarray
u and v are the output of `linalg.svd` or
:func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
dimensions so one can compute `np.dot(u * s, v)`.
u_based_decision : boolean, (default=True)
If True, use the columns of u as the basis for sign flipping.
Otherwise, use the rows of v. The choice of which variable to base the
decision on is generally algorithm dependent.
Returns
-------
u_adjusted, v_adjusted : arrays with the same dimensions as the input.
"""
if u_based_decision:
# columns of u, rows of v
max_abs_cols = np.argmax(np.abs(u), axis=0)
signs = np.sign(u[max_abs_cols, range(u.shape[1])])
u *= signs
v *= signs[:, np.newaxis]
else:
# rows of v, columns of u
max_abs_rows = np.argmax(np.abs(v), axis=1)
signs = np.sign(v[range(v.shape[0]), max_abs_rows])
u *= signs
v *= signs[:, np.newaxis]
return u, v
def log_logistic(X, out=None):
"""Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``.
This implementation is numerically stable because it splits positive and
negative values::
-log(1 + exp(-x_i)) if x_i > 0
x_i - log(1 + exp(x_i)) if x_i <= 0
For the ordinary logistic function, use ``scipy.special.expit``.
Parameters
----------
X : array-like, shape (M, N) or (M, )
Argument to the logistic function
out : array-like, shape: (M, N) or (M, ), optional:
Preallocated output array.
Returns
-------
out : array, shape (M, N) or (M, )
Log of the logistic function evaluated at every point in x
Notes
-----
See the blog post describing this implementation:
http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/
"""
is_1d = X.ndim == 1
X = np.atleast_2d(X)
X = check_array(X, dtype=np.float64)
n_samples, n_features = X.shape
if out is None:
out = np.empty_like(X)
_log_logistic_sigmoid(n_samples, n_features, X, out)
if is_1d:
return np.squeeze(out)
return out
def softmax(X, copy=True):
"""
Calculate the softmax function.
The softmax function is calculated by
np.exp(X) / np.sum(np.exp(X), axis=1)
This will cause overflow when large values are exponentiated.
Hence the largest value in each row is subtracted from each data
point to prevent this.
Parameters
----------
X : array-like of floats, shape (M, N)
Argument to the logistic function
copy : bool, optional
Copy X or not.
Returns
-------
out : array, shape (M, N)
Softmax function evaluated at every point in x
"""
if copy:
X = np.copy(X)
max_prob = np.max(X, axis=1).reshape((-1, 1))
X -= max_prob
np.exp(X, X)
sum_prob = np.sum(X, axis=1).reshape((-1, 1))
X /= sum_prob
return X
@deprecated("safe_min is deprecated in version 0.22 and will be removed "
"in version 0.24.")
def safe_min(X):
"""Returns the minimum value of a dense or a CSR/CSC matrix.
Adapated from https://stackoverflow.com/q/13426580
.. deprecated:: 0.22.0
Parameters
----------
X : array_like
The input array or sparse matrix
Returns
-------
Float
The min value of X
"""
if sparse.issparse(X):
if len(X.data) == 0:
return 0
m = X.data.min()
return m if X.getnnz() == X.size else min(m, 0)
else:
return X.min()
def make_nonnegative(X, min_value=0):
"""Ensure `X.min()` >= `min_value`.
Parameters
----------
X : array_like
The matrix to make non-negative
min_value : float
The threshold value
Returns
-------
array_like
The thresholded array
Raises
------
ValueError
When X is sparse
"""
min_ = X.min()
if min_ < min_value:
if sparse.issparse(X):
raise ValueError("Cannot make the data matrix"
" nonnegative because it is sparse."
" Adding a value to every entry would"
" make it no longer sparse.")
X = X + (min_value - min_)
return X
# Use at least float64 for the accumulating functions to avoid precision issue
# see https://github.com/numpy/numpy/issues/9393. The float64 is also retained
# as it is in case the float overflows
def _safe_accumulator_op(op, x, *args, **kwargs):
"""
This function provides numpy accumulator functions with a float64 dtype
when used on a floating point input. This prevents accumulator overflow on
smaller floating point dtypes.
Parameters
----------
op : function
A numpy accumulator function such as np.mean or np.sum
x : numpy array
A numpy array to apply the accumulator function
*args : positional arguments
Positional arguments passed to the accumulator function after the
input x
**kwargs : keyword arguments
Keyword arguments passed to the accumulator function
Returns
-------
result : The output of the accumulator function passed to this function
"""
if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8:
result = op(x, *args, **kwargs, dtype=np.float64)
else:
result = op(x, *args, **kwargs)
return result
def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
"""Calculate mean update and a Youngs and Cramer variance update.
last_mean and last_variance are statistics computed at the last step by the
function. Both must be initialized to 0.0. In case no scaling is required
last_variance can be None. The mean is always required and returned because
necessary for the calculation of the variance. last_n_samples_seen is the
number of samples encountered until now.
From the paper "Algorithms for computing the sample variance: analysis and
recommendations", by Chan, Golub, and LeVeque.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Data to use for variance update
last_mean : array-like, shape: (n_features,)
last_variance : array-like, shape: (n_features,)
last_sample_count : array-like, shape (n_features,)
Returns
-------
updated_mean : array, shape (n_features,)
updated_variance : array, shape (n_features,)
If None, only mean is computed
updated_sample_count : array, shape (n_features,)
Notes
-----
NaNs are ignored during the algorithm.
References
----------
T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
variance: recommendations, The American Statistician, Vol. 37, No. 3,
pp. 242-247
Also, see the sparse implementation of this in
`utils.sparsefuncs.incr_mean_variance_axis` and
`utils.sparsefuncs_fast.incr_mean_variance_axis0`
"""
# old = stats until now
# new = the current increment
# updated = the aggregated stats
last_sum = last_mean * last_sample_count
new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
new_sample_count = np.sum(~np.isnan(X), axis=0)
updated_sample_count = last_sample_count + new_sample_count
updated_mean = (last_sum + new_sum) / updated_sample_count
if last_variance is None:
updated_variance = None
else:
new_unnormalized_variance = (
_safe_accumulator_op(np.nanvar, X, axis=0) * new_sample_count)
last_unnormalized_variance = last_variance * last_sample_count
with np.errstate(divide='ignore', invalid='ignore'):
last_over_new_count = last_sample_count / new_sample_count
updated_unnormalized_variance = (
last_unnormalized_variance + new_unnormalized_variance +
last_over_new_count / updated_sample_count *
(last_sum / last_over_new_count - new_sum) ** 2)
zeros = last_sample_count == 0
updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros]
updated_variance = updated_unnormalized_variance / updated_sample_count
return updated_mean, updated_variance, updated_sample_count
def _deterministic_vector_sign_flip(u):
"""Modify the sign of vectors for reproducibility
Flips the sign of elements of all the vectors (rows of u) such that
the absolute maximum element of each vector is positive.
Parameters
----------
u : ndarray
Array with vectors as its rows.
Returns
-------
u_flipped : ndarray with same shape as u
Array with the sign flipped vectors as its rows.
"""
max_abs_rows = np.argmax(np.abs(u), axis=1)
signs = np.sign(u[range(u.shape[0]), max_abs_rows])
u *= signs[:, np.newaxis]
return u
def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
"""Use high precision for cumsum and check that final value matches sum
Parameters
----------
arr : array-like
To be cumulatively summed as flat
axis : int, optional
Axis along which the cumulative sum is computed.
The default (None) is to compute the cumsum over the flattened array.
rtol : float
Relative tolerance, see ``np.allclose``
atol : float
Absolute tolerance, see ``np.allclose``
"""
out = np.cumsum(arr, axis=axis, dtype=np.float64)
expected = np.sum(arr, axis=axis, dtype=np.float64)
if not np.all(np.isclose(out.take(-1, axis=axis), expected, rtol=rtol,
atol=atol, equal_nan=True)):
warnings.warn('cumsum was found to be unstable: '
'its last element does not correspond to sum',
RuntimeWarning)
return out

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _fast_dict # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.utils.fast_dict'
correct_import_path = 'sklearn.utils'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_fast_dict, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,162 @@
"""Compatibility fixes for older version of python, numpy and scipy
If you add content to this file, please give the version of the package
at which the fixe is no longer needed.
"""
# Authors: Emmanuelle Gouillart <emmanuelle.gouillart@normalesup.org>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Fabian Pedregosa <fpedregosa@acm.org>
# Lars Buitinck
#
# License: BSD 3 clause
from distutils.version import LooseVersion
import numpy as np
import scipy.sparse as sp
import scipy
import scipy.stats
from scipy.sparse.linalg import lsqr as sparse_lsqr # noqa
from numpy.ma import MaskedArray as _MaskedArray # TODO: remove in 0.25
from .deprecation import deprecated
try:
from pkg_resources import parse_version # type: ignore
except ImportError:
# setuptools not installed
parse_version = LooseVersion # type: ignore
np_version = parse_version(np.__version__)
sp_version = parse_version(scipy.__version__)
if sp_version >= parse_version('1.4'):
from scipy.sparse.linalg import lobpcg
else:
# Backport of lobpcg functionality from scipy 1.4.0, can be removed
# once support for sp_version < parse_version('1.4') is dropped
# mypy error: Name 'lobpcg' already defined (possibly by an import)
from ..externals._lobpcg import lobpcg # type: ignore # noqa
def _object_dtype_isnan(X):
return X != X
# TODO: replace by copy=False, when only scipy > 1.1 is supported.
def _astype_copy_false(X):
"""Returns the copy=False parameter for
{ndarray, csr_matrix, csc_matrix}.astype when possible,
otherwise don't specify
"""
if sp_version >= parse_version('1.1') or not sp.issparse(X):
return {'copy': False}
else:
return {}
def _joblib_parallel_args(**kwargs):
"""Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+
For joblib 0.11 this maps both ``prefer`` and ``require`` parameters to
a specific ``backend``.
Parameters
----------
prefer : str in {'processes', 'threads'} or None
Soft hint to choose the default backend if no specific backend
was selected with the parallel_backend context manager.
require : 'sharedmem' or None
Hard condstraint to select the backend. If set to 'sharedmem',
the selected backend will be single-host and thread-based even
if the user asked for a non-thread based backend with
parallel_backend.
See joblib.Parallel documentation for more details
"""
import joblib
if parse_version(joblib.__version__) >= parse_version('0.12'):
return kwargs
extra_args = set(kwargs.keys()).difference({'prefer', 'require'})
if extra_args:
raise NotImplementedError('unhandled arguments %s with joblib %s'
% (list(extra_args), joblib.__version__))
args = {}
if 'prefer' in kwargs:
prefer = kwargs['prefer']
if prefer not in ['threads', 'processes', None]:
raise ValueError('prefer=%s is not supported' % prefer)
args['backend'] = {'threads': 'threading',
'processes': 'multiprocessing',
None: None}[prefer]
if 'require' in kwargs:
require = kwargs['require']
if require not in [None, 'sharedmem']:
raise ValueError('require=%s is not supported' % require)
if require == 'sharedmem':
args['backend'] = 'threading'
return args
class loguniform(scipy.stats.reciprocal):
"""A class supporting log-uniform random variables.
Parameters
----------
low : float
The minimum value
high : float
The maximum value
Methods
-------
rvs(self, size=None, random_state=None)
Generate log-uniform random variables
The most useful method for Scikit-learn usage is highlighted here.
For a full list, see
`scipy.stats.reciprocal
<https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.reciprocal.html>`_.
This list includes all functions of ``scipy.stats`` continuous
distributions such as ``pdf``.
Notes
-----
This class generates values between ``low`` and ``high`` or
low <= loguniform(low, high).rvs() <= high
The logarithmic probability density function (PDF) is uniform. When
``x`` is a uniformly distributed random variable between 0 and 1, ``10**x``
are random variales that are equally likely to be returned.
This class is an alias to ``scipy.stats.reciprocal``, which uses the
reciprocal distribution:
https://en.wikipedia.org/wiki/Reciprocal_distribution
Examples
--------
>>> from sklearn.utils.fixes import loguniform
>>> rv = loguniform(1e-3, 1e1)
>>> rvs = rv.rvs(random_state=42, size=1000)
>>> rvs.min() # doctest: +SKIP
0.0010435856341129003
>>> rvs.max() # doctest: +SKIP
9.97403052786026
"""
@deprecated(
'MaskedArray is deprecated in version 0.23 and will be removed in version '
'0.25. Use numpy.ma.MaskedArray instead.'
)
class MaskedArray(_MaskedArray):
pass # TODO: remove in 0.25

View file

@ -0,0 +1,69 @@
"""
Graph utilities and algorithms
Graphs are represented with their adjacency matrices, preferably using
sparse matrices.
"""
# Authors: Aric Hagberg <hagberg@lanl.gov>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Jake Vanderplas <vanderplas@astro.washington.edu>
# License: BSD 3 clause
from scipy import sparse
from .graph_shortest_path import graph_shortest_path # noqa
from .validation import _deprecate_positional_args
###############################################################################
# Path and connected component analysis.
# Code adapted from networkx
@_deprecate_positional_args
def single_source_shortest_path_length(graph, source, *, cutoff=None):
"""Return the shortest path length from source to all reachable nodes.
Returns a dictionary of shortest path lengths keyed by target.
Parameters
----------
graph : sparse matrix or 2D array (preferably LIL matrix)
Adjacency matrix of the graph
source : integer
Starting node for path
cutoff : integer, optional
Depth to stop the search - only
paths of length <= cutoff are returned.
Examples
--------
>>> from sklearn.utils.graph import single_source_shortest_path_length
>>> import numpy as np
>>> graph = np.array([[ 0, 1, 0, 0],
... [ 1, 0, 1, 0],
... [ 0, 1, 0, 1],
... [ 0, 0, 1, 0]])
>>> list(sorted(single_source_shortest_path_length(graph, 0).items()))
[(0, 0), (1, 1), (2, 2), (3, 3)]
>>> graph = np.ones((6, 6))
>>> list(sorted(single_source_shortest_path_length(graph, 2).items()))
[(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]
"""
if sparse.isspmatrix(graph):
graph = graph.tolil()
else:
graph = sparse.lil_matrix(graph)
seen = {} # level (number of hops) when seen in BFS
level = 0 # the current level
next_level = [source] # dict of nodes to check at next level
while next_level:
this_level = next_level # advance to next level
next_level = set() # and start a new list (fringe)
for v in this_level:
if v not in seen:
seen[v] = level # set the level of vertex v
next_level.update(graph.rows[v])
if cutoff is not None and cutoff <= level:
break
level += 1
return seen # return all path lengths as dictionary

View file

@ -0,0 +1,211 @@
"""Utilities for meta-estimators"""
# Author: Joel Nothman
# Andreas Mueller
# License: BSD
from typing import List, Any
from abc import ABCMeta, abstractmethod
from operator import attrgetter
from functools import update_wrapper
import numpy as np
from ..utils import _safe_indexing
from ..base import BaseEstimator
__all__ = ['if_delegate_has_method']
class _BaseComposition(BaseEstimator, metaclass=ABCMeta):
"""Handles parameter management for classifiers composed of named estimators.
"""
steps: List[Any]
@abstractmethod
def __init__(self):
pass
def _get_params(self, attr, deep=True):
out = super().get_params(deep=deep)
if not deep:
return out
estimators = getattr(self, attr)
out.update(estimators)
for name, estimator in estimators:
if hasattr(estimator, 'get_params'):
for key, value in estimator.get_params(deep=True).items():
out['%s__%s' % (name, key)] = value
return out
def _set_params(self, attr, **params):
# Ensure strict ordering of parameter setting:
# 1. All steps
if attr in params:
setattr(self, attr, params.pop(attr))
# 2. Step replacement
items = getattr(self, attr)
names = []
if items:
names, _ = zip(*items)
for name in list(params.keys()):
if '__' not in name and name in names:
self._replace_estimator(attr, name, params.pop(name))
# 3. Step parameters and other initialisation arguments
super().set_params(**params)
return self
def _replace_estimator(self, attr, name, new_val):
# assumes `name` is a valid estimator name
new_estimators = list(getattr(self, attr))
for i, (estimator_name, _) in enumerate(new_estimators):
if estimator_name == name:
new_estimators[i] = (name, new_val)
break
setattr(self, attr, new_estimators)
def _validate_names(self, names):
if len(set(names)) != len(names):
raise ValueError('Names provided are not unique: '
'{0!r}'.format(list(names)))
invalid_names = set(names).intersection(self.get_params(deep=False))
if invalid_names:
raise ValueError('Estimator names conflict with constructor '
'arguments: {0!r}'.format(sorted(invalid_names)))
invalid_names = [name for name in names if '__' in name]
if invalid_names:
raise ValueError('Estimator names must not contain __: got '
'{0!r}'.format(invalid_names))
class _IffHasAttrDescriptor:
"""Implements a conditional property using the descriptor protocol.
Using this class to create a decorator will raise an ``AttributeError``
if none of the delegates (specified in ``delegate_names``) is an attribute
of the base object or the first found delegate does not have an attribute
``attribute_name``.
This allows ducktyping of the decorated method based on
``delegate.attribute_name``. Here ``delegate`` is the first item in
``delegate_names`` for which ``hasattr(object, delegate) is True``.
See https://docs.python.org/3/howto/descriptor.html for an explanation of
descriptors.
"""
def __init__(self, fn, delegate_names, attribute_name):
self.fn = fn
self.delegate_names = delegate_names
self.attribute_name = attribute_name
# update the docstring of the descriptor
update_wrapper(self, fn)
def __get__(self, obj, type=None):
# raise an AttributeError if the attribute is not present on the object
if obj is not None:
# delegate only on instances, not the classes.
# this is to allow access to the docstrings.
for delegate_name in self.delegate_names:
try:
delegate = attrgetter(delegate_name)(obj)
except AttributeError:
continue
else:
getattr(delegate, self.attribute_name)
break
else:
attrgetter(self.delegate_names[-1])(obj)
# lambda, but not partial, allows help() to work with update_wrapper
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
# update the docstring of the returned function
update_wrapper(out, self.fn)
return out
def if_delegate_has_method(delegate):
"""Create a decorator for methods that are delegated to a sub-estimator
This enables ducktyping by hasattr returning True according to the
sub-estimator.
Parameters
----------
delegate : string, list of strings or tuple of strings
Name of the sub-estimator that can be accessed as an attribute of the
base object. If a list or a tuple of names are provided, the first
sub-estimator that is an attribute of the base object will be used.
"""
if isinstance(delegate, list):
delegate = tuple(delegate)
if not isinstance(delegate, tuple):
delegate = (delegate,)
return lambda fn: _IffHasAttrDescriptor(fn, delegate,
attribute_name=fn.__name__)
def _safe_split(estimator, X, y, indices, train_indices=None):
"""Create subset of dataset and properly handle kernels.
Slice X, y according to indices for cross-validation, but take care of
precomputed kernel-matrices or pairwise affinities / distances.
If ``estimator._pairwise is True``, X needs to be square and
we slice rows and columns. If ``train_indices`` is not None,
we slice rows using ``indices`` (assumed the test set) and columns
using ``train_indices``, indicating the training set.
Labels y will always be indexed only along the first axis.
Parameters
----------
estimator : object
Estimator to determine whether we should slice only rows or rows and
columns.
X : array-like, sparse matrix or iterable
Data to be indexed. If ``estimator._pairwise is True``,
this needs to be a square array-like or sparse matrix.
y : array-like, sparse matrix or iterable
Targets to be indexed.
indices : array of int
Rows to select from X and y.
If ``estimator._pairwise is True`` and ``train_indices is None``
then ``indices`` will also be used to slice columns.
train_indices : array of int or None, default=None
If ``estimator._pairwise is True`` and ``train_indices is not None``,
then ``train_indices`` will be use to slice the columns of X.
Returns
-------
X_subset : array-like, sparse matrix or list
Indexed data.
y_subset : array-like, sparse matrix or list
Indexed targets.
"""
if getattr(estimator, "_pairwise", False):
if not hasattr(X, "shape"):
raise ValueError("Precomputed kernels or affinity matrices have "
"to be passed as arrays or sparse matrices.")
# X is a precomputed square kernel matrix
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square kernel matrix")
if train_indices is None:
X_subset = X[np.ix_(indices, indices)]
else:
X_subset = X[np.ix_(indices, train_indices)]
else:
X_subset = _safe_indexing(X, indices)
if y is not None:
y_subset = _safe_indexing(y, indices)
else:
y_subset = None
return X_subset, y_subset

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _mocking # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.utils.mocking'
correct_import_path = 'sklearn.utils'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_mocking, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,448 @@
# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
#
# License: BSD 3 clause
"""
Multi-class / multi-label utility function
==========================================
"""
from collections.abc import Sequence
from itertools import chain
from scipy.sparse import issparse
from scipy.sparse.base import spmatrix
from scipy.sparse import dok_matrix
from scipy.sparse import lil_matrix
import numpy as np
from .validation import check_array, _assert_all_finite
def _unique_multiclass(y):
if hasattr(y, '__array__'):
return np.unique(np.asarray(y))
else:
return set(y)
def _unique_indicator(y):
return np.arange(
check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
)
_FN_UNIQUE_LABELS = {
'binary': _unique_multiclass,
'multiclass': _unique_multiclass,
'multilabel-indicator': _unique_indicator,
}
def unique_labels(*ys):
"""Extract an ordered array of unique labels
We don't allow:
- mix of multilabel and multiclass (single label) targets
- mix of label indicator matrix and anything else,
because there are no explicit labels)
- mix of label indicator matrices of different sizes
- mix of string and integer labels
At the moment, we also don't allow "multiclass-multioutput" input type.
Parameters
----------
*ys : array-likes
Returns
-------
out : numpy array of shape [n_unique_labels]
An ordered array of unique labels.
Examples
--------
>>> from sklearn.utils.multiclass import unique_labels
>>> unique_labels([3, 5, 5, 5, 7, 7])
array([3, 5, 7])
>>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
array([1, 2, 3, 4])
>>> unique_labels([1, 2, 10], [5, 11])
array([ 1, 2, 5, 10, 11])
"""
if not ys:
raise ValueError('No argument has been passed.')
# Check that we don't mix label format
ys_types = set(type_of_target(x) for x in ys)
if ys_types == {"binary", "multiclass"}:
ys_types = {"multiclass"}
if len(ys_types) > 1:
raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
label_type = ys_types.pop()
# Check consistency for the indicator format
if (label_type == "multilabel-indicator" and
len(set(check_array(y,
accept_sparse=['csr', 'csc', 'coo']).shape[1]
for y in ys)) > 1):
raise ValueError("Multi-label binary indicator input with "
"different numbers of labels")
# Get the unique set of labels
_unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
if not _unique_labels:
raise ValueError("Unknown label type: %s" % repr(ys))
ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys))
# Check that we don't mix string type with number type
if (len(set(isinstance(label, str) for label in ys_labels)) > 1):
raise ValueError("Mix of label input types (string and number)")
return np.array(sorted(ys_labels))
def _is_integral_float(y):
return y.dtype.kind == 'f' and np.all(y.astype(int) == y)
def is_multilabel(y):
""" Check if ``y`` is in a multilabel format.
Parameters
----------
y : numpy array of shape [n_samples]
Target values.
Returns
-------
out : bool,
Return ``True``, if ``y`` is in a multilabel format, else ```False``.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils.multiclass import is_multilabel
>>> is_multilabel([0, 1, 0, 1])
False
>>> is_multilabel([[1], [0, 2], []])
False
>>> is_multilabel(np.array([[1, 0], [0, 0]]))
True
>>> is_multilabel(np.array([[1], [0], [0]]))
False
>>> is_multilabel(np.array([[1, 0, 0]]))
True
"""
if hasattr(y, '__array__') or isinstance(y, Sequence):
y = np.asarray(y)
if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
return False
if issparse(y):
if isinstance(y, (dok_matrix, lil_matrix)):
y = y.tocsr()
return (len(y.data) == 0 or np.unique(y.data).size == 1 and
(y.dtype.kind in 'biu' or # bool, int, uint
_is_integral_float(np.unique(y.data))))
else:
labels = np.unique(y)
return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint
_is_integral_float(labels))
def check_classification_targets(y):
"""Ensure that target y is of a non-regression type.
Only the following target types (as defined in type_of_target) are allowed:
'binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences'
Parameters
----------
y : array-like
"""
y_type = type_of_target(y)
if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
'multilabel-indicator', 'multilabel-sequences']:
raise ValueError("Unknown label type: %r" % y_type)
def type_of_target(y):
"""Determine the type of data indicated by the target.
Note that this type is the most specific type that can be inferred.
For example:
* ``binary`` is more specific but compatible with ``multiclass``.
* ``multiclass`` of integers is more specific but compatible with
``continuous``.
* ``multilabel-indicator`` is more specific but compatible with
``multiclass-multioutput``.
Parameters
----------
y : array-like
Returns
-------
target_type : string
One of:
* 'continuous': `y` is an array-like of floats that are not all
integers, and is 1d or a column vector.
* 'continuous-multioutput': `y` is a 2d array of floats that are
not all integers, and both dimensions are of size > 1.
* 'binary': `y` contains <= 2 discrete values and is 1d or a column
vector.
* 'multiclass': `y` contains more than two discrete values, is not a
sequence of sequences, and is 1d or a column vector.
* 'multiclass-multioutput': `y` is a 2d array that contains more
than two discrete values, is not a sequence of sequences, and both
dimensions are of size > 1.
* 'multilabel-indicator': `y` is a label indicator matrix, an array
of two dimensions with at least two columns, and at most 2 unique
values.
* 'unknown': `y` is array-like but none of the above, such as a 3d
array, sequence of sequences, or an array of non-sequence objects.
Examples
--------
>>> import numpy as np
>>> type_of_target([0.1, 0.6])
'continuous'
>>> type_of_target([1, -1, -1, 1])
'binary'
>>> type_of_target(['a', 'b', 'a'])
'binary'
>>> type_of_target([1.0, 2.0])
'binary'
>>> type_of_target([1, 0, 2])
'multiclass'
>>> type_of_target([1.0, 0.0, 3.0])
'multiclass'
>>> type_of_target(['a', 'b', 'c'])
'multiclass'
>>> type_of_target(np.array([[1, 2], [3, 1]]))
'multiclass-multioutput'
>>> type_of_target([[1, 2]])
'multilabel-indicator'
>>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
'continuous-multioutput'
>>> type_of_target(np.array([[0, 1], [1, 1]]))
'multilabel-indicator'
"""
valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__'))
and not isinstance(y, str))
if not valid:
raise ValueError('Expected array-like (array or non-string sequence), '
'got %r' % y)
sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray'])
if sparse_pandas:
raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
if is_multilabel(y):
return 'multilabel-indicator'
try:
y = np.asarray(y)
except ValueError:
# Known to fail in numpy 1.3 for array of arrays
return 'unknown'
# The old sequence of sequences format
try:
if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
and not isinstance(y[0], str)):
raise ValueError('You appear to be using a legacy multi-label data'
' representation. Sequence of sequences are no'
' longer supported; use a binary array or sparse'
' matrix instead - the MultiLabelBinarizer'
' transformer can convert to this format.')
except IndexError:
pass
# Invalid inputs
if y.ndim > 2 or (y.dtype == object and len(y) and
not isinstance(y.flat[0], str)):
return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"]
if y.ndim == 2 and y.shape[1] == 0:
return 'unknown' # [[]]
if y.ndim == 2 and y.shape[1] > 1:
suffix = "-multioutput" # [[1, 2], [1, 2]]
else:
suffix = "" # [1, 2, 3] or [[1], [2], [3]]
# check float and contains non-integer float values
if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
# [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
_assert_all_finite(y)
return 'continuous' + suffix
if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
else:
return 'binary' # [1, 2] or [["a"], ["b"]]
def _check_partial_fit_first_call(clf, classes=None):
"""Private helper function for factorizing common classes param logic
Estimators that implement the ``partial_fit`` API need to be provided with
the list of possible classes at the first call to partial_fit.
Subsequent calls to partial_fit should check that ``classes`` is still
consistent with a previous value of ``clf.classes_`` when provided.
This function returns True if it detects that this was the first call to
``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
set on ``clf``.
"""
if getattr(clf, 'classes_', None) is None and classes is None:
raise ValueError("classes must be passed on the first call "
"to partial_fit.")
elif classes is not None:
if getattr(clf, 'classes_', None) is not None:
if not np.array_equal(clf.classes_, unique_labels(classes)):
raise ValueError(
"`classes=%r` is not the same as on last call "
"to partial_fit, was: %r" % (classes, clf.classes_))
else:
# This is the first call to partial_fit
clf.classes_ = unique_labels(classes)
return True
# classes is None and clf.classes_ has already previously been set:
# nothing to do
return False
def class_distribution(y, sample_weight=None):
"""Compute class priors from multioutput-multiclass target data
Parameters
----------
y : array like or sparse matrix of size (n_samples, n_outputs)
The labels for each example.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
classes : list of size n_outputs of arrays of size (n_classes,)
List of classes for each column.
n_classes : list of integers of size n_outputs
Number of classes in each column
class_prior : list of size n_outputs of arrays of size (n_classes,)
Class distribution of each column.
"""
classes = []
n_classes = []
class_prior = []
n_samples, n_outputs = y.shape
if sample_weight is not None:
sample_weight = np.asarray(sample_weight)
if issparse(y):
y = y.tocsc()
y_nnz = np.diff(y.indptr)
for k in range(n_outputs):
col_nonzero = y.indices[y.indptr[k]:y.indptr[k + 1]]
# separate sample weights for zero and non-zero elements
if sample_weight is not None:
nz_samp_weight = sample_weight[col_nonzero]
zeros_samp_weight_sum = (np.sum(sample_weight) -
np.sum(nz_samp_weight))
else:
nz_samp_weight = None
zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
classes_k, y_k = np.unique(y.data[y.indptr[k]:y.indptr[k + 1]],
return_inverse=True)
class_prior_k = np.bincount(y_k, weights=nz_samp_weight)
# An explicit zero was found, combine its weight with the weight
# of the implicit zeros
if 0 in classes_k:
class_prior_k[classes_k == 0] += zeros_samp_weight_sum
# If an there is an implicit zero and it is not in classes and
# class_prior, make an entry for it
if 0 not in classes_k and y_nnz[k] < y.shape[0]:
classes_k = np.insert(classes_k, 0, 0)
class_prior_k = np.insert(class_prior_k, 0,
zeros_samp_weight_sum)
classes.append(classes_k)
n_classes.append(classes_k.shape[0])
class_prior.append(class_prior_k / class_prior_k.sum())
else:
for k in range(n_outputs):
classes_k, y_k = np.unique(y[:, k], return_inverse=True)
classes.append(classes_k)
n_classes.append(classes_k.shape[0])
class_prior_k = np.bincount(y_k, weights=sample_weight)
class_prior.append(class_prior_k / class_prior_k.sum())
return (classes, n_classes, class_prior)
def _ovr_decision_function(predictions, confidences, n_classes):
"""Compute a continuous, tie-breaking OvR decision function from OvO.
It is important to include a continuous value, not only votes,
to make computing AUC or calibration meaningful.
Parameters
----------
predictions : array-like, shape (n_samples, n_classifiers)
Predicted classes for each binary classifier.
confidences : array-like, shape (n_samples, n_classifiers)
Decision functions or predicted probabilities for positive class
for each binary classifier.
n_classes : int
Number of classes. n_classifiers must be
``n_classes * (n_classes - 1 ) / 2``
"""
n_samples = predictions.shape[0]
votes = np.zeros((n_samples, n_classes))
sum_of_confidences = np.zeros((n_samples, n_classes))
k = 0
for i in range(n_classes):
for j in range(i + 1, n_classes):
sum_of_confidences[:, i] -= confidences[:, k]
sum_of_confidences[:, j] += confidences[:, k]
votes[predictions[:, k] == 0, i] += 1
votes[predictions[:, k] == 1, j] += 1
k += 1
# Monotonically transform the sum_of_confidences to (-1/3, 1/3)
# and add it with votes. The monotonic transformation is
# f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2
# to ensure that we won't reach the limits and change vote order.
# The motivation is to use confidence levels as a way to break ties in
# the votes without switching any decision made based on a difference
# of 1 vote.
transformed_confidences = (sum_of_confidences /
(3 * (np.abs(sum_of_confidences) + 1)))
return votes + transformed_confidences

View file

@ -0,0 +1,21 @@
"""Export fast murmurhash C/C++ routines + cython wrappers"""
cimport numpy as np
# The C API is disabled for now, since it requires -I flags to get
# compilation to work even when these functions are not used.
#cdef extern from "MurmurHash3.h":
# void MurmurHash3_x86_32(void* key, int len, unsigned int seed,
# void* out)
#
# void MurmurHash3_x86_128(void* key, int len, unsigned int seed,
# void* out)
#
# void MurmurHash3_x64_128(void* key, int len, unsigned int seed,
# void* out)
cpdef np.uint32_t murmurhash3_int_u32(int key, unsigned int seed)
cpdef np.int32_t murmurhash3_int_s32(int key, unsigned int seed)
cpdef np.uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed)
cpdef np.int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed)

View file

@ -0,0 +1,256 @@
"""
Our own implementation of the Newton algorithm
Unlike the scipy.optimize version, this version of the Newton conjugate
gradient solver uses only one function call to retrieve the
func value, the gradient value and a callable for the Hessian matvec
product. If the function call is very expensive (e.g. for logistic
regression with large design matrix), this approach gives very
significant speedups.
"""
# This is a modified file from scipy.optimize
# Original authors: Travis Oliphant, Eric Jones
# Modifications by Gael Varoquaux, Mathieu Blondel and Tom Dupre la Tour
# License: BSD
import numpy as np
import warnings
from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
from ..exceptions import ConvergenceWarning
from . import deprecated
class _LineSearchError(RuntimeError):
pass
def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval,
**kwargs):
"""
Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
suitable step length is not found, and raise an exception if a
suitable step length is not found.
Raises
------
_LineSearchError
If no suitable step size is found
"""
ret = line_search_wolfe1(f, fprime, xk, pk, gfk,
old_fval, old_old_fval,
**kwargs)
if ret[0] is None:
# line search failed: try different one.
ret = line_search_wolfe2(f, fprime, xk, pk, gfk,
old_fval, old_old_fval, **kwargs)
if ret[0] is None:
raise _LineSearchError()
return ret
def _cg(fhess_p, fgrad, maxiter, tol):
"""
Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
with a conjugate gradient descent.
Parameters
----------
fhess_p : callable
Function that takes the gradient as a parameter and returns the
matrix product of the Hessian and gradient
fgrad : ndarray, shape (n_features,) or (n_features + 1,)
Gradient vector
maxiter : int
Number of CG iterations.
tol : float
Stopping criterion.
Returns
-------
xsupi : ndarray, shape (n_features,) or (n_features + 1,)
Estimated solution
"""
xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype)
ri = fgrad
psupi = -ri
i = 0
dri0 = np.dot(ri, ri)
while i <= maxiter:
if np.sum(np.abs(ri)) <= tol:
break
Ap = fhess_p(psupi)
# check curvature
curv = np.dot(psupi, Ap)
if 0 <= curv <= 3 * np.finfo(np.float64).eps:
break
elif curv < 0:
if i > 0:
break
else:
# fall back to steepest descent direction
xsupi += dri0 / curv * psupi
break
alphai = dri0 / curv
xsupi += alphai * psupi
ri = ri + alphai * Ap
dri1 = np.dot(ri, ri)
betai = dri1 / dri0
psupi = -ri + betai * psupi
i = i + 1
dri0 = dri1 # update np.dot(ri,ri) for next time.
return xsupi
@deprecated("newton_cg is deprecated in version "
"0.22 and will be removed in version 0.24.")
def newton_cg(grad_hess, func, grad, x0, args=(), tol=1e-4,
maxiter=100, maxinner=200, line_search=True, warn=True):
return _newton_cg(grad_hess, func, grad, x0, args, tol, maxiter,
maxinner, line_search, warn)
def _newton_cg(grad_hess, func, grad, x0, args=(), tol=1e-4,
maxiter=100, maxinner=200, line_search=True, warn=True):
"""
Minimization of scalar function of one or more variables using the
Newton-CG algorithm.
Parameters
----------
grad_hess : callable
Should return the gradient and a callable returning the matvec product
of the Hessian.
func : callable
Should return the value of the function.
grad : callable
Should return the function value and the gradient. This is used
by the linesearch functions.
x0 : array of float
Initial guess.
args : tuple, optional
Arguments passed to func_grad_hess, func and grad.
tol : float
Stopping criterion. The iteration will stop when
``max{|g_i | i = 1, ..., n} <= tol``
where ``g_i`` is the i-th component of the gradient.
maxiter : int
Number of Newton iterations.
maxinner : int
Number of CG iterations.
line_search : boolean
Whether to use a line search or not.
warn : boolean
Whether to warn when didn't converge.
Returns
-------
xk : ndarray of float
Estimated minimum.
"""
x0 = np.asarray(x0).flatten()
xk = x0
k = 0
if line_search:
old_fval = func(x0, *args)
old_old_fval = None
# Outer loop: our Newton iteration
while k < maxiter:
# Compute a search direction pk by applying the CG method to
# del2 f(xk) p = - fgrad f(xk) starting from 0.
fgrad, fhess_p = grad_hess(xk, *args)
absgrad = np.abs(fgrad)
if np.max(absgrad) <= tol:
break
maggrad = np.sum(absgrad)
eta = min([0.5, np.sqrt(maggrad)])
termcond = eta * maggrad
# Inner loop: solve the Newton update by conjugate gradient, to
# avoid inverting the Hessian
xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond)
alphak = 1.0
if line_search:
try:
alphak, fc, gc, old_fval, old_old_fval, gfkp1 = \
_line_search_wolfe12(func, grad, xk, xsupi, fgrad,
old_fval, old_old_fval, args=args)
except _LineSearchError:
warnings.warn('Line Search failed')
break
xk = xk + alphak * xsupi # upcast if necessary
k += 1
if warn and k >= maxiter:
warnings.warn("newton-cg failed to converge. Increase the "
"number of iterations.", ConvergenceWarning)
return xk, k
def _check_optimize_result(solver, result, max_iter=None,
extra_warning_msg=None):
"""Check the OptimizeResult for successful convergence
Parameters
----------
solver: str
solver name. Currently only `lbfgs` is supported.
result: OptimizeResult
result of the scipy.optimize.minimize function
max_iter: {int, None}
expected maximum number of iterations
Returns
-------
n_iter: int
number of iterations
"""
# handle both scipy and scikit-learn solver names
if solver == "lbfgs":
if result.status != 0:
warning_msg = (
"{} failed to converge (status={}):\n{}.\n\n"
"Increase the number of iterations (max_iter) "
"or scale the data as shown in:\n"
" https://scikit-learn.org/stable/modules/"
"preprocessing.html"
).format(solver, result.status, result.message.decode("latin1"))
if extra_warning_msg is not None:
warning_msg += "\n" + extra_warning_msg
warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2)
if max_iter is not None:
# In scipy <= 1.0.0, nit may exceed maxiter for lbfgs.
# See https://github.com/scipy/scipy/issues/7854
n_iter_i = min(result.nit, max_iter)
else:
n_iter_i = result.nit
else:
raise NotImplementedError
return n_iter_i

View file

@ -0,0 +1,104 @@
# Author: Hamzeh Alsalhi <ha258@cornell.edu>
#
# License: BSD 3 clause
import numpy as np
import scipy.sparse as sp
import array
from . import check_random_state
from ._random import sample_without_replacement
from . import deprecated
__all__ = ['sample_without_replacement']
@deprecated("random_choice_csc is deprecated in version "
"0.22 and will be removed in version 0.24.")
def random_choice_csc(n_samples, classes, class_probability=None,
random_state=None):
return _random_choice_csc(n_samples, classes, class_probability,
random_state)
def _random_choice_csc(n_samples, classes, class_probability=None,
random_state=None):
"""Generate a sparse random matrix given column class distributions
Parameters
----------
n_samples : int,
Number of samples to draw in each column.
classes : list of size n_outputs of arrays of size (n_classes,)
List of classes for each column.
class_probability : list of size n_outputs of arrays of size (n_classes,)
Optional (default=None). Class distribution of each column. If None the
uniform distribution is assumed.
random_state : int, RandomState instance, default=None
Controls the randomness of the sampled classes.
See :term:`Glossary <random_state>`.
Returns
-------
random_matrix : sparse csc matrix of size (n_samples, n_outputs)
"""
data = array.array('i')
indices = array.array('i')
indptr = array.array('i', [0])
for j in range(len(classes)):
classes[j] = np.asarray(classes[j])
if classes[j].dtype.kind != 'i':
raise ValueError("class dtype %s is not supported" %
classes[j].dtype)
classes[j] = classes[j].astype(np.int64, copy=False)
# use uniform distribution if no class_probability is given
if class_probability is None:
class_prob_j = np.empty(shape=classes[j].shape[0])
class_prob_j.fill(1 / classes[j].shape[0])
else:
class_prob_j = np.asarray(class_probability[j])
if not np.isclose(np.sum(class_prob_j), 1.0):
raise ValueError("Probability array at index {0} does not sum to "
"one".format(j))
if class_prob_j.shape[0] != classes[j].shape[0]:
raise ValueError("classes[{0}] (length {1}) and "
"class_probability[{0}] (length {2}) have "
"different length.".format(j,
classes[j].shape[0],
class_prob_j.shape[0]))
# If 0 is not present in the classes insert it with a probability 0.0
if 0 not in classes[j]:
classes[j] = np.insert(classes[j], 0, 0)
class_prob_j = np.insert(class_prob_j, 0, 0.0)
# If there are nonzero classes choose randomly using class_probability
rng = check_random_state(random_state)
if classes[j].shape[0] > 1:
p_nonzero = 1 - class_prob_j[classes[j] == 0]
nnz = int(n_samples * p_nonzero)
ind_sample = sample_without_replacement(n_population=n_samples,
n_samples=nnz,
random_state=random_state)
indices.extend(ind_sample)
# Normalize probabilities for the nonzero elements
classes_j_nonzero = classes[j] != 0
class_probability_nz = class_prob_j[classes_j_nonzero]
class_probability_nz_norm = (class_probability_nz /
np.sum(class_probability_nz))
classes_ind = np.searchsorted(class_probability_nz_norm.cumsum(),
rng.rand(nnz))
data.extend(classes[j][classes_j_nonzero][classes_ind])
indptr.append(len(indices))
return sp.csc_matrix((data, indices, indptr),
(n_samples, len(classes)),
dtype=int)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _seq_dataset # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.utils.seq_dataset'
correct_import_path = 'sklearn.utils'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_seq_dataset, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,80 @@
import os
from os.path import join
from sklearn._build_utils import gen_from_templates
def configuration(parent_package='', top_path=None):
import numpy
from numpy.distutils.misc_util import Configuration
config = Configuration('utils', parent_package, top_path)
libraries = []
if os.name == 'posix':
libraries.append('m')
config.add_extension('sparsefuncs_fast',
sources=['sparsefuncs_fast.pyx'],
libraries=libraries)
config.add_extension('_cython_blas',
sources=['_cython_blas.pyx'],
libraries=libraries)
config.add_extension('arrayfuncs',
sources=['arrayfuncs.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_extension('murmurhash',
sources=['murmurhash.pyx', join(
'src', 'MurmurHash3.cpp')],
include_dirs=['src'])
config.add_extension('graph_shortest_path',
sources=['graph_shortest_path.pyx'],
include_dirs=[numpy.get_include()])
config.add_extension('_fast_dict',
sources=['_fast_dict.pyx'],
language="c++",
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_extension('_openmp_helpers',
sources=['_openmp_helpers.pyx'],
libraries=libraries)
# generate _seq_dataset from template
templates = ['sklearn/utils/_seq_dataset.pyx.tp',
'sklearn/utils/_seq_dataset.pxd.tp']
gen_from_templates(templates, top_path)
config.add_extension('_seq_dataset',
sources=['_seq_dataset.pyx'],
include_dirs=[numpy.get_include()])
config.add_extension('_weight_vector',
sources=['_weight_vector.pyx'],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_extension("_random",
sources=["_random.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_extension("_logistic_sigmoid",
sources=["_logistic_sigmoid.pyx"],
include_dirs=[numpy.get_include()],
libraries=libraries)
config.add_subpackage('tests')
return config
if __name__ == '__main__':
from numpy.distutils.core import setup
setup(**configuration(top_path='').todict())

View file

@ -0,0 +1,548 @@
# Authors: Manoj Kumar
# Thomas Unterthiner
# Giorgio Patrini
#
# License: BSD 3 clause
import scipy.sparse as sp
import numpy as np
from .validation import _deprecate_positional_args
from .sparsefuncs_fast import (
csr_mean_variance_axis0 as _csr_mean_var_axis0,
csc_mean_variance_axis0 as _csc_mean_var_axis0,
incr_mean_variance_axis0 as _incr_mean_var_axis0)
def _raise_typeerror(X):
"""Raises a TypeError if X is not a CSR or CSC matrix"""
input_type = X.format if sp.issparse(X) else type(X)
err = "Expected a CSR or CSC sparse matrix, got %s." % input_type
raise TypeError(err)
def _raise_error_wrong_axis(axis):
if axis not in (0, 1):
raise ValueError(
"Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis)
def inplace_csr_column_scale(X, scale):
"""Inplace column scaling of a CSR matrix.
Scale each feature of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : CSR matrix with shape (n_samples, n_features)
Matrix to normalize using the variance of the features.
scale : float array with shape (n_features,)
Array of precomputed feature-wise values to use for scaling.
"""
assert scale.shape[0] == X.shape[1]
X.data *= scale.take(X.indices, mode='clip')
def inplace_csr_row_scale(X, scale):
""" Inplace row scaling of a CSR matrix.
Scale each sample of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : CSR sparse matrix, shape (n_samples, n_features)
Matrix to be scaled.
scale : float array with shape (n_samples,)
Array of precomputed sample-wise values to use for scaling.
"""
assert scale.shape[0] == X.shape[0]
X.data *= np.repeat(scale, np.diff(X.indptr))
def mean_variance_axis(X, axis):
"""Compute mean and variance along an axix on a CSR or CSC matrix
Parameters
----------
X : CSR or CSC sparse matrix, shape (n_samples, n_features)
Input data.
axis : int (either 0 or 1)
Axis along which the axis should be computed.
Returns
-------
means : float array with shape (n_features,)
Feature-wise means
variances : float array with shape (n_features,)
Feature-wise variances
"""
_raise_error_wrong_axis(axis)
if isinstance(X, sp.csr_matrix):
if axis == 0:
return _csr_mean_var_axis0(X)
else:
return _csc_mean_var_axis0(X.T)
elif isinstance(X, sp.csc_matrix):
if axis == 0:
return _csc_mean_var_axis0(X)
else:
return _csr_mean_var_axis0(X.T)
else:
_raise_typeerror(X)
@_deprecate_positional_args
def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n):
"""Compute incremental mean and variance along an axix on a CSR or
CSC matrix.
last_mean, last_var are the statistics computed at the last step by this
function. Both must be initialized to 0-arrays of the proper size, i.e.
the number of features in X. last_n is the number of samples encountered
until now.
Parameters
----------
X : CSR or CSC sparse matrix, shape (n_samples, n_features)
Input data.
axis : int (either 0 or 1)
Axis along which the axis should be computed.
last_mean : float array with shape (n_features,)
Array of feature-wise means to update with the new data X.
last_var : float array with shape (n_features,)
Array of feature-wise var to update with the new data X.
last_n : int with shape (n_features,)
Number of samples seen so far, excluded X.
Returns
-------
means : float array with shape (n_features,)
Updated feature-wise means.
variances : float array with shape (n_features,)
Updated feature-wise variances.
n : int with shape (n_features,)
Updated number of seen samples.
Notes
-----
NaNs are ignored in the algorithm.
"""
_raise_error_wrong_axis(axis)
if isinstance(X, sp.csr_matrix):
if axis == 0:
return _incr_mean_var_axis0(X, last_mean=last_mean,
last_var=last_var, last_n=last_n)
else:
return _incr_mean_var_axis0(X.T, last_mean=last_mean,
last_var=last_var, last_n=last_n)
elif isinstance(X, sp.csc_matrix):
if axis == 0:
return _incr_mean_var_axis0(X, last_mean=last_mean,
last_var=last_var, last_n=last_n)
else:
return _incr_mean_var_axis0(X.T, last_mean=last_mean,
last_var=last_var, last_n=last_n)
else:
_raise_typeerror(X)
def inplace_column_scale(X, scale):
"""Inplace column scaling of a CSC/CSR matrix.
Scale each feature of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : CSC or CSR matrix with shape (n_samples, n_features)
Matrix to normalize using the variance of the features.
scale : float array with shape (n_features,)
Array of precomputed feature-wise values to use for scaling.
"""
if isinstance(X, sp.csc_matrix):
inplace_csr_row_scale(X.T, scale)
elif isinstance(X, sp.csr_matrix):
inplace_csr_column_scale(X, scale)
else:
_raise_typeerror(X)
def inplace_row_scale(X, scale):
""" Inplace row scaling of a CSR or CSC matrix.
Scale each row of the data matrix by multiplying with specific scale
provided by the caller assuming a (n_samples, n_features) shape.
Parameters
----------
X : CSR or CSC sparse matrix, shape (n_samples, n_features)
Matrix to be scaled.
scale : float array with shape (n_features,)
Array of precomputed sample-wise values to use for scaling.
"""
if isinstance(X, sp.csc_matrix):
inplace_csr_column_scale(X.T, scale)
elif isinstance(X, sp.csr_matrix):
inplace_csr_row_scale(X, scale)
else:
_raise_typeerror(X)
def inplace_swap_row_csc(X, m, n):
"""
Swaps two rows of a CSC matrix in-place.
Parameters
----------
X : scipy.sparse.csc_matrix, shape=(n_samples, n_features)
Matrix whose two rows are to be swapped.
m : int
Index of the row of X to be swapped.
n : int
Index of the row of X to be swapped.
"""
for t in [m, n]:
if isinstance(t, np.ndarray):
raise TypeError("m and n should be valid integers")
if m < 0:
m += X.shape[0]
if n < 0:
n += X.shape[0]
m_mask = X.indices == m
X.indices[X.indices == n] = m
X.indices[m_mask] = n
def inplace_swap_row_csr(X, m, n):
"""
Swaps two rows of a CSR matrix in-place.
Parameters
----------
X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
Matrix whose two rows are to be swapped.
m : int
Index of the row of X to be swapped.
n : int
Index of the row of X to be swapped.
"""
for t in [m, n]:
if isinstance(t, np.ndarray):
raise TypeError("m and n should be valid integers")
if m < 0:
m += X.shape[0]
if n < 0:
n += X.shape[0]
# The following swapping makes life easier since m is assumed to be the
# smaller integer below.
if m > n:
m, n = n, m
indptr = X.indptr
m_start = indptr[m]
m_stop = indptr[m + 1]
n_start = indptr[n]
n_stop = indptr[n + 1]
nz_m = m_stop - m_start
nz_n = n_stop - n_start
if nz_m != nz_n:
# Modify indptr first
X.indptr[m + 2:n] += nz_n - nz_m
X.indptr[m + 1] = m_start + nz_n
X.indptr[n] = n_stop - nz_m
X.indices = np.concatenate([X.indices[:m_start],
X.indices[n_start:n_stop],
X.indices[m_stop:n_start],
X.indices[m_start:m_stop],
X.indices[n_stop:]])
X.data = np.concatenate([X.data[:m_start],
X.data[n_start:n_stop],
X.data[m_stop:n_start],
X.data[m_start:m_stop],
X.data[n_stop:]])
def inplace_swap_row(X, m, n):
"""
Swaps two rows of a CSC/CSR matrix in-place.
Parameters
----------
X : CSR or CSC sparse matrix, shape=(n_samples, n_features)
Matrix whose two rows are to be swapped.
m : int
Index of the row of X to be swapped.
n : int
Index of the row of X to be swapped.
"""
if isinstance(X, sp.csc_matrix):
inplace_swap_row_csc(X, m, n)
elif isinstance(X, sp.csr_matrix):
inplace_swap_row_csr(X, m, n)
else:
_raise_typeerror(X)
def inplace_swap_column(X, m, n):
"""
Swaps two columns of a CSC/CSR matrix in-place.
Parameters
----------
X : CSR or CSC sparse matrix, shape=(n_samples, n_features)
Matrix whose two columns are to be swapped.
m : int
Index of the column of X to be swapped.
n : int
Index of the column of X to be swapped.
"""
if m < 0:
m += X.shape[1]
if n < 0:
n += X.shape[1]
if isinstance(X, sp.csc_matrix):
inplace_swap_row_csr(X, m, n)
elif isinstance(X, sp.csr_matrix):
inplace_swap_row_csc(X, m, n)
else:
_raise_typeerror(X)
def _minor_reduce(X, ufunc):
major_index = np.flatnonzero(np.diff(X.indptr))
# reduceat tries casts X.indptr to intp, which errors
# if it is int64 on a 32 bit system.
# Reinitializing prevents this where possible, see #13737
X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
value = ufunc.reduceat(X.data, X.indptr[major_index])
return major_index, value
def _min_or_max_axis(X, axis, min_or_max):
N = X.shape[axis]
if N == 0:
raise ValueError("zero-size array to reduction operation")
M = X.shape[1 - axis]
mat = X.tocsc() if axis == 0 else X.tocsr()
mat.sum_duplicates()
major_index, value = _minor_reduce(mat, min_or_max)
not_full = np.diff(mat.indptr)[major_index] < N
value[not_full] = min_or_max(value[not_full], 0)
mask = value != 0
major_index = np.compress(mask, major_index)
value = np.compress(mask, value)
if axis == 0:
res = sp.coo_matrix((value, (np.zeros(len(value)), major_index)),
dtype=X.dtype, shape=(1, M))
else:
res = sp.coo_matrix((value, (major_index, np.zeros(len(value)))),
dtype=X.dtype, shape=(M, 1))
return res.A.ravel()
def _sparse_min_or_max(X, axis, min_or_max):
if axis is None:
if 0 in X.shape:
raise ValueError("zero-size array to reduction operation")
zero = X.dtype.type(0)
if X.nnz == 0:
return zero
m = min_or_max.reduce(X.data.ravel())
if X.nnz != np.product(X.shape):
m = min_or_max(zero, m)
return m
if axis < 0:
axis += 2
if (axis == 0) or (axis == 1):
return _min_or_max_axis(X, axis, min_or_max)
else:
raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
def _sparse_min_max(X, axis):
return (_sparse_min_or_max(X, axis, np.minimum),
_sparse_min_or_max(X, axis, np.maximum))
def _sparse_nan_min_max(X, axis):
return(_sparse_min_or_max(X, axis, np.fmin),
_sparse_min_or_max(X, axis, np.fmax))
def min_max_axis(X, axis, ignore_nan=False):
"""Compute minimum and maximum along an axis on a CSR or CSC matrix and
optionally ignore NaN values.
Parameters
----------
X : CSR or CSC sparse matrix, shape (n_samples, n_features)
Input data.
axis : int (either 0 or 1)
Axis along which the axis should be computed.
ignore_nan : bool, default is False
Ignore or passing through NaN values.
.. versionadded:: 0.20
Returns
-------
mins : float array with shape (n_features,)
Feature-wise minima
maxs : float array with shape (n_features,)
Feature-wise maxima
"""
if isinstance(X, sp.csr_matrix) or isinstance(X, sp.csc_matrix):
if ignore_nan:
return _sparse_nan_min_max(X, axis=axis)
else:
return _sparse_min_max(X, axis=axis)
else:
_raise_typeerror(X)
def count_nonzero(X, axis=None, sample_weight=None):
"""A variant of X.getnnz() with extension to weighting on axis 0
Useful in efficiently calculating multilabel metrics.
Parameters
----------
X : CSR sparse matrix of shape (n_samples, n_labels)
Input data.
axis : None, 0 or 1
The axis on which the data is aggregated.
sample_weight : array-like of shape (n_samples,), default=None
Weight for each row of X.
"""
if axis == -1:
axis = 1
elif axis == -2:
axis = 0
elif X.format != 'csr':
raise TypeError('Expected CSR sparse format, got {0}'.format(X.format))
# We rely here on the fact that np.diff(Y.indptr) for a CSR
# will return the number of nonzero entries in each row.
# A bincount over Y.indices will return the number of nonzeros
# in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.
if axis is None:
if sample_weight is None:
return X.nnz
else:
return np.dot(np.diff(X.indptr), sample_weight)
elif axis == 1:
out = np.diff(X.indptr)
if sample_weight is None:
# astype here is for consistency with axis=0 dtype
return out.astype('intp')
return out * sample_weight
elif axis == 0:
if sample_weight is None:
return np.bincount(X.indices, minlength=X.shape[1])
else:
weights = np.repeat(sample_weight, np.diff(X.indptr))
return np.bincount(X.indices, minlength=X.shape[1],
weights=weights)
else:
raise ValueError('Unsupported axis: {0}'.format(axis))
def _get_median(data, n_zeros):
"""Compute the median of data with n_zeros additional zeros.
This function is used to support sparse matrices; it modifies data in-place
"""
n_elems = len(data) + n_zeros
if not n_elems:
return np.nan
n_negative = np.count_nonzero(data < 0)
middle, is_odd = divmod(n_elems, 2)
data.sort()
if is_odd:
return _get_elem_at_rank(middle, data, n_negative, n_zeros)
return (_get_elem_at_rank(middle - 1, data, n_negative, n_zeros) +
_get_elem_at_rank(middle, data, n_negative, n_zeros)) / 2.
def _get_elem_at_rank(rank, data, n_negative, n_zeros):
"""Find the value in data augmented with n_zeros for the given rank"""
if rank < n_negative:
return data[rank]
if rank - n_negative < n_zeros:
return 0
return data[rank - n_zeros]
def csc_median_axis_0(X):
"""Find the median across axis 0 of a CSC matrix.
It is equivalent to doing np.median(X, axis=0).
Parameters
----------
X : CSC sparse matrix, shape (n_samples, n_features)
Input data.
Returns
-------
median : ndarray, shape (n_features,)
Median.
"""
if not isinstance(X, sp.csc_matrix):
raise TypeError("Expected matrix of CSC format, got %s" % X.format)
indptr = X.indptr
n_samples, n_features = X.shape
median = np.zeros(n_features)
for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):
# Prevent modifying X in place
data = np.copy(X.data[start: end])
nz = n_samples - data.size
median[f_ind] = _get_median(data, nz)
return median

View file

@ -0,0 +1,18 @@
import numpy as np
from .extmath import stable_cumsum
def _weighted_percentile(array, sample_weight, percentile=50):
"""
Compute the weighted ``percentile`` of ``array`` with ``sample_weight``.
"""
sorted_idx = np.argsort(array)
# Find index of median prediction for each sample
weight_cdf = stable_cumsum(sample_weight[sorted_idx])
percentile_idx = np.searchsorted(
weight_cdf, (percentile / 100.) * weight_cdf[-1])
# in rare cases, percentile_idx equals to len(sorted_idx)
percentile_idx = np.clip(percentile_idx, 0, len(sorted_idx)-1)
return array[sorted_idx[percentile_idx]]

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _testing # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.utils.testing'
correct_import_path = 'sklearn.utils'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_testing, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,10 @@
import pytest
import sklearn
@pytest.fixture
def print_changed_only_false():
sklearn.set_config(print_changed_only=False)
yield
sklearn.set_config(print_changed_only=True) # reset to default

View file

@ -0,0 +1,266 @@
import numpy as np
import pytest
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
def test_compute_class_weight():
# Test (and demo) compute_class_weight.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
cw = compute_class_weight("balanced", classes=classes, y=y)
# total effect of samples is preserved
class_counts = np.bincount(y)[2:]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert cw[0] < cw[1] < cw[2]
def test_compute_class_weight_not_present():
# Raise error when y does not contain all class labels
classes = np.arange(4)
y = np.asarray([0, 0, 0, 1, 1, 2])
with pytest.raises(ValueError):
compute_class_weight("balanced", classes=classes, y=y)
# Fix exception in error message formatting when missing label is a string
# https://github.com/scikit-learn/scikit-learn/issues/8312
with pytest.raises(ValueError,
match="Class label label_not_present not present"):
compute_class_weight({"label_not_present": 1.}, classes=classes, y=y)
# Raise error when y has items not in classes
classes = np.arange(2)
with pytest.raises(ValueError):
compute_class_weight("balanced", classes=classes, y=y)
with pytest.raises(ValueError):
compute_class_weight({0: 1., 1: 2.}, classes=classes, y=y)
def test_compute_class_weight_dict():
classes = np.arange(3)
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
y = np.asarray([0, 0, 1, 2])
cw = compute_class_weight(class_weights, classes=classes, y=y)
# When the user specifies class weights, compute_class_weights should just
# return them.
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
# When a class weight is specified that isn't in classes, a ValueError
# should get raised
msg = 'Class label 4 not present.'
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
with pytest.raises(ValueError, match=msg):
compute_class_weight(class_weights, classes=classes, y=y)
msg = 'Class label -1 not present.'
class_weights = {-1: 5.0, 0: 1.0, 1: 2.0, 2: 3.0}
with pytest.raises(ValueError, match=msg):
compute_class_weight(class_weights, classes=classes, y=y)
def test_compute_class_weight_invariance():
# Test that results with class_weight="balanced" is invariant wrt
# class imbalance if the number of samples is identical.
# The test uses a balanced two class dataset with 100 datapoints.
# It creates three versions, one where class 1 is duplicated
# resulting in 150 points of class 1 and 50 of class 0,
# one where there are 50 points in class 1 and 150 in class 0,
# and one where there are 100 points of each class (this one is balanced
# again).
# With balancing class weights, all three should give the same model.
X, y = make_blobs(centers=2, random_state=0)
# create dataset where class 1 is duplicated twice
X_1 = np.vstack([X] + [X[y == 1]] * 2)
y_1 = np.hstack([y] + [y[y == 1]] * 2)
# create dataset where class 0 is duplicated twice
X_0 = np.vstack([X] + [X[y == 0]] * 2)
y_0 = np.hstack([y] + [y[y == 0]] * 2)
# duplicate everything
X_ = np.vstack([X] * 2)
y_ = np.hstack([y] * 2)
# results should be identical
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def test_compute_class_weight_balanced_negative():
# Test compute_class_weight when labels are negative
# Test with balanced class labels.
classes = np.array([-2, -1, 0])
y = np.asarray([-1, -1, 0, 0, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
assert_array_almost_equal(cw, np.array([1., 1., 1.]))
# Test with unbalanced class labels.
y = np.asarray([-1, 0, 0, -2, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
class_counts = np.bincount(y + 2)
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2. / 3, 2., 1.])
def test_compute_class_weight_balanced_unordered():
# Test compute_class_weight when classes are unordered
classes = np.array([1, 0, 3])
y = np.asarray([1, 0, 0, 3, 3, 3])
cw = compute_class_weight("balanced", classes=classes, y=y)
class_counts = np.bincount(y)[classes]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2., 1., 2. / 3])
def test_compute_class_weight_default():
# Test for the case where no weight is given for a present class.
# Current behaviour is to assign the unweighted classes a weight of 1.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
classes_len = len(classes)
# Test for non specified weights
cw = compute_class_weight(None, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, np.ones(3))
# Tests for partly specified weights
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1., 1.])
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1., 0.5])
def test_compute_sample_weight():
# Test (and demo) compute_sample_weight.
# Test with balanced classes
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with user-defined weights
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
assert_array_almost_equal(sample_weight, [2., 2., 2., 1., 1., 1.])
# Test with column vector of balanced classes
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with unbalanced classes
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y)
expected_balanced = np.array([0.7777, 0.7777, 0.7777, 0.7777, 0.7777,
0.7777, 2.3333])
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
# Test with `None` weights
sample_weight = compute_sample_weight(None, y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 1.])
# Test with multi-output of balanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with multi-output with user-defined weights
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
assert_array_almost_equal(sample_weight, [2., 2., 2., 2., 2., 2.])
# Test with multi-output of unbalanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, expected_balanced ** 2, decimal=3)
def test_compute_sample_weight_with_subsample():
# Test compute_sample_weight with subsamples specified.
# Test with balanced classes and all samples present
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with column vector of balanced classes and all samples present
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
# Test with a subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
2. / 3, 2., 2., 2.])
# Test with a bootstrap subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
assert_array_almost_equal(sample_weight, expected_balanced)
# Test with a bootstrap subsample for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y,
indices=[0, 1, 1, 2, 2, 3])
assert_array_almost_equal(sample_weight, expected_balanced ** 2)
# Test with a missing class
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
# Test with a missing class for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
def test_compute_sample_weight_errors():
# Test compute_sample_weight raises errors expected.
# Invalid preset string
y = np.asarray([1, 1, 1, 2, 2, 2])
y_ = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
with pytest.raises(ValueError):
compute_sample_weight("ni", y)
with pytest.raises(ValueError):
compute_sample_weight("ni", y, indices=range(4))
with pytest.raises(ValueError):
compute_sample_weight("ni", y_)
with pytest.raises(ValueError):
compute_sample_weight("ni", y_, indices=range(4))
# Not "balanced" for subsample
with pytest.raises(ValueError):
compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
# Not a list or preset for multi-output
with pytest.raises(ValueError):
compute_sample_weight({1: 2, 2: 1}, y_)
# Incorrect length list for multi-output
with pytest.raises(ValueError):
compute_sample_weight([{1: 2, 2: 1}], y_)
def test_compute_sample_weight_more_than_32():
# Non-regression smoke test for #12146
y = np.arange(50) # more than 32 distinct classes
indices = np.arange(50) # use subsampling
weight = compute_sample_weight('balanced', y, indices=indices)
assert_array_almost_equal(weight, np.ones(y.shape[0]))

View file

@ -0,0 +1,229 @@
import pytest
import numpy as np
from sklearn.utils._testing import assert_allclose
from sklearn.utils._cython_blas import _dot_memview
from sklearn.utils._cython_blas import _asum_memview
from sklearn.utils._cython_blas import _axpy_memview
from sklearn.utils._cython_blas import _nrm2_memview
from sklearn.utils._cython_blas import _copy_memview
from sklearn.utils._cython_blas import _scal_memview
from sklearn.utils._cython_blas import _rotg_memview
from sklearn.utils._cython_blas import _rot_memview
from sklearn.utils._cython_blas import _gemv_memview
from sklearn.utils._cython_blas import _ger_memview
from sklearn.utils._cython_blas import _gemm_memview
from sklearn.utils._cython_blas import RowMajor, ColMajor
from sklearn.utils._cython_blas import Trans, NoTrans
def _numpy_to_cython(dtype):
cython = pytest.importorskip("cython")
if dtype == np.float32:
return cython.float
elif dtype == np.float64:
return cython.double
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
ORDER = {RowMajor: 'C', ColMajor: 'F'}
def _no_op(x):
return x
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_dot(dtype):
dot = _dot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
expected = x.dot(y)
actual = dot(x, y)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_asum(dtype):
asum = _asum_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.abs(x).sum()
actual = asum(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_axpy(dtype):
axpy = _axpy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x + y
axpy(alpha, x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_nrm2(dtype):
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.linalg.norm(x)
actual = nrm2(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_copy(dtype):
copy = _copy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = np.empty_like(x)
expected = x.copy()
copy(x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_scal(dtype):
scal = _scal_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x
scal(alpha, x)
assert_allclose(x, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rotg(dtype):
rotg = _rotg_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
a = dtype(rng.randn())
b = dtype(rng.randn())
c, s = 0.0, 0.0
def expected_rotg(a, b):
roe = a if abs(a) > abs(b) else b
if a == 0 and b == 0:
c, s, r, z = (1, 0, 0, 0)
else:
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
c, s = a/r, b/r
z = s if roe == a else (1 if c == 0 else 1 / c)
return r, z, c, s
expected = expected_rotg(a, b)
actual = rotg(a, b, c, s)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rot(dtype):
rot = _rot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
c = dtype(rng.randn())
s = dtype(rng.randn())
expected_x = c * x + s * y
expected_y = c * y - s * x
rot(x, y, c, s)
assert_allclose(x, expected_x)
assert_allclose(y, expected_y)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("opA, transA",
[(_no_op, NoTrans), (np.transpose, Trans)],
ids=["NoTrans", "Trans"])
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
ids=["RowMajor", "ColMajor"])
def test_gemv(dtype, opA, transA, order):
gemv = _gemv_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(opA(rng.random_sample((20, 10)).astype(dtype, copy=False)),
order=ORDER[order])
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(x) + beta * y
gemv(transA, alpha, A, x, beta, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
ids=["RowMajor", "ColMajor"])
def test_ger(dtype, order):
ger = _ger_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
A = np.asarray(rng.random_sample((10, 20)).astype(dtype, copy=False),
order=ORDER[order])
alpha = 2.5
expected = alpha * np.outer(x, y) + A
ger(alpha, x, y, A)
assert_allclose(A, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("opB, transB",
[(_no_op, NoTrans), (np.transpose, Trans)],
ids=["NoTrans", "Trans"])
@pytest.mark.parametrize("opA, transA",
[(_no_op, NoTrans), (np.transpose, Trans)],
ids=["NoTrans", "Trans"])
@pytest.mark.parametrize("order", [RowMajor, ColMajor],
ids=["RowMajor", "ColMajor"])
def test_gemm(dtype, opA, transA, opB, transB, order):
gemm = _gemm_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(opA(rng.random_sample((30, 10)).astype(dtype, copy=False)),
order=ORDER[order])
B = np.asarray(opB(rng.random_sample((10, 20)).astype(dtype, copy=False)),
order=ORDER[order])
C = np.asarray(rng.random_sample((30, 20)).astype(dtype, copy=False),
order=ORDER[order])
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(opB(B)) + beta * C
gemm(transA, transB, alpha, A, B, beta, C)
assert_allclose(C, expected, rtol=RTOL[dtype])

View file

@ -0,0 +1,128 @@
import pytest
import types
import numpy as np
import warnings
from sklearn.dummy import DummyClassifier
from sklearn.utils import all_estimators
from sklearn.utils.estimator_checks import choose_check_classifiers_labels
from sklearn.utils.estimator_checks import NotAnArray
from sklearn.utils.estimator_checks import enforce_estimator_tags_y
from sklearn.utils.estimator_checks import is_public_parameter
from sklearn.utils.estimator_checks import pairwise_estimator_convert_X
from sklearn.utils.estimator_checks import set_checking_parameters
from sklearn.utils.optimize import newton_cg
from sklearn.utils.random import random_choice_csc
from sklearn.utils import safe_indexing
# This file tests the utils that are deprecated
# TODO: remove in 0.24
def test_choose_check_classifiers_labels_deprecated():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
choose_check_classifiers_labels(None, None, None)
# TODO: remove in 0.24
def test_enforce_estimator_tags_y():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
enforce_estimator_tags_y(DummyClassifier(), np.array([0, 1]))
# TODO: remove in 0.24
def test_notanarray():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
NotAnArray([1, 2])
# TODO: remove in 0.24
def test_is_public_parameter():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
is_public_parameter('hello')
# TODO: remove in 0.24
def test_pairwise_estimator_convert_X():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
pairwise_estimator_convert_X([[1, 2]], DummyClassifier())
# TODO: remove in 0.24
def test_set_checking_parameters():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
set_checking_parameters(DummyClassifier())
# TODO: remove in 0.24
def test_newton_cg():
rng = np.random.RandomState(0)
A = rng.normal(size=(10, 10))
x0 = np.ones(10)
def func(x):
Ax = A.dot(x)
return .5 * (Ax).dot(Ax)
def grad(x):
return A.T.dot(A.dot(x))
def grad_hess(x):
return grad(x), lambda x: A.T.dot(A.dot(x))
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
newton_cg(grad_hess, func, grad, x0)
# TODO: remove in 0.24
def test_random_choice_csc():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
random_choice_csc(10, [[2]])
# TODO: remove in 0.24
def test_safe_indexing():
with pytest.warns(FutureWarning,
match="removed in version 0.24"):
safe_indexing([1, 2], 0)
# TODO: remove in 0.24
def test_partial_dependence_no_shadowing():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/15842
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
from sklearn.inspection.partial_dependence import partial_dependence as _ # noqa
# Calling all_estimators() also triggers a recursive import of all
# submodules, including deprecated ones.
all_estimators()
from sklearn.inspection import partial_dependence
assert isinstance(partial_dependence, types.FunctionType)
# TODO: remove in 0.24
def test_dict_learning_no_shadowing():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/15842
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
from sklearn.decomposition.dict_learning import dict_learning as _ # noqa
# Calling all_estimators() also triggers a recursive import of all
# submodules, including deprecated ones.
all_estimators()
from sklearn.decomposition import dict_learning
assert isinstance(dict_learning, types.FunctionType)

View file

@ -0,0 +1,59 @@
# Authors: Raghav RV <rvraghav93@gmail.com>
# License: BSD 3 clause
import pickle
from sklearn.utils.deprecation import _is_deprecated
from sklearn.utils.deprecation import deprecated
from sklearn.utils._testing import assert_warns_message
@deprecated('qwerty')
class MockClass1:
pass
class MockClass2:
@deprecated('mockclass2_method')
def method(self):
pass
class MockClass3:
@deprecated()
def __init__(self):
pass
class MockClass4:
pass
@deprecated()
def mock_function():
return 10
def test_deprecated():
assert_warns_message(FutureWarning, 'qwerty', MockClass1)
assert_warns_message(FutureWarning, 'mockclass2_method',
MockClass2().method)
assert_warns_message(FutureWarning, 'deprecated', MockClass3)
val = assert_warns_message(FutureWarning, 'deprecated',
mock_function)
assert val == 10
def test_is_deprecated():
# Test if _is_deprecated helper identifies wrapping via deprecated
# NOTE it works only for class methods and functions
assert _is_deprecated(MockClass1.__init__)
assert _is_deprecated(MockClass2().method)
assert _is_deprecated(MockClass3.__init__)
assert not _is_deprecated(MockClass4.__init__)
assert _is_deprecated(mock_function)
def test_pickle():
pickle.loads(pickle.dumps(mock_function))

View file

@ -0,0 +1,640 @@
import unittest
import sys
import numpy as np
import scipy.sparse as sp
import joblib
from io import StringIO
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import deprecated
from sklearn.utils._testing import (assert_raises_regex,
ignore_warnings,
assert_warns, assert_raises,
SkipTest)
from sklearn.utils.estimator_checks import check_estimator, _NotAnArray
from sklearn.utils.estimator_checks \
import check_class_weight_balanced_linear_classifier
from sklearn.utils.estimator_checks import set_random_state
from sklearn.utils.estimator_checks import _set_checking_parameters
from sklearn.utils.estimator_checks import check_estimators_unfitted
from sklearn.utils.estimator_checks import check_fit_score_takes_y
from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
from sklearn.utils.estimator_checks import check_classifier_data_not_an_array
from sklearn.utils.estimator_checks import check_regressor_data_not_an_array
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.estimator_checks import check_outlier_corruption
from sklearn.utils.fixes import np_version, parse_version
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import NMF
from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_array
from sklearn.utils import all_estimators
class CorrectNotFittedError(ValueError):
"""Exception class to raise if estimator is used before fitting.
Like NotFittedError, it inherits from ValueError, but not from
AttributeError. Used for testing only.
"""
class BaseBadClassifier(ClassifierMixin, BaseEstimator):
def fit(self, X, y):
return self
def predict(self, X):
return np.ones(X.shape[0])
class ChangesDict(BaseEstimator):
def __init__(self, key=0):
self.key = key
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
def predict(self, X):
X = check_array(X)
self.key = 1000
return np.ones(X.shape[0])
class SetsWrongAttribute(BaseEstimator):
def __init__(self, acceptable_key=0):
self.acceptable_key = acceptable_key
def fit(self, X, y=None):
self.wrong_attribute = 0
X, y = self._validate_data(X, y)
return self
class ChangesWrongAttribute(BaseEstimator):
def __init__(self, wrong_attribute=0):
self.wrong_attribute = wrong_attribute
def fit(self, X, y=None):
self.wrong_attribute = 1
X, y = self._validate_data(X, y)
return self
class ChangesUnderscoreAttribute(BaseEstimator):
def fit(self, X, y=None):
self._good_attribute = 1
X, y = self._validate_data(X, y)
return self
class RaisesErrorInSetParams(BaseEstimator):
def __init__(self, p=0):
self.p = p
def set_params(self, **kwargs):
if 'p' in kwargs:
p = kwargs.pop('p')
if p < 0:
raise ValueError("p can't be less than 0")
self.p = p
return super().set_params(**kwargs)
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
class ModifiesValueInsteadOfRaisingError(BaseEstimator):
def __init__(self, p=0):
self.p = p
def set_params(self, **kwargs):
if 'p' in kwargs:
p = kwargs.pop('p')
if p < 0:
p = 0
self.p = p
return super().set_params(**kwargs)
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
class ModifiesAnotherValue(BaseEstimator):
def __init__(self, a=0, b='method1'):
self.a = a
self.b = b
def set_params(self, **kwargs):
if 'a' in kwargs:
a = kwargs.pop('a')
self.a = a
if a is None:
kwargs.pop('b')
self.b = 'method2'
return super().set_params(**kwargs)
def fit(self, X, y=None):
X, y = self._validate_data(X, y)
return self
class NoCheckinPredict(BaseBadClassifier):
def fit(self, X, y):
X, y = self._validate_data(X, y)
return self
class NoSparseClassifier(BaseBadClassifier):
def fit(self, X, y):
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
if sp.issparse(X):
raise ValueError("Nonsensical Error")
return self
def predict(self, X):
X = check_array(X)
return np.ones(X.shape[0])
class CorrectNotFittedErrorClassifier(BaseBadClassifier):
def fit(self, X, y):
X, y = self._validate_data(X, y)
self.coef_ = np.ones(X.shape[1])
return self
def predict(self, X):
check_is_fitted(self)
X = check_array(X)
return np.ones(X.shape[0])
class NoSampleWeightPandasSeriesType(BaseEstimator):
def fit(self, X, y, sample_weight=None):
# Convert data
X, y = self._validate_data(
X, y,
accept_sparse=("csr", "csc"),
multi_output=True,
y_numeric=True)
# Function is only called after we verify that pandas is installed
from pandas import Series
if isinstance(sample_weight, Series):
raise ValueError("Estimator does not accept 'sample_weight'"
"of type pandas.Series")
return self
def predict(self, X):
X = check_array(X)
return np.ones(X.shape[0])
class BadBalancedWeightsClassifier(BaseBadClassifier):
def __init__(self, class_weight=None):
self.class_weight = class_weight
def fit(self, X, y):
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import compute_class_weight
label_encoder = LabelEncoder().fit(y)
classes = label_encoder.classes_
class_weight = compute_class_weight(self.class_weight, classes=classes,
y=y)
# Intentionally modify the balanced class_weight
# to simulate a bug and raise an exception
if self.class_weight == "balanced":
class_weight += 1.
# Simply assigning coef_ to the class_weight
self.coef_ = class_weight
return self
class BadTransformerWithoutMixin(BaseEstimator):
def fit(self, X, y=None):
X = self._validate_data(X)
return self
def transform(self, X):
X = check_array(X)
return X
class NotInvariantPredict(BaseEstimator):
def fit(self, X, y):
# Convert data
X, y = self._validate_data(
X, y,
accept_sparse=("csr", "csc"),
multi_output=True,
y_numeric=True)
return self
def predict(self, X):
# return 1 if X has more than one element else return 0
X = check_array(X)
if X.shape[0] > 1:
return np.ones(X.shape[0])
return np.zeros(X.shape[0])
class LargeSparseNotSupportedClassifier(BaseEstimator):
def fit(self, X, y):
X, y = self._validate_data(
X, y,
accept_sparse=("csr", "csc", "coo"),
accept_large_sparse=True,
multi_output=True,
y_numeric=True)
if sp.issparse(X):
if X.getformat() == "coo":
if X.row.dtype == "int64" or X.col.dtype == "int64":
raise ValueError(
"Estimator doesn't support 64-bit indices")
elif X.getformat() in ["csc", "csr"]:
assert "int64" not in (X.indices.dtype, X.indptr.dtype),\
"Estimator doesn't support 64-bit indices"
return self
class SparseTransformer(BaseEstimator):
def fit(self, X, y=None):
self.X_shape_ = self._validate_data(X).shape
return self
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
def transform(self, X):
X = check_array(X)
if X.shape[1] != self.X_shape_[1]:
raise ValueError('Bad number of features')
return sp.csr_matrix(X)
class EstimatorInconsistentForPandas(BaseEstimator):
def fit(self, X, y):
try:
from pandas import DataFrame
if isinstance(X, DataFrame):
self.value_ = X.iloc[0, 0]
else:
X = check_array(X)
self.value_ = X[1, 0]
return self
except ImportError:
X = check_array(X)
self.value_ = X[1, 0]
return self
def predict(self, X):
X = check_array(X)
return np.array([self.value_] * X.shape[0])
class UntaggedBinaryClassifier(SGDClassifier):
# Toy classifier that only supports binary classification, will fail tests.
def fit(self, X, y, coef_init=None, intercept_init=None,
sample_weight=None):
super().fit(X, y, coef_init, intercept_init, sample_weight)
if len(self.classes_) > 2:
raise ValueError('Only 2 classes are supported')
return self
def partial_fit(self, X, y, classes=None, sample_weight=None):
super().partial_fit(X=X, y=y, classes=classes,
sample_weight=sample_weight)
if len(self.classes_) > 2:
raise ValueError('Only 2 classes are supported')
return self
class TaggedBinaryClassifier(UntaggedBinaryClassifier):
# Toy classifier that only supports binary classification.
def _more_tags(self):
return {'binary_only': True}
class RequiresPositiveYRegressor(LinearRegression):
def fit(self, X, y):
X, y = self._validate_data(X, y, multi_output=True)
if (y <= 0).any():
raise ValueError('negative y values not supported!')
return super().fit(X, y)
def _more_tags(self):
return {"requires_positive_y": True}
def test_not_an_array_array_function():
if np_version < parse_version('1.17'):
raise SkipTest("array_function protocol not supported in numpy <1.17")
not_array = _NotAnArray(np.ones(10))
msg = "Don't want to call array_function sum!"
assert_raises_regex(TypeError, msg, np.sum, not_array)
# always returns True
assert np.may_share_memory(not_array, None)
def test_check_fit_score_takes_y_works_on_deprecated_fit():
# Tests that check_fit_score_takes_y works on a class with
# a deprecated fit method
class TestEstimatorWithDeprecatedFitMethod(BaseEstimator):
@deprecated("Deprecated for the purpose of testing "
"check_fit_score_takes_y")
def fit(self, X, y):
return self
check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod())
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
def test_check_estimator():
# tests that the estimator actually fails on "bad" estimators.
# not a complete test of all checks, which are very extensive.
# check that we have a set_params and can clone
msg = "it does not implement a 'get_params' method"
assert_raises_regex(TypeError, msg, check_estimator, object)
msg = "object has no attribute '_get_tags'"
assert_raises_regex(AttributeError, msg, check_estimator, object())
# check that values returned by get_params match set_params
msg = "get_params result does not match what was passed to set_params"
assert_raises_regex(AssertionError, msg, check_estimator,
ModifiesValueInsteadOfRaisingError())
assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams())
assert_raises_regex(AssertionError, msg, check_estimator,
ModifiesAnotherValue())
# check that we have a fit method
msg = "object has no attribute 'fit'"
assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator)
assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
# check that fit does input validation
msg = "ValueError not raised"
assert_raises_regex(AssertionError, msg, check_estimator,
BaseBadClassifier)
assert_raises_regex(AssertionError, msg, check_estimator,
BaseBadClassifier())
# check that sample_weights in fit accepts pandas.Series type
try:
from pandas import Series # noqa
msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
"'sample_weight' parameter is of type pandas.Series")
assert_raises_regex(
ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType)
except ImportError:
pass
# check that predict does input validation (doesn't accept dicts in input)
msg = "Estimator doesn't check for NaN and inf in predict"
assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict)
assert_raises_regex(AssertionError, msg, check_estimator,
NoCheckinPredict())
# check that estimator state does not change
# at transform/predict/predict_proba time
msg = 'Estimator changes __dict__ during predict'
assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict)
# check that `fit` only changes attribures that
# are private (start with an _ or end with a _).
msg = ('Estimator ChangesWrongAttribute should not change or mutate '
'the parameter wrong_attribute from 0 to 1 during fit.')
assert_raises_regex(AssertionError, msg,
check_estimator, ChangesWrongAttribute)
check_estimator(ChangesUnderscoreAttribute)
# check that `fit` doesn't add any public attribute
msg = (r'Estimator adds public attribute\(s\) during the fit method.'
' Estimators are only allowed to add private attributes'
' either started with _ or ended'
' with _ but wrong_attribute added')
assert_raises_regex(AssertionError, msg,
check_estimator, SetsWrongAttribute)
# check for invariant method
name = NotInvariantPredict.__name__
method = 'predict'
msg = ("{method} of {name} is not invariant when applied "
"to a subset.").format(method=method, name=name)
assert_raises_regex(AssertionError, msg,
check_estimator, NotInvariantPredict)
# check for sparse matrix input handling
name = NoSparseClassifier.__name__
msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
# the check for sparse input handling prints to the stdout,
# instead of raising an error, so as not to remove the original traceback.
# that means we need to jump through some hoops to catch it.
old_stdout = sys.stdout
string_buffer = StringIO()
sys.stdout = string_buffer
try:
check_estimator(NoSparseClassifier)
except:
pass
finally:
sys.stdout = old_stdout
assert msg in string_buffer.getvalue()
# Large indices test on bad estimator
msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to '
r'support \S{3}_64 matrix, and is not failing gracefully.*')
assert_raises_regex(AssertionError, msg, check_estimator,
LargeSparseNotSupportedClassifier)
# does error on binary_only untagged estimator
msg = 'Only 2 classes are supported'
assert_raises_regex(ValueError, msg, check_estimator,
UntaggedBinaryClassifier)
# non-regression test for estimators transforming to sparse data
check_estimator(SparseTransformer())
# doesn't error on actual estimator
check_estimator(LogisticRegression)
check_estimator(LogisticRegression(C=0.01))
check_estimator(MultiTaskElasticNet)
check_estimator(MultiTaskElasticNet())
# doesn't error on binary_only tagged estimator
check_estimator(TaggedBinaryClassifier)
# Check regressor with requires_positive_y estimator tag
msg = 'negative y values not supported!'
assert_raises_regex(ValueError, msg, check_estimator,
RequiresPositiveYRegressor)
def test_check_outlier_corruption():
# should raise AssertionError
decision = np.array([0., 1., 1.5, 2.])
assert_raises(AssertionError, check_outlier_corruption, 1, 2, decision)
# should pass
decision = np.array([0., 1., 1., 2.])
check_outlier_corruption(1, 2, decision)
def test_check_estimator_transformer_no_mixin():
# check that TransformerMixin is not required for transformer tests to run
assert_raises_regex(AttributeError, '.*fit_transform.*',
check_estimator, BadTransformerWithoutMixin())
def test_check_estimator_clones():
# check that check_estimator doesn't modify the estimator it receives
from sklearn.datasets import load_iris
iris = load_iris()
for Estimator in [GaussianMixture, LinearRegression,
RandomForestClassifier, NMF, SGDClassifier,
MiniBatchKMeans]:
with ignore_warnings(category=FutureWarning):
# when 'est = SGDClassifier()'
est = Estimator()
_set_checking_parameters(est)
set_random_state(est)
# without fitting
old_hash = joblib.hash(est)
check_estimator(est)
assert old_hash == joblib.hash(est)
with ignore_warnings(category=FutureWarning):
# when 'est = SGDClassifier()'
est = Estimator()
_set_checking_parameters(est)
set_random_state(est)
# with fitting
est.fit(iris.data + 10, iris.target)
old_hash = joblib.hash(est)
check_estimator(est)
assert old_hash == joblib.hash(est)
def test_check_estimators_unfitted():
# check that a ValueError/AttributeError is raised when calling predict
# on an unfitted estimator
msg = "NotFittedError not raised by predict"
assert_raises_regex(AssertionError, msg, check_estimators_unfitted,
"estimator", NoSparseClassifier())
# check that CorrectNotFittedError inherit from either ValueError
# or AttributeError
check_estimators_unfitted("estimator", CorrectNotFittedErrorClassifier())
def test_check_no_attributes_set_in_init():
class NonConformantEstimatorPrivateSet(BaseEstimator):
def __init__(self):
self.you_should_not_set_this_ = None
class NonConformantEstimatorNoParamSet(BaseEstimator):
def __init__(self, you_should_set_this_=None):
pass
assert_raises_regex(AssertionError,
"Estimator estimator_name should not set any"
" attribute apart from parameters during init."
r" Found attributes \['you_should_not_set_this_'\].",
check_no_attributes_set_in_init,
'estimator_name',
NonConformantEstimatorPrivateSet())
assert_raises_regex(AssertionError,
"Estimator estimator_name should store all "
"parameters as an attribute during init. "
"Did not find attributes "
r"\['you_should_set_this_'\].",
check_no_attributes_set_in_init,
'estimator_name',
NonConformantEstimatorNoParamSet())
def test_check_estimator_pairwise():
# check that check_estimator() works on estimator with _pairwise
# kernel or metric
# test precomputed kernel
est = SVC(kernel='precomputed')
check_estimator(est)
# test precomputed metric
est = KNeighborsRegressor(metric='precomputed')
check_estimator(est)
def test_check_classifier_data_not_an_array():
assert_raises_regex(AssertionError,
'Not equal to tolerance',
check_classifier_data_not_an_array,
'estimator_name',
EstimatorInconsistentForPandas())
def test_check_regressor_data_not_an_array():
assert_raises_regex(AssertionError,
'Not equal to tolerance',
check_regressor_data_not_an_array,
'estimator_name',
EstimatorInconsistentForPandas())
@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24
def test_check_estimator_required_parameters_skip():
# TODO: remove whole test in 0.24 since passes classes to check_estimator()
# isn't supported anymore
class MyEstimator(BaseEstimator):
_required_parameters = ["special_parameter"]
def __init__(self, special_parameter):
self.special_parameter = special_parameter
assert_raises_regex(SkipTest, r"Can't instantiate estimator MyEstimator "
r"which requires parameters "
r"\['special_parameter'\]",
check_estimator, MyEstimator)
def run_tests_without_pytest():
"""Runs the tests in this file without using pytest.
"""
main_module = sys.modules['__main__']
test_functions = [getattr(main_module, name) for name in dir(main_module)
if name.startswith('test_')]
test_cases = [unittest.FunctionTestCase(fn) for fn in test_functions]
suite = unittest.TestSuite()
suite.addTests(test_cases)
runner = unittest.TextTestRunner()
runner.run(suite)
def test_check_class_weight_balanced_linear_classifier():
# check that ill-computed balanced weights raises an exception
assert_raises_regex(AssertionError,
"Classifier estimator_name is not computing"
" class_weight=balanced properly.",
check_class_weight_balanced_linear_classifier,
'estimator_name',
BadBalancedWeightsClassifier)
def test_all_estimators_all_public():
# all_estimator should not fail when pytest is not installed and return
# only public estimators
estimators = all_estimators()
for est in estimators:
assert not est.__class__.__name__.startswith("_")
if __name__ == '__main__':
# This module is run as a script to check that we have no dependency on
# pytest for estimator checks.
run_tests_without_pytest()

Some files were not shown because too many files have changed in this diff Show more