Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
0
venv/Lib/site-packages/sklearn/_loss/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/_loss/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
355
venv/Lib/site-packages/sklearn/_loss/glm_distribution.py
Normal file
355
venv/Lib/site-packages/sklearn/_loss/glm_distribution.py
Normal file
|
@ -0,0 +1,355 @@
|
|||
"""
|
||||
Distribution functions used in GLM
|
||||
"""
|
||||
|
||||
# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from collections import namedtuple
|
||||
import numbers
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import xlogy
|
||||
|
||||
|
||||
DistributionBoundary = namedtuple("DistributionBoundary",
|
||||
("value", "inclusive"))
|
||||
|
||||
|
||||
class ExponentialDispersionModel(metaclass=ABCMeta):
|
||||
r"""Base class for reproductive Exponential Dispersion Models (EDM).
|
||||
|
||||
The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
|
||||
|
||||
.. math:: p(y| \theta, \phi) = c(y, \phi)
|
||||
\exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
|
||||
= \tilde{c}(y, \phi)
|
||||
\exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
|
||||
|
||||
with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
|
||||
variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
|
||||
unit variance :math:`v(y_\textrm{pred})` and
|
||||
unit deviance :math:`d(y,y_\textrm{pred})`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
deviance
|
||||
deviance_derivative
|
||||
in_y_range
|
||||
unit_deviance
|
||||
unit_deviance_derivative
|
||||
unit_variance
|
||||
|
||||
References
|
||||
----------
|
||||
https://en.wikipedia.org/wiki/Exponential_dispersion_model.
|
||||
"""
|
||||
|
||||
def in_y_range(self, y):
|
||||
"""Returns ``True`` if y is in the valid range of Y~EDM.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
"""
|
||||
# Note that currently supported distributions have +inf upper bound
|
||||
|
||||
if not isinstance(self._lower_bound, DistributionBoundary):
|
||||
raise TypeError('_lower_bound attribute must be of type '
|
||||
'DistributionBoundary')
|
||||
|
||||
if self._lower_bound.inclusive:
|
||||
return np.greater_equal(y, self._lower_bound.value)
|
||||
else:
|
||||
return np.greater(y, self._lower_bound.value)
|
||||
|
||||
@abstractmethod
|
||||
def unit_variance(self, y_pred):
|
||||
r"""Compute the unit variance function.
|
||||
|
||||
The unit variance :math:`v(y_\textrm{pred})` determines the variance as
|
||||
a function of the mean :math:`y_\textrm{pred}` by
|
||||
:math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`.
|
||||
It can also be derived from the unit deviance
|
||||
:math:`d(y,y_\textrm{pred})` as
|
||||
|
||||
.. math:: v(y_\textrm{pred}) = \frac{2}{
|
||||
\frac{\partial^2 d(y,y_\textrm{pred})}{
|
||||
\partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}}
|
||||
|
||||
See also :func:`variance`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def unit_deviance(self, y, y_pred, check_input=False):
|
||||
r"""Compute the unit deviance.
|
||||
|
||||
The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
|
||||
log-likelihood as
|
||||
:math:`d(y,y_\textrm{pred}) = -2\phi\cdot
|
||||
\left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
check_input : bool, default=False
|
||||
If True raise an exception on invalid y or y_pred values, otherwise
|
||||
they will be propagated as NaN.
|
||||
Returns
|
||||
-------
|
||||
deviance: array of shape (n_samples,)
|
||||
Computed deviance
|
||||
"""
|
||||
|
||||
def unit_deviance_derivative(self, y, y_pred):
|
||||
r"""Compute the derivative of the unit deviance w.r.t. y_pred.
|
||||
|
||||
The derivative of the unit deviance is given by
|
||||
:math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred})
|
||||
= -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}`
|
||||
with unit variance :math:`v(y_\textrm{pred})`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
"""
|
||||
return -2 * (y - y_pred) / self.unit_variance(y_pred)
|
||||
|
||||
def deviance(self, y, y_pred, weights=1):
|
||||
r"""Compute the deviance.
|
||||
|
||||
The deviance is a weighted sum of the per sample unit deviances,
|
||||
:math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
|
||||
with weights :math:`s_i` and unit deviance
|
||||
:math:`d(y,y_\textrm{pred})`.
|
||||
In terms of the log-likelihood it is :math:`D = -2\phi\cdot
|
||||
\left(loglike(y,y_\textrm{pred},\frac{phi}{s})
|
||||
- loglike(y,y,\frac{phi}{s})\right)`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
weights : {int, array of shape (n_samples,)}, default=1
|
||||
Weights or exposure to which variance is inverse proportional.
|
||||
"""
|
||||
return np.sum(weights * self.unit_deviance(y, y_pred))
|
||||
|
||||
def deviance_derivative(self, y, y_pred, weights=1):
|
||||
r"""Compute the derivative of the deviance w.r.t. y_pred.
|
||||
|
||||
It gives :math:`\frac{\partial}{\partial y_\textrm{pred}}
|
||||
D(y, \y_\textrm{pred}; weights)`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array, shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array, shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
weights : {int, array of shape (n_samples,)}, default=1
|
||||
Weights or exposure to which variance is inverse proportional.
|
||||
"""
|
||||
return weights * self.unit_deviance_derivative(y, y_pred)
|
||||
|
||||
|
||||
class TweedieDistribution(ExponentialDispersionModel):
|
||||
r"""A class for the Tweedie distribution.
|
||||
|
||||
A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
|
||||
is uniquely defined by it's mean-variance relationship
|
||||
:math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
|
||||
|
||||
Special cases are:
|
||||
|
||||
===== ================
|
||||
Power Distribution
|
||||
===== ================
|
||||
0 Normal
|
||||
1 Poisson
|
||||
(1,2) Compound Poisson
|
||||
2 Gamma
|
||||
3 Inverse Gaussian
|
||||
|
||||
Parameters
|
||||
----------
|
||||
power : float, default=0
|
||||
The variance power of the `unit_variance`
|
||||
:math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
|
||||
For ``0<power<1``, no distribution exists.
|
||||
"""
|
||||
def __init__(self, power=0):
|
||||
self.power = power
|
||||
|
||||
@property
|
||||
def power(self):
|
||||
return self._power
|
||||
|
||||
@power.setter
|
||||
def power(self, power):
|
||||
# We use a property with a setter, to update lower and
|
||||
# upper bound when the power parameter is updated e.g. in grid
|
||||
# search.
|
||||
if not isinstance(power, numbers.Real):
|
||||
raise TypeError('power must be a real number, input was {0}'
|
||||
.format(power))
|
||||
|
||||
if power <= 0:
|
||||
# Extreme Stable or Normal distribution
|
||||
self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
|
||||
elif 0 < power < 1:
|
||||
raise ValueError('Tweedie distribution is only defined for '
|
||||
'power<=0 and power>=1.')
|
||||
elif 1 <= power < 2:
|
||||
# Poisson or Compound Poisson distribution
|
||||
self._lower_bound = DistributionBoundary(0, inclusive=True)
|
||||
elif power >= 2:
|
||||
# Gamma, Positive Stable, Inverse Gaussian distributions
|
||||
self._lower_bound = DistributionBoundary(0, inclusive=False)
|
||||
else: # pragma: no cover
|
||||
# this branch should be unreachable.
|
||||
raise ValueError
|
||||
|
||||
self._power = power
|
||||
|
||||
def unit_variance(self, y_pred):
|
||||
"""Compute the unit variance of a Tweedie distribution
|
||||
v(y_\textrm{pred})=y_\textrm{pred}**power.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
"""
|
||||
return np.power(y_pred, self.power)
|
||||
|
||||
def unit_deviance(self, y, y_pred, check_input=False):
|
||||
r"""Compute the unit deviance.
|
||||
|
||||
The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
|
||||
log-likelihood as
|
||||
:math:`d(y,y_\textrm{pred}) = -2\phi\cdot
|
||||
\left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
y_pred : array of shape (n_samples,)
|
||||
Predicted mean.
|
||||
|
||||
check_input : bool, default=False
|
||||
If True raise an exception on invalid y or y_pred values, otherwise
|
||||
they will be propagated as NaN.
|
||||
Returns
|
||||
-------
|
||||
deviance: array of shape (n_samples,)
|
||||
Computed deviance
|
||||
"""
|
||||
p = self.power
|
||||
|
||||
if check_input:
|
||||
message = ("Mean Tweedie deviance error with power={} can only be "
|
||||
"used on ".format(p))
|
||||
if p < 0:
|
||||
# 'Extreme stable', y any realy number, y_pred > 0
|
||||
if (y_pred <= 0).any():
|
||||
raise ValueError(message + "strictly positive y_pred.")
|
||||
elif p == 0:
|
||||
# Normal, y and y_pred can be any real number
|
||||
pass
|
||||
elif 0 < p < 1:
|
||||
raise ValueError("Tweedie deviance is only defined for "
|
||||
"power<=0 and power>=1.")
|
||||
elif 1 <= p < 2:
|
||||
# Poisson and Compount poisson distribution, y >= 0, y_pred > 0
|
||||
if (y < 0).any() or (y_pred <= 0).any():
|
||||
raise ValueError(message + "non-negative y and strictly "
|
||||
"positive y_pred.")
|
||||
elif p >= 2:
|
||||
# Gamma and Extreme stable distribution, y and y_pred > 0
|
||||
if (y <= 0).any() or (y_pred <= 0).any():
|
||||
raise ValueError(message
|
||||
+ "strictly positive y and y_pred.")
|
||||
else: # pragma: nocover
|
||||
# Unreachable statement
|
||||
raise ValueError
|
||||
|
||||
if p < 0:
|
||||
# 'Extreme stable', y any realy number, y_pred > 0
|
||||
dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p))
|
||||
- y * np.power(y_pred, 1-p) / (1-p)
|
||||
+ np.power(y_pred, 2-p) / (2-p))
|
||||
|
||||
elif p == 0:
|
||||
# Normal distribution, y and y_pred any real number
|
||||
dev = (y - y_pred)**2
|
||||
elif p < 1:
|
||||
raise ValueError("Tweedie deviance is only defined for power<=0 "
|
||||
"and power>=1.")
|
||||
elif p == 1:
|
||||
# Poisson distribution
|
||||
dev = 2 * (xlogy(y, y/y_pred) - y + y_pred)
|
||||
elif p == 2:
|
||||
# Gamma distribution
|
||||
dev = 2 * (np.log(y_pred/y) + y/y_pred - 1)
|
||||
else:
|
||||
dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p))
|
||||
- y * np.power(y_pred, 1-p) / (1-p)
|
||||
+ np.power(y_pred, 2-p) / (2-p))
|
||||
return dev
|
||||
|
||||
|
||||
class NormalDistribution(TweedieDistribution):
|
||||
"""Class for the Normal (aka Gaussian) distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=0)
|
||||
|
||||
|
||||
class PoissonDistribution(TweedieDistribution):
|
||||
"""Class for the scaled Poisson distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=1)
|
||||
|
||||
|
||||
class GammaDistribution(TweedieDistribution):
|
||||
"""Class for the Gamma distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=2)
|
||||
|
||||
|
||||
class InverseGaussianDistribution(TweedieDistribution):
|
||||
"""Class for the scaled InverseGaussianDistribution distribution"""
|
||||
def __init__(self):
|
||||
super().__init__(power=3)
|
||||
|
||||
|
||||
EDM_DISTRIBUTIONS = {
|
||||
'normal': NormalDistribution,
|
||||
'poisson': PoissonDistribution,
|
||||
'gamma': GammaDistribution,
|
||||
'inverse-gaussian': InverseGaussianDistribution,
|
||||
}
|
0
venv/Lib/site-packages/sklearn/_loss/tests/__init__.py
Normal file
0
venv/Lib/site-packages/sklearn/_loss/tests/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,112 @@
|
|||
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
import numpy as np
|
||||
from numpy.testing import (
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
)
|
||||
from scipy.optimize import check_grad
|
||||
import pytest
|
||||
|
||||
from sklearn._loss.glm_distribution import (
|
||||
TweedieDistribution,
|
||||
NormalDistribution, PoissonDistribution,
|
||||
GammaDistribution, InverseGaussianDistribution,
|
||||
DistributionBoundary
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'family, expected',
|
||||
[(NormalDistribution(), [True, True, True]),
|
||||
(PoissonDistribution(), [False, True, True]),
|
||||
(TweedieDistribution(power=1.5), [False, True, True]),
|
||||
(GammaDistribution(), [False, False, True]),
|
||||
(InverseGaussianDistribution(), [False, False, True]),
|
||||
(TweedieDistribution(power=4.5), [False, False, True])])
|
||||
def test_family_bounds(family, expected):
|
||||
"""Test the valid range of distributions at -1, 0, 1."""
|
||||
result = family.in_y_range([-1, 0, 1])
|
||||
assert_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_invalid_distribution_bound():
|
||||
dist = TweedieDistribution()
|
||||
dist._lower_bound = 0
|
||||
with pytest.raises(TypeError,
|
||||
match="must be of type DistributionBoundary"):
|
||||
dist.in_y_range([-1, 0, 1])
|
||||
|
||||
|
||||
def test_tweedie_distribution_power():
|
||||
msg = "distribution is only defined for power<=0 and power>=1"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
TweedieDistribution(power=0.5)
|
||||
|
||||
with pytest.raises(TypeError, match="must be a real number"):
|
||||
TweedieDistribution(power=1j)
|
||||
|
||||
with pytest.raises(TypeError, match="must be a real number"):
|
||||
dist = TweedieDistribution()
|
||||
dist.power = 1j
|
||||
|
||||
dist = TweedieDistribution()
|
||||
assert isinstance(dist._lower_bound, DistributionBoundary)
|
||||
|
||||
assert dist._lower_bound.inclusive is False
|
||||
dist.power = 1
|
||||
assert dist._lower_bound.value == 0.0
|
||||
assert dist._lower_bound.inclusive is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'family, chk_values',
|
||||
[(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
|
||||
(PoissonDistribution(), [0.1, 1.5]),
|
||||
(GammaDistribution(), [0.1, 1.5]),
|
||||
(InverseGaussianDistribution(), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=-2.5), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=-1), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=1.5), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=2.5), [0.1, 1.5]),
|
||||
(TweedieDistribution(power=-4), [0.1, 1.5])])
|
||||
def test_deviance_zero(family, chk_values):
|
||||
"""Test deviance(y,y) = 0 for different families."""
|
||||
for x in chk_values:
|
||||
assert_allclose(family.deviance(x, x), 0, atol=1e-9)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'family',
|
||||
[NormalDistribution(),
|
||||
PoissonDistribution(),
|
||||
GammaDistribution(),
|
||||
InverseGaussianDistribution(),
|
||||
TweedieDistribution(power=-2.5),
|
||||
TweedieDistribution(power=-1),
|
||||
TweedieDistribution(power=1.5),
|
||||
TweedieDistribution(power=2.5),
|
||||
TweedieDistribution(power=-4)],
|
||||
ids=lambda x: x.__class__.__name__
|
||||
)
|
||||
def test_deviance_derivative(family):
|
||||
"""Test deviance derivative for different families."""
|
||||
rng = np.random.RandomState(0)
|
||||
y_true = rng.rand(10)
|
||||
# make data positive
|
||||
y_true += np.abs(y_true.min()) + 1e-2
|
||||
|
||||
y_pred = y_true + np.fmax(rng.rand(10), 0.)
|
||||
|
||||
dev = family.deviance(y_true, y_pred)
|
||||
assert isinstance(dev, float)
|
||||
dev_derivative = family.deviance_derivative(y_true, y_pred)
|
||||
assert dev_derivative.shape == y_pred.shape
|
||||
|
||||
err = check_grad(
|
||||
lambda y_pred: family.deviance(y_true, y_pred),
|
||||
lambda y_pred: family.deviance_derivative(y_true, y_pred),
|
||||
y_pred,
|
||||
) / np.linalg.norm(dev_derivative)
|
||||
assert abs(err) < 1e-6
|
Loading…
Add table
Add a link
Reference in a new issue