Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
15
venv/Lib/site-packages/sklearn/linear_model/_glm/__init__.py
Normal file
15
venv/Lib/site-packages/sklearn/linear_model/_glm/__init__.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
# License: BSD 3 clause
|
||||
|
||||
from .glm import (
|
||||
GeneralizedLinearRegressor,
|
||||
PoissonRegressor,
|
||||
GammaRegressor,
|
||||
TweedieRegressor
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"GeneralizedLinearRegressor",
|
||||
"PoissonRegressor",
|
||||
"GammaRegressor",
|
||||
"TweedieRegressor"
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
615
venv/Lib/site-packages/sklearn/linear_model/_glm/glm.py
Normal file
615
venv/Lib/site-packages/sklearn/linear_model/_glm/glm.py
Normal file
|
@ -0,0 +1,615 @@
|
|||
"""
|
||||
Generalized Linear Models with Exponential Dispersion Family
|
||||
"""
|
||||
|
||||
# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
|
||||
# some parts and tricks stolen from other sklearn files.
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numbers
|
||||
|
||||
import numpy as np
|
||||
import scipy.optimize
|
||||
|
||||
from ...base import BaseEstimator, RegressorMixin
|
||||
from ...utils import check_array, check_X_y
|
||||
from ...utils.optimize import _check_optimize_result
|
||||
from ...utils.validation import check_is_fitted, _check_sample_weight
|
||||
from ..._loss.glm_distribution import (
|
||||
ExponentialDispersionModel,
|
||||
TweedieDistribution,
|
||||
EDM_DISTRIBUTIONS
|
||||
)
|
||||
from .link import (
|
||||
BaseLink,
|
||||
IdentityLink,
|
||||
LogLink,
|
||||
)
|
||||
|
||||
|
||||
def _safe_lin_pred(X, coef):
|
||||
"""Compute the linear predictor taking care if intercept is present."""
|
||||
if coef.size == X.shape[1] + 1:
|
||||
return X @ coef[1:] + coef[0]
|
||||
else:
|
||||
return X @ coef
|
||||
|
||||
|
||||
def _y_pred_deviance_derivative(coef, X, y, weights, family, link):
|
||||
"""Compute y_pred and the derivative of the deviance w.r.t coef."""
|
||||
lin_pred = _safe_lin_pred(X, coef)
|
||||
y_pred = link.inverse(lin_pred)
|
||||
d1 = link.inverse_derivative(lin_pred)
|
||||
temp = d1 * family.deviance_derivative(y, y_pred, weights)
|
||||
if coef.size == X.shape[1] + 1:
|
||||
devp = np.concatenate(([temp.sum()], temp @ X))
|
||||
else:
|
||||
devp = temp @ X # same as X.T @ temp
|
||||
return y_pred, devp
|
||||
|
||||
|
||||
class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
|
||||
"""Regression via a penalized Generalized Linear Model (GLM).
|
||||
|
||||
GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
|
||||
fitting and predicting the mean of the target y as y_pred=h(X*w).
|
||||
Therefore, the fit minimizes the following objective function with L2
|
||||
priors as regularizer::
|
||||
|
||||
1/(2*sum(s)) * deviance(y, h(X*w); s)
|
||||
+ 1/2 * alpha * |w|_2
|
||||
|
||||
with inverse link function h and s=sample_weight.
|
||||
The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the penalty term and thus determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor (X @ coef + intercept).
|
||||
|
||||
family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \
|
||||
or an ExponentialDispersionModel instance, default='normal'
|
||||
The distributional assumption of the GLM, i.e. which distribution from
|
||||
the EDM, specifies the loss function to be minimized.
|
||||
|
||||
link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \
|
||||
default='auto'
|
||||
The link function of the GLM, i.e. mapping from linear predictor
|
||||
`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
|
||||
the link depending on the chosen family as follows:
|
||||
|
||||
- 'identity' for Normal distribution
|
||||
- 'log' for Poisson, Gamma and Inverse Gaussian distributions
|
||||
|
||||
solver : 'lbfgs', default='lbfgs'
|
||||
Algorithm to use in the optimization problem:
|
||||
|
||||
'lbfgs'
|
||||
Calls scipy's L-BFGS-B optimizer.
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for ``coef_`` and ``intercept_``.
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X @ coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
"""
|
||||
def __init__(self, *, alpha=1.0,
|
||||
fit_intercept=True, family='normal', link='auto',
|
||||
solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
|
||||
verbose=0):
|
||||
self.alpha = alpha
|
||||
self.fit_intercept = fit_intercept
|
||||
self.family = family
|
||||
self.link = link
|
||||
self.solver = solver
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.warm_start = warm_start
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit a Generalized Linear Model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : returns an instance of self.
|
||||
"""
|
||||
if isinstance(self.family, ExponentialDispersionModel):
|
||||
self._family_instance = self.family
|
||||
elif self.family in EDM_DISTRIBUTIONS:
|
||||
self._family_instance = EDM_DISTRIBUTIONS[self.family]()
|
||||
else:
|
||||
raise ValueError(
|
||||
"The family must be an instance of class"
|
||||
" ExponentialDispersionModel or an element of"
|
||||
" ['normal', 'poisson', 'gamma', 'inverse-gaussian']"
|
||||
"; got (family={0})".format(self.family))
|
||||
|
||||
# Guarantee that self._link_instance is set to an instance of
|
||||
# class BaseLink
|
||||
if isinstance(self.link, BaseLink):
|
||||
self._link_instance = self.link
|
||||
else:
|
||||
if self.link == 'auto':
|
||||
if isinstance(self._family_instance, TweedieDistribution):
|
||||
if self._family_instance.power <= 0:
|
||||
self._link_instance = IdentityLink()
|
||||
if self._family_instance.power >= 1:
|
||||
self._link_instance = LogLink()
|
||||
else:
|
||||
raise ValueError("No default link known for the "
|
||||
"specified distribution family. Please "
|
||||
"set link manually, i.e. not to 'auto'; "
|
||||
"got (link='auto', family={})"
|
||||
.format(self.family))
|
||||
elif self.link == 'identity':
|
||||
self._link_instance = IdentityLink()
|
||||
elif self.link == 'log':
|
||||
self._link_instance = LogLink()
|
||||
else:
|
||||
raise ValueError(
|
||||
"The link must be an instance of class Link or "
|
||||
"an element of ['auto', 'identity', 'log']; "
|
||||
"got (link={0})".format(self.link))
|
||||
|
||||
if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
|
||||
raise ValueError("Penalty term must be a non-negative number;"
|
||||
" got (alpha={0})".format(self.alpha))
|
||||
if not isinstance(self.fit_intercept, bool):
|
||||
raise ValueError("The argument fit_intercept must be bool;"
|
||||
" got {0}".format(self.fit_intercept))
|
||||
if self.solver not in ['lbfgs']:
|
||||
raise ValueError("GeneralizedLinearRegressor supports only solvers"
|
||||
"'lbfgs'; got {0}".format(self.solver))
|
||||
solver = self.solver
|
||||
if (not isinstance(self.max_iter, numbers.Integral)
|
||||
or self.max_iter <= 0):
|
||||
raise ValueError("Maximum number of iteration must be a positive "
|
||||
"integer;"
|
||||
" got (max_iter={0!r})".format(self.max_iter))
|
||||
if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
|
||||
raise ValueError("Tolerance for stopping criteria must be "
|
||||
"positive; got (tol={0!r})".format(self.tol))
|
||||
if not isinstance(self.warm_start, bool):
|
||||
raise ValueError("The argument warm_start must be bool;"
|
||||
" got {0}".format(self.warm_start))
|
||||
|
||||
family = self._family_instance
|
||||
link = self._link_instance
|
||||
|
||||
X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
|
||||
dtype=[np.float64, np.float32],
|
||||
y_numeric=True, multi_output=False)
|
||||
|
||||
weights = _check_sample_weight(sample_weight, X)
|
||||
|
||||
_, n_features = X.shape
|
||||
|
||||
if not np.all(family.in_y_range(y)):
|
||||
raise ValueError("Some value(s) of y are out of the valid "
|
||||
"range for family {0}"
|
||||
.format(family.__class__.__name__))
|
||||
# TODO: if alpha=0 check that X is not rank deficient
|
||||
|
||||
# rescaling of sample_weight
|
||||
#
|
||||
# IMPORTANT NOTE: Since we want to minimize
|
||||
# 1/(2*sum(sample_weight)) * deviance + L2,
|
||||
# deviance = sum(sample_weight * unit_deviance),
|
||||
# we rescale weights such that sum(weights) = 1 and this becomes
|
||||
# 1/2*deviance + L2 with deviance=sum(weights * unit_deviance)
|
||||
weights = weights / weights.sum()
|
||||
|
||||
if self.warm_start and hasattr(self, 'coef_'):
|
||||
if self.fit_intercept:
|
||||
coef = np.concatenate((np.array([self.intercept_]),
|
||||
self.coef_))
|
||||
else:
|
||||
coef = self.coef_
|
||||
else:
|
||||
if self.fit_intercept:
|
||||
coef = np.zeros(n_features+1)
|
||||
coef[0] = link(np.average(y, weights=weights))
|
||||
else:
|
||||
coef = np.zeros(n_features)
|
||||
|
||||
# algorithms for optimization
|
||||
|
||||
if solver == 'lbfgs':
|
||||
def func(coef, X, y, weights, alpha, family, link):
|
||||
y_pred, devp = _y_pred_deviance_derivative(
|
||||
coef, X, y, weights, family, link
|
||||
)
|
||||
dev = family.deviance(y, y_pred, weights)
|
||||
# offset if coef[0] is intercept
|
||||
offset = 1 if self.fit_intercept else 0
|
||||
coef_scaled = alpha * coef[offset:]
|
||||
obj = 0.5 * dev + 0.5 * (coef[offset:] @ coef_scaled)
|
||||
objp = 0.5 * devp
|
||||
objp[offset:] += coef_scaled
|
||||
return obj, objp
|
||||
|
||||
args = (X, y, weights, self.alpha, family, link)
|
||||
|
||||
opt_res = scipy.optimize.minimize(
|
||||
func, coef, method="L-BFGS-B", jac=True,
|
||||
options={
|
||||
"maxiter": self.max_iter,
|
||||
"iprint": (self.verbose > 0) - 1,
|
||||
"gtol": self.tol,
|
||||
"ftol": 1e3*np.finfo(float).eps,
|
||||
},
|
||||
args=args)
|
||||
self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
|
||||
coef = opt_res.x
|
||||
|
||||
if self.fit_intercept:
|
||||
self.intercept_ = coef[0]
|
||||
self.coef_ = coef[1:]
|
||||
else:
|
||||
# set intercept to zero as the other linear models do
|
||||
self.intercept_ = 0.
|
||||
self.coef_ = coef
|
||||
|
||||
return self
|
||||
|
||||
def _linear_predictor(self, X):
|
||||
"""Compute the linear_predictor = `X @ coef_ + intercept_`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Returns predicted values of linear predictor.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
|
||||
dtype=[np.float64, np.float32], ensure_2d=True,
|
||||
allow_nd=False)
|
||||
return X @ self.coef_ + self.intercept_
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict using GLM with feature matrix X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Returns predicted values.
|
||||
"""
|
||||
# check_array is done in _linear_predictor
|
||||
eta = self._linear_predictor(X)
|
||||
y_pred = self._link_instance.inverse(eta)
|
||||
return y_pred
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Compute D^2, the percentage of deviance explained.
|
||||
|
||||
D^2 is a generalization of the coefficient of determination R^2.
|
||||
R^2 uses squared error and D^2 deviance. Note that those two are equal
|
||||
for ``family='normal'``.
|
||||
|
||||
D^2 is defined as
|
||||
:math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
|
||||
:math:`D_{null}` is the null deviance, i.e. the deviance of a model
|
||||
with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
|
||||
The mean :math:`\\bar{y}` is averaged by sample_weight.
|
||||
Best possible score is 1.0 and it can be negative (because the model
|
||||
can be arbitrarily worse).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
True values of target.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
D^2 of self.predict(X) w.r.t. y.
|
||||
"""
|
||||
# Note, default score defined in RegressorMixin is R^2 score.
|
||||
# TODO: make D^2 a score function in module metrics (and thereby get
|
||||
# input validation and so on)
|
||||
weights = _check_sample_weight(sample_weight, X)
|
||||
y_pred = self.predict(X)
|
||||
dev = self._family_instance.deviance(y, y_pred, weights=weights)
|
||||
y_mean = np.average(y, weights=weights)
|
||||
dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
|
||||
return 1 - dev / dev_null
|
||||
|
||||
def _more_tags(self):
|
||||
# create the _family_instance if fit wasn't called yet.
|
||||
if hasattr(self, '_family_instance'):
|
||||
_family_instance = self._family_instance
|
||||
elif isinstance(self.family, ExponentialDispersionModel):
|
||||
_family_instance = self.family
|
||||
elif self.family in EDM_DISTRIBUTIONS:
|
||||
_family_instance = EDM_DISTRIBUTIONS[self.family]()
|
||||
else:
|
||||
raise ValueError
|
||||
return {"requires_positive_y": not _family_instance.in_y_range(-1.0)}
|
||||
|
||||
|
||||
class PoissonRegressor(GeneralizedLinearRegressor):
|
||||
"""Generalized Linear Model with a Poisson distribution.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the penalty term and thus determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor (X @ coef + intercept).
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for ``coef_`` and ``intercept_`` .
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X @ coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
"""
|
||||
def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100,
|
||||
tol=1e-4, warm_start=False, verbose=0):
|
||||
|
||||
super().__init__(alpha=alpha, fit_intercept=fit_intercept,
|
||||
family="poisson", link='log', max_iter=max_iter,
|
||||
tol=tol, warm_start=warm_start, verbose=verbose)
|
||||
|
||||
@property
|
||||
def family(self):
|
||||
# Make this attribute read-only to avoid mis-uses e.g. in GridSearch.
|
||||
return "poisson"
|
||||
|
||||
@family.setter
|
||||
def family(self, value):
|
||||
if value != "poisson":
|
||||
raise ValueError("PoissonRegressor.family must be 'poisson'!")
|
||||
|
||||
|
||||
class GammaRegressor(GeneralizedLinearRegressor):
|
||||
"""Generalized Linear Model with a Gamma distribution.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the penalty term and thus determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor (X @ coef + intercept).
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for ``coef_`` and ``intercept_`` .
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X * coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
"""
|
||||
def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100,
|
||||
tol=1e-4, warm_start=False, verbose=0):
|
||||
|
||||
super().__init__(alpha=alpha, fit_intercept=fit_intercept,
|
||||
family="gamma", link='log', max_iter=max_iter,
|
||||
tol=tol, warm_start=warm_start, verbose=verbose)
|
||||
|
||||
@property
|
||||
def family(self):
|
||||
# Make this attribute read-only to avoid mis-uses e.g. in GridSearch.
|
||||
return "gamma"
|
||||
|
||||
@family.setter
|
||||
def family(self, value):
|
||||
if value != "gamma":
|
||||
raise ValueError("GammaRegressor.family must be 'gamma'!")
|
||||
|
||||
|
||||
class TweedieRegressor(GeneralizedLinearRegressor):
|
||||
"""Generalized Linear Model with a Tweedie distribution.
|
||||
|
||||
This estimator can be used to model different GLMs depending on the
|
||||
``power`` parameter, which determines the underlying distribution.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
power : float, default=0
|
||||
The power determines the underlying target distribution according
|
||||
to the following table:
|
||||
|
||||
+-------+------------------------+
|
||||
| Power | Distribution |
|
||||
+=======+========================+
|
||||
| 0 | Normal |
|
||||
+-------+------------------------+
|
||||
| 1 | Poisson |
|
||||
+-------+------------------------+
|
||||
| (1,2) | Compound Poisson Gamma |
|
||||
+-------+------------------------+
|
||||
| 2 | Gamma |
|
||||
+-------+------------------------+
|
||||
| 3 | Inverse Gaussian |
|
||||
+-------+------------------------+
|
||||
|
||||
For ``0 < power < 1``, no distribution exists.
|
||||
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the penalty term and thus determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
|
||||
link : {'auto', 'identity', 'log'}, default='auto'
|
||||
The link function of the GLM, i.e. mapping from linear predictor
|
||||
`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
|
||||
the link depending on the chosen family as follows:
|
||||
|
||||
- 'identity' for Normal distribution
|
||||
- 'log' for Poisson, Gamma and Inverse Gaussian distributions
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor (X @ coef + intercept).
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for ``coef_`` and ``intercept_`` .
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X @ coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
"""
|
||||
def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True,
|
||||
link='auto', max_iter=100, tol=1e-4,
|
||||
warm_start=False, verbose=0):
|
||||
|
||||
super().__init__(alpha=alpha, fit_intercept=fit_intercept,
|
||||
family=TweedieDistribution(power=power), link=link,
|
||||
max_iter=max_iter, tol=tol,
|
||||
warm_start=warm_start, verbose=verbose)
|
||||
|
||||
@property
|
||||
def family(self):
|
||||
# We use a property with a setter to make sure that the family is
|
||||
# always a Tweedie distribution, and that self.power and
|
||||
# self.family.power are identical by construction.
|
||||
dist = TweedieDistribution(power=self.power)
|
||||
# TODO: make the returned object immutable
|
||||
return dist
|
||||
|
||||
@family.setter
|
||||
def family(self, value):
|
||||
if isinstance(value, TweedieDistribution):
|
||||
self.power = value.power
|
||||
else:
|
||||
raise TypeError("TweedieRegressor.family must be of type "
|
||||
"TweedieDistribution!")
|
110
venv/Lib/site-packages/sklearn/linear_model/_glm/link.py
Normal file
110
venv/Lib/site-packages/sklearn/linear_model/_glm/link.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
"""
|
||||
Link functions used in GLM
|
||||
"""
|
||||
|
||||
# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import expit, logit
|
||||
|
||||
|
||||
class BaseLink(metaclass=ABCMeta):
|
||||
"""Abstract base class for Link functions."""
|
||||
|
||||
@abstractmethod
|
||||
def __call__(self, y_pred):
|
||||
"""Compute the link function g(y_pred).
|
||||
|
||||
The link function links the mean y_pred=E[Y] to the so called linear
|
||||
predictor (X*w), i.e. g(y_pred) = linear predictor.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Usually the (predicted) mean.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def derivative(self, y_pred):
|
||||
"""Compute the derivative of the link g'(y_pred).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Usually the (predicted) mean.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def inverse(self, lin_pred):
|
||||
"""Compute the inverse link function h(lin_pred).
|
||||
|
||||
Gives the inverse relationship between linear predictor and the mean
|
||||
y_pred=E[Y], i.e. h(linear predictor) = y_pred.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lin_pred : array of shape (n_samples,)
|
||||
Usually the (fitted) linear predictor.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def inverse_derivative(self, lin_pred):
|
||||
"""Compute the derivative of the inverse link function h'(lin_pred).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lin_pred : array of shape (n_samples,)
|
||||
Usually the (fitted) linear predictor.
|
||||
"""
|
||||
|
||||
|
||||
class IdentityLink(BaseLink):
|
||||
"""The identity link function g(x)=x."""
|
||||
|
||||
def __call__(self, y_pred):
|
||||
return y_pred
|
||||
|
||||
def derivative(self, y_pred):
|
||||
return np.ones_like(y_pred)
|
||||
|
||||
def inverse(self, lin_pred):
|
||||
return lin_pred
|
||||
|
||||
def inverse_derivative(self, lin_pred):
|
||||
return np.ones_like(lin_pred)
|
||||
|
||||
|
||||
class LogLink(BaseLink):
|
||||
"""The log link function g(x)=log(x)."""
|
||||
|
||||
def __call__(self, y_pred):
|
||||
return np.log(y_pred)
|
||||
|
||||
def derivative(self, y_pred):
|
||||
return 1 / y_pred
|
||||
|
||||
def inverse(self, lin_pred):
|
||||
return np.exp(lin_pred)
|
||||
|
||||
def inverse_derivative(self, lin_pred):
|
||||
return np.exp(lin_pred)
|
||||
|
||||
|
||||
class LogitLink(BaseLink):
|
||||
"""The logit link function g(x)=logit(x)."""
|
||||
|
||||
def __call__(self, y_pred):
|
||||
return logit(y_pred)
|
||||
|
||||
def derivative(self, y_pred):
|
||||
return 1 / (y_pred * (1 - y_pred))
|
||||
|
||||
def inverse(self, lin_pred):
|
||||
return expit(lin_pred)
|
||||
|
||||
def inverse_derivative(self, lin_pred):
|
||||
ep = expit(lin_pred)
|
||||
return ep * (1 - ep)
|
|
@ -0,0 +1 @@
|
|||
# License: BSD 3 clause
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,431 @@
|
|||
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
import pytest
|
||||
import warnings
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.linear_model._glm import GeneralizedLinearRegressor
|
||||
from sklearn.linear_model import (
|
||||
TweedieRegressor,
|
||||
PoissonRegressor,
|
||||
GammaRegressor
|
||||
)
|
||||
from sklearn.linear_model._glm.link import (
|
||||
IdentityLink,
|
||||
LogLink,
|
||||
)
|
||||
from sklearn._loss.glm_distribution import (
|
||||
TweedieDistribution,
|
||||
NormalDistribution, PoissonDistribution,
|
||||
GammaDistribution, InverseGaussianDistribution,
|
||||
)
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def regression_data():
|
||||
X, y = make_regression(n_samples=107,
|
||||
n_features=10,
|
||||
n_informative=80, noise=0.5,
|
||||
random_state=2)
|
||||
return X, y
|
||||
|
||||
|
||||
def test_sample_weights_validation():
|
||||
"""Test the raised errors in the validation of sample_weight."""
|
||||
# scalar value but not positive
|
||||
X = [[1]]
|
||||
y = [1]
|
||||
weights = 0
|
||||
glm = GeneralizedLinearRegressor()
|
||||
|
||||
# Positive weights are accepted
|
||||
glm.fit(X, y, sample_weight=1)
|
||||
|
||||
# 2d array
|
||||
weights = [[0]]
|
||||
with pytest.raises(ValueError, match="must be 1D array or scalar"):
|
||||
glm.fit(X, y, weights)
|
||||
|
||||
# 1d but wrong length
|
||||
weights = [1, 0]
|
||||
msg = r"sample_weight.shape == \(2,\), expected \(1,\)!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
glm.fit(X, y, weights)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name, instance',
|
||||
[('normal', NormalDistribution()),
|
||||
('poisson', PoissonDistribution()),
|
||||
('gamma', GammaDistribution()),
|
||||
('inverse-gaussian', InverseGaussianDistribution())])
|
||||
def test_glm_family_argument(name, instance):
|
||||
"""Test GLM family argument set as string."""
|
||||
y = np.array([0.1, 0.5]) # in range of all distributions
|
||||
X = np.array([[1], [2]])
|
||||
glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y)
|
||||
assert isinstance(glm._family_instance, instance.__class__)
|
||||
|
||||
glm = GeneralizedLinearRegressor(family='not a family')
|
||||
with pytest.raises(ValueError, match="family must be"):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('name, instance',
|
||||
[('identity', IdentityLink()),
|
||||
('log', LogLink())])
|
||||
def test_glm_link_argument(name, instance):
|
||||
"""Test GLM link argument set as string."""
|
||||
y = np.array([0.1, 0.5]) # in range of all distributions
|
||||
X = np.array([[1], [2]])
|
||||
glm = GeneralizedLinearRegressor(family='normal', link=name).fit(X, y)
|
||||
assert isinstance(glm._link_instance, instance.__class__)
|
||||
|
||||
glm = GeneralizedLinearRegressor(family='normal', link='not a link')
|
||||
with pytest.raises(ValueError, match="link must be"):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('family, expected_link_class', [
|
||||
('normal', IdentityLink),
|
||||
('poisson', LogLink),
|
||||
('gamma', LogLink),
|
||||
('inverse-gaussian', LogLink),
|
||||
])
|
||||
def test_glm_link_auto(family, expected_link_class):
|
||||
# Make sure link='auto' delivers the expected link function
|
||||
y = np.array([0.1, 0.5]) # in range of all distributions
|
||||
X = np.array([[1], [2]])
|
||||
glm = GeneralizedLinearRegressor(family=family, link='auto').fit(X, y)
|
||||
assert isinstance(glm._link_instance, expected_link_class)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('alpha', ['not a number', -4.2])
|
||||
def test_glm_alpha_argument(alpha):
|
||||
"""Test GLM for invalid alpha argument."""
|
||||
y = np.array([1, 2])
|
||||
X = np.array([[1], [2]])
|
||||
glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
|
||||
with pytest.raises(ValueError,
|
||||
match="Penalty term must be a non-negative"):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
|
||||
def test_glm_fit_intercept_argument(fit_intercept):
|
||||
"""Test GLM for invalid fit_intercept argument."""
|
||||
y = np.array([1, 2])
|
||||
X = np.array([[1], [1]])
|
||||
glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
|
||||
with pytest.raises(ValueError, match="fit_intercept must be bool"):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('solver',
|
||||
['not a solver', 1, [1]])
|
||||
def test_glm_solver_argument(solver):
|
||||
"""Test GLM for invalid solver argument."""
|
||||
y = np.array([1, 2])
|
||||
X = np.array([[1], [2]])
|
||||
glm = GeneralizedLinearRegressor(solver=solver)
|
||||
with pytest.raises(ValueError):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]])
|
||||
def test_glm_max_iter_argument(max_iter):
|
||||
"""Test GLM for invalid max_iter argument."""
|
||||
y = np.array([1, 2])
|
||||
X = np.array([[1], [2]])
|
||||
glm = GeneralizedLinearRegressor(max_iter=max_iter)
|
||||
with pytest.raises(ValueError, match="must be a positive integer"):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]])
|
||||
def test_glm_tol_argument(tol):
|
||||
"""Test GLM for invalid tol argument."""
|
||||
y = np.array([1, 2])
|
||||
X = np.array([[1], [2]])
|
||||
glm = GeneralizedLinearRegressor(tol=tol)
|
||||
with pytest.raises(ValueError, match="stopping criteria must be positive"):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]])
|
||||
def test_glm_warm_start_argument(warm_start):
|
||||
"""Test GLM for invalid warm_start argument."""
|
||||
y = np.array([1, 2])
|
||||
X = np.array([[1], [1]])
|
||||
glm = GeneralizedLinearRegressor(warm_start=warm_start)
|
||||
with pytest.raises(ValueError, match="warm_start must be bool"):
|
||||
glm.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fit_intercept', [False, True])
|
||||
def test_glm_identity_regression(fit_intercept):
|
||||
"""Test GLM regression with identity link on a simple dataset."""
|
||||
coef = [1., 2.]
|
||||
X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
|
||||
y = np.dot(X, coef)
|
||||
glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
|
||||
fit_intercept=fit_intercept, tol=1e-12)
|
||||
if fit_intercept:
|
||||
glm.fit(X[:, 1:], y)
|
||||
assert_allclose(glm.coef_, coef[1:], rtol=1e-10)
|
||||
assert_allclose(glm.intercept_, coef[0], rtol=1e-10)
|
||||
else:
|
||||
glm.fit(X, y)
|
||||
assert_allclose(glm.coef_, coef, rtol=1e-12)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fit_intercept', [False, True])
|
||||
@pytest.mark.parametrize('alpha', [0.0, 1.0])
|
||||
@pytest.mark.parametrize('family', ['normal', 'poisson', 'gamma'])
|
||||
def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family):
|
||||
"""Test that the impact of sample_weight is consistent"""
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples, n_features = 10, 5
|
||||
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = rng.rand(n_samples)
|
||||
glm_params = dict(alpha=alpha, family=family, link='auto',
|
||||
fit_intercept=fit_intercept)
|
||||
|
||||
glm = GeneralizedLinearRegressor(**glm_params).fit(X, y)
|
||||
coef = glm.coef_.copy()
|
||||
|
||||
# sample_weight=np.ones(..) should be equivalent to sample_weight=None
|
||||
sample_weight = np.ones(y.shape)
|
||||
glm.fit(X, y, sample_weight=sample_weight)
|
||||
assert_allclose(glm.coef_, coef, rtol=1e-12)
|
||||
|
||||
# sample_weight are normalized to 1 so, scaling them has no effect
|
||||
sample_weight = 2*np.ones(y.shape)
|
||||
glm.fit(X, y, sample_weight=sample_weight)
|
||||
assert_allclose(glm.coef_, coef, rtol=1e-12)
|
||||
|
||||
# setting one element of sample_weight to 0 is equivalent to removing
|
||||
# the correspoding sample
|
||||
sample_weight = np.ones(y.shape)
|
||||
sample_weight[-1] = 0
|
||||
glm.fit(X, y, sample_weight=sample_weight)
|
||||
coef1 = glm.coef_.copy()
|
||||
glm.fit(X[:-1], y[:-1])
|
||||
assert_allclose(glm.coef_, coef1, rtol=1e-12)
|
||||
|
||||
# check that multiplying sample_weight by 2 is equivalent
|
||||
# to repeating correspoding samples twice
|
||||
X2 = np.concatenate([X, X[:n_samples//2]], axis=0)
|
||||
y2 = np.concatenate([y, y[:n_samples//2]])
|
||||
sample_weight_1 = np.ones(len(y))
|
||||
sample_weight_1[:n_samples//2] = 2
|
||||
|
||||
glm1 = GeneralizedLinearRegressor(**glm_params).fit(
|
||||
X, y, sample_weight=sample_weight_1
|
||||
)
|
||||
|
||||
glm2 = GeneralizedLinearRegressor(**glm_params).fit(
|
||||
X2, y2, sample_weight=None
|
||||
)
|
||||
assert_allclose(glm1.coef_, glm2.coef_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fit_intercept', [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
'family',
|
||||
[NormalDistribution(), PoissonDistribution(),
|
||||
GammaDistribution(), InverseGaussianDistribution(),
|
||||
TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)])
|
||||
def test_glm_log_regression(fit_intercept, family):
|
||||
"""Test GLM regression with log link on a simple dataset."""
|
||||
coef = [0.2, -0.1]
|
||||
X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
|
||||
y = np.exp(np.dot(X, coef))
|
||||
glm = GeneralizedLinearRegressor(
|
||||
alpha=0, family=family, link='log',
|
||||
fit_intercept=fit_intercept, tol=1e-7)
|
||||
if fit_intercept:
|
||||
res = glm.fit(X[:, 1:], y)
|
||||
assert_allclose(res.coef_, coef[1:], rtol=1e-6)
|
||||
assert_allclose(res.intercept_, coef[0], rtol=1e-6)
|
||||
else:
|
||||
res = glm.fit(X, y)
|
||||
assert_allclose(res.coef_, coef, rtol=2e-6)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('fit_intercept', [True, False])
|
||||
def test_warm_start(fit_intercept):
|
||||
n_samples, n_features = 110, 10
|
||||
X, y = make_regression(n_samples=n_samples, n_features=n_features,
|
||||
n_informative=n_features-2, noise=0.5,
|
||||
random_state=42)
|
||||
|
||||
glm1 = GeneralizedLinearRegressor(
|
||||
warm_start=False,
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=1000
|
||||
)
|
||||
glm1.fit(X, y)
|
||||
|
||||
glm2 = GeneralizedLinearRegressor(
|
||||
warm_start=True,
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=1
|
||||
)
|
||||
# As we intentionally set max_iter=1, L-BFGS-B will issue a
|
||||
# ConvergenceWarning which we here simply ignore.
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore', category=ConvergenceWarning)
|
||||
glm2.fit(X, y)
|
||||
assert glm1.score(X, y) > glm2.score(X, y)
|
||||
glm2.set_params(max_iter=1000)
|
||||
glm2.fit(X, y)
|
||||
# The two model are not exactly identical since the lbfgs solver
|
||||
# computes the approximate hessian from previous iterations, which
|
||||
# will not be strictly identical in the case of a warm start.
|
||||
assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
|
||||
assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
|
||||
@pytest.mark.parametrize('fit_intercept', [True, False])
|
||||
@pytest.mark.parametrize('sample_weight', [None, True])
|
||||
def test_normal_ridge_comparison(n_samples, n_features, fit_intercept,
|
||||
sample_weight, request):
|
||||
"""Compare with Ridge regression for Normal distributions."""
|
||||
test_size = 10
|
||||
X, y = make_regression(n_samples=n_samples + test_size,
|
||||
n_features=n_features,
|
||||
n_informative=n_features-2, noise=0.5,
|
||||
random_state=42)
|
||||
|
||||
if n_samples > n_features:
|
||||
ridge_params = {"solver": "svd"}
|
||||
else:
|
||||
ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-7}
|
||||
|
||||
X_train, X_test, y_train, y_test, = train_test_split(
|
||||
X, y, test_size=test_size, random_state=0
|
||||
)
|
||||
|
||||
alpha = 1.0
|
||||
if sample_weight is None:
|
||||
sw_train = None
|
||||
alpha_ridge = alpha * n_samples
|
||||
else:
|
||||
sw_train = np.random.RandomState(0).rand(len(y_train))
|
||||
alpha_ridge = alpha * sw_train.sum()
|
||||
|
||||
# GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
|
||||
ridge = Ridge(alpha=alpha_ridge, normalize=False,
|
||||
random_state=42, fit_intercept=fit_intercept,
|
||||
**ridge_params)
|
||||
ridge.fit(X_train, y_train, sample_weight=sw_train)
|
||||
|
||||
glm = GeneralizedLinearRegressor(alpha=alpha, family='normal',
|
||||
link='identity',
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=300,
|
||||
tol=1e-5)
|
||||
glm.fit(X_train, y_train, sample_weight=sw_train)
|
||||
assert glm.coef_.shape == (X.shape[1], )
|
||||
assert_allclose(glm.coef_, ridge.coef_, atol=5e-5)
|
||||
assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
|
||||
assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=2e-4)
|
||||
assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4)
|
||||
|
||||
|
||||
def test_poisson_glmnet():
|
||||
"""Compare Poisson regression with L2 regularization and LogLink to glmnet
|
||||
"""
|
||||
# library("glmnet")
|
||||
# options(digits=10)
|
||||
# df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
|
||||
# x <- data.matrix(df[,c("a", "b")])
|
||||
# y <- df$y
|
||||
# fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
|
||||
# standardize=F, thresh=1e-10, nlambda=10000)
|
||||
# coef(fit, s=1)
|
||||
# (Intercept) -0.12889386979
|
||||
# a 0.29019207995
|
||||
# b 0.03741173122
|
||||
X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
|
||||
y = np.array([0, 1, 1, 2])
|
||||
glm = GeneralizedLinearRegressor(alpha=1,
|
||||
fit_intercept=True, family='poisson',
|
||||
link='log', tol=1e-7,
|
||||
max_iter=300)
|
||||
glm.fit(X, y)
|
||||
assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
|
||||
assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
|
||||
|
||||
|
||||
def test_convergence_warning(regression_data):
|
||||
X, y = regression_data
|
||||
|
||||
est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20)
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
est.fit(X, y)
|
||||
|
||||
|
||||
def test_poisson_regression_family(regression_data):
|
||||
# Make sure the family attribute is read-only to prevent searching over it
|
||||
# e.g. in a grid search
|
||||
est = PoissonRegressor()
|
||||
est.family == "poisson"
|
||||
|
||||
msg = "PoissonRegressor.family must be 'poisson'!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.family = 0
|
||||
|
||||
|
||||
def test_gamma_regression_family(regression_data):
|
||||
# Make sure the family attribute is read-only to prevent searching over it
|
||||
# e.g. in a grid search
|
||||
est = GammaRegressor()
|
||||
est.family == "gamma"
|
||||
|
||||
msg = "GammaRegressor.family must be 'gamma'!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.family = 0
|
||||
|
||||
|
||||
def test_tweedie_regression_family(regression_data):
|
||||
# Make sure the family attribute is always a TweedieDistribution and that
|
||||
# the power attribute is properly updated
|
||||
power = 2.0
|
||||
est = TweedieRegressor(power=power)
|
||||
assert isinstance(est.family, TweedieDistribution)
|
||||
assert est.family.power == power
|
||||
assert est.power == power
|
||||
|
||||
new_power = 0
|
||||
new_family = TweedieDistribution(power=new_power)
|
||||
est.family = new_family
|
||||
assert isinstance(est.family, TweedieDistribution)
|
||||
assert est.family.power == new_power
|
||||
assert est.power == new_power
|
||||
|
||||
msg = "TweedieRegressor.family must be of type TweedieDistribution!"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
est.family = None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'estimator, value',
|
||||
[
|
||||
(PoissonRegressor(), True),
|
||||
(GammaRegressor(), True),
|
||||
(TweedieRegressor(power=1.5), True),
|
||||
(TweedieRegressor(power=0), False)
|
||||
],
|
||||
)
|
||||
def test_tags(estimator, value):
|
||||
assert estimator._get_tags()['requires_positive_y'] is value
|
|
@ -0,0 +1,45 @@
|
|||
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
import pytest
|
||||
from scipy.optimize import check_grad
|
||||
|
||||
from sklearn.linear_model._glm.link import (
|
||||
IdentityLink,
|
||||
LogLink,
|
||||
LogitLink,
|
||||
)
|
||||
|
||||
|
||||
LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Link', LINK_FUNCTIONS)
|
||||
def test_link_properties(Link):
|
||||
"""Test link inverse and derivative."""
|
||||
rng = np.random.RandomState(42)
|
||||
x = rng.rand(100) * 100
|
||||
link = Link()
|
||||
if isinstance(link, LogitLink):
|
||||
# careful for large x, note expit(36) = 1
|
||||
# limit max eta to 15
|
||||
x = x / 100 * 15
|
||||
assert_allclose(link(link.inverse(x)), x)
|
||||
# if g(h(x)) = x, then g'(h(x)) = 1/h'(x)
|
||||
# g = link, h = link.inverse
|
||||
assert_allclose(link.derivative(link.inverse(x)),
|
||||
1 / link.inverse_derivative(x))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Link', LINK_FUNCTIONS)
|
||||
def test_link_derivative(Link):
|
||||
link = Link()
|
||||
x = np.random.RandomState(0).rand(1)
|
||||
err = check_grad(link, link.derivative, x) / link.derivative(x)
|
||||
assert abs(err) < 1e-6
|
||||
|
||||
err = (check_grad(link.inverse, link.inverse_derivative, x)
|
||||
/ link.derivative(x))
|
||||
assert abs(err) < 1e-6
|
Loading…
Add table
Add a link
Reference in a new issue