Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,26 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from sklearn.neural_network._base import binary_log_loss
|
||||
from sklearn.neural_network._base import log_loss
|
||||
|
||||
|
||||
def test_binary_log_loss_1_prob_finite():
|
||||
# y_proba is equal to one should result in a finite logloss
|
||||
y_true = np.array([[0, 0, 1]]).T
|
||||
y_prob = np.array([[0.9, 1.0, 1.0]]).T
|
||||
|
||||
loss = binary_log_loss(y_true, y_prob)
|
||||
assert np.isfinite(loss)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("y_true, y_prob", [
|
||||
(np.array([[1, 0, 0], [0, 1, 0]]),
|
||||
np.array([[0., 1., 0.], [0.9, 0.05, 0.05]])),
|
||||
(np.array([[0, 0, 1]]).T,
|
||||
np.array([[0.9, 1.0, 1.0]]).T),
|
||||
])
|
||||
def test_log_loss_1_prob_finite(y_true, y_prob):
|
||||
# y_proba is equal to 1 should result in a finite logloss
|
||||
loss = log_loss(y_true, y_prob)
|
||||
assert np.isfinite(loss)
|
718
venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py
Normal file
718
venv/Lib/site-packages/sklearn/neural_network/tests/test_mlp.py
Normal file
|
@ -0,0 +1,718 @@
|
|||
"""
|
||||
Testing for Multi-layer Perceptron module (sklearn.neural_network)
|
||||
"""
|
||||
|
||||
# Author: Issam H. Laradji
|
||||
# License: BSD 3 clause
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
import warnings
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
|
||||
from numpy.testing import assert_almost_equal, assert_array_equal
|
||||
|
||||
from sklearn.datasets import load_digits, load_boston, load_iris
|
||||
from sklearn.datasets import make_regression, make_multilabel_classification
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from io import StringIO
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.neural_network import MLPRegressor
|
||||
from sklearn.preprocessing import LabelBinarizer
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from scipy.sparse import csr_matrix
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
|
||||
ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]
|
||||
|
||||
X_digits, y_digits = load_digits(n_class=3, return_X_y=True)
|
||||
|
||||
X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200])
|
||||
y_digits_multi = y_digits[:200]
|
||||
|
||||
X_digits, y_digits = load_digits(n_class=2, return_X_y=True)
|
||||
|
||||
X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200])
|
||||
y_digits_binary = y_digits[:200]
|
||||
|
||||
classification_datasets = [(X_digits_multi, y_digits_multi),
|
||||
(X_digits_binary, y_digits_binary)]
|
||||
|
||||
boston = load_boston()
|
||||
|
||||
Xboston = StandardScaler().fit_transform(boston.data)[: 200]
|
||||
yboston = boston.target[:200]
|
||||
|
||||
regression_datasets = [(Xboston, yboston)]
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
X_iris = iris.data
|
||||
y_iris = iris.target
|
||||
|
||||
|
||||
def test_alpha():
|
||||
# Test that larger alpha yields weights closer to zero
|
||||
X = X_digits_binary[:100]
|
||||
y = y_digits_binary[:100]
|
||||
|
||||
alpha_vectors = []
|
||||
alpha_values = np.arange(2)
|
||||
absolute_sum = lambda x: np.sum(np.abs(x))
|
||||
|
||||
for alpha in alpha_values:
|
||||
mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1)
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
mlp.fit(X, y)
|
||||
alpha_vectors.append(np.array([absolute_sum(mlp.coefs_[0]),
|
||||
absolute_sum(mlp.coefs_[1])]))
|
||||
|
||||
for i in range(len(alpha_values) - 1):
|
||||
assert (alpha_vectors[i] > alpha_vectors[i + 1]).all()
|
||||
|
||||
|
||||
def test_fit():
|
||||
# Test that the algorithm solution is equal to a worked out example.
|
||||
X = np.array([[0.6, 0.8, 0.7]])
|
||||
y = np.array([0])
|
||||
mlp = MLPClassifier(solver='sgd', learning_rate_init=0.1, alpha=0.1,
|
||||
activation='logistic', random_state=1, max_iter=1,
|
||||
hidden_layer_sizes=2, momentum=0)
|
||||
# set weights
|
||||
mlp.coefs_ = [0] * 2
|
||||
mlp.intercepts_ = [0] * 2
|
||||
mlp.n_outputs_ = 1
|
||||
mlp.coefs_[0] = np.array([[0.1, 0.2], [0.3, 0.1], [0.5, 0]])
|
||||
mlp.coefs_[1] = np.array([[0.1], [0.2]])
|
||||
mlp.intercepts_[0] = np.array([0.1, 0.1])
|
||||
mlp.intercepts_[1] = np.array([1.0])
|
||||
mlp._coef_grads = [] * 2
|
||||
mlp._intercept_grads = [] * 2
|
||||
|
||||
# Initialize parameters
|
||||
mlp.n_iter_ = 0
|
||||
mlp.learning_rate_ = 0.1
|
||||
|
||||
# Compute the number of layers
|
||||
mlp.n_layers_ = 3
|
||||
|
||||
# Pre-allocate gradient matrices
|
||||
mlp._coef_grads = [0] * (mlp.n_layers_ - 1)
|
||||
mlp._intercept_grads = [0] * (mlp.n_layers_ - 1)
|
||||
|
||||
mlp.out_activation_ = 'logistic'
|
||||
mlp.t_ = 0
|
||||
mlp.best_loss_ = np.inf
|
||||
mlp.loss_curve_ = []
|
||||
mlp._no_improvement_count = 0
|
||||
mlp._intercept_velocity = [np.zeros_like(intercepts) for
|
||||
intercepts in
|
||||
mlp.intercepts_]
|
||||
mlp._coef_velocity = [np.zeros_like(coefs) for coefs in
|
||||
mlp.coefs_]
|
||||
|
||||
mlp.partial_fit(X, y, classes=[0, 1])
|
||||
# Manually worked out example
|
||||
# h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.1 + 0.8 * 0.3 + 0.7 * 0.5 + 0.1)
|
||||
# = 0.679178699175393
|
||||
# h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.2 + 0.8 * 0.1 + 0.7 * 0 + 0.1)
|
||||
# = 0.574442516811659
|
||||
# o1 = g(h * W2 + b21) = g(0.679 * 0.1 + 0.574 * 0.2 + 1)
|
||||
# = 0.7654329236196236
|
||||
# d21 = -(0 - 0.765) = 0.765
|
||||
# d11 = (1 - 0.679) * 0.679 * 0.765 * 0.1 = 0.01667
|
||||
# d12 = (1 - 0.574) * 0.574 * 0.765 * 0.2 = 0.0374
|
||||
# W1grad11 = X1 * d11 + alpha * W11 = 0.6 * 0.01667 + 0.1 * 0.1 = 0.0200
|
||||
# W1grad11 = X1 * d12 + alpha * W12 = 0.6 * 0.0374 + 0.1 * 0.2 = 0.04244
|
||||
# W1grad21 = X2 * d11 + alpha * W13 = 0.8 * 0.01667 + 0.1 * 0.3 = 0.043336
|
||||
# W1grad22 = X2 * d12 + alpha * W14 = 0.8 * 0.0374 + 0.1 * 0.1 = 0.03992
|
||||
# W1grad31 = X3 * d11 + alpha * W15 = 0.6 * 0.01667 + 0.1 * 0.5 = 0.060002
|
||||
# W1grad32 = X3 * d12 + alpha * W16 = 0.6 * 0.0374 + 0.1 * 0 = 0.02244
|
||||
# W2grad1 = h1 * d21 + alpha * W21 = 0.679 * 0.765 + 0.1 * 0.1 = 0.5294
|
||||
# W2grad2 = h2 * d21 + alpha * W22 = 0.574 * 0.765 + 0.1 * 0.2 = 0.45911
|
||||
# b1grad1 = d11 = 0.01667
|
||||
# b1grad2 = d12 = 0.0374
|
||||
# b2grad = d21 = 0.765
|
||||
# W1 = W1 - eta * [W1grad11, .., W1grad32] = [[0.1, 0.2], [0.3, 0.1],
|
||||
# [0.5, 0]] - 0.1 * [[0.0200, 0.04244], [0.043336, 0.03992],
|
||||
# [0.060002, 0.02244]] = [[0.098, 0.195756], [0.2956664,
|
||||
# 0.096008], [0.4939998, -0.002244]]
|
||||
# W2 = W2 - eta * [W2grad1, W2grad2] = [[0.1], [0.2]] - 0.1 *
|
||||
# [[0.5294], [0.45911]] = [[0.04706], [0.154089]]
|
||||
# b1 = b1 - eta * [b1grad1, b1grad2] = 0.1 - 0.1 * [0.01667, 0.0374]
|
||||
# = [0.098333, 0.09626]
|
||||
# b2 = b2 - eta * b2grad = 1.0 - 0.1 * 0.765 = 0.9235
|
||||
assert_almost_equal(mlp.coefs_[0], np.array([[0.098, 0.195756],
|
||||
[0.2956664, 0.096008],
|
||||
[0.4939998, -0.002244]]),
|
||||
decimal=3)
|
||||
assert_almost_equal(mlp.coefs_[1], np.array([[0.04706], [0.154089]]),
|
||||
decimal=3)
|
||||
assert_almost_equal(mlp.intercepts_[0],
|
||||
np.array([0.098333, 0.09626]), decimal=3)
|
||||
assert_almost_equal(mlp.intercepts_[1], np.array(0.9235), decimal=3)
|
||||
# Testing output
|
||||
# h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.098 + 0.8 * 0.2956664 +
|
||||
# 0.7 * 0.4939998 + 0.098333) = 0.677
|
||||
# h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.195756 + 0.8 * 0.096008 +
|
||||
# 0.7 * -0.002244 + 0.09626) = 0.572
|
||||
# o1 = h * W2 + b21 = 0.677 * 0.04706 +
|
||||
# 0.572 * 0.154089 + 0.9235 = 1.043
|
||||
# prob = sigmoid(o1) = 0.739
|
||||
assert_almost_equal(mlp.predict_proba(X)[0, 1], 0.739, decimal=3)
|
||||
|
||||
|
||||
def test_gradient():
|
||||
# Test gradient.
|
||||
|
||||
# This makes sure that the activation functions and their derivatives
|
||||
# are correct. The numerical and analytical computation of the gradient
|
||||
# should be close.
|
||||
for n_labels in [2, 3]:
|
||||
n_samples = 5
|
||||
n_features = 10
|
||||
random_state = np.random.RandomState(seed=42)
|
||||
X = random_state.rand(n_samples, n_features)
|
||||
y = 1 + np.mod(np.arange(n_samples) + 1, n_labels)
|
||||
Y = LabelBinarizer().fit_transform(y)
|
||||
|
||||
for activation in ACTIVATION_TYPES:
|
||||
mlp = MLPClassifier(activation=activation, hidden_layer_sizes=10,
|
||||
solver='lbfgs', alpha=1e-5,
|
||||
learning_rate_init=0.2, max_iter=1,
|
||||
random_state=1)
|
||||
mlp.fit(X, y)
|
||||
|
||||
theta = np.hstack([l.ravel() for l in mlp.coefs_ +
|
||||
mlp.intercepts_])
|
||||
|
||||
layer_units = ([X.shape[1]] + [mlp.hidden_layer_sizes] +
|
||||
[mlp.n_outputs_])
|
||||
|
||||
activations = []
|
||||
deltas = []
|
||||
coef_grads = []
|
||||
intercept_grads = []
|
||||
|
||||
activations.append(X)
|
||||
for i in range(mlp.n_layers_ - 1):
|
||||
activations.append(np.empty((X.shape[0],
|
||||
layer_units[i + 1])))
|
||||
deltas.append(np.empty((X.shape[0],
|
||||
layer_units[i + 1])))
|
||||
|
||||
fan_in = layer_units[i]
|
||||
fan_out = layer_units[i + 1]
|
||||
coef_grads.append(np.empty((fan_in, fan_out)))
|
||||
intercept_grads.append(np.empty(fan_out))
|
||||
|
||||
# analytically compute the gradients
|
||||
def loss_grad_fun(t):
|
||||
return mlp._loss_grad_lbfgs(t, X, Y, activations, deltas,
|
||||
coef_grads, intercept_grads)
|
||||
|
||||
[value, grad] = loss_grad_fun(theta)
|
||||
numgrad = np.zeros(np.size(theta))
|
||||
n = np.size(theta, 0)
|
||||
E = np.eye(n)
|
||||
epsilon = 1e-5
|
||||
# numerically compute the gradients
|
||||
for i in range(n):
|
||||
dtheta = E[:, i] * epsilon
|
||||
numgrad[i] = ((loss_grad_fun(theta + dtheta)[0] -
|
||||
loss_grad_fun(theta - dtheta)[0]) /
|
||||
(epsilon * 2.0))
|
||||
assert_almost_equal(numgrad, grad)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('X,y', classification_datasets)
|
||||
def test_lbfgs_classification(X, y):
|
||||
# Test lbfgs on classification.
|
||||
# It should achieve a score higher than 0.95 for the binary and multi-class
|
||||
# versions of the digits dataset.
|
||||
X_train = X[:150]
|
||||
y_train = y[:150]
|
||||
X_test = X[150:]
|
||||
expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind)
|
||||
|
||||
for activation in ACTIVATION_TYPES:
|
||||
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50,
|
||||
max_iter=150, shuffle=True, random_state=1,
|
||||
activation=activation)
|
||||
mlp.fit(X_train, y_train)
|
||||
y_predict = mlp.predict(X_test)
|
||||
assert mlp.score(X_train, y_train) > 0.95
|
||||
assert ((y_predict.shape[0], y_predict.dtype.kind) ==
|
||||
expected_shape_dtype)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('X,y', regression_datasets)
|
||||
def test_lbfgs_regression(X, y):
|
||||
# Test lbfgs on the boston dataset, a regression problems.
|
||||
for activation in ACTIVATION_TYPES:
|
||||
mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50,
|
||||
max_iter=150, shuffle=True, random_state=1,
|
||||
activation=activation)
|
||||
mlp.fit(X, y)
|
||||
if activation == 'identity':
|
||||
assert mlp.score(X, y) > 0.84
|
||||
else:
|
||||
# Non linear models perform much better than linear bottleneck:
|
||||
assert mlp.score(X, y) > 0.95
|
||||
|
||||
|
||||
@pytest.mark.parametrize('X,y', classification_datasets)
|
||||
def test_lbfgs_classification_maxfun(X, y):
|
||||
# Test lbfgs parameter max_fun.
|
||||
# It should independently limit the number of iterations for lbfgs.
|
||||
max_fun = 10
|
||||
# classification tests
|
||||
for activation in ACTIVATION_TYPES:
|
||||
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50,
|
||||
max_iter=150, max_fun=max_fun, shuffle=True,
|
||||
random_state=1, activation=activation)
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
mlp.fit(X, y)
|
||||
assert max_fun >= mlp.n_iter_
|
||||
|
||||
|
||||
@pytest.mark.parametrize('X,y', regression_datasets)
|
||||
def test_lbfgs_regression_maxfun(X, y):
|
||||
# Test lbfgs parameter max_fun.
|
||||
# It should independently limit the number of iterations for lbfgs.
|
||||
max_fun = 10
|
||||
# regression tests
|
||||
for activation in ACTIVATION_TYPES:
|
||||
mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50,
|
||||
max_iter=150, max_fun=max_fun, shuffle=True,
|
||||
random_state=1, activation=activation)
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
mlp.fit(X, y)
|
||||
assert max_fun >= mlp.n_iter_
|
||||
|
||||
mlp.max_fun = -1
|
||||
with pytest.raises(ValueError):
|
||||
mlp.fit(X, y)
|
||||
|
||||
|
||||
def test_learning_rate_warmstart():
|
||||
# Tests that warm_start reuse past solutions.
|
||||
X = [[3, 2], [1, 6], [5, 6], [-2, -4]]
|
||||
y = [1, 1, 1, 0]
|
||||
for learning_rate in ["invscaling", "constant"]:
|
||||
mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=4,
|
||||
learning_rate=learning_rate, max_iter=1,
|
||||
power_t=0.25, warm_start=True)
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
mlp.fit(X, y)
|
||||
prev_eta = mlp._optimizer.learning_rate
|
||||
mlp.fit(X, y)
|
||||
post_eta = mlp._optimizer.learning_rate
|
||||
|
||||
if learning_rate == 'constant':
|
||||
assert prev_eta == post_eta
|
||||
elif learning_rate == 'invscaling':
|
||||
assert (mlp.learning_rate_init / pow(8 + 1, mlp.power_t) ==
|
||||
post_eta)
|
||||
|
||||
|
||||
def test_multilabel_classification():
|
||||
# Test that multi-label classification works as expected.
|
||||
# test fit method
|
||||
X, y = make_multilabel_classification(n_samples=50, random_state=0,
|
||||
return_indicator=True)
|
||||
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5,
|
||||
max_iter=150, random_state=0, activation='logistic',
|
||||
learning_rate_init=0.2)
|
||||
mlp.fit(X, y)
|
||||
assert mlp.score(X, y) > 0.97
|
||||
|
||||
# test partial fit method
|
||||
mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150,
|
||||
random_state=0, activation='logistic', alpha=1e-5,
|
||||
learning_rate_init=0.2)
|
||||
for i in range(100):
|
||||
mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
|
||||
assert mlp.score(X, y) > 0.9
|
||||
|
||||
# Make sure early stopping still work now that spliting is stratified by
|
||||
# default (it is disabled for multilabel classification)
|
||||
mlp = MLPClassifier(early_stopping=True)
|
||||
mlp.fit(X, y).predict(X)
|
||||
|
||||
|
||||
def test_multioutput_regression():
|
||||
# Test that multi-output regression works as expected
|
||||
X, y = make_regression(n_samples=200, n_targets=5)
|
||||
mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=50, max_iter=200,
|
||||
random_state=1)
|
||||
mlp.fit(X, y)
|
||||
assert mlp.score(X, y) > 0.9
|
||||
|
||||
|
||||
def test_partial_fit_classes_error():
|
||||
# Tests that passing different classes to partial_fit raises an error
|
||||
X = [[3, 2]]
|
||||
y = [0]
|
||||
clf = MLPClassifier(solver='sgd')
|
||||
clf.partial_fit(X, y, classes=[0, 1])
|
||||
with pytest.raises(ValueError):
|
||||
clf.partial_fit(X, y, classes=[1, 2])
|
||||
|
||||
|
||||
def test_partial_fit_classification():
|
||||
# Test partial_fit on classification.
|
||||
# `partial_fit` should yield the same results as 'fit' for binary and
|
||||
# multi-class classification.
|
||||
for X, y in classification_datasets:
|
||||
X = X
|
||||
y = y
|
||||
mlp = MLPClassifier(solver='sgd', max_iter=100, random_state=1,
|
||||
tol=0, alpha=1e-5, learning_rate_init=0.2)
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
mlp.fit(X, y)
|
||||
pred1 = mlp.predict(X)
|
||||
mlp = MLPClassifier(solver='sgd', random_state=1, alpha=1e-5,
|
||||
learning_rate_init=0.2)
|
||||
for i in range(100):
|
||||
mlp.partial_fit(X, y, classes=np.unique(y))
|
||||
pred2 = mlp.predict(X)
|
||||
assert_array_equal(pred1, pred2)
|
||||
assert mlp.score(X, y) > 0.95
|
||||
|
||||
|
||||
def test_partial_fit_unseen_classes():
|
||||
# Non regression test for bug 6994
|
||||
# Tests for labeling errors in partial fit
|
||||
|
||||
clf = MLPClassifier(random_state=0)
|
||||
clf.partial_fit([[1], [2], [3]], ["a", "b", "c"],
|
||||
classes=["a", "b", "c", "d"])
|
||||
clf.partial_fit([[4]], ["d"])
|
||||
assert clf.score([[1], [2], [3], [4]], ["a", "b", "c", "d"]) > 0
|
||||
|
||||
|
||||
def test_partial_fit_regression():
|
||||
# Test partial_fit on regression.
|
||||
# `partial_fit` should yield the same results as 'fit' for regression.
|
||||
X = Xboston
|
||||
y = yboston
|
||||
|
||||
for momentum in [0, .9]:
|
||||
mlp = MLPRegressor(solver='sgd', max_iter=100, activation='relu',
|
||||
random_state=1, learning_rate_init=0.01,
|
||||
batch_size=X.shape[0], momentum=momentum)
|
||||
with warnings.catch_warnings(record=True):
|
||||
# catch convergence warning
|
||||
mlp.fit(X, y)
|
||||
pred1 = mlp.predict(X)
|
||||
mlp = MLPRegressor(solver='sgd', activation='relu',
|
||||
learning_rate_init=0.01, random_state=1,
|
||||
batch_size=X.shape[0], momentum=momentum)
|
||||
for i in range(100):
|
||||
mlp.partial_fit(X, y)
|
||||
|
||||
pred2 = mlp.predict(X)
|
||||
assert_almost_equal(pred1, pred2, decimal=2)
|
||||
score = mlp.score(X, y)
|
||||
assert score > 0.75
|
||||
|
||||
|
||||
def test_partial_fit_errors():
|
||||
# Test partial_fit error handling.
|
||||
X = [[3, 2], [1, 6]]
|
||||
y = [1, 0]
|
||||
|
||||
# no classes passed
|
||||
with pytest.raises(ValueError):
|
||||
MLPClassifier(solver='sgd').partial_fit(X, y, classes=[2])
|
||||
|
||||
# lbfgs doesn't support partial_fit
|
||||
assert not hasattr(MLPClassifier(solver='lbfgs'), 'partial_fit')
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"args",
|
||||
[{'hidden_layer_sizes': -1},
|
||||
{'max_iter': -1},
|
||||
{'shuffle': 'true'},
|
||||
{'alpha': -1},
|
||||
{'learning_rate_init': -1},
|
||||
{'momentum': 2},
|
||||
{'momentum': -0.5},
|
||||
{'nesterovs_momentum': 'invalid'},
|
||||
{'early_stopping': 'invalid'},
|
||||
{'validation_fraction': 1},
|
||||
{'validation_fraction': -0.5},
|
||||
{'beta_1': 1},
|
||||
{'beta_1': -0.5},
|
||||
{'beta_2': 1},
|
||||
{'beta_2': -0.5},
|
||||
{'epsilon': -0.5},
|
||||
{'n_iter_no_change': -1},
|
||||
{'solver': 'hadoken'},
|
||||
{'learning_rate': 'converge'},
|
||||
{'activation': 'cloak'}]
|
||||
)
|
||||
def test_params_errors(args):
|
||||
# Test that invalid parameters raise value error
|
||||
X = [[3, 2], [1, 6]]
|
||||
y = [1, 0]
|
||||
clf = MLPClassifier
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
clf(**args).fit(X, y)
|
||||
|
||||
|
||||
def test_predict_proba_binary():
|
||||
# Test that predict_proba works as expected for binary class.
|
||||
X = X_digits_binary[:50]
|
||||
y = y_digits_binary[:50]
|
||||
|
||||
clf = MLPClassifier(hidden_layer_sizes=5, activation='logistic',
|
||||
random_state=1)
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
clf.fit(X, y)
|
||||
y_proba = clf.predict_proba(X)
|
||||
y_log_proba = clf.predict_log_proba(X)
|
||||
|
||||
(n_samples, n_classes) = y.shape[0], 2
|
||||
|
||||
proba_max = y_proba.argmax(axis=1)
|
||||
proba_log_max = y_log_proba.argmax(axis=1)
|
||||
|
||||
assert y_proba.shape == (n_samples, n_classes)
|
||||
assert_array_equal(proba_max, proba_log_max)
|
||||
assert_array_equal(y_log_proba, np.log(y_proba))
|
||||
|
||||
assert roc_auc_score(y, y_proba[:, 1]) == 1.0
|
||||
|
||||
|
||||
def test_predict_proba_multiclass():
|
||||
# Test that predict_proba works as expected for multi class.
|
||||
X = X_digits_multi[:10]
|
||||
y = y_digits_multi[:10]
|
||||
|
||||
clf = MLPClassifier(hidden_layer_sizes=5)
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
clf.fit(X, y)
|
||||
y_proba = clf.predict_proba(X)
|
||||
y_log_proba = clf.predict_log_proba(X)
|
||||
|
||||
(n_samples, n_classes) = y.shape[0], np.unique(y).size
|
||||
|
||||
proba_max = y_proba.argmax(axis=1)
|
||||
proba_log_max = y_log_proba.argmax(axis=1)
|
||||
|
||||
assert y_proba.shape == (n_samples, n_classes)
|
||||
assert_array_equal(proba_max, proba_log_max)
|
||||
assert_array_equal(y_log_proba, np.log(y_proba))
|
||||
|
||||
|
||||
def test_predict_proba_multilabel():
|
||||
# Test that predict_proba works as expected for multilabel.
|
||||
# Multilabel should not use softmax which makes probabilities sum to 1
|
||||
X, Y = make_multilabel_classification(n_samples=50, random_state=0,
|
||||
return_indicator=True)
|
||||
n_samples, n_classes = Y.shape
|
||||
|
||||
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30,
|
||||
random_state=0)
|
||||
clf.fit(X, Y)
|
||||
y_proba = clf.predict_proba(X)
|
||||
|
||||
assert y_proba.shape == (n_samples, n_classes)
|
||||
assert_array_equal(y_proba > 0.5, Y)
|
||||
|
||||
y_log_proba = clf.predict_log_proba(X)
|
||||
proba_max = y_proba.argmax(axis=1)
|
||||
proba_log_max = y_log_proba.argmax(axis=1)
|
||||
|
||||
assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10
|
||||
assert_array_equal(proba_max, proba_log_max)
|
||||
assert_array_equal(y_log_proba, np.log(y_proba))
|
||||
|
||||
|
||||
def test_shuffle():
|
||||
# Test that the shuffle parameter affects the training process (it should)
|
||||
X, y = make_regression(n_samples=50, n_features=5, n_targets=1,
|
||||
random_state=0)
|
||||
|
||||
# The coefficients will be identical if both do or do not shuffle
|
||||
for shuffle in [True, False]:
|
||||
mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
|
||||
random_state=0, shuffle=shuffle)
|
||||
mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
|
||||
random_state=0, shuffle=shuffle)
|
||||
mlp1.fit(X, y)
|
||||
mlp2.fit(X, y)
|
||||
|
||||
assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
|
||||
|
||||
# The coefficients will be slightly different if shuffle=True
|
||||
mlp1 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
|
||||
random_state=0, shuffle=True)
|
||||
mlp2 = MLPRegressor(hidden_layer_sizes=1, max_iter=1, batch_size=1,
|
||||
random_state=0, shuffle=False)
|
||||
mlp1.fit(X, y)
|
||||
mlp2.fit(X, y)
|
||||
|
||||
assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
|
||||
|
||||
|
||||
def test_sparse_matrices():
|
||||
# Test that sparse and dense input matrices output the same results.
|
||||
X = X_digits_binary[:50]
|
||||
y = y_digits_binary[:50]
|
||||
X_sparse = csr_matrix(X)
|
||||
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=15,
|
||||
random_state=1)
|
||||
mlp.fit(X, y)
|
||||
pred1 = mlp.predict(X)
|
||||
mlp.fit(X_sparse, y)
|
||||
pred2 = mlp.predict(X_sparse)
|
||||
assert_almost_equal(pred1, pred2)
|
||||
pred1 = mlp.predict(X)
|
||||
pred2 = mlp.predict(X_sparse)
|
||||
assert_array_equal(pred1, pred2)
|
||||
|
||||
|
||||
def test_tolerance():
|
||||
# Test tolerance.
|
||||
# It should force the solver to exit the loop when it converges.
|
||||
X = [[3, 2], [1, 6]]
|
||||
y = [1, 0]
|
||||
clf = MLPClassifier(tol=0.5, max_iter=3000, solver='sgd')
|
||||
clf.fit(X, y)
|
||||
assert clf.max_iter > clf.n_iter_
|
||||
|
||||
|
||||
def test_verbose_sgd():
|
||||
# Test verbose.
|
||||
X = [[3, 2], [1, 6]]
|
||||
y = [1, 0]
|
||||
clf = MLPClassifier(solver='sgd', max_iter=2, verbose=10,
|
||||
hidden_layer_sizes=2)
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = output = StringIO()
|
||||
|
||||
with ignore_warnings(category=ConvergenceWarning):
|
||||
clf.fit(X, y)
|
||||
clf.partial_fit(X, y)
|
||||
|
||||
sys.stdout = old_stdout
|
||||
assert 'Iteration' in output.getvalue()
|
||||
|
||||
|
||||
def test_early_stopping():
|
||||
X = X_digits_binary[:100]
|
||||
y = y_digits_binary[:100]
|
||||
tol = 0.2
|
||||
clf = MLPClassifier(tol=tol, max_iter=3000, solver='sgd',
|
||||
early_stopping=True)
|
||||
clf.fit(X, y)
|
||||
assert clf.max_iter > clf.n_iter_
|
||||
|
||||
valid_scores = clf.validation_scores_
|
||||
best_valid_score = clf.best_validation_score_
|
||||
assert max(valid_scores) == best_valid_score
|
||||
assert best_valid_score + tol > valid_scores[-2]
|
||||
assert best_valid_score + tol > valid_scores[-1]
|
||||
|
||||
|
||||
def test_adaptive_learning_rate():
|
||||
X = [[3, 2], [1, 6]]
|
||||
y = [1, 0]
|
||||
clf = MLPClassifier(tol=0.5, max_iter=3000, solver='sgd',
|
||||
learning_rate='adaptive')
|
||||
clf.fit(X, y)
|
||||
assert clf.max_iter > clf.n_iter_
|
||||
assert 1e-6 > clf._optimizer.learning_rate
|
||||
|
||||
|
||||
@ignore_warnings(category=RuntimeWarning)
|
||||
def test_warm_start():
|
||||
X = X_iris
|
||||
y = y_iris
|
||||
|
||||
y_2classes = np.array([0] * 75 + [1] * 75)
|
||||
y_3classes = np.array([0] * 40 + [1] * 40 + [2] * 70)
|
||||
y_3classes_alt = np.array([0] * 50 + [1] * 50 + [3] * 50)
|
||||
y_4classes = np.array([0] * 37 + [1] * 37 + [2] * 38 + [3] * 38)
|
||||
y_5classes = np.array([0] * 30 + [1] * 30 + [2] * 30 + [3] * 30 + [4] * 30)
|
||||
|
||||
# No error raised
|
||||
clf = MLPClassifier(hidden_layer_sizes=2, solver='lbfgs',
|
||||
warm_start=True).fit(X, y)
|
||||
clf.fit(X, y)
|
||||
clf.fit(X, y_3classes)
|
||||
|
||||
for y_i in (y_2classes, y_3classes_alt, y_4classes, y_5classes):
|
||||
clf = MLPClassifier(hidden_layer_sizes=2, solver='lbfgs',
|
||||
warm_start=True).fit(X, y)
|
||||
message = ('warm_start can only be used where `y` has the same '
|
||||
'classes as in the previous call to fit.'
|
||||
' Previously got [0 1 2], `y` has %s' % np.unique(y_i))
|
||||
with pytest.raises(ValueError, match=re.escape(message)):
|
||||
clf.fit(X, y_i)
|
||||
|
||||
|
||||
def test_n_iter_no_change():
|
||||
# test n_iter_no_change using binary data set
|
||||
# the classifying fitting process is not prone to loss curve fluctuations
|
||||
X = X_digits_binary[:100]
|
||||
y = y_digits_binary[:100]
|
||||
tol = 0.01
|
||||
max_iter = 3000
|
||||
|
||||
# test multiple n_iter_no_change
|
||||
for n_iter_no_change in [2, 5, 10, 50, 100]:
|
||||
clf = MLPClassifier(tol=tol, max_iter=max_iter, solver='sgd',
|
||||
n_iter_no_change=n_iter_no_change)
|
||||
clf.fit(X, y)
|
||||
|
||||
# validate n_iter_no_change
|
||||
assert clf._no_improvement_count == n_iter_no_change + 1
|
||||
assert max_iter > clf.n_iter_
|
||||
|
||||
|
||||
@ignore_warnings(category=ConvergenceWarning)
|
||||
def test_n_iter_no_change_inf():
|
||||
# test n_iter_no_change using binary data set
|
||||
# the fitting process should go to max_iter iterations
|
||||
X = X_digits_binary[:100]
|
||||
y = y_digits_binary[:100]
|
||||
|
||||
# set a ridiculous tolerance
|
||||
# this should always trigger _update_no_improvement_count()
|
||||
tol = 1e9
|
||||
|
||||
# fit
|
||||
n_iter_no_change = np.inf
|
||||
max_iter = 3000
|
||||
clf = MLPClassifier(tol=tol, max_iter=max_iter, solver='sgd',
|
||||
n_iter_no_change=n_iter_no_change)
|
||||
clf.fit(X, y)
|
||||
|
||||
# validate n_iter_no_change doesn't cause early stopping
|
||||
assert clf.n_iter_ == max_iter
|
||||
|
||||
# validate _update_no_improvement_count() was always triggered
|
||||
assert clf._no_improvement_count == clf.n_iter_ - 1
|
||||
|
||||
|
||||
def test_early_stopping_stratified():
|
||||
# Make sure data splitting for early stopping is stratified
|
||||
X = [[1, 2], [2, 3], [3, 4], [4, 5]]
|
||||
y = [0, 0, 0, 1]
|
||||
|
||||
mlp = MLPClassifier(early_stopping=True)
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match='The least populated class in y has only 1 member'):
|
||||
mlp.fit(X, y)
|
191
venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py
Normal file
191
venv/Lib/site-packages/sklearn/neural_network/tests/test_rbm.py
Normal file
|
@ -0,0 +1,191 @@
|
|||
import sys
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import csc_matrix, csr_matrix, lil_matrix
|
||||
from sklearn.utils._testing import (assert_almost_equal, assert_array_equal)
|
||||
|
||||
from sklearn.datasets import load_digits
|
||||
from io import StringIO
|
||||
from sklearn.neural_network import BernoulliRBM
|
||||
from sklearn.utils.validation import assert_all_finite
|
||||
|
||||
Xdigits, _ = load_digits(return_X_y=True)
|
||||
Xdigits -= Xdigits.min()
|
||||
Xdigits /= Xdigits.max()
|
||||
|
||||
|
||||
def test_fit():
|
||||
X = Xdigits.copy()
|
||||
|
||||
rbm = BernoulliRBM(n_components=64, learning_rate=0.1,
|
||||
batch_size=10, n_iter=7, random_state=9)
|
||||
rbm.fit(X)
|
||||
|
||||
assert_almost_equal(rbm.score_samples(X).mean(), -21., decimal=0)
|
||||
|
||||
# in-place tricks shouldn't have modified X
|
||||
assert_array_equal(X, Xdigits)
|
||||
|
||||
|
||||
def test_partial_fit():
|
||||
X = Xdigits.copy()
|
||||
rbm = BernoulliRBM(n_components=64, learning_rate=0.1,
|
||||
batch_size=20, random_state=9)
|
||||
n_samples = X.shape[0]
|
||||
n_batches = int(np.ceil(float(n_samples) / rbm.batch_size))
|
||||
batch_slices = np.array_split(X, n_batches)
|
||||
|
||||
for i in range(7):
|
||||
for batch in batch_slices:
|
||||
rbm.partial_fit(batch)
|
||||
|
||||
assert_almost_equal(rbm.score_samples(X).mean(), -21., decimal=0)
|
||||
assert_array_equal(X, Xdigits)
|
||||
|
||||
|
||||
def test_transform():
|
||||
X = Xdigits[:100]
|
||||
rbm1 = BernoulliRBM(n_components=16, batch_size=5,
|
||||
n_iter=5, random_state=42)
|
||||
rbm1.fit(X)
|
||||
|
||||
Xt1 = rbm1.transform(X)
|
||||
Xt2 = rbm1._mean_hiddens(X)
|
||||
|
||||
assert_array_equal(Xt1, Xt2)
|
||||
|
||||
|
||||
def test_small_sparse():
|
||||
# BernoulliRBM should work on small sparse matrices.
|
||||
X = csr_matrix(Xdigits[:4])
|
||||
BernoulliRBM().fit(X) # no exception
|
||||
|
||||
|
||||
def test_small_sparse_partial_fit():
|
||||
for sparse in [csc_matrix, csr_matrix]:
|
||||
X_sparse = sparse(Xdigits[:100])
|
||||
X = Xdigits[:100].copy()
|
||||
|
||||
rbm1 = BernoulliRBM(n_components=64, learning_rate=0.1,
|
||||
batch_size=10, random_state=9)
|
||||
rbm2 = BernoulliRBM(n_components=64, learning_rate=0.1,
|
||||
batch_size=10, random_state=9)
|
||||
|
||||
rbm1.partial_fit(X_sparse)
|
||||
rbm2.partial_fit(X)
|
||||
|
||||
assert_almost_equal(rbm1.score_samples(X).mean(),
|
||||
rbm2.score_samples(X).mean(),
|
||||
decimal=0)
|
||||
|
||||
|
||||
def test_sample_hiddens():
|
||||
rng = np.random.RandomState(0)
|
||||
X = Xdigits[:100]
|
||||
rbm1 = BernoulliRBM(n_components=2, batch_size=5,
|
||||
n_iter=5, random_state=42)
|
||||
rbm1.fit(X)
|
||||
|
||||
h = rbm1._mean_hiddens(X[0])
|
||||
hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0)
|
||||
|
||||
assert_almost_equal(h, hs, decimal=1)
|
||||
|
||||
|
||||
def test_fit_gibbs():
|
||||
# Gibbs on the RBM hidden layer should be able to recreate [[0], [1]]
|
||||
# from the same input
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.array([[0.], [1.]])
|
||||
rbm1 = BernoulliRBM(n_components=2, batch_size=2,
|
||||
n_iter=42, random_state=rng)
|
||||
# you need that much iters
|
||||
rbm1.fit(X)
|
||||
assert_almost_equal(rbm1.components_,
|
||||
np.array([[0.02649814], [0.02009084]]), decimal=4)
|
||||
assert_almost_equal(rbm1.gibbs(X), X)
|
||||
return rbm1
|
||||
|
||||
|
||||
def test_fit_gibbs_sparse():
|
||||
# Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from
|
||||
# the same input even when the input is sparse, and test against non-sparse
|
||||
rbm1 = test_fit_gibbs()
|
||||
rng = np.random.RandomState(42)
|
||||
from scipy.sparse import csc_matrix
|
||||
X = csc_matrix([[0.], [1.]])
|
||||
rbm2 = BernoulliRBM(n_components=2, batch_size=2,
|
||||
n_iter=42, random_state=rng)
|
||||
rbm2.fit(X)
|
||||
assert_almost_equal(rbm2.components_,
|
||||
np.array([[0.02649814], [0.02009084]]), decimal=4)
|
||||
assert_almost_equal(rbm2.gibbs(X), X.toarray())
|
||||
assert_almost_equal(rbm1.components_, rbm2.components_)
|
||||
|
||||
|
||||
def test_gibbs_smoke():
|
||||
# Check if we don't get NaNs sampling the full digits dataset.
|
||||
# Also check that sampling again will yield different results.
|
||||
X = Xdigits
|
||||
rbm1 = BernoulliRBM(n_components=42, batch_size=40,
|
||||
n_iter=20, random_state=42)
|
||||
rbm1.fit(X)
|
||||
X_sampled = rbm1.gibbs(X)
|
||||
assert_all_finite(X_sampled)
|
||||
X_sampled2 = rbm1.gibbs(X)
|
||||
assert np.all((X_sampled != X_sampled2).max(axis=1))
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
# Test score_samples (pseudo-likelihood) method.
|
||||
# Assert that pseudo-likelihood is computed without clipping.
|
||||
# See Fabian's blog, http://bit.ly/1iYefRk
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.vstack([np.zeros(1000), np.ones(1000)])
|
||||
rbm1 = BernoulliRBM(n_components=10, batch_size=2,
|
||||
n_iter=10, random_state=rng)
|
||||
rbm1.fit(X)
|
||||
assert (rbm1.score_samples(X) < -300).all()
|
||||
|
||||
# Sparse vs. dense should not affect the output. Also test sparse input
|
||||
# validation.
|
||||
rbm1.random_state = 42
|
||||
d_score = rbm1.score_samples(X)
|
||||
rbm1.random_state = 42
|
||||
s_score = rbm1.score_samples(lil_matrix(X))
|
||||
assert_almost_equal(d_score, s_score)
|
||||
|
||||
# Test numerical stability (#2785): would previously generate infinities
|
||||
# and crash with an exception.
|
||||
with np.errstate(under='ignore'):
|
||||
rbm1.score_samples([np.arange(1000) * 100])
|
||||
|
||||
|
||||
def test_rbm_verbose():
|
||||
rbm = BernoulliRBM(n_iter=2, verbose=10)
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
try:
|
||||
rbm.fit(Xdigits)
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
|
||||
def test_sparse_and_verbose():
|
||||
# Make sure RBM works with sparse input when verbose=True
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = StringIO()
|
||||
from scipy.sparse import csc_matrix
|
||||
X = csc_matrix([[0.], [1.]])
|
||||
rbm = BernoulliRBM(n_components=2, batch_size=2, n_iter=1,
|
||||
random_state=42, verbose=True)
|
||||
try:
|
||||
rbm.fit(X)
|
||||
s = sys.stdout.getvalue()
|
||||
# make sure output is sound
|
||||
assert re.match(r"\[BernoulliRBM\] Iteration 1,"
|
||||
r" pseudo-likelihood = -?(\d)+(\.\d+)?,"
|
||||
r" time = (\d|\.)+s", s)
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
|
@ -0,0 +1,108 @@
|
|||
import numpy as np
|
||||
|
||||
from sklearn.neural_network._stochastic_optimizers import (BaseOptimizer,
|
||||
SGDOptimizer,
|
||||
AdamOptimizer)
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
shapes = [(4, 6), (6, 8), (7, 8, 9)]
|
||||
|
||||
|
||||
def test_base_optimizer():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
|
||||
for lr in [10 ** i for i in range(-3, 4)]:
|
||||
optimizer = BaseOptimizer(params, lr)
|
||||
assert optimizer.trigger_stopping('', False)
|
||||
|
||||
|
||||
def test_sgd_optimizer_no_momentum():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
|
||||
for lr in [10 ** i for i in range(-3, 4)]:
|
||||
optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False)
|
||||
grads = [np.random.random(shape) for shape in shapes]
|
||||
expected = [param - lr * grad for param, grad in zip(params, grads)]
|
||||
optimizer.update_params(grads)
|
||||
|
||||
for exp, param in zip(expected, optimizer.params):
|
||||
assert_array_equal(exp, param)
|
||||
|
||||
|
||||
def test_sgd_optimizer_momentum():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 0.1
|
||||
|
||||
for momentum in np.arange(0.5, 0.9, 0.1):
|
||||
optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=False)
|
||||
velocities = [np.random.random(shape) for shape in shapes]
|
||||
optimizer.velocities = velocities
|
||||
grads = [np.random.random(shape) for shape in shapes]
|
||||
updates = [momentum * velocity - lr * grad
|
||||
for velocity, grad in zip(velocities, grads)]
|
||||
expected = [param + update for param, update in zip(params, updates)]
|
||||
optimizer.update_params(grads)
|
||||
|
||||
for exp, param in zip(expected, optimizer.params):
|
||||
assert_array_equal(exp, param)
|
||||
|
||||
|
||||
def test_sgd_optimizer_trigger_stopping():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 2e-6
|
||||
optimizer = SGDOptimizer(params, lr, lr_schedule='adaptive')
|
||||
assert not optimizer.trigger_stopping('', False)
|
||||
assert lr / 5 == optimizer.learning_rate
|
||||
assert optimizer.trigger_stopping('', False)
|
||||
|
||||
|
||||
def test_sgd_optimizer_nesterovs_momentum():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 0.1
|
||||
|
||||
for momentum in np.arange(0.5, 0.9, 0.1):
|
||||
optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=True)
|
||||
velocities = [np.random.random(shape) for shape in shapes]
|
||||
optimizer.velocities = velocities
|
||||
grads = [np.random.random(shape) for shape in shapes]
|
||||
updates = [momentum * velocity - lr * grad
|
||||
for velocity, grad in zip(velocities, grads)]
|
||||
updates = [momentum * update - lr * grad
|
||||
for update, grad in zip(updates, grads)]
|
||||
expected = [param + update for param, update in zip(params, updates)]
|
||||
optimizer.update_params(grads)
|
||||
|
||||
for exp, param in zip(expected, optimizer.params):
|
||||
assert_array_equal(exp, param)
|
||||
|
||||
|
||||
def test_adam_optimizer():
|
||||
params = [np.zeros(shape) for shape in shapes]
|
||||
lr = 0.001
|
||||
epsilon = 1e-8
|
||||
|
||||
for beta_1 in np.arange(0.9, 1.0, 0.05):
|
||||
for beta_2 in np.arange(0.995, 1.0, 0.001):
|
||||
optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon)
|
||||
ms = [np.random.random(shape) for shape in shapes]
|
||||
vs = [np.random.random(shape) for shape in shapes]
|
||||
t = 10
|
||||
optimizer.ms = ms
|
||||
optimizer.vs = vs
|
||||
optimizer.t = t - 1
|
||||
grads = [np.random.random(shape) for shape in shapes]
|
||||
|
||||
ms = [beta_1 * m + (1 - beta_1) * grad
|
||||
for m, grad in zip(ms, grads)]
|
||||
vs = [beta_2 * v + (1 - beta_2) * (grad ** 2)
|
||||
for v, grad in zip(vs, grads)]
|
||||
learning_rate = lr * np.sqrt(1 - beta_2 ** t) / (1 - beta_1**t)
|
||||
updates = [-learning_rate * m / (np.sqrt(v) + epsilon)
|
||||
for m, v in zip(ms, vs)]
|
||||
expected = [param + update
|
||||
for param, update in zip(params, updates)]
|
||||
|
||||
optimizer.update_params(grads)
|
||||
for exp, param in zip(expected, optimizer.params):
|
||||
assert_array_equal(exp, param)
|
Loading…
Add table
Add a link
Reference in a new issue