430 lines
16 KiB
Python
430 lines
16 KiB
Python
|
"""Hessian update strategies for quasi-Newton optimization methods."""
|
||
|
import numpy as np
|
||
|
from numpy.linalg import norm
|
||
|
from scipy.linalg import get_blas_funcs
|
||
|
from warnings import warn
|
||
|
|
||
|
|
||
|
__all__ = ['HessianUpdateStrategy', 'BFGS', 'SR1']
|
||
|
|
||
|
|
||
|
class HessianUpdateStrategy(object):
|
||
|
"""Interface for implementing Hessian update strategies.
|
||
|
|
||
|
Many optimization methods make use of Hessian (or inverse Hessian)
|
||
|
approximations, such as the quasi-Newton methods BFGS, SR1, L-BFGS.
|
||
|
Some of these approximations, however, do not actually need to store
|
||
|
the entire matrix or can compute the internal matrix product with a
|
||
|
given vector in a very efficiently manner. This class serves as an
|
||
|
abstract interface between the optimization algorithm and the
|
||
|
quasi-Newton update strategies, giving freedom of implementation
|
||
|
to store and update the internal matrix as efficiently as possible.
|
||
|
Different choices of initialization and update procedure will result
|
||
|
in different quasi-Newton strategies.
|
||
|
|
||
|
Four methods should be implemented in derived classes: ``initialize``,
|
||
|
``update``, ``dot`` and ``get_matrix``.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Any instance of a class that implements this interface,
|
||
|
can be accepted by the method ``minimize`` and used by
|
||
|
the compatible solvers to approximate the Hessian (or
|
||
|
inverse Hessian) used by the optimization algorithms.
|
||
|
"""
|
||
|
|
||
|
def initialize(self, n, approx_type):
|
||
|
"""Initialize internal matrix.
|
||
|
|
||
|
Allocate internal memory for storing and updating
|
||
|
the Hessian or its inverse.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n : int
|
||
|
Problem dimension.
|
||
|
approx_type : {'hess', 'inv_hess'}
|
||
|
Selects either the Hessian or the inverse Hessian.
|
||
|
When set to 'hess' the Hessian will be stored and updated.
|
||
|
When set to 'inv_hess' its inverse will be used instead.
|
||
|
"""
|
||
|
raise NotImplementedError("The method ``initialize(n, approx_type)``"
|
||
|
" is not implemented.")
|
||
|
|
||
|
def update(self, delta_x, delta_grad):
|
||
|
"""Update internal matrix.
|
||
|
|
||
|
Update Hessian matrix or its inverse (depending on how 'approx_type'
|
||
|
is defined) using information about the last evaluated points.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
delta_x : ndarray
|
||
|
The difference between two points the gradient
|
||
|
function have been evaluated at: ``delta_x = x2 - x1``.
|
||
|
delta_grad : ndarray
|
||
|
The difference between the gradients:
|
||
|
``delta_grad = grad(x2) - grad(x1)``.
|
||
|
"""
|
||
|
raise NotImplementedError("The method ``update(delta_x, delta_grad)``"
|
||
|
" is not implemented.")
|
||
|
|
||
|
def dot(self, p):
|
||
|
"""Compute the product of the internal matrix with the given vector.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
p : array_like
|
||
|
1-D array representing a vector.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Hp : array
|
||
|
1-D represents the result of multiplying the approximation matrix
|
||
|
by vector p.
|
||
|
"""
|
||
|
raise NotImplementedError("The method ``dot(p)``"
|
||
|
" is not implemented.")
|
||
|
|
||
|
def get_matrix(self):
|
||
|
"""Return current internal matrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
H : ndarray, shape (n, n)
|
||
|
Dense matrix containing either the Hessian
|
||
|
or its inverse (depending on how 'approx_type'
|
||
|
is defined).
|
||
|
"""
|
||
|
raise NotImplementedError("The method ``get_matrix(p)``"
|
||
|
" is not implemented.")
|
||
|
|
||
|
|
||
|
class FullHessianUpdateStrategy(HessianUpdateStrategy):
|
||
|
"""Hessian update strategy with full dimensional internal representation.
|
||
|
"""
|
||
|
_syr = get_blas_funcs('syr', dtype='d') # Symmetric rank 1 update
|
||
|
_syr2 = get_blas_funcs('syr2', dtype='d') # Symmetric rank 2 update
|
||
|
# Symmetric matrix-vector product
|
||
|
_symv = get_blas_funcs('symv', dtype='d')
|
||
|
|
||
|
def __init__(self, init_scale='auto'):
|
||
|
self.init_scale = init_scale
|
||
|
# Until initialize is called we can't really use the class,
|
||
|
# so it makes sense to set everything to None.
|
||
|
self.first_iteration = None
|
||
|
self.approx_type = None
|
||
|
self.B = None
|
||
|
self.H = None
|
||
|
|
||
|
def initialize(self, n, approx_type):
|
||
|
"""Initialize internal matrix.
|
||
|
|
||
|
Allocate internal memory for storing and updating
|
||
|
the Hessian or its inverse.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n : int
|
||
|
Problem dimension.
|
||
|
approx_type : {'hess', 'inv_hess'}
|
||
|
Selects either the Hessian or the inverse Hessian.
|
||
|
When set to 'hess' the Hessian will be stored and updated.
|
||
|
When set to 'inv_hess' its inverse will be used instead.
|
||
|
"""
|
||
|
self.first_iteration = True
|
||
|
self.n = n
|
||
|
self.approx_type = approx_type
|
||
|
if approx_type not in ('hess', 'inv_hess'):
|
||
|
raise ValueError("`approx_type` must be 'hess' or 'inv_hess'.")
|
||
|
# Create matrix
|
||
|
if self.approx_type == 'hess':
|
||
|
self.B = np.eye(n, dtype=float)
|
||
|
else:
|
||
|
self.H = np.eye(n, dtype=float)
|
||
|
|
||
|
def _auto_scale(self, delta_x, delta_grad):
|
||
|
# Heuristic to scale matrix at first iteration.
|
||
|
# Described in Nocedal and Wright "Numerical Optimization"
|
||
|
# p.143 formula (6.20).
|
||
|
s_norm2 = np.dot(delta_x, delta_x)
|
||
|
y_norm2 = np.dot(delta_grad, delta_grad)
|
||
|
ys = np.abs(np.dot(delta_grad, delta_x))
|
||
|
if ys == 0.0 or y_norm2 == 0 or s_norm2 == 0:
|
||
|
return 1
|
||
|
if self.approx_type == 'hess':
|
||
|
return y_norm2 / ys
|
||
|
else:
|
||
|
return ys / y_norm2
|
||
|
|
||
|
def _update_implementation(self, delta_x, delta_grad):
|
||
|
raise NotImplementedError("The method ``_update_implementation``"
|
||
|
" is not implemented.")
|
||
|
|
||
|
def update(self, delta_x, delta_grad):
|
||
|
"""Update internal matrix.
|
||
|
|
||
|
Update Hessian matrix or its inverse (depending on how 'approx_type'
|
||
|
is defined) using information about the last evaluated points.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
delta_x : ndarray
|
||
|
The difference between two points the gradient
|
||
|
function have been evaluated at: ``delta_x = x2 - x1``.
|
||
|
delta_grad : ndarray
|
||
|
The difference between the gradients:
|
||
|
``delta_grad = grad(x2) - grad(x1)``.
|
||
|
"""
|
||
|
if np.all(delta_x == 0.0):
|
||
|
return
|
||
|
if np.all(delta_grad == 0.0):
|
||
|
warn('delta_grad == 0.0. Check if the approximated '
|
||
|
'function is linear. If the function is linear '
|
||
|
'better results can be obtained by defining the '
|
||
|
'Hessian as zero instead of using quasi-Newton '
|
||
|
'approximations.', UserWarning)
|
||
|
return
|
||
|
if self.first_iteration:
|
||
|
# Get user specific scale
|
||
|
if self.init_scale == "auto":
|
||
|
scale = self._auto_scale(delta_x, delta_grad)
|
||
|
else:
|
||
|
scale = float(self.init_scale)
|
||
|
# Scale initial matrix with ``scale * np.eye(n)``
|
||
|
if self.approx_type == 'hess':
|
||
|
self.B *= scale
|
||
|
else:
|
||
|
self.H *= scale
|
||
|
self.first_iteration = False
|
||
|
self._update_implementation(delta_x, delta_grad)
|
||
|
|
||
|
def dot(self, p):
|
||
|
"""Compute the product of the internal matrix with the given vector.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
p : array_like
|
||
|
1-D array representing a vector.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Hp : array
|
||
|
1-D represents the result of multiplying the approximation matrix
|
||
|
by vector p.
|
||
|
"""
|
||
|
if self.approx_type == 'hess':
|
||
|
return self._symv(1, self.B, p)
|
||
|
else:
|
||
|
return self._symv(1, self.H, p)
|
||
|
|
||
|
def get_matrix(self):
|
||
|
"""Return the current internal matrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
M : ndarray, shape (n, n)
|
||
|
Dense matrix containing either the Hessian or its inverse
|
||
|
(depending on how `approx_type` was defined).
|
||
|
"""
|
||
|
if self.approx_type == 'hess':
|
||
|
M = np.copy(self.B)
|
||
|
else:
|
||
|
M = np.copy(self.H)
|
||
|
li = np.tril_indices_from(M, k=-1)
|
||
|
M[li] = M.T[li]
|
||
|
return M
|
||
|
|
||
|
|
||
|
class BFGS(FullHessianUpdateStrategy):
|
||
|
"""Broyden-Fletcher-Goldfarb-Shanno (BFGS) Hessian update strategy.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
exception_strategy : {'skip_update', 'damp_update'}, optional
|
||
|
Define how to proceed when the curvature condition is violated.
|
||
|
Set it to 'skip_update' to just skip the update. Or, alternatively,
|
||
|
set it to 'damp_update' to interpolate between the actual BFGS
|
||
|
result and the unmodified matrix. Both exceptions strategies
|
||
|
are explained in [1]_, p.536-537.
|
||
|
min_curvature : float
|
||
|
This number, scaled by a normalization factor, defines the
|
||
|
minimum curvature ``dot(delta_grad, delta_x)`` allowed to go
|
||
|
unaffected by the exception strategy. By default is equal to
|
||
|
1e-8 when ``exception_strategy = 'skip_update'`` and equal
|
||
|
to 0.2 when ``exception_strategy = 'damp_update'``.
|
||
|
init_scale : {float, 'auto'}
|
||
|
Matrix scale at first iteration. At the first
|
||
|
iteration the Hessian matrix or its inverse will be initialized
|
||
|
with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
|
||
|
Set it to 'auto' in order to use an automatic heuristic for choosing
|
||
|
the initial scale. The heuristic is described in [1]_, p.143.
|
||
|
By default uses 'auto'.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The update is based on the description in [1]_, p.140.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
|
||
|
Second Edition (2006).
|
||
|
"""
|
||
|
|
||
|
def __init__(self, exception_strategy='skip_update', min_curvature=None,
|
||
|
init_scale='auto'):
|
||
|
if exception_strategy == 'skip_update':
|
||
|
if min_curvature is not None:
|
||
|
self.min_curvature = min_curvature
|
||
|
else:
|
||
|
self.min_curvature = 1e-8
|
||
|
elif exception_strategy == 'damp_update':
|
||
|
if min_curvature is not None:
|
||
|
self.min_curvature = min_curvature
|
||
|
else:
|
||
|
self.min_curvature = 0.2
|
||
|
else:
|
||
|
raise ValueError("`exception_strategy` must be 'skip_update' "
|
||
|
"or 'damp_update'.")
|
||
|
|
||
|
super(BFGS, self).__init__(init_scale)
|
||
|
self.exception_strategy = exception_strategy
|
||
|
|
||
|
def _update_inverse_hessian(self, ys, Hy, yHy, s):
|
||
|
"""Update the inverse Hessian matrix.
|
||
|
|
||
|
BFGS update using the formula:
|
||
|
|
||
|
``H <- H + ((H*y).T*y + s.T*y)/(s.T*y)^2 * (s*s.T)
|
||
|
- 1/(s.T*y) * ((H*y)*s.T + s*(H*y).T)``
|
||
|
|
||
|
where ``s = delta_x`` and ``y = delta_grad``. This formula is
|
||
|
equivalent to (6.17) in [1]_ written in a more efficient way
|
||
|
for implementation.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
|
||
|
Second Edition (2006).
|
||
|
"""
|
||
|
self.H = self._syr2(-1.0 / ys, s, Hy, a=self.H)
|
||
|
self.H = self._syr((ys+yHy)/ys**2, s, a=self.H)
|
||
|
|
||
|
def _update_hessian(self, ys, Bs, sBs, y):
|
||
|
"""Update the Hessian matrix.
|
||
|
|
||
|
BFGS update using the formula:
|
||
|
|
||
|
``B <- B - (B*s)*(B*s).T/s.T*(B*s) + y*y^T/s.T*y``
|
||
|
|
||
|
where ``s`` is short for ``delta_x`` and ``y`` is short
|
||
|
for ``delta_grad``. Formula (6.19) in [1]_.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
|
||
|
Second Edition (2006).
|
||
|
"""
|
||
|
self.B = self._syr(1.0 / ys, y, a=self.B)
|
||
|
self.B = self._syr(-1.0 / sBs, Bs, a=self.B)
|
||
|
|
||
|
def _update_implementation(self, delta_x, delta_grad):
|
||
|
# Auxiliary variables w and z
|
||
|
if self.approx_type == 'hess':
|
||
|
w = delta_x
|
||
|
z = delta_grad
|
||
|
else:
|
||
|
w = delta_grad
|
||
|
z = delta_x
|
||
|
# Do some common operations
|
||
|
wz = np.dot(w, z)
|
||
|
Mw = self.dot(w)
|
||
|
wMw = Mw.dot(w)
|
||
|
# Guarantee that wMw > 0 by reinitializing matrix.
|
||
|
# While this is always true in exact arithmetics,
|
||
|
# indefinite matrix may appear due to roundoff errors.
|
||
|
if wMw <= 0.0:
|
||
|
scale = self._auto_scale(delta_x, delta_grad)
|
||
|
# Reinitialize matrix
|
||
|
if self.approx_type == 'hess':
|
||
|
self.B = scale * np.eye(self.n, dtype=float)
|
||
|
else:
|
||
|
self.H = scale * np.eye(self.n, dtype=float)
|
||
|
# Do common operations for new matrix
|
||
|
Mw = self.dot(w)
|
||
|
wMw = Mw.dot(w)
|
||
|
# Check if curvature condition is violated
|
||
|
if wz <= self.min_curvature * wMw:
|
||
|
# If the option 'skip_update' is set
|
||
|
# we just skip the update when the condion
|
||
|
# is violated.
|
||
|
if self.exception_strategy == 'skip_update':
|
||
|
return
|
||
|
# If the option 'damp_update' is set we
|
||
|
# interpolate between the actual BFGS
|
||
|
# result and the unmodified matrix.
|
||
|
elif self.exception_strategy == 'damp_update':
|
||
|
update_factor = (1-self.min_curvature) / (1 - wz/wMw)
|
||
|
z = update_factor*z + (1-update_factor)*Mw
|
||
|
wz = np.dot(w, z)
|
||
|
# Update matrix
|
||
|
if self.approx_type == 'hess':
|
||
|
self._update_hessian(wz, Mw, wMw, z)
|
||
|
else:
|
||
|
self._update_inverse_hessian(wz, Mw, wMw, z)
|
||
|
|
||
|
|
||
|
class SR1(FullHessianUpdateStrategy):
|
||
|
"""Symmetric-rank-1 Hessian update strategy.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
min_denominator : float
|
||
|
This number, scaled by a normalization factor,
|
||
|
defines the minimum denominator magnitude allowed
|
||
|
in the update. When the condition is violated we skip
|
||
|
the update. By default uses ``1e-8``.
|
||
|
init_scale : {float, 'auto'}, optional
|
||
|
Matrix scale at first iteration. At the first
|
||
|
iteration the Hessian matrix or its inverse will be initialized
|
||
|
with ``init_scale*np.eye(n)``, where ``n`` is the problem dimension.
|
||
|
Set it to 'auto' in order to use an automatic heuristic for choosing
|
||
|
the initial scale. The heuristic is described in [1]_, p.143.
|
||
|
By default uses 'auto'.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The update is based on the description in [1]_, p.144-146.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Nocedal, Jorge, and Stephen J. Wright. "Numerical optimization"
|
||
|
Second Edition (2006).
|
||
|
"""
|
||
|
|
||
|
def __init__(self, min_denominator=1e-8, init_scale='auto'):
|
||
|
self.min_denominator = min_denominator
|
||
|
super(SR1, self).__init__(init_scale)
|
||
|
|
||
|
def _update_implementation(self, delta_x, delta_grad):
|
||
|
# Auxiliary variables w and z
|
||
|
if self.approx_type == 'hess':
|
||
|
w = delta_x
|
||
|
z = delta_grad
|
||
|
else:
|
||
|
w = delta_grad
|
||
|
z = delta_x
|
||
|
# Do some common operations
|
||
|
Mw = self.dot(w)
|
||
|
z_minus_Mw = z - Mw
|
||
|
denominator = np.dot(w, z_minus_Mw)
|
||
|
# If the denominator is too small
|
||
|
# we just skip the update.
|
||
|
if np.abs(denominator) <= self.min_denominator*norm(w)*norm(z_minus_Mw):
|
||
|
return
|
||
|
# Update matrix
|
||
|
if self.approx_type == 'hess':
|
||
|
self.B = self._syr(1/denominator, z_minus_Mw, a=self.B)
|
||
|
else:
|
||
|
self.H = self._syr(1/denominator, z_minus_Mw, a=self.H)
|