Files
gonum/optimize/newton.go
Brendan Tracey b545e3e77e optimize: Refactor gradient convergence and remove DefaultSettings (#772)
* optimize: Refactor gradient convergence and remove DefaultSettings

The current API design makes it easy to make a mistake in not using the DefaultSettings. This change makes the zero value of Settings do the 'right thing'. The remaining setting that is used by the DefaultSettings is to change the behavior of the GradientTolerance. This was necessary because gradient-based Local methods (BFGS, LBFGS, CG, etc.) typically _define_ convergence by the value of the gradient, while Global methods (CMAES, GuessAndCheck) are defined by _not_ converging when the gradient is small. The problem is to have two completely different default behaviors without knowing the Method. The solution is to treat a very small value of the gradient as a method-based convergence, in the same way that a small spread of data is a convergence of CMAES. Thus, the default behavior, from the perspective of Settings, is never to converge based on the gradient, but all of the Local methods will converge when a value close to the minimum is found. This default value is set to a very small value, such that users should not want a smaller value. A user can thus still set a (more reasonable) convergence value through settings.

Fixes 677.
2018-12-23 08:17:27 -05:00

171 lines
5.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package optimize
import (
"math"
"gonum.org/v1/gonum/mat"
)
const maxNewtonModifications = 20
// Newton implements a modified Newton's method for Hessian-based unconstrained
// minimization. It applies regularization when the Hessian is not positive
// definite, and it can converge to a local minimum from any starting point.
//
// Newton iteratively forms a quadratic model to the objective function f and
// tries to minimize this approximate model. It generates a sequence of
// locations x_k by means of
// solve H_k d_k = -∇f_k for d_k,
// x_{k+1} = x_k + α_k d_k,
// where H_k is the Hessian matrix of f at x_k and α_k is a step size found by
// a line search.
//
// Away from a minimizer H_k may not be positive definite and d_k may not be a
// descent direction. Newton implements a Hessian modification strategy that
// adds successively larger multiples of identity to H_k until it becomes
// positive definite. Note that the repeated trial factorization of the
// modified Hessian involved in this process can be computationally expensive.
//
// If the Hessian matrix cannot be formed explicitly or if the computational
// cost of its factorization is prohibitive, BFGS or L-BFGS quasi-Newton method
// can be used instead.
type Newton struct {
// Linesearcher is used for selecting suitable steps along the descent
// direction d. Accepted steps should satisfy at least one of the Wolfe,
// Goldstein or Armijo conditions.
// If Linesearcher == nil, an appropriate default is chosen.
Linesearcher Linesearcher
// Increase is the factor by which a scalar tau is successively increased
// so that (H + tau*I) is positive definite. Larger values reduce the
// number of trial Hessian factorizations, but also reduce the second-order
// information in H.
// Increase must be greater than 1. If Increase is 0, it is defaulted to 5.
Increase float64
// GradStopThreshold sets the threshold for stopping if the gradient norm
// gets too small. If GradStopThreshold is 0 it is defaulted to 1e-12, and
// if it is NaN the setting is not used.
GradStopThreshold float64
status Status
err error
ls *LinesearchMethod
hess *mat.SymDense // Storage for a copy of the Hessian matrix.
chol mat.Cholesky // Storage for the Cholesky factorization.
tau float64
}
func (n *Newton) Status() (Status, error) {
return n.status, n.err
}
func (n *Newton) Init(dim, tasks int) int {
n.status = NotTerminated
n.err = nil
return 1
}
func (n *Newton) Run(operation chan<- Task, result <-chan Task, tasks []Task) {
n.status, n.err = localOptimizer{}.run(n, n.GradStopThreshold, operation, result, tasks)
close(operation)
return
}
func (n *Newton) initLocal(loc *Location) (Operation, error) {
if n.Increase == 0 {
n.Increase = 5
}
if n.Increase <= 1 {
panic("optimize: Newton.Increase must be greater than 1")
}
if n.Linesearcher == nil {
n.Linesearcher = &Bisection{}
}
if n.ls == nil {
n.ls = &LinesearchMethod{}
}
n.ls.Linesearcher = n.Linesearcher
n.ls.NextDirectioner = n
return n.ls.Init(loc)
}
func (n *Newton) iterateLocal(loc *Location) (Operation, error) {
return n.ls.Iterate(loc)
}
func (n *Newton) InitDirection(loc *Location, dir []float64) (stepSize float64) {
dim := len(loc.X)
n.hess = resizeSymDense(n.hess, dim)
n.tau = 0
return n.NextDirection(loc, dir)
}
func (n *Newton) NextDirection(loc *Location, dir []float64) (stepSize float64) {
// This method implements Algorithm 3.3 (Cholesky with Added Multiple of
// the Identity) from Nocedal, Wright (2006), 2nd edition.
dim := len(loc.X)
d := mat.NewVecDense(dim, dir)
grad := mat.NewVecDense(dim, loc.Gradient)
n.hess.CopySym(loc.Hessian)
// Find the smallest diagonal entry of the Hessian.
minA := n.hess.At(0, 0)
for i := 1; i < dim; i++ {
a := n.hess.At(i, i)
if a < minA {
minA = a
}
}
// If the smallest diagonal entry is positive, the Hessian may be positive
// definite, and so first attempt to apply the Cholesky factorization to
// the un-modified Hessian. If the smallest entry is negative, use the
// final tau from the last iteration if regularization was needed,
// otherwise guess an appropriate value for tau.
if minA > 0 {
n.tau = 0
} else if n.tau == 0 {
n.tau = -minA + 0.001
}
for k := 0; k < maxNewtonModifications; k++ {
if n.tau != 0 {
// Add a multiple of identity to the Hessian.
for i := 0; i < dim; i++ {
n.hess.SetSym(i, i, loc.Hessian.At(i, i)+n.tau)
}
}
// Try to apply the Cholesky factorization.
pd := n.chol.Factorize(n.hess)
if pd {
// Store the solution in d's backing array, dir.
n.chol.SolveVec(d, grad)
d.ScaleVec(-1, d)
return 1
}
// Modified Hessian is not PD, so increase tau.
n.tau = math.Max(n.Increase*n.tau, 0.001)
}
// Hessian modification failed to get a PD matrix. Return the negative
// gradient as the descent direction.
d.ScaleVec(-1, grad)
return 1
}
func (n *Newton) Needs() struct {
Gradient bool
Hessian bool
} {
return struct {
Gradient bool
Hessian bool
}{true, true}
}