mirror of
https://github.com/gonum/gonum.git
synced 2025-10-24 15:43:07 +08:00
314 lines
10 KiB
Go
314 lines
10 KiB
Go
package optimize
|
|
|
|
import (
|
|
"math"
|
|
|
|
"gonum.org/v1/gonum/floats"
|
|
)
|
|
|
|
const (
|
|
iterationRestartFactor = 6
|
|
angleRestartThreshold = -0.9
|
|
)
|
|
|
|
// CGVariant calculates the scaling parameter, β, used for updating the
|
|
// conjugate direction in the nonlinear conjugate gradient (CG) method.
|
|
type CGVariant interface {
|
|
// Init is called at the first iteration and provides a way to initialize
|
|
// any internal state.
|
|
Init(loc *Location)
|
|
// Beta returns the value of the scaling parameter that is computed
|
|
// according to the particular variant of the CG method.
|
|
Beta(grad, gradPrev, dirPrev []float64) float64
|
|
}
|
|
|
|
// CG implements the nonlinear conjugate gradient method for solving nonlinear
|
|
// unconstrained optimization problems. It is a line search method that
|
|
// generates the search directions d_k according to the formula
|
|
// d_{k+1} = -∇f_{k+1} + β_k*d_k, d_0 = -∇f_0.
|
|
// Variants of the conjugate gradient method differ in the choice of the
|
|
// parameter β_k. The conjugate gradient method usually requires fewer function
|
|
// evaluations than the gradient descent method and no matrix storage, but
|
|
// L-BFGS is usually more efficient.
|
|
//
|
|
// CG implements a restart strategy that takes the steepest descent direction
|
|
// (i.e., d_{k+1} = -∇f_{k+1}) whenever any of the following conditions holds:
|
|
//
|
|
// - A certain number of iterations has elapsed without a restart. This number
|
|
// is controllable via IterationRestartFactor and if equal to 0, it is set to
|
|
// a reasonable default based on the problem dimension.
|
|
// - The angle between the gradients at two consecutive iterations ∇f_k and
|
|
// ∇f_{k+1} is too large.
|
|
// - The direction d_{k+1} is not a descent direction.
|
|
// - β_k returned from CGVariant.Beta is equal to zero.
|
|
//
|
|
// The line search for CG must yield step sizes that satisfy the strong Wolfe
|
|
// conditions at every iteration, otherwise the generated search direction
|
|
// might fail to be a descent direction. The line search should be more
|
|
// stringent compared with those for Newton-like methods, which can be achieved
|
|
// by setting the gradient constant in the strong Wolfe conditions to a small
|
|
// value.
|
|
//
|
|
// See also William Hager, Hongchao Zhang, A survey of nonlinear conjugate
|
|
// gradient methods. Pacific Journal of Optimization, 2 (2006), pp. 35-58, and
|
|
// references therein.
|
|
type CG struct {
|
|
// Linesearcher must satisfy the strong Wolfe conditions at every iteration.
|
|
// If Linesearcher == nil, an appropriate default is chosen.
|
|
Linesearcher Linesearcher
|
|
// Variant implements the particular CG formula for computing β_k.
|
|
// If Variant is nil, an appropriate default is chosen.
|
|
Variant CGVariant
|
|
// InitialStep estimates the initial line search step size, because the CG
|
|
// method does not generate well-scaled search directions.
|
|
// If InitialStep is nil, an appropriate default is chosen.
|
|
InitialStep StepSizer
|
|
|
|
// IterationRestartFactor determines the frequency of restarts based on the
|
|
// problem dimension. The negative gradient direction is taken whenever
|
|
// ceil(IterationRestartFactor*(problem dimension)) iterations have elapsed
|
|
// without a restart. For medium and large-scale problems
|
|
// IterationRestartFactor should be set to 1, low-dimensional problems a
|
|
// larger value should be chosen. Note that if the ceil function returns 1,
|
|
// CG will be identical to gradient descent.
|
|
// If IterationRestartFactor is 0, it will be set to 6.
|
|
// CG will panic if IterationRestartFactor is negative.
|
|
IterationRestartFactor float64
|
|
// AngleRestartThreshold sets the threshold angle for restart. The method
|
|
// is restarted if the cosine of the angle between two consecutive
|
|
// gradients is smaller than or equal to AngleRestartThreshold, that is, if
|
|
// ∇f_k·∇f_{k+1} / (|∇f_k| |∇f_{k+1}|) <= AngleRestartThreshold.
|
|
// A value of AngleRestartThreshold closer to -1 (successive gradients in
|
|
// exact opposite directions) will tend to reduce the number of restarts.
|
|
// If AngleRestartThreshold is 0, it will be set to -0.9.
|
|
// CG will panic if AngleRestartThreshold is not in the interval [-1, 0].
|
|
AngleRestartThreshold float64
|
|
|
|
ls *LinesearchMethod
|
|
|
|
restartAfter int
|
|
iterFromRestart int
|
|
|
|
dirPrev []float64
|
|
gradPrev []float64
|
|
gradPrevNorm float64
|
|
}
|
|
|
|
func (cg *CG) Init(loc *Location) (Operation, error) {
|
|
if cg.IterationRestartFactor < 0 {
|
|
panic("cg: IterationRestartFactor is negative")
|
|
}
|
|
if cg.AngleRestartThreshold < -1 || cg.AngleRestartThreshold > 0 {
|
|
panic("cg: AngleRestartThreshold not in [-1, 0]")
|
|
}
|
|
|
|
if cg.Linesearcher == nil {
|
|
cg.Linesearcher = &MoreThuente{CurvatureFactor: 0.1}
|
|
}
|
|
if cg.Variant == nil {
|
|
cg.Variant = &HestenesStiefel{}
|
|
}
|
|
if cg.InitialStep == nil {
|
|
cg.InitialStep = &FirstOrderStepSize{}
|
|
}
|
|
|
|
if cg.IterationRestartFactor == 0 {
|
|
cg.IterationRestartFactor = iterationRestartFactor
|
|
}
|
|
if cg.AngleRestartThreshold == 0 {
|
|
cg.AngleRestartThreshold = angleRestartThreshold
|
|
}
|
|
|
|
if cg.ls == nil {
|
|
cg.ls = &LinesearchMethod{}
|
|
}
|
|
cg.ls.Linesearcher = cg.Linesearcher
|
|
cg.ls.NextDirectioner = cg
|
|
|
|
return cg.ls.Init(loc)
|
|
}
|
|
|
|
func (cg *CG) Iterate(loc *Location) (Operation, error) {
|
|
return cg.ls.Iterate(loc)
|
|
}
|
|
|
|
func (cg *CG) InitDirection(loc *Location, dir []float64) (stepSize float64) {
|
|
dim := len(loc.X)
|
|
|
|
cg.restartAfter = int(math.Ceil(cg.IterationRestartFactor * float64(dim)))
|
|
cg.iterFromRestart = 0
|
|
|
|
// The initial direction is always the negative gradient.
|
|
copy(dir, loc.Gradient)
|
|
floats.Scale(-1, dir)
|
|
|
|
cg.dirPrev = resize(cg.dirPrev, dim)
|
|
copy(cg.dirPrev, dir)
|
|
cg.gradPrev = resize(cg.gradPrev, dim)
|
|
copy(cg.gradPrev, loc.Gradient)
|
|
cg.gradPrevNorm = floats.Norm(loc.Gradient, 2)
|
|
|
|
cg.Variant.Init(loc)
|
|
return cg.InitialStep.Init(loc, dir)
|
|
}
|
|
|
|
func (cg *CG) NextDirection(loc *Location, dir []float64) (stepSize float64) {
|
|
copy(dir, loc.Gradient)
|
|
floats.Scale(-1, dir)
|
|
|
|
cg.iterFromRestart++
|
|
var restart bool
|
|
if cg.iterFromRestart == cg.restartAfter {
|
|
// Restart because too many iterations have been taken without a restart.
|
|
restart = true
|
|
}
|
|
|
|
gDot := floats.Dot(loc.Gradient, cg.gradPrev)
|
|
gNorm := floats.Norm(loc.Gradient, 2)
|
|
if gDot <= cg.AngleRestartThreshold*gNorm*cg.gradPrevNorm {
|
|
// Restart because the angle between the last two gradients is too large.
|
|
restart = true
|
|
}
|
|
|
|
// Compute the scaling factor β_k even when restarting, because cg.Variant
|
|
// may be keeping an inner state that needs to be updated at every iteration.
|
|
beta := cg.Variant.Beta(loc.Gradient, cg.gradPrev, cg.dirPrev)
|
|
if beta == 0 {
|
|
// β_k == 0 means that the steepest descent direction will be taken, so
|
|
// indicate that the method is in fact being restarted.
|
|
restart = true
|
|
}
|
|
if !restart {
|
|
// The method is not being restarted, so update the descent direction.
|
|
floats.AddScaled(dir, beta, cg.dirPrev)
|
|
if floats.Dot(loc.Gradient, dir) >= 0 {
|
|
// Restart because the new direction is not a descent direction.
|
|
restart = true
|
|
copy(dir, loc.Gradient)
|
|
floats.Scale(-1, dir)
|
|
}
|
|
}
|
|
|
|
// Get the initial line search step size from the StepSizer even if the
|
|
// method was restarted, because StepSizers need to see every iteration.
|
|
stepSize = cg.InitialStep.StepSize(loc, dir)
|
|
if restart {
|
|
// The method was restarted and since the steepest descent direction is
|
|
// not related to the previous direction, discard the estimated step
|
|
// size from cg.InitialStep and use step size of 1 instead.
|
|
stepSize = 1
|
|
// Reset to 0 the counter of iterations taken since the last restart.
|
|
cg.iterFromRestart = 0
|
|
}
|
|
|
|
copy(cg.gradPrev, loc.Gradient)
|
|
copy(cg.dirPrev, dir)
|
|
cg.gradPrevNorm = gNorm
|
|
return stepSize
|
|
}
|
|
|
|
func (*CG) Needs() struct {
|
|
Gradient bool
|
|
Hessian bool
|
|
} {
|
|
return struct {
|
|
Gradient bool
|
|
Hessian bool
|
|
}{true, false}
|
|
}
|
|
|
|
// FletcherReeves implements the Fletcher-Reeves variant of the CG method that
|
|
// computes the scaling parameter β_k according to the formula
|
|
// β_k = |∇f_{k+1}|^2 / |∇f_k|^2.
|
|
type FletcherReeves struct {
|
|
prevNorm float64
|
|
}
|
|
|
|
func (fr *FletcherReeves) Init(loc *Location) {
|
|
fr.prevNorm = floats.Norm(loc.Gradient, 2)
|
|
}
|
|
|
|
func (fr *FletcherReeves) Beta(grad, _, _ []float64) (beta float64) {
|
|
norm := floats.Norm(grad, 2)
|
|
beta = (norm / fr.prevNorm) * (norm / fr.prevNorm)
|
|
fr.prevNorm = norm
|
|
return beta
|
|
}
|
|
|
|
// PolakRibierePolyak implements the Polak-Ribiere-Polyak variant of the CG
|
|
// method that computes the scaling parameter β_k according to the formula
|
|
// β_k = max(0, ∇f_{k+1}·y_k / |∇f_k|^2),
|
|
// where y_k = ∇f_{k+1} - ∇f_k.
|
|
type PolakRibierePolyak struct {
|
|
prevNorm float64
|
|
}
|
|
|
|
func (pr *PolakRibierePolyak) Init(loc *Location) {
|
|
pr.prevNorm = floats.Norm(loc.Gradient, 2)
|
|
}
|
|
|
|
func (pr *PolakRibierePolyak) Beta(grad, gradPrev, _ []float64) (beta float64) {
|
|
norm := floats.Norm(grad, 2)
|
|
dot := floats.Dot(grad, gradPrev)
|
|
beta = (norm*norm - dot) / (pr.prevNorm * pr.prevNorm)
|
|
pr.prevNorm = norm
|
|
return math.Max(0, beta)
|
|
}
|
|
|
|
// HestenesStiefel implements the Hestenes-Stiefel variant of the CG method
|
|
// that computes the scaling parameter β_k according to the formula
|
|
// β_k = max(0, ∇f_{k+1}·y_k / d_k·y_k),
|
|
// where y_k = ∇f_{k+1} - ∇f_k.
|
|
type HestenesStiefel struct {
|
|
y []float64
|
|
}
|
|
|
|
func (hs *HestenesStiefel) Init(loc *Location) {
|
|
hs.y = resize(hs.y, len(loc.Gradient))
|
|
}
|
|
|
|
func (hs *HestenesStiefel) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
|
|
floats.SubTo(hs.y, grad, gradPrev)
|
|
beta = floats.Dot(grad, hs.y) / floats.Dot(dirPrev, hs.y)
|
|
return math.Max(0, beta)
|
|
}
|
|
|
|
// DaiYuan implements the Dai-Yuan variant of the CG method that computes the
|
|
// scaling parameter β_k according to the formula
|
|
// β_k = |∇f_{k+1}|^2 / d_k·y_k,
|
|
// where y_k = ∇f_{k+1} - ∇f_k.
|
|
type DaiYuan struct {
|
|
y []float64
|
|
}
|
|
|
|
func (dy *DaiYuan) Init(loc *Location) {
|
|
dy.y = resize(dy.y, len(loc.Gradient))
|
|
}
|
|
|
|
func (dy *DaiYuan) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
|
|
floats.SubTo(dy.y, grad, gradPrev)
|
|
norm := floats.Norm(grad, 2)
|
|
return norm * norm / floats.Dot(dirPrev, dy.y)
|
|
}
|
|
|
|
// HagerZhang implements the Hager-Zhang variant of the CG method that computes the
|
|
// scaling parameter β_k according to the formula
|
|
// β_k = (y_k - 2 d_k |y_k|^2/(d_k·y_k))·∇f_{k+1} / (d_k·y_k),
|
|
// where y_k = ∇f_{k+1} - ∇f_k.
|
|
type HagerZhang struct {
|
|
y []float64
|
|
}
|
|
|
|
func (hz *HagerZhang) Init(loc *Location) {
|
|
hz.y = resize(hz.y, len(loc.Gradient))
|
|
}
|
|
|
|
func (hz *HagerZhang) Beta(grad, gradPrev, dirPrev []float64) (beta float64) {
|
|
floats.SubTo(hz.y, grad, gradPrev)
|
|
dirDotY := floats.Dot(dirPrev, hz.y)
|
|
gDotY := floats.Dot(grad, hz.y)
|
|
gDotDir := floats.Dot(grad, dirPrev)
|
|
yNorm := floats.Norm(hz.y, 2)
|
|
return (gDotY - 2*gDotDir*yNorm*yNorm/dirDotY) / dirDotY
|
|
}
|