diff --git a/src/OptimKit.jl b/src/OptimKit.jl index a03c2f3..09b1f3e 100644 --- a/src/OptimKit.jl +++ b/src/OptimKit.jl @@ -12,9 +12,19 @@ _add!(vdst, vsrc, α) = LinearAlgebra.axpy!(α, vsrc, vdst) _precondition(x, g) = g _finalize!(x, f, g, numiter) = x, f, g -abstract type OptimizationAlgorithm +# print error message and return eps(x) if x is negative +function safe_sqrt(x::Real) + return if x >= 0 + sqrt(x) + else + ϵ = eps(typeof(x)) + -x < ϵ^(3 / 4) || @error "sqrt of negative number: $x" + ϵ + end end +abstract type OptimizationAlgorithm +end const _xlast = Ref{Any}() const _glast = Ref{Any}() const _dlast = Ref{Any}() diff --git a/src/cg.jl b/src/cg.jl index 2aa8c3f..ef6bcd9 100644 --- a/src/cg.jl +++ b/src/cg.jl @@ -24,13 +24,13 @@ function optimize(fg, x, alg::ConjugateGradient; f, g = fg(x) numfg = 1 innergg = inner(x, g, g) - normgrad = sqrt(innergg) + normgrad = safe_sqrt(innergg) fhistory = [f] normgradhistory = [normgrad] # compute here once to define initial value of α in scale-invariant way Pg = precondition(x, g) - normPg = sqrt(inner(x, Pg, Pg)) + normPg = safe_sqrt(inner(x, Pg, Pg)) α = 1/(normPg) # initial guess: scale invariant # α = one(normgrad) @@ -71,7 +71,7 @@ function optimize(fg, x, alg::ConjugateGradient; numiter += 1 x, f, g = finalize!(x, f, g, numiter) innergg = inner(x, g, g) - normgrad = sqrt(innergg) + normgrad = safe_sqrt(innergg) push!(fhistory, f) push!(normgradhistory, normgrad) diff --git a/src/gd.jl b/src/gd.jl index eefd265..3bd4ec2 100644 --- a/src/gd.jl +++ b/src/gd.jl @@ -19,13 +19,13 @@ function optimize(fg, x, alg::GradientDescent; f, g = fg(x) numfg = 1 innergg = inner(x, g, g) - normgrad = sqrt(innergg) + normgrad = safe_sqrt(innergg) fhistory = [f] normgradhistory = [normgrad] # compute here once to define initial value of α in scale-invariant way Pg = precondition(x, g) - normPg = sqrt(inner(x, Pg, Pg)) + normPg = safe_sqrt(inner(x, Pg, Pg)) α = 1/(normPg) # initial guess: scale invariant numiter = 0 @@ -46,7 +46,7 @@ function optimize(fg, x, alg::GradientDescent; numiter += 1 x, f, g = finalize!(x, f, g, numiter) innergg = inner(x, g, g) - normgrad = sqrt(innergg) + normgrad = safe_sqrt(innergg) push!(fhistory, f) push!(normgradhistory, normgrad) diff --git a/src/lbfgs.jl b/src/lbfgs.jl index 0c6d377..cc97367 100644 --- a/src/lbfgs.jl +++ b/src/lbfgs.jl @@ -21,7 +21,7 @@ function optimize(fg, x, alg::LBFGS; f, g = fg(x) numfg = 1 innergg = inner(x, g, g) - normgrad = sqrt(innergg) + normgrad = safe_sqrt(innergg) fhistory = [f] normgradhistory = [normgrad] @@ -43,7 +43,7 @@ function optimize(fg, x, alg::LBFGS; η = scale!(Hg, -1) else Pg = precondition(x, deepcopy(g)) - normPg = sqrt(inner(x, Pg, Pg)) + normPg = safe_sqrt(inner(x, Pg, Pg)) η = scale!(Pg, -1/normPg) # initial guess: scale invariant end @@ -64,7 +64,7 @@ function optimize(fg, x, alg::LBFGS; numiter += 1 x, f, g = finalize!(x, f, g, numiter) innergg = inner(x, g, g) - normgrad = sqrt(innergg) + normgrad = safe_sqrt(innergg) push!(fhistory, f) push!(normgradhistory, normgrad) @@ -95,8 +95,8 @@ function optimize(fg, x, alg::LBFGS; # define new isometric transport such that, applying it to transported ηprev, # it returns a vector proportional to ξ but with the norm of ηprev # still has norm normη because transport is isometric - normη = sqrt(inner(x, ηprev, ηprev)) - normξ = sqrt(inner(x, ξ, ξ)) + normη = safe_sqrt(inner(x, ηprev, ηprev)) + normξ = safe_sqrt(inner(x, ξ, ξ)) β = normη/normξ if !(inner(x, ξ, ηprev) ≈ normξ * normη) # ξ and η are not parallel ξ₁ = ηprev @@ -131,7 +131,7 @@ function optimize(fg, x, alg::LBFGS; innerss = inner(x, s, s) if innersy/innerss > normgrad/10000 - norms = sqrt(innerss) + norms = safe_sqrt(innerss) ρ = innerss/innersy push!(H, (scale!(s, 1/norms), scale!(y, 1/norms), ρ)) end