diff --git a/docs/Project.toml b/docs/Project.toml index f96a0980e9f..539215aa818 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -9,6 +9,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" Dualization = "191a621a-6537-11e9-281d-650236a99e60" +Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" @@ -46,6 +47,7 @@ Distributions = "0.25" Documenter = "=1.2.1" DocumenterCitations = "1" Dualization = "0.5" +Enzyme = "0.11.19" GLPK = "=1.1.3" HTTP = "1.5.4" HiGHS = "=1.8.0" diff --git a/docs/make.jl b/docs/make.jl index 17c482e6b98..630ef0f358c 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -330,6 +330,7 @@ const _PAGES = [ "tutorials/nonlinear/querying_hessians.md", "tutorials/nonlinear/complementarity.md", "tutorials/nonlinear/classifiers.md", + "tutorials/nonlinear/operator_ad.md", ], "Conic programs" => [ "tutorials/conic/introduction.md", diff --git a/docs/src/manual/nonlinear.md b/docs/src/manual/nonlinear.md index 76dcdee5b78..51832d2be9e 100644 --- a/docs/src/manual/nonlinear.md +++ b/docs/src/manual/nonlinear.md @@ -551,11 +551,16 @@ log(f(x)) ### Gradients and Hessians By default, JuMP will use automatic differentiation to compute the gradient and -Hessian of user-defined operators. If your function is not amenable to -automatic differentiation, or you can compute analytic derivatives, you may pass -additional arguments to [`@operator`](@ref) to compute the first- and +Hessian of user-defined operators. If your function is not amenable to the +default automatic differentiation, or you can compute analytic derivatives, you +may pass additional arguments to [`@operator`](@ref) to compute the first- and second-derivatives. +!!! tip + The tutorial [Automatic differentiation of user-defined operators](@ref) + has examples of how to use third-party Julia packages to compute automatic + derivatives. + #### Univariate functions For univariate functions, a gradient function `∇f` returns a number that diff --git a/docs/src/tutorials/nonlinear/operator_ad.jl b/docs/src/tutorials/nonlinear/operator_ad.jl new file mode 100644 index 00000000000..955805515dc --- /dev/null +++ b/docs/src/tutorials/nonlinear/operator_ad.jl @@ -0,0 +1,322 @@ +# Copyright (c) 2024 Oscar Dowson and contributors #src +# #src +# Permission is hereby granted, free of charge, to any person obtaining a copy #src +# of this software and associated documentation files (the "Software"), to deal #src +# in the Software without restriction, including without limitation the rights #src +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell #src +# copies of the Software, and to permit persons to whom the Software is #src +# furnished to do so, subject to the following conditions: #src +# #src +# The above copyright notice and this permission notice shall be included in all #src +# copies or substantial portions of the Software. #src +# #src +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR #src +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, #src +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #src +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #src +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, #src +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #src +# SOFTWARE. #src + +# # Automatic differentiation of user-defined operators + +# The purpose of this tutorial is to demonstrate how to apply automatic +# differentiation to [User-defined operators](@ref jump_user_defined_operators). + +# !!! tip +# This tutorial is for advanced users. As an alternative, consider using +# [Function tracing](@ref) instead of creating an operator, and if an +# operator is necessary, consider using the default of `@operator(model, op_f, N, f)` +# instead of passing explicit [Gradients and Hessians](@ref). + +# This tutorial uses the following packages: + +using JuMP +import Enzyme +import ForwardDiff +import Ipopt +import Test + +# As a simple example, we consider the Rosenbrock function: + +f(x::T...) where {T} = (1 - x[1])^2 + 100 * (x[2] - x[1]^2)^2 + +# Here's the value at a random point: + +x = rand(2) + +#- + +f(x...) + +# ## Analytic derivative + +# If expressions are simple enough, you can provide analytic functions for the +# gradient and Hessian. + +# ### Gradient + +# The Rosenbrock function has the gradient vector: + +function analytic_∇f(g::AbstractVector, x...) + g[1] = 400 * x[1]^3 - 400 * x[1] * x[2] + 2 * x[1] - 2 + g[2] = 200 * (x[2] - x[1]^2) + return +end + +# Let's evaluate it at the same vector `x`: + +analytic_g = zeros(2) +analytic_∇f(analytic_g, x...) +analytic_g + +# ### Hessian + +# The Hessian matrix is: + +function analytic_∇²f(H::AbstractMatrix, x...) + H[1, 1] = 1200 * x[1]^2 - 400 * x[2] + 2 + ## H[1, 2] = -400 * x[1] <-- not needed because Hessian is symmetric + H[2, 1] = -400 * x[1] + H[2, 2] = 200.0 + return +end + +# Note that because the Hessian is symmetric, JuMP requires that we fill in only +# the lower triangle. + +analytic_H = zeros(2, 2) +analytic_∇²f(analytic_H, x...) +analytic_H + +# ### JuMP example + +# Putting our analytic functions together, we get: + +function analytic_rosenbrock() + model = Model(Ipopt.Optimizer) + set_silent(model) + @variable(model, x[1:2]) + @operator(model, op_rosenbrock, 2, f, analytic_∇f, analytic_∇²f) + @objective(model, Min, op_rosenbrock(x[1], x[2])) + optimize!(model) + Test.@test is_solved_and_feasible(model) + return value.(x) +end + +analytic_rosenbrock() + +# ## ForwardDiff + +# Instead of analytic functions, you can use [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) +# to compute derivatives. + +# !!! info +# If you do not specify a gradient or Hessian, JuMP will use ForwardDiff.jl +# to compute derivatives by default. We provide this section as a worked +# example of what is going on under the hood. + +# ### Pros and cons + +# The main benefit of ForwardDiff is that it is simple, robust, and works with a +# broad range of Julia syntax. + +# The main downside is that `f` must be a function that accepts arguments of +# `x::Real...`. See [Common mistakes when writing a user-defined operator](@ref) +# for more details. + +# ### Gradient + +# The gradient can be computed using `ForwardDiff.gradient!`. Note that +# ForwardDiff expects a single `Vector{T}` argument, but we receive `x` as a +# tuple, so we need `y -> f(y...)` and `collect(x)` to get things in the right +# format. + +function fdiff_∇f(g::AbstractVector{T}, x::Vararg{T,N}) where {T,N} + ForwardDiff.gradient!(g, y -> f(y...), collect(x)) + return +end + +# Let's check that we find the analytic solution: + +fdiff_g = zeros(2) +fdiff_∇f(fdiff_g, x...) +Test.@test ≈(analytic_g, fdiff_g) + +# ### Hessian + +# The Hessian is a bit more complicated, but code to implement it is: + +function fdiff_∇²f(H::AbstractMatrix{T}, x::Vararg{T,N}) where {T,N} + h = ForwardDiff.hessian(y -> f(y...), collect(x)) + for i in 1:N, j in 1:i + H[i, j] = h[i, j] + end + return +end + +# Let's check that we find the analytic solution: + +fdiff_H = zeros(2, 2) +fdiff_∇²f(fdiff_H, x...) +Test.@test ≈(analytic_H, fdiff_H) + +# ### JuMP example + +# The code for computing the gradient and Hessian using ForwardDiff can be +# re-used for many operators. Thus, it is helpful to encapsulate it into the +# function: + +""" + fdiff_derivatives(f::Function) -> Tuple{Function,Function} + +Return a tuple of functions that evalute the gradient and Hessian of `f` using +ForwardDiff.jl. +""" +function fdiff_derivatives(f::Function) + function ∇f(g::AbstractVector{T}, x::Vararg{T,N}) where {T,N} + ForwardDiff.gradient!(g, y -> f(y...), collect(x)) + return + end + function ∇²f(H::AbstractMatrix{T}, x::Vararg{T,N}) where {T,N} + h = ForwardDiff.hessian(y -> f(y...), collect(x)) + for i in 1:N, j in 1:i + H[i, j] = h[i, j] + end + return + end + return ∇f, ∇²f +end + +# Here's an example using `fdiff_derivatives`: + +function fdiff_rosenbrock() + model = Model(Ipopt.Optimizer) + set_silent(model) + @variable(model, x[1:2]) + @operator(model, op_rosenbrock, 2, f, fdiff_derivatives(f)...) + @objective(model, Min, op_rosenbrock(x[1], x[2])) + optimize!(model) + Test.@test is_solved_and_feasible(model) + return value.(x) +end + +fdiff_rosenbrock() + +# ## Enzyme + +# Another library for automatic differentiation in Julia is [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl). + +# ### Pros and cons + +# The main benefit of Enzyme is that it can produce fast derivatives for +# functions with many input arguments. + +# The main downsides are that it may throw unusual errors if your code uses an +# unsupported feature of Julia and that it may have large compile times. + +# !!! warning +# The JuMP developers cannot help you debug error messages related to +# Enzyme. If the operator works, it works. If not, we suggest you try a +# different automatic differentiation library. See [juliadiff.org](https://juliadiff.org/) +# for details. + +# ### Gradient + +# The gradient can be computed using `Enzyme.autodiff` with the +# `Enzyme.Reverse` mode. We need to wrap `x` in `Enzyme.Active` to indicate that +# we want to compute the derivatives with respect to these arguments. + +function enzyme_∇f(g::AbstractVector{T}, x::Vararg{T,N}) where {T,N} + g .= Enzyme.autodiff(Enzyme.Reverse, f, Enzyme.Active.(x)...)[1] + return +end + +# Let's check that we find the analytic solution: + +enzyme_g = zeros(2) +enzyme_∇f(enzyme_g, x...) +Test.@test ≈(analytic_g, enzyme_g) + +# ### Hessian + +# We can compute the Hessian in Enzyme using forward-over-reverse automatic +# differentiation. + +# The code to implement the Hessian in Enzyme is complicated, so we will not +# explain it in detail; see the [Enzyme documentation](https://enzymead.github.io/Enzyme.jl/v0.11.20/generated/autodiff/#Vector-forward-over-reverse). + +function enzyme_∇²f(H::AbstractMatrix{T}, x::Vararg{T,N}) where {T,N} + ## direction(i) returns a tuple with a `1` in the `i`'th entry and `0` + ## otherwise + direction(i) = ntuple(j -> Enzyme.Active(T(i == j)), N) + ## As the inner function, compute the gradient using Reverse mode + ∇f_deferred(x...) = Enzyme.autodiff_deferred(Enzyme.Reverse, f, x...)[1] + ## For the outer autodiff, use Forward mode. + hess = Enzyme.autodiff( + Enzyme.Forward, + ∇f_deferred, + ## Compute multiple evaluations of Forward mode, each time using `x` but + ## initializing with a different direction. + Enzyme.BatchDuplicated.(Enzyme.Active.(x), ntuple(direction, N))..., + )[1] + ## Unpack Enzyme's `hess` data structure into the the matrix `H` expected by + ## JuMP. + for j in 1:N, i in 1:j + H[j, i] = hess[j][i] + end + return +end + +# Let's check that we find the analytic solution: + +enzyme_H = zeros(2, 2) +enzyme_∇²f(enzyme_H, x...) +Test.@test ≈(analytic_H, enzyme_H) + +# ### JuMP example + +# The code for computing the gradient and Hessian using Enzyme can be re-used +# for many operators. Thus, it is helpful to encapsulate it into the function: + +""" + enzyme_derivatives(f::Function) -> Tuple{Function,Function} + +Return a tuple of functions that evalute the gradient and Hessian of `f` using +Enzyme.jl. +""" +function enzyme_derivatives(f::Function) + function ∇f(g::AbstractVector{T}, x::Vararg{T,N}) where {T,N} + g .= Enzyme.autodiff(Enzyme.Reverse, f, Enzyme.Active.(x)...)[1] + return + end + function ∇²f(H::AbstractMatrix{T}, x::Vararg{T,N}) where {T,N} + direction(i) = ntuple(j -> Enzyme.Active(T(i == j)), N) + ∇f_deferred(x...) = Enzyme.autodiff_deferred(Enzyme.Reverse, f, x...)[1] + hess = Enzyme.autodiff( + Enzyme.Forward, + ∇f_deferred, + Enzyme.BatchDuplicated.(Enzyme.Active.(x), ntuple(direction, N))..., + )[1] + for j in 1:N, i in 1:j + H[j, i] = hess[j][i] + end + return + end + return ∇f, ∇²f +end + +# Here's an example using `enzyme_derivatives`: + +function enzyme_rosenbrock() + model = Model(Ipopt.Optimizer) + set_silent(model) + @variable(model, x[1:2]) + @operator(model, op_rosenbrock, 2, f, enzyme_derivatives(f)...) + @objective(model, Min, op_rosenbrock(x[1], x[2])) + optimize!(model) + Test.@test is_solved_and_feasible(model) + return value.(x) +end + +enzyme_rosenbrock() diff --git a/docs/styles/Vocab/JuMP-Vocab/accept.txt b/docs/styles/Vocab/JuMP-Vocab/accept.txt index 9bc7f0dd666..d50aeb6e437 100644 --- a/docs/styles/Vocab/JuMP-Vocab/accept.txt +++ b/docs/styles/Vocab/JuMP-Vocab/accept.txt @@ -128,6 +128,7 @@ Dualization EAGO [Ee]mbotech [Ee]cos +Enzyme Geomstats Geoopt GLPK diff --git a/src/nlp_expr.jl b/src/nlp_expr.jl index 3ef3072b131..b63f14bc878 100644 --- a/src/nlp_expr.jl +++ b/src/nlp_expr.jl @@ -957,7 +957,7 @@ function add_nonlinear_operator( return NonlinearOperator(f, name) end -function _catch_redefinition_constant_error(op::Symbol, f::Function) +function _catch_redefinition_constant_error(op::Symbol, f::Function, args...) if op == Symbol(f) error(""" Unable to add the nonlinear operator `:$op` with the same name as