Merge pull request #26 from SciNim/differentiation

SciNim · May 9, 2022 · 614cb30 · 614cb30
2 parents 0170715 + d9827d5
commit 614cb30
Show file tree

Hide file tree

Showing 7 changed files with 1,010 additions and 61 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,33 @@
+# v0.8.0 - 09.05.2022
+## Optimization has joined the chat
+Multi-variate optimization and differentiation has been introduced.
+
+- `numericalnim/differentiate` offers `tensorGradient(f, x)` which calculates the gradient of `f` w.r.t `x` using finite differences, `tensorJacobian` (returns the transpose of the gradient), `tensorHessian`, `mixedDerivative`. It also provides `checkGradient(f, analyticGrad, x, tol)` to verify that the analytic gradient is correct by comparing it to the finite difference approximation.
+- `numericalnim/optimize` now has several multi-variate optimization methods:
+  - `steepestDescent`
+  - `newton`
+  - `bfgs`
+  - `lbfgs`
+  - They all have the function signatures like:
+    ```nim
+    proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = bfgsOptions[U](), analyticGradient: proc(x: Tensor[U]): Tensor[T] = nil): Tensor[U]
+    ```
+    where `f` is the function to be minimized, `x0` is the starting guess, `options` contain options like tolerance (each method has it own options type which can be created by for example `lbfgsOptions` or `newtonOptions`), `analyticGradient` can be supplied to avoid having to do finite difference approximations of the derivatives.
+  - There are 4 different line search methods supported and those are set in the `options`: `Armijo, Wolfe, WolfeStrong, NoLineSearch`.
+  - `levmarq`: non-linear least-square optimizer
+    ```nim
+    proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], options: OptimOptions[U, LevmarqOptions[U]] = levmarqOptions[U]()): Tensor[U]
+    ```
+    - `f` is the function you want to fit to the parameters in `param` and `x` is the value to evaluate the function at. 
+    - `params0` is the initial guess for the parameters
+    - `xData` is a 1D Tensor with the x points and `yData` is a 1D Tensor with the y points.
+    - `options` can be created using `levmarqOptions`.
+    - Returns the final parameters
+  
+
+Note: There are basic tests to ensure these methods converge for simple problems, but they are not tested on more complex problems and should be considered experimental until more tests have been done. Please try them out, but don't rely on them for anything important for now. Also, the API isn't set in stone yet so expect that it may change in future versions.
+
+
 # v0.7.1 -25.01.2022
 
 Add a `nimCI` task for the Nim CI to run now that the tests have external dependencies.

diff --git a/numericalnim.nimble b/numericalnim.nimble
@@ -1,5 +1,5 @@
 # Package Information
-version = "0.7.1"
+version = "0.8.0"
 author = "Hugo Granström"
 description = "A collection of numerical methods written in Nim. Current features: integration, ode, optimization."
 license = "MIT"

diff --git a/src/numericalnim.nim b/src/numericalnim.nim
@@ -8,5 +8,7 @@ import numericalnim/optimize
 export optimize
 import numericalnim/interpolate
 export interpolate
+import numericalnim/differentiate
+export differentiate
 import ./numericalnim/common/commonTypes
 export commonTypes
diff --git a/src/numericalnim/differentiate.nim b/src/numericalnim/differentiate.nim
@@ -0,0 +1,199 @@
+import std/strformat
+import arraymancer
+
+proc diff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
+  ## Numerically calculate the derivative of f(x) at x0 using a step size h.
+  ## Uses forward difference which has accuracy O(h)
+  result = (f(x0 + h) - f(x0)) / h
+
+proc diff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
+  ## Numerically calculate the derivative of f(x) at x0 using a step size h.
+  ## Uses backward difference which has accuracy O(h)
+  result = (f(x0) - f(x0 - h)) / h
+
+proc diff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
+  ## Numerically calculate the derivative of f(x) at x0 using a step size h.
+  ## Uses central difference which has accuracy O(h^2)
+  result = (f(x0 + h) - f(x0 - h)) / (2*h)
+
+proc secondDiff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
+  ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
+  result = (f(x0 + 2*h) - 2*f(x0 + h) + f(x0)) / (h*h)
+
+proc secondDiff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
+  ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
+  result = (f(x0) - 2*f(x0 - h) + f(x0 - 2*h)) / (h*h)
+
+proc secondDiff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
+  ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
+  ## Uses central difference which has accuracy O(h^2)
+  result = (f(x0 + h) - 2*f(x0) + f(x0 - h)) / (h*h)
+
+proc tensorGradient*[U; T: not Tensor](
+    f: proc(x: Tensor[U]): T,
+    x0: Tensor[U],
+    h: U = U(1e-6),
+    fastMode: bool = false
+  ): Tensor[T] =
+  ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
+  ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
+  ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
+  assert x0.rank == 1 # must be a 1d vector
+  let f0 = f(x0) # make use of this with a `fastMode` switch so we use forward difference instead of central difference?
+  let xLen = x0.shape[0]
+  result = newTensor[T](xLen)
+  var x = x0.clone()
+  for i in 0 ..< xLen:
+    x[i] += h
+    let fPlusH = f(x)
+    if fastMode:
+      x[i] -= h # restore to original
+      result[i] = (fPlusH - f0) / h
+    else:
+      x[i] -= 2*h
+      let fMinusH = f(x)
+      x[i] += h # restore to original (± float error)
+      result[i] = (fPlusH - fMinusH) / (2 * h)
+
+proc tensorGradient*[U, T](
+    f: proc(x: Tensor[U]): Tensor[T],
+    x0: Tensor[U],
+    h: U = U(1e-6),
+    fastMode: bool = false
+  ): Tensor[T] =
+  ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
+  ## Every column is the gradient of one component of f.
+  ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
+  ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
+  assert x0.rank == 1 # must be a 1d vector
+  let f0 = f(x0) # make use of this with a `fastMode` switch so we use forward difference instead of central difference?
+  assert f0.rank == 1
+  let rows = x0.shape[0]
+  let cols = f0.shape[0]
+  result = newTensor[T](rows, cols)
+  var x = x0.clone()
+  for i in 0 ..< rows:
+    x[i] += h
+    let fPlusH = f(x)
+    if fastMode:
+      x[i] -= h # restore to original
+      result[i, _] = ((fPlusH - f0) / h).reshape(1, cols)
+    else:
+      x[i] -= 2*h
+      let fMinusH = f(x)
+      x[i] += h # restore to original (± float error)
+      result[i, _] = ((fPlusH - fMinusH) / (2 * h)).reshape(1, cols)
+
+proc tensorJacobian*[U, T](
+    f: proc(x: Tensor[U]): Tensor[T],
+    x0: Tensor[U],
+    h: U = U(1e-6),
+    fastMode: bool = false
+  ): Tensor[T] =
+    ## Calculates the jacobian of f(x) w.r.t vector x at x0 using step size h.
+    ## Every row is the gradient of one component of f.
+    ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
+    ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
+    transpose(tensorGradient(f, x0, h, fastMode))
+
+proc mixedDerivative*[U, T](f: proc(x: Tensor[U]): T, x0: var Tensor[U], indices: (int, int), h: U = U(1e-6)): T =
+  result = 0
+  let i = indices[0]
+  let j = indices[1]
+  # f(x+h, y+h)
+  x0[i] += h
+  x0[j] += h
+  result += f(x0)
+
+  # f(x+h, y-h)
+  x0[j] -= 2*h
+  result -= f(x0)
+
+  # f(x-h, y-h)
+  x0[i] -= 2*h
+  result += f(x0)
+
+  # f(x-h, y+h)
+  x0[j] += 2*h
+  result -= f(x0)
+
+  # restore x0
+  x0[i] += h
+  x0[j] -= h
+
+  result *= 1 / (4 * h*h)
+
+
+proc tensorHessian*[U; T: not Tensor](
+    f: proc(x: Tensor[U]): T,
+    x0: Tensor[U],
+    h: U = U(1e-6)
+  ): Tensor[T] =
+    assert x0.rank == 1 # must be a 1d vector
+    let f0 = f(x0)
+    let xLen = x0.shape[0]
+    var x = x0.clone()
+    result = zeros[T](xLen, xLen)
+    for i in 0 ..< xLen:
+      for j in i ..< xLen:
+        let mixed = mixedDerivative(f, x, (i, j), h)
+        result[i, j] = mixed
+        result[j, i] = mixed
+
+proc checkGradient*[U; T: not Tensor](f: proc(x: Tensor[U]): T, fGrad: proc(x: Tensor[U]): Tensor[T], x0: Tensor[U], tol: T): bool =
+  ## Checks if the provided gradient function `fGrad` gives the same values as numeric gradient.
+  let numGrad = tensorGradient(f, x0)
+  let grad = fGrad(x0)
+  result = true
+  for i, x in abs(numGrad - grad):
+    if x > tol:
+      echo fmt"Gradient at index {i[0]} has error: {x} (tol = {tol})"
+      result = false
+
+proc checkGradient*[U; T](f: proc(x: Tensor[U]): Tensor[T], fGrad: proc(x: Tensor[U]): Tensor[T], x0: Tensor[U], tol: T): bool =
+  ## Checks if the provided gradient function `fGrad` gives the same values as numeric gradient.
+  let numGrad = tensorGradient(f, x0)
+  let grad = fGrad(x0)
+  result = true
+  for i, x in abs(numGrad - grad):
+    if x > tol:
+      echo fmt"Gradient at index {i[0]} has error: {x} (tol = {tol})"
+      result = false
+
+
+
+when isMainModule:
+  import std/math
+  import benchy
+  proc f1(x: Tensor[float]): Tensor[float] =
+    x.sum(0)
+  let x0 = ones[float](10)
+  echo tensorGradient(f1, x0, 1e-6)
+  echo tensorGradient(f1, x0, 1e-6, true)
+  echo tensorJacobian(f1, x0, 1e-6)
+
+  proc f2(x: Tensor[float]): float =
+    sum(x)
+  echo tensorGradient(f2, x0, 1e-6)
+  echo tensorGradient(f2, x0, 1e-6, true)
+
+  let N = 1000
+  timeIt "slow mode":
+    for i in 0 .. N:
+      keep tensorGradient(f1, x0, 1e-6, false)
+  timeIt "fast mode":
+    for i in 0 .. N:
+      keep tensorGradient(f1, x0, 1e-6, true)
+  timeIt "slow mode float":
+    for i in 0 .. N:
+      keep tensorGradient(f2, x0, 1e-6, false)
+  timeIt "fast mode float":
+    for i in 0 .. N:
+      keep tensorGradient(f2, x0, 1e-6, true)
+  timeIt "jacobian slow":
+    for i in 0 .. N:
+      keep tensorJacobian(f1, x0, 1e-6, false)
+
+
+
+