From b4622145fb05ad9198042edeb0f1201ed5ed61db Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Tue, 8 Feb 2022 17:08:00 +0100
Subject: [PATCH 01/25] basic 1d differentiation methods

---
 src/numericalnim/differentiation.nim | 31 ++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 src/numericalnim/differentiation.nim

diff --git a/src/numericalnim/differentiation.nim b/src/numericalnim/differentiation.nim
new file mode 100644
index 0000000..e0d6a86
--- /dev/null
+++ b/src/numericalnim/differentiation.nim
@@ -0,0 +1,31 @@
+import math
+
+proc diff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+  ## Numerically calculate the derivative of f(x) at x0 using a step size h.
+  ## Uses forward difference which has accuracy O(h)
+  result = (f(x0 + h) - f(x0)) / h
+
+proc diff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+  ## Numerically calculate the derivative of f(x) at x0 using a step size h.
+  ## Uses backward difference which has accuracy O(h)
+  result = (f(x0) - f(x0 - h)) / h
+
+proc diff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+  ## Numerically calculate the derivative of f(x) at x0 using a step size h.
+  ## Uses central difference which has accuracy O(h^2)
+  result = (f(x0 + h) - f(x0 - h)) / (2*h)
+
+proc secondDiff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+  ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
+  result = (f(x0 + 2*h) - 2*f(x0 + h) + f(x0)) / (h*h)
+
+proc secondDiff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+  ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
+  result = (f(x0) - 2*f(x0 - h) + f(x0 - 2*h)) / (h*h)
+
+proc secondDiff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+  ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
+  ## Uses central difference which has accuracy O(h^2)
+  result = (f(x0 + h) - 2*f(x0) + f(x0 - h)) / (h*h)
+
+

From 2242e4a74659977184518657ca43ade565439a94 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Tue, 8 Feb 2022 20:17:22 +0100
Subject: [PATCH 02/25] implement jacobian/gradient of tensor functions

---
 src/numericalnim/differentiation.nim | 64 +++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/src/numericalnim/differentiation.nim b/src/numericalnim/differentiation.nim
index e0d6a86..f8de4f4 100644
--- a/src/numericalnim/differentiation.nim
+++ b/src/numericalnim/differentiation.nim
@@ -1,4 +1,4 @@
-import math
+import arraymancer
 
 proc diff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
   ## Numerically calculate the derivative of f(x) at x0 using a step size h.
@@ -28,4 +28,66 @@ proc secondDiff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U): T =
   ## Uses central difference which has accuracy O(h^2)
   result = (f(x0 + h) - 2*f(x0) + f(x0 - h)) / (h*h)
 
+proc tensorGradient*[U, T](
+    f: proc(x: Tensor[U]): Tensor[T],
+    x0: Tensor[U],
+    h: U,
+    fastMode: bool = false
+    ): Tensor[T] =
+  ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
+  ## Every column is the gradient of one component of f.
+  assert x0.rank == 1 # must be a 1d vector
+  let f0 = f(x0) # make use of this with a `fastMode` switch so we use forward difference instead of central difference?
+  assert f0.rank == 1
+  let rows = x0.shape[0]
+  let cols = f0.shape[0]
+  result = newTensor[T](rows, cols)
+  var x = x0.clone()
+  for i in 0 ..< rows:
+    x[i] += h
+    let fPlusH = f(x)
+    if fastMode:
+      x[i] -= h # restore to original
+      result[i, _] = ((fPlusH - f0) / h).reshape(1, cols)
+    else:
+      x[i] -= 2*h
+      let fMinusH = f(x)
+      x[i] += h # restore to original (± float error)
+      result[i, _] = ((fPlusH - fMinusH) / (2 * h)).reshape(1, cols)
+
+proc tensorJacobian*[U, T](
+    f: proc(x: Tensor[U]): Tensor[T],
+    x0: Tensor[U],
+    h: U,
+    fastMode: bool = false
+    ): Tensor[T] =
+  transpose(tensorGradient(f, x0, h, fastMode))
+
+
+when isMainModule:
+  import std/math
+  import benchy
+  proc f1(x: Tensor[float]): Tensor[float] =
+    result = zeros[float](2)
+    result[0] = x[0] + 1
+    result[1] = x[0]^2 + x[1]^2
+  let x0 = [1.0, 1.0].toTensor
+  echo tensorGradient(f1, x0, 1e-6)
+  echo tensorGradient(f1, x0, 1e-6, true)
+  echo tensorJacobian(f1, x0, 1e-6)
+
+  let N = 1000
+  timeIt "slow mode":
+    for i in 0 .. N:
+      keep tensorGradient(f1, x0, 1e-6, false)
+  timeIt "fast mode":
+    for i in 0 .. N:
+      keep tensorGradient(f1, x0, 1e-6, true)
+
+  timeIt "jacobian":
+    for i in 0 .. N:
+      keep tensorJacobian(f1, x0, 1e-6, true)
+    
+
+
 

From a13b9a8fb8d51e279e4d23e523ca22165d5fee3e Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Tue, 8 Feb 2022 22:23:15 +0100
Subject: [PATCH 03/25] add version with scalar output

---
 src/numericalnim/differentiation.nim | 46 +++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/numericalnim/differentiation.nim b/src/numericalnim/differentiation.nim
index f8de4f4..cf2f434 100644
--- a/src/numericalnim/differentiation.nim
+++ b/src/numericalnim/differentiation.nim
@@ -28,6 +28,30 @@ proc secondDiff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U): T =
   ## Uses central difference which has accuracy O(h^2)
   result = (f(x0 + h) - 2*f(x0) + f(x0 - h)) / (h*h)
 
+proc tensorGradient*[U; T: not Tensor](
+    f: proc(x: Tensor[U]): T,
+    x0: Tensor[U],
+    h: U,
+    fastMode: bool = false
+    ): Tensor[T] =
+  ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
+  assert x0.rank == 1 # must be a 1d vector
+  let f0 = f(x0) # make use of this with a `fastMode` switch so we use forward difference instead of central difference?
+  let xLen = x0.shape[0]
+  result = newTensor[T](xLen)
+  var x = x0.clone()
+  for i in 0 ..< xLen:
+    x[i] += h
+    let fPlusH = f(x)
+    if fastMode:
+      x[i] -= h # restore to original
+      result[i] = (fPlusH - f0) / h
+    else:
+      x[i] -= 2*h
+      let fMinusH = f(x)
+      x[i] += h # restore to original (± float error)
+      result[i] = (fPlusH - fMinusH) / (2 * h)
+
 proc tensorGradient*[U, T](
     f: proc(x: Tensor[U]): Tensor[T],
     x0: Tensor[U],
@@ -68,14 +92,17 @@ when isMainModule:
   import std/math
   import benchy
   proc f1(x: Tensor[float]): Tensor[float] =
-    result = zeros[float](2)
-    result[0] = x[0] + 1
-    result[1] = x[0]^2 + x[1]^2
-  let x0 = [1.0, 1.0].toTensor
+    x.sum(0)
+  let x0 = ones[float](10)
   echo tensorGradient(f1, x0, 1e-6)
   echo tensorGradient(f1, x0, 1e-6, true)
   echo tensorJacobian(f1, x0, 1e-6)
 
+  proc f2(x: Tensor[float]): float =
+    sum(x)
+  echo tensorGradient(f2, x0, 1e-6)
+  echo tensorGradient(f2, x0, 1e-6, true)
+
   let N = 1000
   timeIt "slow mode":
     for i in 0 .. N:
@@ -83,10 +110,15 @@ when isMainModule:
   timeIt "fast mode":
     for i in 0 .. N:
       keep tensorGradient(f1, x0, 1e-6, true)
-
-  timeIt "jacobian":
+  timeIt "slow mode float":
+    for i in 0 .. N:
+      keep tensorGradient(f2, x0, 1e-6, false)
+  timeIt "fast mode float":
+    for i in 0 .. N:
+      keep tensorGradient(f2, x0, 1e-6, true)
+  timeIt "jacobian slow":
     for i in 0 .. N:
-      keep tensorJacobian(f1, x0, 1e-6, true)
+      keep tensorJacobian(f1, x0, 1e-6, false)
     
 
 

From f9967a4a98524c943f21443c582e487c0fc8e008 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Tue, 8 Feb 2022 22:24:19 +0100
Subject: [PATCH 04/25] add doc comment to jacobian

---
 src/numericalnim/differentiation.nim | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/numericalnim/differentiation.nim b/src/numericalnim/differentiation.nim
index cf2f434..3114850 100644
--- a/src/numericalnim/differentiation.nim
+++ b/src/numericalnim/differentiation.nim
@@ -85,6 +85,8 @@ proc tensorJacobian*[U, T](
     h: U,
     fastMode: bool = false
     ): Tensor[T] =
+  ## Calculates the jacobian of f(x) w.r.t vector x at x0 using step size h.
+  ## Every row is the gradient of one component of f.
   transpose(tensorGradient(f, x0, h, fastMode))
 
 

From ebddfd02d3b39f98ec5c1be44dfdba3b3edb270f Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 09:56:57 +0100
Subject: [PATCH 05/25] rename file and export it

---
 src/numericalnim.nim                           |  2 ++
 .../{differentiation.nim => differentiate.nim} | 18 +++++++++---------
 2 files changed, 11 insertions(+), 9 deletions(-)
 rename src/numericalnim/{differentiation.nim => differentiate.nim} (88%)

diff --git a/src/numericalnim.nim b/src/numericalnim.nim
index 905ec3a..8308749 100644
--- a/src/numericalnim.nim
+++ b/src/numericalnim.nim
@@ -8,5 +8,7 @@ import numericalnim/optimize
 export optimize
 import numericalnim/interpolate
 export interpolate
+import numericalnim/differentiate
+export differentiate
 import ./numericalnim/common/commonTypes
 export commonTypes
diff --git a/src/numericalnim/differentiation.nim b/src/numericalnim/differentiate.nim
similarity index 88%
rename from src/numericalnim/differentiation.nim
rename to src/numericalnim/differentiate.nim
index 3114850..8c63d5e 100644
--- a/src/numericalnim/differentiation.nim
+++ b/src/numericalnim/differentiate.nim
@@ -1,29 +1,29 @@
 import arraymancer
 
-proc diff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+proc diff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
   ## Numerically calculate the derivative of f(x) at x0 using a step size h.
   ## Uses forward difference which has accuracy O(h)
   result = (f(x0 + h) - f(x0)) / h
 
-proc diff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+proc diff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
   ## Numerically calculate the derivative of f(x) at x0 using a step size h.
   ## Uses backward difference which has accuracy O(h)
   result = (f(x0) - f(x0 - h)) / h
 
-proc diff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+proc diff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
   ## Numerically calculate the derivative of f(x) at x0 using a step size h.
   ## Uses central difference which has accuracy O(h^2)
   result = (f(x0 + h) - f(x0 - h)) / (2*h)
 
-proc secondDiff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+proc secondDiff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
   ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
   result = (f(x0 + 2*h) - 2*f(x0 + h) + f(x0)) / (h*h)
 
-proc secondDiff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+proc secondDiff1dBackward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
   ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
   result = (f(x0) - 2*f(x0 - h) + f(x0 - 2*h)) / (h*h)
 
-proc secondDiff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U): T =
+proc secondDiff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
   ## Numerically calculate the second derivative of f(x) at x0 using a step size h.
   ## Uses central difference which has accuracy O(h^2)
   result = (f(x0 + h) - 2*f(x0) + f(x0 - h)) / (h*h)
@@ -31,7 +31,7 @@ proc secondDiff1dCentral*[U, T](f: proc(x: U): T, x0: U, h: U): T =
 proc tensorGradient*[U; T: not Tensor](
     f: proc(x: Tensor[U]): T,
     x0: Tensor[U],
-    h: U,
+    h: U = U(1e-6),
     fastMode: bool = false
     ): Tensor[T] =
   ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
@@ -55,7 +55,7 @@ proc tensorGradient*[U; T: not Tensor](
 proc tensorGradient*[U, T](
     f: proc(x: Tensor[U]): Tensor[T],
     x0: Tensor[U],
-    h: U,
+    h: U = U(1e-6),
     fastMode: bool = false
     ): Tensor[T] =
   ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
@@ -82,7 +82,7 @@ proc tensorGradient*[U, T](
 proc tensorJacobian*[U, T](
     f: proc(x: Tensor[U]): Tensor[T],
     x0: Tensor[U],
-    h: U,
+    h: U = U(1e-6),
     fastMode: bool = false
     ): Tensor[T] =
   ## Calculates the jacobian of f(x) w.r.t vector x at x0 using step size h.

From ae22c8d16e2805f2cb0f8ca9f9bc77059f86a9ed Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 09:57:07 +0100
Subject: [PATCH 06/25] add tests for 1d procs

---
 tests/test_differentiate.nim | 47 ++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 tests/test_differentiate.nim

diff --git a/tests/test_differentiate.nim b/tests/test_differentiate.nim
new file mode 100644
index 0000000..9a305ea
--- /dev/null
+++ b/tests/test_differentiate.nim
@@ -0,0 +1,47 @@
+import std/[unittest, math]
+import numericalnim
+import arraymancer
+
+proc f_1d(x: float): float = 2 * sin(x) * cos(x) # same as sin(2*x)
+proc exact_deriv_1d(x: float): float = 2*cos(2*x)
+proc exact_secondderiv_1d(x: float): float = -4*sin(2*x)
+
+suite "1D numeric differentiation":
+  test "Forward difference":
+    for x in numericalnim.linspace(0, 10, 100):
+      let numDiff = diff1dForward(f_1d, x)
+      let exact = exact_deriv_1d(x)
+      check abs(numDiff - exact) < 3e-6
+  
+  test "Backward difference":
+    for x in numericalnim.linspace(0, 10, 100):
+      let numDiff = diff1dBackward(f_1d, x)
+      let exact = exact_deriv_1d(x)
+      check abs(numDiff - exact) < 3e-6
+
+  test "Central difference":
+    for x in numericalnim.linspace(0, 10, 100):
+      let numDiff = diff1dCentral(f_1d, x)
+      let exact = exact_deriv_1d(x)
+      check abs(numDiff - exact) < 2e-9
+
+  test "Forward second difference":
+    for x in numericalnim.linspace(0, 10, 100):
+      let numDiff = secondDiff1dForward(f_1d, x)
+      let exact = exact_secondderiv_1d(x)
+      check abs(numDiff - exact) < 4e-3
+
+  test "Backward second difference":
+    for x in numericalnim.linspace(0, 10, 100):
+      let numDiff = secondDiff1dBackward(f_1d, x)
+      let exact = exact_secondderiv_1d(x)
+      check abs(numDiff - exact) < 4e-3
+
+  test "Central second difference":
+    for x in numericalnim.linspace(0, 10, 100):
+      let numDiff = secondDiff1dCentral(f_1d, x)
+      let exact = exact_secondderiv_1d(x)
+      check abs(numDiff - exact) < 4e-4
+
+suite "Multi dimensional numeric gradients":
+  discard
\ No newline at end of file

From c5d2256e6f0d807020bae97f2005ce9ddd304938 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 10:41:25 +0100
Subject: [PATCH 07/25] multidim tests pass!

---
 tests/test_differentiate.nim | 73 +++++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/tests/test_differentiate.nim b/tests/test_differentiate.nim
index 9a305ea..23be0a9 100644
--- a/tests/test_differentiate.nim
+++ b/tests/test_differentiate.nim
@@ -43,5 +43,76 @@ suite "1D numeric differentiation":
       let exact = exact_secondderiv_1d(x)
       check abs(numDiff - exact) < 4e-4
 
+proc fScalar(x: Tensor[float]): float =
+  # This will be a function of three variables
+  # f(x0, x1, x2) = x0^2 + 2 * x0 * x1 + sin(x2)
+  result = x[0]*x[0] + 2 * x[0] * x[1] + sin(x[2])
+
+proc scalarGradient(x: Tensor[float]): Tensor[float] =
+  # Gradient is (2*x0 + 2*x1, 2*x0, cos(x2))
+  result = zeros_like(x)
+  result[0] = 2*x[0] + 2*x[1]
+  result[1] = 2*x[0]
+  result[2] = cos(x[2])
+
+proc fMultidim(x: Tensor[float]): Tensor[float] =
+  # Function will have 3 inputs and 2 outputs (important that they aren't the same for testing!)
+  # f(x0, x1, x2) = (x0*x1*x2^2, x1*sin(2*x2))
+  result = zeros[float](2)
+  result[0] = x[0]*x[1]*x[2]*x[2]
+  result[1] = x[1] * sin(2*x[2])
+
+proc multidimGradient(x: Tensor[float]): Tensor[float] =
+  # The gradient (Jacobian transposed) is:
+  # x1*x2^2     0
+  # x0*x2^2     sin(2*x2)
+  # 2*x0*x1*x2  2*x1*cos(2*x2)
+  result = zeros[float](3, 2)
+  result[0, 0] = x[1]*x[2]*x[2]
+  result[0, 1] = 0
+  result[1, 0] = x[0]*x[2]*x[2]
+  result[1, 1] = sin(2*x[2])
+  result[2, 0] = 2*x[0]*x[1]*x[2]
+  result[2, 1] = 2*x[1]*cos(2*x[2])
+  
+
 suite "Multi dimensional numeric gradients":
-  discard
\ No newline at end of file
+  test "Scalar valued function of 3 variables":
+    for x in numericalnim.linspace(0, 1, 10):
+      for y in numericalnim.linspace(0, 1, 10):
+        for z in numericalnim.linspace(0, 1, 10):
+          let x0 = [x, y, z].toTensor
+          let numGrad = tensorGradient(fScalar, x0)
+          let exact = scalarGradient(x0)
+          for err in abs(numGrad - exact):
+            check err < 5e-10
+
+  test "Scalar valued function of 3 variables (fast mode)":
+    for x in numericalnim.linspace(0, 1, 10):
+      for y in numericalnim.linspace(0, 1, 10):
+        for z in numericalnim.linspace(0, 1, 10):
+          let x0 = [x, y, z].toTensor
+          let numGrad = tensorGradient(fScalar, x0, fastMode=true)
+          let exact = scalarGradient(x0)
+          for err in abs(numGrad - exact):
+            check err < 5e-6
+
+  test "Multi-dimensional function of 3 variables":
+    for x in numericalnim.linspace(0, 1, 10):
+      for y in numericalnim.linspace(0, 1, 10):
+        for z in numericalnim.linspace(0, 1, 10):
+          let x0 = [x, y, z].toTensor
+          let numGrad = tensorGradient(fMultidim, x0)
+          let exact = multidimGradient(x0)
+          for err in abs(numGrad - exact):
+            check err < 1e-10
+
+  test "Multi-dimensional function of 3 variables (fast mode)":
+    for x in numericalnim.linspace(0, 1, 10):
+      for y in numericalnim.linspace(0, 1, 10):
+        for z in numericalnim.linspace(0, 1, 10):
+          let x0 = [x, y, z].toTensor
+          let numGrad = tensorGradient(fMultidim, x0, fastMode=true)
+          let exact = multidimGradient(x0)
+          for err in abs(numGrad - exact):
+            check err < 2e-6

From 9c17bc5b851504b14f15b7345d03179e141409df Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 10:45:42 +0100
Subject: [PATCH 08/25] add jacobian test

---
 tests/test_differentiate.nim | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_differentiate.nim b/tests/test_differentiate.nim
index 23be0a9..642a5da 100644
--- a/tests/test_differentiate.nim
+++ b/tests/test_differentiate.nim
@@ -116,3 +116,14 @@ suite "Multi dimensional numeric gradients":
           let exact = multidimGradient(x0)
           for err in abs(numGrad - exact):
             check err < 2e-6
+
+  
+  test "Jacobian":
+    for x in numericalnim.linspace(0, 1, 10):
+      for y in numericalnim.linspace(0, 1, 10):
+        for z in numericalnim.linspace(0, 1, 10):
+          let x0 = [x, y, z].toTensor
+          let numJacobian = tensorJacobian(fMultidim, x0)
+          let exact = multidimGradient(x0).transpose
+          for err in abs(numJacobian - exact):
+            check err < 1e-10

From bd84aa3847781b7f58d81aa1eda5ef80dc908948 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 10:51:05 +0100
Subject: [PATCH 09/25] add documentation comments on fastMode

---
 src/numericalnim/differentiate.nim | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/numericalnim/differentiate.nim b/src/numericalnim/differentiate.nim
index 8c63d5e..e28faf3 100644
--- a/src/numericalnim/differentiate.nim
+++ b/src/numericalnim/differentiate.nim
@@ -35,6 +35,8 @@ proc tensorGradient*[U; T: not Tensor](
     fastMode: bool = false
     ): Tensor[T] =
   ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
+  ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
+  ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
   assert x0.rank == 1 # must be a 1d vector
   let f0 = f(x0) # make use of this with a `fastMode` switch so we use forward difference instead of central difference?
   let xLen = x0.shape[0]
@@ -60,6 +62,8 @@ proc tensorGradient*[U, T](
     ): Tensor[T] =
   ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
   ## Every column is the gradient of one component of f.
+  ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
+  ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
   assert x0.rank == 1 # must be a 1d vector
   let f0 = f(x0) # make use of this with a `fastMode` switch so we use forward difference instead of central difference?
   assert f0.rank == 1
@@ -87,6 +91,8 @@ proc tensorJacobian*[U, T](
     ): Tensor[T] =
   ## Calculates the jacobian of f(x) w.r.t vector x at x0 using step size h.
   ## Every row is the gradient of one component of f.
+  ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
+  ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
   transpose(tensorGradient(f, x0, h, fastMode))
 
 

From 8da26e9ecf865146c97a2ba45a8c0b09dcb2121e Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 12:01:23 +0100
Subject: [PATCH 10/25] implement multivariate steepest descent function

---
 src/numericalnim/optimize.nim | 49 +++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index af03c93..b1800f9 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -2,6 +2,7 @@ import strformat
 import arraymancer
 import sequtils
 import math
+import ./differentiate
 
 proc steepest_descent*(deriv: proc(x: float64): float64, start: float64, gamma: float64 = 0.01, precision: float64 = 1e-5, max_iters: Natural = 1000):float64 {.inline.} =
     ## Gradient descent optimization algorithm for finding local minimums of a function with derivative 'deriv'
@@ -120,9 +121,53 @@ proc secant*(f: proc(x: float64): float64, start: array[2, float64], precision:
             raise newException(ArithmeticError, "Maximum iterations for Secant method exceeded")
     return xCurrent
 
+##############################
+## Multidimensional methods ##
+##############################
+
+proc vectorNorm*[T](v: Tensor[T]): T =
+    ## Calculates the norm of the vector, ie the sqrt(Σ vᵢ²)
+    assert v.rank == 1, "v must be a 1d vector!"
+    result = sqrt(v.dot(v))
+
+proc steepestDescent*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+    ## Minimize scalar-valued function f. 
+    var x = x0.clone()
+    var fNorm = abs(f(x0))
+    var gradient = tensorGradient(f, x0, fastMode=fastMode)
+    var gradNorm = vectorNorm(gradient)
+    var iters: int
+    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+        let p = -gradient
+        x += alpha * p
+        let fx = f(x)
+        fNorm = abs(fx)
+        gradient = tensorGradient(f, x, fastMode=fastMode)
+        gradNorm = vectorNorm(gradient)
+        iters += 1
+    if iters >= 10000:
+        echo "Limit of 10000 iterations reached!"
+    #echo iters, " iterations done!"
+    result = x
+        
+when isMainModule:
+    import benchy
+    proc f1(x: Tensor[float]): float =
+        result = x[0]*x[0] + x[1]*x[1] - 10
     
-
-
+    let x0 = [10.0, 10.0].toTensor
+
+    let sol1 = steepestDescent(f1, x0, tol=1e-10, fastMode=false)
+    let sol2 = steepestDescent(f1, x0, tol=1e-10, fastMode=true)
+    echo sol1
+    echo sol2
+    echo f1(sol1)
+    echo f1(sol2)
+
+    timeIt "slow mode":
+        keep steepestDescent(f1, x0, tol=1e-10, fastMode=false)
+    timeIt "fast mode":
+        keep steepestDescent(f1, x0, tol=1e-10, fastMode=true)
 
 
 

From 6ffd6a8c697fc4674b98a62292eff8033e746c79 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 13:38:56 +0100
Subject: [PATCH 11/25] levmarq that works on linear and quadratic functions at
 least!!!

---
 src/numericalnim/optimize.nim | 64 +++++++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index b1800f9..3512662 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -130,6 +130,11 @@ proc vectorNorm*[T](v: Tensor[T]): T =
     assert v.rank == 1, "v must be a 1d vector!"
     result = sqrt(v.dot(v))
 
+proc eye[T](n: int): Tensor[T] =
+    result = zeros[T](n, n)
+    for i in 0 ..< n:
+        result[i, i] = 1
+
 proc steepestDescent*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
     ## Minimize scalar-valued function f. 
     var x = x0.clone()
@@ -149,14 +154,58 @@ proc steepestDescent*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
         echo "Limit of 10000 iterations reached!"
     #echo iters, " iterations done!"
     result = x
+
+proc levmarq*[U, T](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], alpha = U(1), tol = U(1e-6), lambda0 = U(1), fastMode = false): Tensor[U] =
+    assert xData.rank == 1
+    assert yData.rank == 1
+    assert params0.rank == 1
+    let xLen = xData.shape[0]
+    let yLen = yData.shape[0]
+    let paramsLen = params0.shape[0]
+    assert xLen == yLen
+
+    let residualFunc = # proc that returns the residual vector
+        proc (params: Tensor[U]): Tensor[T] =
+            result = map2_inline(xData, yData):
+                f(params, x) - y
+
+    var lambdaCoeff = lambda0
+
+    var params = params0.clone()
+    var gradient = tensorGradient(residualFunc, params, fastMode=fastMode)
+    var residuals = residualFunc(params)
+    var resNorm = vectorNorm(residuals)
+    var gradNorm = vectorNorm(squeeze(gradient * residuals.reshape(xLen, 1)))
+    var iters: int
+    let eyeNN = eye[T](paramsLen)
+    while gradNorm > tol*(1 + resNorm) and iters < 10000:
+        let rhs = -gradient * residuals.reshape(xLen, 1)
+        let lhs = gradient * gradient.transpose + lambdaCoeff * eyeNN
+        let p = solve(lhs, rhs)
+        params += p * alpha
+        gradient = tensorGradient(residualFunc, params, fastMode=fastMode)
+        residuals = residualFunc(params)
+        let newGradNorm = vectorNorm(squeeze(gradient * residuals.reshape(xLen, 1)))
+        if newGradNorm / gradNorm < 0.9: # we have improved, decrease lambda → more Gauss-Newton
+            lambdaCoeff = max(lambdaCoeff / 3, 1e-4)
+        elif newGradNorm / gradNorm > 1.2: # we have done worse than last ste, increase lambda → more Steepest descent
+            lambdaCoeff = min(lambdaCoeff * 2, 20)
+        # else: don't change anything
+        gradNorm = newGradNorm
+        resNorm = vectorNorm(residuals)
+        iters += 1
+    result = params
+
+
         
 when isMainModule:
     import benchy
-    proc f1(x: Tensor[float]): float =
+    # Steepest descent:
+#[     proc f1(x: Tensor[float]): float =
         result = x[0]*x[0] + x[1]*x[1] - 10
     
     let x0 = [10.0, 10.0].toTensor
-
+    echo eye[float](10)
     let sol1 = steepestDescent(f1, x0, tol=1e-10, fastMode=false)
     let sol2 = steepestDescent(f1, x0, tol=1e-10, fastMode=true)
     echo sol1
@@ -167,7 +216,16 @@ when isMainModule:
     timeIt "slow mode":
         keep steepestDescent(f1, x0, tol=1e-10, fastMode=false)
     timeIt "fast mode":
-        keep steepestDescent(f1, x0, tol=1e-10, fastMode=true)
+        keep steepestDescent(f1, x0, tol=1e-10, fastMode=true) ]#
+
+    # Lev-Marq:
+    proc fFit(params: Tensor[float], x: float): float =
+        params[0] + params[1] * x + params[2] * x*x
+    
+    let xData = arraymancer.linspace(0, 10, 100)
+    let yData = 1.5 +. xData * 6.28 + xData *. xData * -5.79
+    let params0 = [0.0, 0.0, 0.0].toTensor
+    echo levmarq(fFit, params0, xData, yData)
 
 
 

From 5a615f37791c78c4b3f8fc903e26ac2daaea914c Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 16:55:42 +0100
Subject: [PATCH 12/25] adjust function parameters

---
 src/numericalnim/optimize.nim | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index 3512662..465b03d 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -155,7 +155,7 @@ proc steepestDescent*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     #echo iters, " iterations done!"
     result = x
 
-proc levmarq*[U, T](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], alpha = U(1), tol = U(1e-6), lambda0 = U(1), fastMode = false): Tensor[U] =
+proc levmarq*[U, T](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], alpha = U(1), tol: U = U(1e-6), lambda0: U = U(1), fastMode = false): Tensor[U] =
     assert xData.rank == 1
     assert yData.rank == 1
     assert params0.rank == 1
@@ -223,9 +223,13 @@ when isMainModule:
         params[0] + params[1] * x + params[2] * x*x
     
     let xData = arraymancer.linspace(0, 10, 100)
-    let yData = 1.5 +. xData * 6.28 + xData *. xData * -5.79
+    let yData = 1.5 +. xData * 6.28 + xData *. xData * -5.79 + randomNormalTensor[float](xData.shape[0], 0.0, 0.1)
     let params0 = [0.0, 0.0, 0.0].toTensor
     echo levmarq(fFit, params0, xData, yData)
+    timeIt "slow mode":
+        keep levmarq(fFit, params0, xData, yData, fastMode=false)
+    timeIt "fast mode":
+        keep levmarq(fFit, params0, xData, yData, fastMode=true)
 
 
 

From 34dd3168c759be90051c88de5138e3c230a47372 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 9 Feb 2022 20:04:29 +0100
Subject: [PATCH 13/25] implement tensorHessian and tests

---
 src/numericalnim/differentiate.nim | 65 ++++++++++++++++++++++++++----
 tests/test_differentiate.nim       | 22 ++++++++++
 2 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/src/numericalnim/differentiate.nim b/src/numericalnim/differentiate.nim
index e28faf3..51dfdca 100644
--- a/src/numericalnim/differentiate.nim
+++ b/src/numericalnim/differentiate.nim
@@ -33,7 +33,7 @@ proc tensorGradient*[U; T: not Tensor](
     x0: Tensor[U],
     h: U = U(1e-6),
     fastMode: bool = false
-    ): Tensor[T] =
+  ): Tensor[T] =
   ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
   ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
   ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
@@ -59,7 +59,7 @@ proc tensorGradient*[U, T](
     x0: Tensor[U],
     h: U = U(1e-6),
     fastMode: bool = false
-    ): Tensor[T] =
+  ): Tensor[T] =
   ## Calculates the gradient of f(x) w.r.t vector x at x0 using step size h.
   ## Every column is the gradient of one component of f.
   ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
@@ -88,12 +88,61 @@ proc tensorJacobian*[U, T](
     x0: Tensor[U],
     h: U = U(1e-6),
     fastMode: bool = false
-    ): Tensor[T] =
-  ## Calculates the jacobian of f(x) w.r.t vector x at x0 using step size h.
-  ## Every row is the gradient of one component of f.
-  ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
-  ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
-  transpose(tensorGradient(f, x0, h, fastMode))
+  ): Tensor[T] =
+    ## Calculates the jacobian of f(x) w.r.t vector x at x0 using step size h.
+    ## Every row is the gradient of one component of f.
+    ## By default it uses central difference for approximating the derivatives. This requires two function evaluations per derivative.
+    ## When fastMode is true it will instead use the forward difference which only uses 1 function evaluation per derivative but is less accurate.
+    transpose(tensorGradient(f, x0, h, fastMode))
+
+proc mixedDerivative*[U, T](f: proc(x: Tensor[U]): T, x0: var Tensor[U], indices: (int, int), h: U = U(1e-6)): T =
+  result = 0
+  let i = indices[0]
+  let j = indices[1]
+  # f(x+h, y+h)
+  x0[i] += h
+  x0[j] += h
+  result += f(x0)
+
+  # f(x+h, y-h)
+  x0[j] -= 2*h
+  result -= f(x0)
+
+  # f(x-h, y-h)
+  x0[i] -= 2*h
+  result += f(x0)
+
+  # f(x-h, y+h)
+  x0[j] += 2*h
+  result -= f(x0)
+
+  # restore x0
+  x0[i] += h
+  x0[j] -= h
+
+  result *= 1 / (4 * h*h)
+  
+
+proc tensorHessian*[U; T: not Tensor](
+    f: proc(x: Tensor[U]): T,
+    x0: Tensor[U],
+    h: U = U(1e-6)
+  ): Tensor[T] =
+    assert x0.rank == 1 # must be a 1d vector
+    let f0 = f(x0)
+    let xLen = x0.shape[0]
+    var x = x0.clone()
+    result = zeros[T](xLen, xLen)
+    for i in 0 ..< xLen:
+      for j in i ..< xLen:
+        let mixed = mixedDerivative(f, x, (i, j), h)
+        result[i, j] = mixed
+        result[j, i] = mixed
+
+
+        
+        
+
 
 
 when isMainModule:
diff --git a/tests/test_differentiate.nim b/tests/test_differentiate.nim
index 642a5da..cd7bb4f 100644
--- a/tests/test_differentiate.nim
+++ b/tests/test_differentiate.nim
@@ -55,6 +55,18 @@ proc scalarGradient(x: Tensor[float]): Tensor[float] =
   result[1] = 2*x[0]
   result[2] = cos(x[2])
 
+proc scalarHessian(x: Tensor[float]): Tensor[float] =
+  result = zeros[float](3, 3)
+  result[0,0] = 2
+  result[0,1] = 2
+  result[1,0] = 2
+  result[0,2] = 0
+  result[2,0] = 0
+  result[1,1] = 0
+  result[1,2] = 0
+  result[2,1] = 0
+  result[2,2] = -sin(x[2])
+
 proc fMultidim(x: Tensor[float]): Tensor[float] =
   # Function will have 3 inputs and 2 outputs (important that they aren't the same for testing!)
   # f(x0, x1, x2) = (x0*x1*x2^2, x1*sin(2*x2))
@@ -127,3 +139,13 @@ suite "Multi dimensional numeric gradients":
           let exact = multidimGradient(x0).transpose
           for err in abs(numJacobian - exact):
             check err < 1e-10
+
+  test "Hessian scalar valued function":
+    for x in numericalnim.linspace(0, 1, 10):
+      for y in numericalnim.linspace(0, 1, 10):
+        for z in numericalnim.linspace(0, 1, 10):
+          let x0 = [x, y, z].toTensor
+          let numHessian = tensorHessian(fScalar, x0)
+          let exact = scalarHessian(x0)
+          for err in abs(numHessian - exact):
+            check err < 3e-4

From 22fd1bbe1f0904a32c2948743a996010f1dc2d8d Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Thu, 10 Feb 2022 09:57:08 +0100
Subject: [PATCH 14/25] basic newton solver

---
 src/numericalnim/optimize.nim | 54 +++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index 465b03d..3614ca8 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -151,7 +151,28 @@ proc steepestDescent*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
         gradNorm = vectorNorm(gradient)
         iters += 1
     if iters >= 10000:
-        echo "Limit of 10000 iterations reached!"
+        discard "Limit of 10000 iterations reached!"
+    #echo iters, " iterations done!"
+    result = x
+
+proc newton*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+    var x = x0.clone()
+    var fNorm = abs(f(x))
+    var gradient = tensorGradient(f, x, fastMode=fastMode)
+    var gradNorm = vectorNorm(gradient)
+    var hessian = tensorHessian(f, x)
+    var iters: int
+    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+        let p = -solve(hessian, gradient)
+        x += alpha * p
+        let fx = f(x)
+        fNorm = abs(fx)
+        gradient = tensorGradient(f, x, fastMode=fastMode)
+        gradNorm = vectorNorm(gradient)
+        hessian = tensorHessian(f, x)
+        iters += 1
+    if iters >= 10000:
+        discard "Limit of 10000 iterations reached!"
     #echo iters, " iterations done!"
     result = x
 
@@ -201,25 +222,28 @@ proc levmarq*[U, T](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xDa
 when isMainModule:
     import benchy
     # Steepest descent:
-#[     proc f1(x: Tensor[float]): float =
-        result = x[0]*x[0] + x[1]*x[1] - 10
+    proc f1(x: Tensor[float]): float =
+        # result = -20*exp(-0.2*sqrt(0.5 * (x[0]*x[0] + x[1]*x[1]))) - exp(0.5*(cos(2*PI*x[0]) + cos(2*PI*x[1]))) + E + 20 # Ackley
+        result = (1 - x[0])^2 + 100*(x[1] - x[0]^2)^2
     
-    let x0 = [10.0, 10.0].toTensor
-    echo eye[float](10)
-    let sol1 = steepestDescent(f1, x0, tol=1e-10, fastMode=false)
-    let sol2 = steepestDescent(f1, x0, tol=1e-10, fastMode=true)
+    let x0 = [-1.0, -1.0].toTensor
+    let sol1 = steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
     echo sol1
-    echo sol2
     echo f1(sol1)
-    echo f1(sol2)
+    echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=false)
+    echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=true)
 
-    timeIt "slow mode":
-        keep steepestDescent(f1, x0, tol=1e-10, fastMode=false)
-    timeIt "fast mode":
-        keep steepestDescent(f1, x0, tol=1e-10, fastMode=true) ]#
+    timeIt "steepest slow mode":
+        keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=false)
+    timeIt "steepest fast mode":
+        keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
+    timeIt "newton slow mode":
+        keep newton(f1, x0, tol=1e-8, fastMode=false)
+    timeIt "newton fast mode":
+        keep newton(f1, x0, tol=1e-8, fastMode=true)
 
     # Lev-Marq:
-    proc fFit(params: Tensor[float], x: float): float =
+#[     proc fFit(params: Tensor[float], x: float): float =
         params[0] + params[1] * x + params[2] * x*x
     
     let xData = arraymancer.linspace(0, 10, 100)
@@ -229,7 +253,7 @@ when isMainModule:
     timeIt "slow mode":
         keep levmarq(fFit, params0, xData, yData, fastMode=false)
     timeIt "fast mode":
-        keep levmarq(fFit, params0, xData, yData, fastMode=true)
+        keep levmarq(fFit, params0, xData, yData, fastMode=true) ]#
 
 
 

From 351c4bdaae7a27c0e19c6ebcb47826b7d39516bc Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Thu, 10 Feb 2022 19:57:21 +0100
Subject: [PATCH 15/25] implement bfgs

---
 src/numericalnim/optimize.nim | 46 ++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index 3614ca8..939c61c 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -135,7 +135,7 @@ proc eye[T](n: int): Tensor[T] =
     for i in 0 ..< n:
         result[i, i] = 1
 
-proc steepestDescent*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
     ## Minimize scalar-valued function f. 
     var x = x0.clone()
     var fNorm = abs(f(x0))
@@ -155,7 +155,7 @@ proc steepestDescent*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     #echo iters, " iterations done!"
     result = x
 
-proc newton*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
     var x = x0.clone()
     var fNorm = abs(f(x))
     var gradient = tensorGradient(f, x, fastMode=fastMode)
@@ -176,7 +176,36 @@ proc newton*[U, T](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), t
     #echo iters, " iterations done!"
     result = x
 
-proc levmarq*[U, T](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], alpha = U(1), tol: U = U(1e-6), lambda0: U = U(1), fastMode = false): Tensor[U] =
+proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+    var x = x0.clone()
+    let xLen = x.shape[0]
+    var fNorm = abs(f(x))
+    var gradient = tensorGradient(f, x, fastMode=fastMode)
+    var gradNorm = vectorNorm(gradient)
+    var hessianB = eye[T](xLen) # inverse of the approximated hessian
+    var iters: int
+    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+        let p = -hessianB * gradient.reshape(xLen, 1)
+        x += alpha * p
+        let newGradient = tensorGradient(f, x, fastMode=fastMode)
+        let sk = alpha * p.reshape(xLen, 1)
+        let yk = (newGradient - gradient).reshape(xLen, 1)
+        let sk_yk_dot = dot(sk.squeeze, yk.squeeze)
+        hessianB += (sk_yk_dot + dot(yk.squeeze, squeeze(hessianB * yk))) / (sk_yk_dot * sk_yk_dot) * (sk * sk.transpose) - ((hessianB * yk) * sk.transpose + sk * (yk.transpose * hessianB)) / sk_yk_dot
+
+        gradient = newGradient
+        let fx = f(x)
+        fNorm = abs(fx)
+        gradNorm = vectorNorm(gradient)
+        iters += 1
+    if iters >= 10000:
+        discard "Limit of 10000 iterations reached!"
+    #echo iters, " iterations done!"
+    result = x
+
+
+
+proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], alpha = U(1), tol: U = U(1e-6), lambda0: U = U(1), fastMode = false): Tensor[U] =
     assert xData.rank == 1
     assert yData.rank == 1
     assert params0.rank == 1
@@ -225,13 +254,18 @@ when isMainModule:
     proc f1(x: Tensor[float]): float =
         # result = -20*exp(-0.2*sqrt(0.5 * (x[0]*x[0] + x[1]*x[1]))) - exp(0.5*(cos(2*PI*x[0]) + cos(2*PI*x[1]))) + E + 20 # Ackley
         result = (1 - x[0])^2 + 100*(x[1] - x[0]^2)^2
+        for ix in x:
+            result += (ix - 1)^2
     
-    let x0 = [-1.0, -1.0].toTensor
+    #let x0 = [-1.0, -1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0].toTensor
+    let x0 = ones[float](500) * -1
     let sol1 = steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
     echo sol1
     echo f1(sol1)
     echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=false)
     echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=true)
+    echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
+    echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
 
     timeIt "steepest slow mode":
         keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=false)
@@ -241,6 +275,10 @@ when isMainModule:
         keep newton(f1, x0, tol=1e-8, fastMode=false)
     timeIt "newton fast mode":
         keep newton(f1, x0, tol=1e-8, fastMode=true)
+    timeIt "bfgs slow mode":
+        keep bfgs(f1, x0, tol=1e-8, fastMode=false)
+    timeIt "bfgs fast mode":
+        keep bfgs(f1, x0, tol=1e-8, fastMode=true)
 
     # Lev-Marq:
 #[     proc fFit(params: Tensor[float], x: float): float =

From ef6338752058138786a0dc943530bbcd3fbec6a2 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Fri, 11 Feb 2022 22:22:29 +0100
Subject: [PATCH 16/25] somewhat working lbfgs function

---
 src/numericalnim/optimize.nim | 65 +++++++++++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index 939c61c..df78760 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -1,7 +1,5 @@
-import strformat
+import std/[strformat, sequtils, math, deques]
 import arraymancer
-import sequtils
-import math
 import ./differentiate
 
 proc steepest_descent*(deriv: proc(x: float64): float64, start: float64, gamma: float64 = 0.01, precision: float64 = 1e-5, max_iters: Natural = 1000):float64 {.inline.} =
@@ -180,7 +178,7 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
-    var gradient = tensorGradient(f, x, fastMode=fastMode)
+    var gradient = 0.01*tensorGradient(f, x, fastMode=fastMode)
     var gradNorm = vectorNorm(gradient)
     var hessianB = eye[T](xLen) # inverse of the approximated hessian
     var iters: int
@@ -203,7 +201,57 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     #echo iters, " iterations done!"
     result = x
 
+proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+    var x = x0.clone()
+    let xLen = x.shape[0]
+    var fNorm = abs(f(x))
+    var gradient = 0.01*tensorGradient(f, x, fastMode=fastMode)
+    var gradNorm = vectorNorm(gradient)
+    var iters: int
+    let m = 10 # number of past iterations to save
+    var sk_queue = initDeque[Tensor[U]](m)
+    var yk_queue = initDeque[Tensor[T]](m)
+    # the problem is the first iteration as the gradient is huge and no adjustments are made
+    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+        #echo "grad: ", gradient
+        #echo "x: ", x
+        var q = gradient.clone()
+        # we want to loop from latest inserted to oldest
+        # → we insert at the beginning and pop at the end
+        var alphas: seq[U]
+        for i in 0 ..< sk_queue.len:
+            let rho_i = 1 / dot(sk_queue[i], yk_queue[i])
+            let alpha_i = rho_i * dot(sk_queue[i], q)
+            q -= alpha_i * yk_queue[i]
+            alphas.add alpha_i
+        let gamma = if sk_queue.len == 0: 1.0 else: dot(sk_queue[0], yk_queue[0]) / dot(yk_queue[0], yk_queue[0])
+        #echo gamma
+        var z = gamma * q
+        for i in countdown(sk_queue.len - 1, 0):
+            let rho_i = 1 / dot(sk_queue[i], yk_queue[i])
+            let beta_i = rho_i * dot(yk_queue[i], z)
+            let alpha_i = alphas[i]
+            z += sk_queue[i] * (alpha_i - beta_i)
+        z = -z
+        let p = z
+        #echo "q: ", q
+        x += alpha * p
+        sk_queue.addFirst alpha*p
+        let newGradient = tensorGradient(f, x, fastMode=fastMode)
+        let yk = newGradient - gradient
+        yk_queue.addFirst yk
+        gradient = newGradient
+        let fx = f(x)
+        fNorm = abs(fx)
+        gradNorm = vectorNorm(gradient)
+        if sk_queue.len > m: discard sk_queue.popLast
+        if yk_queue.len > m: discard yk_queue.popLast
+        iters += 1
 
+    if iters >= 10000:
+        discard "Limit of 10000 iterations reached!"
+    #echo iters, " iterations done!"
+    result = x
 
 proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], alpha = U(1), tol: U = U(1e-6), lambda0: U = U(1), fastMode = false): Tensor[U] =
     assert xData.rank == 1
@@ -257,8 +305,8 @@ when isMainModule:
         for ix in x:
             result += (ix - 1)^2
     
-    #let x0 = [-1.0, -1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0].toTensor
-    let x0 = ones[float](500) * -1
+    #let x0 = [-0.5, 2.0].toTensor
+    let x0 = ones[float](500) * 0
     let sol1 = steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
     echo sol1
     echo f1(sol1)
@@ -266,6 +314,7 @@ when isMainModule:
     echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=true)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
+    echo "LBFGS:", lbfgs(f1, x0, tol=1e-8, fastMode=false)
 
     timeIt "steepest slow mode":
         keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=false)
@@ -279,6 +328,10 @@ when isMainModule:
         keep bfgs(f1, x0, tol=1e-8, fastMode=false)
     timeIt "bfgs fast mode":
         keep bfgs(f1, x0, tol=1e-8, fastMode=true)
+    timeIt "lbfgs slow mode":
+        keep lbfgs(f1, x0, tol=1e-8, fastMode=false)
+    timeIt "lbfgs fast mode":
+        keep lbfgs(f1, x0, tol=1e-8, fastMode=true)
 
     # Lev-Marq:
 #[     proc fFit(params: Tensor[float], x: float): float =

From 5ee3fd49446c229d836121a3ed186b3cb3df4ee7 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Sat, 12 Feb 2022 22:44:40 +0100
Subject: [PATCH 17/25] optimzed version of bfgs (remove old one after test)

---
 src/numericalnim/optimize.nim | 116 ++++++++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 6 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index df78760..a7279cc 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -183,13 +183,30 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     var hessianB = eye[T](xLen) # inverse of the approximated hessian
     var iters: int
     while gradNorm > tol*(1 + fNorm) and iters < 10000:
+        #echo "Hessian iter ", iters, ": ", hessianB
         let p = -hessianB * gradient.reshape(xLen, 1)
+        #echo "p iter ", iters, ": ", p
         x += alpha * p
         let newGradient = tensorGradient(f, x, fastMode=fastMode)
         let sk = alpha * p.reshape(xLen, 1)
+        #gemm(1.0, hessianB, hessianB, 0.0, newGradient)
         let yk = (newGradient - gradient).reshape(xLen, 1)
         let sk_yk_dot = dot(sk.squeeze, yk.squeeze)
-        hessianB += (sk_yk_dot + dot(yk.squeeze, squeeze(hessianB * yk))) / (sk_yk_dot * sk_yk_dot) * (sk * sk.transpose) - ((hessianB * yk) * sk.transpose + sk * (yk.transpose * hessianB)) / sk_yk_dot
+        let prefactor1 = (sk_yk_dot + dot(yk.squeeze, squeeze(hessianB * yk))) / (sk_yk_dot * sk_yk_dot)
+        let prefactor2 = 1 / sk_yk_dot
+        let m1 = (sk * sk.transpose)
+        let m2_1 = (hessianB * yk) * sk.transpose
+        let m2_2 = sk * (yk.transpose * hessianB)
+        #echo "prefactor2: ", prefactor2
+        hessianB += prefactor1 * m1
+        #echo "hessian2: ", hessianB
+        #echo "hessian: ", hessianB
+        #echo "yk: ", yk
+        hessianB -= prefactor2 * m2_1
+        #echo "checkpoint1: ", (hessianB * yk)
+        #echo "hessian2.5: ", hessianB
+        hessianB -= prefactor2 * m2_2
+        #echo "hessian3: ", hessianB
 
         gradient = newGradient
         let fx = f(x)
@@ -201,6 +218,84 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     #echo iters, " iterations done!"
     result = x
 
+proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+    # Use gemm and gemv with preallocated Tensors and setting beta = 0
+    var x = x0.clone()
+    let xLen = x.shape[0]
+    var fNorm = abs(f(x))
+    var gradient = 0.01*tensorGradient(f, x, fastMode=fastMode)
+    var gradNorm = vectorNorm(gradient)
+    var hessianB = eye[T](xLen) # inverse of the approximated hessian
+    var p = newTensor[U](xLen)
+    var tempVector1 = newTensor[U](xLen, 1)
+    var tempVector2 = newTensor[U](1, xLen)
+    var iters: int
+    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+        # We are using hessianB in calculating it so we are modifying it prior to its use!
+
+
+        #echo "Hessian iter ", iters, ": ", hessianB
+        #let p = -hessianB * gradient.reshape(xLen, 1)
+        gemv(-1.0, hessianB, gradient.reshape(xLen, 1), 0.0, p)
+        #echo "p iter ", iters, ": ", p
+        #echo "x iter ", iters, ": ", x
+        #echo "gradient iter ", iters, ": ", gradient
+        x += alpha * p
+        let newGradient = tensorGradient(f, x, fastMode=fastMode)
+        let sk = alpha * p.reshape(xLen, 1)
+        
+        let yk = (newGradient - gradient).reshape(xLen, 1)
+        let sk_yk_dot = dot(sk.squeeze, yk.squeeze)
+        # gemm(1.0, hessianB, hessianB, 0.0, newGradient)
+        # Do the calculation in steps, minimizing allocations
+        # sk * sk.transpose is matrix × matrix
+        # how to deal with the vector.T × vector = Matrix? Use gemm? gemm() can be used as += if beta is set to 1.0
+        #echo "aim: ", hessianB + (sk_yk_dot + dot(yk.squeeze, squeeze(hessianB * yk))) / (sk_yk_dot * sk_yk_dot) * (sk * sk.transpose) - ((hessianB * yk) * sk.transpose + sk * (yk.transpose * hessianB)) / sk_yk_dot
+        #echo "real Hessian: ", hessianB + (sk_yk_dot + dot(yk.squeeze, squeeze(hessianB * yk))) / (sk_yk_dot * sk_yk_dot) * (sk * sk.transpose)
+        let prefactor1 = (sk_yk_dot + dot(yk.squeeze, squeeze(hessianB * yk))) / (sk_yk_dot * sk_yk_dot)
+        let prefactor2 = 1 / sk_yk_dot
+        gemv(1.0, hessianB, yk, 0.0, tempVector1) # temp1 = hessianB * yk
+        gemm(1.0, yk.transpose, hessianB, 0.0, tempVector2) # temp2 = yk.transpose * hessianB
+
+        gemm(prefactor1, sk, sk.transpose, 1.0, hessianB) # hessianB += prefactor1 * sk * sk.transpose
+
+        gemm(-prefactor2, tempVector1, sk.transpose, 1.0, hessianB) # hessianB -= prefactor2 * temp1 * sk.transpose
+
+        gemm(-prefactor2, sk, tempVector2, 1.0, hessianB) # hessianB -= prefactor2 * sk * temp2
+        #echo "hessian2: ", hessianB # This is correct
+        
+        # somewhere down here the error occurs!↓
+
+        # reuse vector p:
+        #gemv(1.0, hessianB, yk, 0.0, tempVector1) # temp1 = hessianB * yk
+        #echo "checkpoint1: ", tempVector1 # this is incorrect!!!
+
+        #
+        #
+        # Rewrite with transposes as gemv instead!
+        # (A * B)^T = B^T * A^T → yk.transpose * hessianB = transpose(hessianB.transpose * yk)
+        #
+
+        #gemm(1.0, yk.transpose, hessianB, 0.0, p) 
+        #gemv(1.0, hessianB.transpose, yk, 0.0, tempVector) # p = yk.transpose * hessianB
+        #tempVector = tempVector.transpose
+
+        
+        #echo "hessian3: ", hessianB
+
+        #hessianB += (sk_yk_dot + dot(yk.squeeze, squeeze(hessianB * yk))) / (sk_yk_dot * sk_yk_dot) * (sk * sk.transpose) - ((hessianB * yk) * sk.transpose + sk * (yk.transpose * hessianB)) / sk_yk_dot
+        #tempVector = tempVector.transpose # reverse it back to column vector
+        
+        gradient = newGradient
+        let fx = f(x)
+        fNorm = abs(fx)
+        gradNorm = vectorNorm(gradient)
+        iters += 1
+    if iters >= 10000:
+        discard "Limit of 10000 iterations reached!"
+    #echo iters, " iterations done!"
+    result = x
+
 proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
     var x = x0.clone()
     let xLen = x.shape[0]
@@ -302,19 +397,24 @@ when isMainModule:
     proc f1(x: Tensor[float]): float =
         # result = -20*exp(-0.2*sqrt(0.5 * (x[0]*x[0] + x[1]*x[1]))) - exp(0.5*(cos(2*PI*x[0]) + cos(2*PI*x[1]))) + E + 20 # Ackley
         result = (1 - x[0])^2 + 100*(x[1] - x[0]^2)^2
-        for ix in x:
-            result += (ix - 1)^2
+        if x.shape[0] > 2:
+            for ix in x[2 .. _]:
+                result += (ix - 1)^2
     
     #let x0 = [-0.5, 2.0].toTensor
-    let x0 = ones[float](500) * 0
-    let sol1 = steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
+    let x0 = ones[float](500) * -1
+    #[ let sol1 = steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
     echo sol1
     echo f1(sol1)
     echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=false)
     echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=true)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
-    echo "LBFGS:", lbfgs(f1, x0, tol=1e-8, fastMode=false)
+    echo "LBFGS:", lbfgs(f1, x0, tol=1e-8, fastMode=false) ]#
+    echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
+    echo "BFGS opt: ", bfgs_optimized(f1, x0, tol=1e-8, fastMode=false)
+    echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
+    echo "BFGS opt: ", bfgs_optimized(f1, x0, tol=1e-8, fastMode=true)
 
     timeIt "steepest slow mode":
         keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=false)
@@ -328,6 +428,10 @@ when isMainModule:
         keep bfgs(f1, x0, tol=1e-8, fastMode=false)
     timeIt "bfgs fast mode":
         keep bfgs(f1, x0, tol=1e-8, fastMode=true)
+    timeIt "optimized bfgs slow mode":
+        keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=false)
+    timeIt "optimized bfgs fast mode":
+        keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=true)
     timeIt "lbfgs slow mode":
         keep lbfgs(f1, x0, tol=1e-8, fastMode=false)
     timeIt "lbfgs fast mode":

From 49715b2d97a3a80ccd7fb12647ddc8a368451333 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Sat, 19 Feb 2022 12:26:16 +0100
Subject: [PATCH 18/25] line search but it's not fast...

---
 src/numericalnim/optimize.nim | 119 +++++++++++++++++++++++++++++-----
 1 file changed, 103 insertions(+), 16 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index a7279cc..a2a63b0 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -133,8 +133,59 @@ proc eye[T](n: int): Tensor[T] =
     for i in 0 ..< n:
         result[i, i] = 1
 
-proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+type LineSearchCriterion = enum
+    Armijo, Wolfe, WolfeStrong, None
+
+proc line_search*[U, T](alpha: var U, p: Tensor[T], x0: Tensor[U], f: proc(x: Tensor[U]): T, criterion: LineSearchCriterion, fastMode: bool = false) =
+    # note: set initial alpha for the methods as 1 / sqrt(dot(grad, grad)) so first step has length 1.
+    if criterion == None:
+        return
+    var gradient = tensorGradient(f, x0, fastMode=fastMode)
+    let dirDerivInit = dot(gradient, p)
+
+    if 0 < dirDerivInit:
+        # p is pointing uphill, use whatever alpha we currently have.
+        return
+
+    let fInit = f(x0)
+    var counter = 0
+    alpha = 1
+    while counter < 20:
+        let x = x0 + alpha * p
+        let fx = f(x)
+        gradient = tensorGradient(f, x, fastMode=fastMode)
+        counter += 1
+
+        if fx > fInit + 1e-4*alpha * dirDerivInit: # c1 = 1e-4 multiply as well? Doesn't seem to work
+            alpha *= 0.5
+        else:
+            if criterion == Armijo:
+                return
+            let dirDeriv = dot(gradient, p)
+            if dirDeriv < 0.9 * dirDerivInit:
+                alpha *= 2.1
+            else:
+                if criterion == Wolfe:
+                    return
+                if dirDeriv > -0.9*dirDerivInit:
+                    alpha *= 0.5
+                else:
+                    return
+        if alpha < 1e-3:
+            alpha = 1e-3
+            return
+        elif alpha > 1e2:
+            alpha = 1e2
+            return
+
+
+
+
+    
+
+proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false, criterion: LineSearchCriterion = None): Tensor[U] =
     ## Minimize scalar-valued function f. 
+    var alpha = alpha
     var x = x0.clone()
     var fNorm = abs(f(x0))
     var gradient = tensorGradient(f, x0, fastMode=fastMode)
@@ -142,6 +193,7 @@ proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
     var iters: int
     while gradNorm > tol*(1 + fNorm) and iters < 10000:
         let p = -gradient
+        line_search(alpha, p, x, f, criterion, fastMode)
         x += alpha * p
         let fx = f(x)
         fNorm = abs(fx)
@@ -218,8 +270,9 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     #echo iters, " iterations done!"
     result = x
 
-proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false, criterion: LineSearchCriterion = None): Tensor[U] =
     # Use gemm and gemv with preallocated Tensors and setting beta = 0
+    var alpha = alpha
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
@@ -240,6 +293,7 @@ proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
         #echo "p iter ", iters, ": ", p
         #echo "x iter ", iters, ": ", x
         #echo "gradient iter ", iters, ": ", gradient
+        line_search(alpha, p, x, f, criterion, fastMode)
         x += alpha * p
         let newGradient = tensorGradient(f, x, fastMode=fastMode)
         let sk = alpha * p.reshape(xLen, 1)
@@ -296,14 +350,15 @@ proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
     #echo iters, " iterations done!"
     result = x
 
-proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false, m: int = 10, criterion: LineSearchCriterion = None): Tensor[U] =
+    var alpha = alpha
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
     var gradient = 0.01*tensorGradient(f, x, fastMode=fastMode)
     var gradNorm = vectorNorm(gradient)
     var iters: int
-    let m = 10 # number of past iterations to save
+    #let m = 10 # number of past iterations to save
     var sk_queue = initDeque[Tensor[U]](m)
     var yk_queue = initDeque[Tensor[T]](m)
     # the problem is the first iteration as the gradient is huge and no adjustments are made
@@ -330,6 +385,7 @@ proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U
         z = -z
         let p = z
         #echo "q: ", q
+        line_search(alpha, p, x, f, criterion, fastMode)
         x += alpha * p
         sk_queue.addFirst alpha*p
         let newGradient = tensorGradient(f, x, fastMode=fastMode)
@@ -402,7 +458,7 @@ when isMainModule:
                 result += (ix - 1)^2
     
     #let x0 = [-0.5, 2.0].toTensor
-    let x0 = ones[float](500) * -1
+    let x0 = ones[float](100) * -1
     #[ let sol1 = steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
     echo sol1
     echo f1(sol1)
@@ -410,17 +466,45 @@ when isMainModule:
     echo "Newton: ", newton(f1, x0, tol=1e-8, fastMode=true)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
-    echo "LBFGS:", lbfgs(f1, x0, tol=1e-8, fastMode=false) ]#
+    echo "LBFGS:", lbfgs(f1, x0, tol=1e-8, fastMode=false)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=false)
     echo "BFGS opt: ", bfgs_optimized(f1, x0, tol=1e-8, fastMode=false)
     echo "BFGS: ", bfgs(f1, x0, tol=1e-8, fastMode=true)
-    echo "BFGS opt: ", bfgs_optimized(f1, x0, tol=1e-8, fastMode=true)
+    echo "BFGS opt: ", bfgs_optimized(f1, x0, tol=1e-8, fastMode=true) ]#
 
-    timeIt "steepest slow mode":
-        keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=false)
-    timeIt "steepest fast mode":
-        keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true)
-    timeIt "newton slow mode":
+    #[ echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=None)
+    echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Armijo)
+    echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Wolfe)
+    echo bfgs_optimized(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=WolfeStrong)
+    echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=None)
+    echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Armijo)
+    echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Wolfe)
+    echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=WolfeStrong) ]#
+
+    timeIt "steepest slow mode None":
+        keep bfgs_optimized(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=None)
+
+    timeIt "steepest slow mode Armijo":
+        keep bfgs_optimized(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=Armijo)
+
+    timeIt "steepest slow mode Wolfe":
+        keep bfgs_optimized(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=Wolfe)
+
+    timeIt "steepest slow mode WolfeStrong":
+        keep bfgs_optimized(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=WolfeStrong)
+
+    timeIt "steepest slow mode None":
+        keep lbfgs(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=None)
+
+    timeIt "steepest slow mode Armijo":
+        keep lbfgs(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=Armijo)
+
+    timeIt "steepest slow mode Wolfe":
+        keep lbfgs(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=Wolfe)
+
+    timeIt "steepest slow mode WolfeStrong":
+        keep lbfgs(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=WolfeStrong)
+#[     timeIt "newton slow mode":
         keep newton(f1, x0, tol=1e-8, fastMode=false)
     timeIt "newton fast mode":
         keep newton(f1, x0, tol=1e-8, fastMode=true)
@@ -428,14 +512,17 @@ when isMainModule:
         keep bfgs(f1, x0, tol=1e-8, fastMode=false)
     timeIt "bfgs fast mode":
         keep bfgs(f1, x0, tol=1e-8, fastMode=true)
-    timeIt "optimized bfgs slow mode":
-        keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=false)
-    timeIt "optimized bfgs fast mode":
-        keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=true)
     timeIt "lbfgs slow mode":
         keep lbfgs(f1, x0, tol=1e-8, fastMode=false)
     timeIt "lbfgs fast mode":
         keep lbfgs(f1, x0, tol=1e-8, fastMode=true)
+    timeIt "optimized bfgs slow mode":
+        keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=false)
+    timeIt "optimized bfgs fast mode":
+        keep bfgs_optimized(f1, x0, tol=1e-8, fastMode=true) 
+    timeIt "steepest fast mode":
+        keep steepestDescent(f1, x0, tol=1e-8, alpha=0.001, fastMode=true) ]#
+
 
     # Lev-Marq:
 #[     proc fFit(params: Tensor[float], x: float): float =

From dea6b390f8f5a31fa66f5a8d20e4f4227d522e29 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Mon, 21 Feb 2022 20:42:31 +0100
Subject: [PATCH 19/25] implement basic tests for all optimization methods

---
 src/numericalnim/optimize.nim |   4 +-
 tests/test_optimize.nim       | 165 ++++++++++++++++++++++------------
 2 files changed, 111 insertions(+), 58 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index a2a63b0..290c773 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -183,7 +183,7 @@ proc line_search*[U, T](alpha: var U, p: Tensor[T], x0: Tensor[U], f: proc(x: Te
 
     
 
-proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.1), tol: U = U(1e-6), fastMode: bool = false, criterion: LineSearchCriterion = None): Tensor[U] =
+proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.001), tol: U = U(1e-6), fastMode: bool = false, criterion: LineSearchCriterion = None): Tensor[U] =
     ## Minimize scalar-valued function f. 
     var alpha = alpha
     var x = x0.clone()
@@ -443,6 +443,8 @@ proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Te
         gradNorm = newGradNorm
         resNorm = vectorNorm(residuals)
         iters += 1
+    if iters == 10000:
+        echo "levmarq reached maximum number of iterations!"
     result = params
 
 
diff --git a/tests/test_optimize.nim b/tests/test_optimize.nim
index fd03379..7b1f5ae 100644
--- a/tests/test_optimize.nim
+++ b/tests/test_optimize.nim
@@ -1,61 +1,112 @@
-import unittest, math, sequtils, arraymancer
+import std/[unittest, math, sequtils, random] 
+import arraymancer
 import numericalnim
 
-test "steepest_descent func":
-    proc df(x: float): float = 4 * x^3 - 9.0 * x^2
-    let start = 6.0
-    let gamma = 0.01
-    let precision = 0.00001
-    let max_iters = 10000
-    let correct = 2.24996
-    let value = steepest_descent(df, start, gamma, precision, max_iters)
-    check isClose(value, correct, tol = 1e-5)
-
-test "steepest_descent func starting at zero":
-    proc df(x: float): float = 4 * x^3 - 9.0 * x^2 + 4
-    let start = 0.0
-    let correct = -0.59301
-    let value = steepest_descent(df, start)
-    check isClose(value, correct, tol = 1e-5)
-
-#[
-test "conjugate_gradient func":
-    var A = toSeq([4.0, 1.0, 1.0, 3.0]).toTensor.reshape(2,2).astype(float64)
-    var x = toSeq([2.0, 1.0]).toTensor.reshape(2,1)
-    var b = toSeq([1.0,2.0]).toTensor.reshape(2,1)
-    let tol = 0.001
-    let correct = toSeq([0.090909, 0.636363]).toTensor.reshape(2,1).astype(float64)
-
-    let value = conjugate_gradient(A, b, x, tol)
-    check isClose(value, correct, tol = 1e-5)
-]#
-test "Newtons 1 dimension func":
-    proc f(x:float64): float64 = (1.0 / 3.0) * x ^ 3 - 2 * x ^ 2 + 3 * x
-    proc df(x:float64): float64 = x ^ 2 - 4 * x + 3
-    let x = 0.5
-    let correct = 0.0
-    let value = newtons(f, df, x, 0.000001, 1000)
-    check isClose(value, correct, tol=1e-5)
-
-test "Newtons 1 dimension func default args":
-    proc f(x:float64): float64 = (1.0 / 3.0) * x ^ 3 - 2 * x ^ 2 + 3 * x
-    proc df(x:float64): float64 = x ^ 2 - 4 * x + 3
-    let x = 0.5
-    let correct = 0.0
-    let value = newtons(f, df, x)
-    check isClose(value, correct, tol=1e-5)
-
-test "Newtons unable to find a root":
-    proc bad_f(x:float64): float64 = pow(E, x) + 1
-    proc bad_df(x:float64): float64 = pow(E, x)
-    expect(ArithmeticError):
-        discard newtons(bad_f, bad_df, 0, 0.000001, 1000)
-
-test "Secant 1 dimension func":
-    proc f(x:float64): float64 = (1.0 / 3.0) * x ^ 3 - 2 * x ^ 2 + 3 * x
-    let x = [1.0, 0.5]
-    let correct = 0.0
-    let value = secant(f, x, 1e-5, 10)
-    check isClose(value, correct, tol=1e-5)
+suite "1D":
+    test "steepest_descent func":
+        proc df(x: float): float = 4 * x^3 - 9.0 * x^2
+        let start = 6.0
+        let gamma = 0.01
+        let precision = 0.00001
+        let max_iters = 10000
+        let correct = 2.24996
+        let value = steepest_descent(df, start, gamma, precision, max_iters)
+        check isClose(value, correct, tol = 1e-5)
+
+    test "steepest_descent func starting at zero":
+        proc df(x: float): float = 4 * x^3 - 9.0 * x^2 + 4
+        let start = 0.0
+        let correct = -0.59301
+        let value = steepest_descent(df, start)
+        check isClose(value, correct, tol = 1e-5)
+
+    #[
+    test "conjugate_gradient func":
+        var A = toSeq([4.0, 1.0, 1.0, 3.0]).toTensor.reshape(2,2).astype(float64)
+        var x = toSeq([2.0, 1.0]).toTensor.reshape(2,1)
+        var b = toSeq([1.0,2.0]).toTensor.reshape(2,1)
+        let tol = 0.001
+        let correct = toSeq([0.090909, 0.636363]).toTensor.reshape(2,1).astype(float64)
+
+        let value = conjugate_gradient(A, b, x, tol)
+        check isClose(value, correct, tol = 1e-5)
+    ]#
+    test "Newtons 1 dimension func":
+        proc f(x:float64): float64 = (1.0 / 3.0) * x ^ 3 - 2 * x ^ 2 + 3 * x
+        proc df(x:float64): float64 = x ^ 2 - 4 * x + 3
+        let x = 0.5
+        let correct = 0.0
+        let value = newtons(f, df, x, 0.000001, 1000)
+        check isClose(value, correct, tol=1e-5)
+
+    test "Newtons 1 dimension func default args":
+        proc f(x:float64): float64 = (1.0 / 3.0) * x ^ 3 - 2 * x ^ 2 + 3 * x
+        proc df(x:float64): float64 = x ^ 2 - 4 * x + 3
+        let x = 0.5
+        let correct = 0.0
+        let value = newtons(f, df, x)
+        check isClose(value, correct, tol=1e-5)
+
+    test "Newtons unable to find a root":
+        proc bad_f(x:float64): float64 = pow(E, x) + 1
+        proc bad_df(x:float64): float64 = pow(E, x)
+        expect(ArithmeticError):
+            discard newtons(bad_f, bad_df, 0, 0.000001, 1000)
+
+    test "Secant 1 dimension func":
+        proc f(x:float64): float64 = (1.0 / 3.0) * x ^ 3 - 2 * x ^ 2 + 3 * x
+        let x = [1.0, 0.5]
+        let correct = 0.0
+        let value = secant(f, x, 1e-5, 10)
+        check isClose(value, correct, tol=1e-5)
+
+
+###############################
+## Multi-dimensional methods ##
+###############################
+
+suite "Multi-dim":
+    proc bananaFunc(x: Tensor[float]): float =
+        ## Function of 2 variables with minimum at (1, 1)
+        ## And it looks like a banana 🍌
+        result = (1 - x[0])^2 + 100*(x[1] - x[0]^2)^2
+    
+    let x0 = [-1.0, -1.0].toTensor()
+    let correct = [1.0, 1.0].toTensor()
+
+    test "Steepest Gradient":
+        let xSol = steepestDescent(bananaFunc, x0.clone)
+        for x in abs(correct - xSol):
+            check x < 2e-2
+    
+    test "Newton":
+        let xSol = newton(bananaFunc, x0.clone)
+        for x in abs(correct - xSol):
+            check x < 3e-10
+
+    test "BFGS":
+        let xSol = bfgs(bananaFunc, x0.clone)
+        for x in abs(correct - xSol):
+            check x < 3e-7
+
+    test "L-BFGS":
+        let xSol = lbfgs(bananaFunc, x0.clone)
+        for x in abs(correct - xSol):
+            check x < 7e-10
+
+    let correctParams = [10.4, -0.45].toTensor()
+    proc fitFunc(params: Tensor[float], x: float): float =
+        params[0] * exp(params[1] * x)
+
+    let xData = arraymancer.linspace(0.0, 10.0, 100)
+    randomize(1337)
+    let yData = correctParams[0] * exp(correctParams[1] * xData) + randomNormalTensor([100], 0.0, 1e-2)
+
+    let params0 = [0.0, 0.0].toTensor()
+
+    test "levmarq":
+        let paramsSol = levmarq(fitFunc, params0, xData, yData)
+        for x in abs(paramsSol - correctParams):
+            check x < 9e-4
 
 

From 2725dfc065584d01e9fcc1bc34e11abc0036c6a3 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Tue, 22 Feb 2022 16:29:34 +0100
Subject: [PATCH 20/25] increase tolerance to please 1.4.8

---
 tests/test_optimize.nim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_optimize.nim b/tests/test_optimize.nim
index 7b1f5ae..71aacdd 100644
--- a/tests/test_optimize.nim
+++ b/tests/test_optimize.nim
@@ -107,6 +107,6 @@ suite "Multi-dim":
     test "levmarq":
         let paramsSol = levmarq(fitFunc, params0, xData, yData)
         for x in abs(paramsSol - correctParams):
-            check x < 9e-4
+            check x < 1.3e-3
 
 

From 772097c1bdce79e9d260e0606764dd110d575cad Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Tue, 22 Feb 2022 18:52:56 +0100
Subject: [PATCH 21/25] add OptimOptions type

---
 src/numericalnim/optimize.nim | 74 +++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index 290c773..edca85d 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -123,6 +123,23 @@ proc secant*(f: proc(x: float64): float64, start: array[2, float64], precision:
 ## Multidimensional methods ##
 ##############################
 
+type LineSearchCriterion = enum
+    Armijo, Wolfe, WolfeStrong, NoLineSearch
+
+type OptimOptions*[U] = object
+    tol*, alpha*, lambda0*: U
+    fastMode*: bool
+    maxIterations*: int
+    lineSearchCriterion*: LineSearchCriterion
+
+proc optimOptions*[U](tol: U = U(1e-6), alpha: U = U(1), lambda0: U = U(1), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U] =
+    result.tol = tol
+    result.alpha = alpha
+    result.lambda0 = lambda0
+    result.fastMode = fastMode
+    result.maxIterations = maxIterations
+    result.lineSearchCriterion = lineSearchCriterion
+
 proc vectorNorm*[T](v: Tensor[T]): T =
     ## Calculates the norm of the vector, ie the sqrt(Σ vᵢ²)
     assert v.rank == 1, "v must be a 1d vector!"
@@ -133,12 +150,9 @@ proc eye[T](n: int): Tensor[T] =
     for i in 0 ..< n:
         result[i, i] = 1
 
-type LineSearchCriterion = enum
-    Armijo, Wolfe, WolfeStrong, None
-
 proc line_search*[U, T](alpha: var U, p: Tensor[T], x0: Tensor[U], f: proc(x: Tensor[U]): T, criterion: LineSearchCriterion, fastMode: bool = false) =
     # note: set initial alpha for the methods as 1 / sqrt(dot(grad, grad)) so first step has length 1.
-    if criterion == None:
+    if criterion == NoLineSearch:
         return
     var gradient = tensorGradient(f, x0, fastMode=fastMode)
     let dirDerivInit = dot(gradient, p)
@@ -183,21 +197,21 @@ proc line_search*[U, T](alpha: var U, p: Tensor[T], x0: Tensor[U], f: proc(x: Te
 
     
 
-proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(0.001), tol: U = U(1e-6), fastMode: bool = false, criterion: LineSearchCriterion = None): Tensor[U] =
+proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U] = optimOptions[U](alpha = U(0.001))): Tensor[U] =
     ## Minimize scalar-valued function f. 
-    var alpha = alpha
+    var alpha = options.alpha
     var x = x0.clone()
     var fNorm = abs(f(x0))
-    var gradient = tensorGradient(f, x0, fastMode=fastMode)
+    var gradient = tensorGradient(f, x0, fastMode=options.fastMode)
     var gradNorm = vectorNorm(gradient)
     var iters: int
-    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+    while gradNorm > options.tol*(1 + fNorm) and iters < 10000:
         let p = -gradient
-        line_search(alpha, p, x, f, criterion, fastMode)
+        line_search(alpha, p, x, f, options.lineSearchCriterion, options.fastMode)
         x += alpha * p
         let fx = f(x)
         fNorm = abs(fx)
-        gradient = tensorGradient(f, x, fastMode=fastMode)
+        gradient = tensorGradient(f, x, fastMode=options.fastMode)
         gradNorm = vectorNorm(gradient)
         iters += 1
     if iters >= 10000:
@@ -205,19 +219,21 @@ proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
     #echo iters, " iterations done!"
     result = x
 
-proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U] = optimOptions[U]()): Tensor[U] =
+    var alpha = options.alpha
     var x = x0.clone()
     var fNorm = abs(f(x))
-    var gradient = tensorGradient(f, x, fastMode=fastMode)
+    var gradient = tensorGradient(f, x, fastMode=options.fastMode)
     var gradNorm = vectorNorm(gradient)
     var hessian = tensorHessian(f, x)
     var iters: int
-    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+    while gradNorm > options.tol*(1 + fNorm) and iters < 10000:
         let p = -solve(hessian, gradient)
+        line_search(alpha, p, x, f, options.lineSearchCriterion, options.fastMode)
         x += alpha * p
         let fx = f(x)
         fNorm = abs(fx)
-        gradient = tensorGradient(f, x, fastMode=fastMode)
+        gradient = tensorGradient(f, x, fastMode=options.fastMode)
         gradNorm = vectorNorm(gradient)
         hessian = tensorHessian(f, x)
         iters += 1
@@ -226,7 +242,7 @@ proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U
     #echo iters, " iterations done!"
     result = x
 
-proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+proc bfgs_old*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
@@ -270,20 +286,20 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U =
     #echo iters, " iterations done!"
     result = x
 
-proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false, criterion: LineSearchCriterion = None): Tensor[U] =
+proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U] = optimOptions[U]()): Tensor[U] =
     # Use gemm and gemv with preallocated Tensors and setting beta = 0
-    var alpha = alpha
+    var alpha = options.alpha
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
-    var gradient = 0.01*tensorGradient(f, x, fastMode=fastMode)
+    var gradient = 0.01*tensorGradient(f, x, fastMode=options.fastMode)
     var gradNorm = vectorNorm(gradient)
     var hessianB = eye[T](xLen) # inverse of the approximated hessian
     var p = newTensor[U](xLen)
     var tempVector1 = newTensor[U](xLen, 1)
     var tempVector2 = newTensor[U](1, xLen)
     var iters: int
-    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+    while gradNorm > options.tol*(1 + fNorm) and iters < 10000:
         # We are using hessianB in calculating it so we are modifying it prior to its use!
 
 
@@ -293,9 +309,9 @@ proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
         #echo "p iter ", iters, ": ", p
         #echo "x iter ", iters, ": ", x
         #echo "gradient iter ", iters, ": ", gradient
-        line_search(alpha, p, x, f, criterion, fastMode)
+        line_search(alpha, p, x, f, options.lineSearchCriterion, options.fastMode)
         x += alpha * p
-        let newGradient = tensorGradient(f, x, fastMode=fastMode)
+        let newGradient = tensorGradient(f, x, fastMode=options.fastMode)
         let sk = alpha * p.reshape(xLen, 1)
         
         let yk = (newGradient - gradient).reshape(xLen, 1)
@@ -350,19 +366,19 @@ proc bfgs_optimized*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
     #echo iters, " iterations done!"
     result = x
 
-proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false, m: int = 10, criterion: LineSearchCriterion = None): Tensor[U] =
-    var alpha = alpha
+proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], m: int = 10, options: OptimOptions[U] = optimOptions[U]()): Tensor[U] =
+    var alpha = options.alpha
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
-    var gradient = 0.01*tensorGradient(f, x, fastMode=fastMode)
+    var gradient = 0.01*tensorGradient(f, x, fastMode=options.fastMode)
     var gradNorm = vectorNorm(gradient)
     var iters: int
     #let m = 10 # number of past iterations to save
     var sk_queue = initDeque[Tensor[U]](m)
     var yk_queue = initDeque[Tensor[T]](m)
     # the problem is the first iteration as the gradient is huge and no adjustments are made
-    while gradNorm > tol*(1 + fNorm) and iters < 10000:
+    while gradNorm > options.tol*(1 + fNorm) and iters < 10000:
         #echo "grad: ", gradient
         #echo "x: ", x
         var q = gradient.clone()
@@ -385,10 +401,10 @@ proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U
         z = -z
         let p = z
         #echo "q: ", q
-        line_search(alpha, p, x, f, criterion, fastMode)
+        line_search(alpha, p, x, f, options.lineSearchCriterion, options.fastMode)
         x += alpha * p
         sk_queue.addFirst alpha*p
-        let newGradient = tensorGradient(f, x, fastMode=fastMode)
+        let newGradient = tensorGradient(f, x, fastMode=options.fastMode)
         let yk = newGradient - gradient
         yk_queue.addFirst yk
         gradient = newGradient
@@ -483,7 +499,7 @@ when isMainModule:
     echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=Wolfe)
     echo lbfgs(f1, x0, tol=1e-8, alpha=0.001, fastMode=false, criterion=WolfeStrong) ]#
 
-    timeIt "steepest slow mode None":
+    #[ timeIt "steepest slow mode None":
         keep bfgs_optimized(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=None)
 
     timeIt "steepest slow mode Armijo":
@@ -505,7 +521,7 @@ when isMainModule:
         keep lbfgs(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=Wolfe)
 
     timeIt "steepest slow mode WolfeStrong":
-        keep lbfgs(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=WolfeStrong)
+        keep lbfgs(f1, x0, tol=1e-8, alpha=1, fastMode=false, criterion=WolfeStrong) ]#
 #[     timeIt "newton slow mode":
         keep newton(f1, x0, tol=1e-8, fastMode=false)
     timeIt "newton fast mode":

From 49f8050ba02ccd6dd75858560318ccde6729aed4 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Wed, 23 Feb 2022 22:24:44 +0100
Subject: [PATCH 22/25] improve options types with specialization for each
 methods (inspired by @Vindaar)

---
 src/numericalnim/optimize.nim | 90 +++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 21 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index edca85d..cbc54d2 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -126,13 +126,20 @@ proc secant*(f: proc(x: float64): float64, start: array[2, float64], precision:
 type LineSearchCriterion = enum
     Armijo, Wolfe, WolfeStrong, NoLineSearch
 
-type OptimOptions*[U] = object
-    tol*, alpha*, lambda0*: U
-    fastMode*: bool
-    maxIterations*: int
-    lineSearchCriterion*: LineSearchCriterion
-
-proc optimOptions*[U](tol: U = U(1e-6), alpha: U = U(1), lambda0: U = U(1), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U] =
+type
+    OptimOptions*[U, AO] = object
+        tol*, alpha*: U
+        fastMode*: bool
+        maxIterations*: int
+        lineSearchCriterion*: LineSearchCriterion
+        algoOptions*: AO
+    StandardOptions* = object
+    LevMarqOptions*[U] = object
+        lambda0*: U
+    LBFGSOptions*[U] = object
+        savedIterations*: int
+
+proc optimOptions*[U](tol: U = U(1e-6), alpha: U = U(1), lambda0: U = U(1), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U, StandardOptions] =
     result.tol = tol
     result.alpha = alpha
     result.lambda0 = lambda0
@@ -140,6 +147,46 @@ proc optimOptions*[U](tol: U = U(1e-6), alpha: U = U(1), lambda0: U = U(1), fast
     result.maxIterations = maxIterations
     result.lineSearchCriterion = lineSearchCriterion
 
+proc steepestDescentOptions*[U](tol: U = U(1e-6), alpha: U = U(0.001), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U, StandardOptions] =
+    result.tol = tol
+    result.alpha = alpha
+    result.fastMode = fastMode
+    result.maxIterations = maxIterations
+    result.lineSearchCriterion = lineSearchCriterion
+
+proc newtonOptions*[U](tol: U = U(1e-6), alpha: U = U(1), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U, StandardOptions] =
+    result.tol = tol
+    result.alpha = alpha
+    result.fastMode = fastMode
+    result.maxIterations = maxIterations
+    result.lineSearchCriterion = lineSearchCriterion
+
+proc bfgsOptions*[U](tol: U = U(1e-6), alpha: U = U(1), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U, StandardOptions] =
+    result.tol = tol
+    result.alpha = alpha
+    result.fastMode = fastMode
+    result.maxIterations = maxIterations
+    result.lineSearchCriterion = lineSearchCriterion
+
+proc lbfgsOptions*[U](savedIterations: int = 10, tol: U = U(1e-6), alpha: U = U(1), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U, LBFGSOptions[U]] =
+    result.tol = tol
+    result.alpha = alpha
+    result.fastMode = fastMode
+    result.maxIterations = maxIterations
+    result.lineSearchCriterion = lineSearchCriterion
+    result.algoOptions.savedIterations = savedIterations
+
+proc levmarqOptions*[U](lambda0: U = U(1), tol: U = U(1e-6), alpha: U = U(1), fastMode: bool = false, maxIterations: int = 10000, lineSearchCriterion: LineSearchCriterion = NoLineSearch): OptimOptions[U, LevMarqOptions[U]] =
+    result.tol = tol
+    result.alpha = alpha
+    result.fastMode = fastMode
+    result.maxIterations = maxIterations
+    result.lineSearchCriterion = lineSearchCriterion
+    result.algoOptions.lambda0 = lambda0
+
+
+
+
 proc vectorNorm*[T](v: Tensor[T]): T =
     ## Calculates the norm of the vector, ie the sqrt(Σ vᵢ²)
     assert v.rank == 1, "v must be a 1d vector!"
@@ -193,11 +240,7 @@ proc line_search*[U, T](alpha: var U, p: Tensor[T], x0: Tensor[U], f: proc(x: Te
             return
 
 
-
-
-    
-
-proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U] = optimOptions[U](alpha = U(0.001))): Tensor[U] =
+proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = steepestDescentOptions[U]()): Tensor[U] =
     ## Minimize scalar-valued function f. 
     var alpha = options.alpha
     var x = x0.clone()
@@ -219,7 +262,7 @@ proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
     #echo iters, " iterations done!"
     result = x
 
-proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U] = optimOptions[U]()): Tensor[U] =
+proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = newtonOptions[U]()): Tensor[U] =
     var alpha = options.alpha
     var x = x0.clone()
     var fNorm = abs(f(x))
@@ -286,7 +329,7 @@ proc bfgs_old*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha:
     #echo iters, " iterations done!"
     result = x
 
-proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U] = optimOptions[U]()): Tensor[U] =
+proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = bfgsOptions[U]()): Tensor[U] =
     # Use gemm and gemv with preallocated Tensors and setting beta = 0
     var alpha = options.alpha
     var x = x0.clone()
@@ -366,7 +409,7 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: O
     #echo iters, " iterations done!"
     result = x
 
-proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], m: int = 10, options: OptimOptions[U] = optimOptions[U]()): Tensor[U] =
+proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], m: int = 10, options: OptimOptions[U, LBFGSOptions[U]] = lbfgsOptions[U]()): Tensor[U] =
     var alpha = options.alpha
     var x = x0.clone()
     let xLen = x.shape[0]
@@ -420,7 +463,7 @@ proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], m: int =
     #echo iters, " iterations done!"
     result = x
 
-proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], alpha = U(1), tol: U = U(1e-6), lambda0: U = U(1), fastMode = false): Tensor[U] =
+proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], options: OptimOptions[U, LevmarqOptions[U]] = levmarqOptions[U]()): Tensor[U] =
     assert xData.rank == 1
     assert yData.rank == 1
     assert params0.rank == 1
@@ -434,21 +477,26 @@ proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Te
             result = map2_inline(xData, yData):
                 f(params, x) - y
 
-    var lambdaCoeff = lambda0
+    let errorFunc = # proc that returns the scalar error
+        proc (params: Tensor[U]): T =
+            let res = residualFunc(params)
+            result = dot(res, res)
+
+    var lambdaCoeff = options.algoOptions.lambda0
 
     var params = params0.clone()
-    var gradient = tensorGradient(residualFunc, params, fastMode=fastMode)
+    var gradient = tensorGradient(residualFunc, params, fastMode=options.fastMode)
     var residuals = residualFunc(params)
     var resNorm = vectorNorm(residuals)
     var gradNorm = vectorNorm(squeeze(gradient * residuals.reshape(xLen, 1)))
     var iters: int
     let eyeNN = eye[T](paramsLen)
-    while gradNorm > tol*(1 + resNorm) and iters < 10000:
+    while gradNorm > options.tol*(1 + resNorm) and iters < 10000:
         let rhs = -gradient * residuals.reshape(xLen, 1)
         let lhs = gradient * gradient.transpose + lambdaCoeff * eyeNN
         let p = solve(lhs, rhs)
-        params += p * alpha
-        gradient = tensorGradient(residualFunc, params, fastMode=fastMode)
+        params += p * options.alpha
+        gradient = tensorGradient(residualFunc, params, fastMode=options.fastMode)
         residuals = residualFunc(params)
         let newGradNorm = vectorNorm(squeeze(gradient * residuals.reshape(xLen, 1)))
         if newGradNorm / gradNorm < 0.9: # we have improved, decrease lambda → more Gauss-Newton

From 147d7a440653b3f333e3bddd0c964d5210f266b1 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Sun, 6 Mar 2022 19:06:19 +0100
Subject: [PATCH 23/25] checkGradient

---
 src/numericalnim/differentiate.nim | 23 ++++++++++++++++++++---
 tests/test_differentiate.nim       |  4 ++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/numericalnim/differentiate.nim b/src/numericalnim/differentiate.nim
index 51dfdca..733f018 100644
--- a/src/numericalnim/differentiate.nim
+++ b/src/numericalnim/differentiate.nim
@@ -1,3 +1,4 @@
+import std/strformat
 import arraymancer
 
 proc diff1dForward*[U, T](f: proc(x: U): T, x0: U, h: U = U(1e-6)): T =
@@ -139,9 +140,25 @@ proc tensorHessian*[U; T: not Tensor](
         result[i, j] = mixed
         result[j, i] = mixed
 
-
-        
-        
+proc checkGradient*[U; T: not Tensor](f: proc(x: Tensor[U]): T, fGrad: proc(x: Tensor[U]): Tensor[T], x0: Tensor[U], tol: T): bool =
+  ## Checks if the provided gradient function `fGrad` gives the same values as numeric gradient.
+  let numGrad = tensorGradient(f, x0)
+  let grad = fGrad(x0)
+  result = true
+  for i, x in abs(numGrad - grad):
+    if x > tol:
+      echo fmt"Gradient at index {i[0]} has error: {x} (tol = {tol})"
+      result = false
+
+proc checkGradient*[U; T](f: proc(x: Tensor[U]): Tensor[T], fGrad: proc(x: Tensor[U]): Tensor[T], x0: Tensor[U], tol: T): bool =
+  ## Checks if the provided gradient function `fGrad` gives the same values as numeric gradient.
+  let numGrad = tensorGradient(f, x0)
+  let grad = fGrad(x0)
+  result = true
+  for i, x in abs(numGrad - grad):
+    if x > tol:
+      echo fmt"Gradient at index {i[0]} has error: {x} (tol = {tol})"
+      result = false
 
 
 
diff --git a/tests/test_differentiate.nim b/tests/test_differentiate.nim
index cd7bb4f..e150313 100644
--- a/tests/test_differentiate.nim
+++ b/tests/test_differentiate.nim
@@ -140,6 +140,10 @@ suite "Multi dimensional numeric gradients":
           for err in abs(numJacobian - exact):
             check err < 1e-10
 
+  test "checkGradient":
+    check checkGradient(fScalar, scalarGradient, [0.5, 0.5, 0.5].toTensor, 6e-11)
+    check checkGradient(fMultidim, multidimGradient, [0.5, 0.5, 0.5].toTensor, 4e-12)
+
   test "Hessian scalar valued function":
     for x in numericalnim.linspace(0, 1, 10):
       for y in numericalnim.linspace(0, 1, 10):

From 5ec1dd24725b5105c0aa7d044f67453ee41f6659 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Sun, 8 May 2022 22:20:32 +0200
Subject: [PATCH 24/25] allow supplying the analytic gradient

---
 src/numericalnim/optimize.nim | 31 ++++++++++++++++++-------------
 tests/test_optimize.nim       | 28 ++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/src/numericalnim/optimize.nim b/src/numericalnim/optimize.nim
index cbc54d2..18a0963 100644
--- a/src/numericalnim/optimize.nim
+++ b/src/numericalnim/optimize.nim
@@ -239,13 +239,18 @@ proc line_search*[U, T](alpha: var U, p: Tensor[T], x0: Tensor[U], f: proc(x: Te
             alpha = 1e2
             return
 
+template analyticOrNumericGradient(analytic, f, x, options: untyped): untyped =
+    if analytic.isNil:
+        tensorGradient(f, x, fastMode=options.fastMode)
+    else:
+        analytic(x)
 
-proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = steepestDescentOptions[U]()): Tensor[U] =
+proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = steepestDescentOptions[U](), analyticGradient: proc(x: Tensor[U]): Tensor[T] = nil): Tensor[U] =
     ## Minimize scalar-valued function f. 
     var alpha = options.alpha
     var x = x0.clone()
     var fNorm = abs(f(x0))
-    var gradient = tensorGradient(f, x0, fastMode=options.fastMode)
+    var gradient = analyticOrNumericGradient(analyticGradient, f, x0, options) #tensorGradient(f, x0, fastMode=options.fastMode)
     var gradNorm = vectorNorm(gradient)
     var iters: int
     while gradNorm > options.tol*(1 + fNorm) and iters < 10000:
@@ -254,7 +259,7 @@ proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
         x += alpha * p
         let fx = f(x)
         fNorm = abs(fx)
-        gradient = tensorGradient(f, x, fastMode=options.fastMode)
+        gradient = analyticOrNumericGradient(analyticGradient, f, x, options) #tensorGradient(f, x, fastMode=options.fastMode)
         gradNorm = vectorNorm(gradient)
         iters += 1
     if iters >= 10000:
@@ -262,11 +267,11 @@ proc steepestDescent*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U],
     #echo iters, " iterations done!"
     result = x
 
-proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = newtonOptions[U]()): Tensor[U] =
+proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = newtonOptions[U](), analyticGradient: proc(x: Tensor[U]): Tensor[T] = nil): Tensor[U] =
     var alpha = options.alpha
     var x = x0.clone()
     var fNorm = abs(f(x))
-    var gradient = tensorGradient(f, x, fastMode=options.fastMode)
+    var gradient = analyticOrNumericGradient(analyticGradient, f, x0, options)
     var gradNorm = vectorNorm(gradient)
     var hessian = tensorHessian(f, x)
     var iters: int
@@ -276,7 +281,7 @@ proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options:
         x += alpha * p
         let fx = f(x)
         fNorm = abs(fx)
-        gradient = tensorGradient(f, x, fastMode=options.fastMode)
+        gradient = analyticOrNumericGradient(analyticGradient, f, x, options)
         gradNorm = vectorNorm(gradient)
         hessian = tensorHessian(f, x)
         iters += 1
@@ -285,7 +290,7 @@ proc newton*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options:
     #echo iters, " iterations done!"
     result = x
 
-proc bfgs_old*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false): Tensor[U] =
+proc bfgs_old*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha: U = U(1), tol: U = U(1e-6), fastMode: bool = false, analyticGradient: proc(x: Tensor[U]): Tensor[T] = nil): Tensor[U] =
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
@@ -329,13 +334,13 @@ proc bfgs_old*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], alpha:
     #echo iters, " iterations done!"
     result = x
 
-proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = bfgsOptions[U]()): Tensor[U] =
+proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = bfgsOptions[U](), analyticGradient: proc(x: Tensor[U]): Tensor[T] = nil): Tensor[U] =
     # Use gemm and gemv with preallocated Tensors and setting beta = 0
     var alpha = options.alpha
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
-    var gradient = 0.01*tensorGradient(f, x, fastMode=options.fastMode)
+    var gradient = 0.01*analyticOrNumericGradient(analyticGradient, f, x0, options)
     var gradNorm = vectorNorm(gradient)
     var hessianB = eye[T](xLen) # inverse of the approximated hessian
     var p = newTensor[U](xLen)
@@ -354,7 +359,7 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: O
         #echo "gradient iter ", iters, ": ", gradient
         line_search(alpha, p, x, f, options.lineSearchCriterion, options.fastMode)
         x += alpha * p
-        let newGradient = tensorGradient(f, x, fastMode=options.fastMode)
+        let newGradient = analyticOrNumericGradient(analyticGradient, f, x, options) #tensorGradient(f, x, fastMode=options.fastMode)
         let sk = alpha * p.reshape(xLen, 1)
         
         let yk = (newGradient - gradient).reshape(xLen, 1)
@@ -409,12 +414,12 @@ proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: O
     #echo iters, " iterations done!"
     result = x
 
-proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], m: int = 10, options: OptimOptions[U, LBFGSOptions[U]] = lbfgsOptions[U]()): Tensor[U] =
+proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], m: int = 10, options: OptimOptions[U, LBFGSOptions[U]] = lbfgsOptions[U](), analyticGradient: proc(x: Tensor[U]): Tensor[T] = nil): Tensor[U] =
     var alpha = options.alpha
     var x = x0.clone()
     let xLen = x.shape[0]
     var fNorm = abs(f(x))
-    var gradient = 0.01*tensorGradient(f, x, fastMode=options.fastMode)
+    var gradient = 0.01*analyticOrNumericGradient(analyticGradient, f, x0, options)
     var gradNorm = vectorNorm(gradient)
     var iters: int
     #let m = 10 # number of past iterations to save
@@ -447,7 +452,7 @@ proc lbfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], m: int =
         line_search(alpha, p, x, f, options.lineSearchCriterion, options.fastMode)
         x += alpha * p
         sk_queue.addFirst alpha*p
-        let newGradient = tensorGradient(f, x, fastMode=options.fastMode)
+        let newGradient = analyticOrNumericGradient(analyticGradient, f, x, options)
         let yk = newGradient - gradient
         yk_queue.addFirst yk
         gradient = newGradient
diff --git a/tests/test_optimize.nim b/tests/test_optimize.nim
index 71aacdd..329cdfe 100644
--- a/tests/test_optimize.nim
+++ b/tests/test_optimize.nim
@@ -70,30 +70,58 @@ suite "Multi-dim":
         ## Function of 2 variables with minimum at (1, 1)
         ## And it looks like a banana 🍌
         result = (1 - x[0])^2 + 100*(x[1] - x[0]^2)^2
+
+    proc bananaBend(x: Tensor[float]): Tensor[float] =
+        ## Calculates the gradient of the banana function
+        result = newTensor[float](2)
+        result[0] = -2 * (1 - x[0]) + 100 * 2 * (x[1] - x[0]*x[0]) * -2 * x[0] # this one is wrong
+        result[1] = 100 * 2 * (x[1] - x[0]*x[0])
     
     let x0 = [-1.0, -1.0].toTensor()
     let correct = [1.0, 1.0].toTensor()
 
+    doAssert checkGradient(bananaFunc, bananaBend, x0, 1e-6), "Analytic gradient is wrong in test!"
+
     test "Steepest Gradient":
         let xSol = steepestDescent(bananaFunc, x0.clone)
         for x in abs(correct - xSol):
             check x < 2e-2
+
+    test "Steepest Gradient analytic":
+        let xSol = steepestDescent(bananaFunc, x0.clone, analyticGradient=bananaBend)
+        for x in abs(correct - xSol):
+            check x < 2e-2
     
     test "Newton":
         let xSol = newton(bananaFunc, x0.clone)
         for x in abs(correct - xSol):
             check x < 3e-10
 
+    test "Newton analytic":
+        let xSol = newton(bananaFunc, x0.clone, analyticGradient=bananaBend)
+        for x in abs(correct - xSol):
+            check x < 3e-10
+
     test "BFGS":
         let xSol = bfgs(bananaFunc, x0.clone)
         for x in abs(correct - xSol):
             check x < 3e-7
 
+    test "BFGS analytic":
+        let xSol = bfgs(bananaFunc, x0.clone, analyticGradient=bananaBend)
+        for x in abs(correct - xSol):
+            check x < 3e-7
+
     test "L-BFGS":
         let xSol = lbfgs(bananaFunc, x0.clone)
         for x in abs(correct - xSol):
             check x < 7e-10
 
+    test "L-BFGS analytic":
+        let xSol = lbfgs(bananaFunc, x0.clone, analyticGradient=bananaBend)
+        for x in abs(correct - xSol):
+            check x < 7e-10
+
     let correctParams = [10.4, -0.45].toTensor()
     proc fitFunc(params: Tensor[float], x: float): float =
         params[0] * exp(params[1] * x)

From d9827d5f6c6c6070b3504cc46bfa329336cc19a6 Mon Sep 17 00:00:00 2001
From: HugoGranstrom <5092565+HugoGranstrom@users.noreply.github.com>
Date: Mon, 9 May 2022 13:21:58 +0200
Subject: [PATCH 25/25] bump version and update changelog

---
 changelog.md        | 30 ++++++++++++++++++++++++++++++
 numericalnim.nimble |  2 +-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/changelog.md b/changelog.md
index 60ffe1c..2265348 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,3 +1,33 @@
+# v0.8.0 - 09.05.2022
+## Optimization has joined the chat
+Multi-variate optimization and differentiation has been introduced.
+
+- `numericalnim/differentiate` offers `tensorGradient(f, x)` which calculates the gradient of `f` w.r.t `x` using finite differences, `tensorJacobian` (returns the transpose of the gradient), `tensorHessian`, `mixedDerivative`. It also provides `checkGradient(f, analyticGrad, x, tol)` to verify that the analytic gradient is correct by comparing it to the finite difference approximation.
+- `numericalnim/optimize` now has several multi-variate optimization methods:
+  - `steepestDescent`
+  - `newton`
+  - `bfgs`
+  - `lbfgs`
+  - They all have the function signatures like:
+    ```nim
+    proc bfgs*[U; T: not Tensor](f: proc(x: Tensor[U]): T, x0: Tensor[U], options: OptimOptions[U, StandardOptions] = bfgsOptions[U](), analyticGradient: proc(x: Tensor[U]): Tensor[T] = nil): Tensor[U]
+    ```
+    where `f` is the function to be minimized, `x0` is the starting guess, `options` contain options like tolerance (each method has it own options type which can be created by for example `lbfgsOptions` or `newtonOptions`), `analyticGradient` can be supplied to avoid having to do finite difference approximations of the derivatives.
+  - There are 4 different line search methods supported and those are set in the `options`: `Armijo, Wolfe, WolfeStrong, NoLineSearch`.
+  - `levmarq`: non-linear least-square optimizer
+    ```nim
+    proc levmarq*[U; T: not Tensor](f: proc(params: Tensor[U], x: U): T, params0: Tensor[U], xData: Tensor[U], yData: Tensor[T], options: OptimOptions[U, LevmarqOptions[U]] = levmarqOptions[U]()): Tensor[U]
+    ```
+    - `f` is the function you want to fit to the parameters in `param` and `x` is the value to evaluate the function at. 
+    - `params0` is the initial guess for the parameters
+    - `xData` is a 1D Tensor with the x points and `yData` is a 1D Tensor with the y points.
+    - `options` can be created using `levmarqOptions`.
+    - Returns the final parameters
+  
+
+Note: There are basic tests to ensure these methods converge for simple problems, but they are not tested on more complex problems and should be considered experimental until more tests have been done. Please try them out, but don't rely on them for anything important for now. Also, the API isn't set in stone yet so expect that it may change in future versions.
+
+
 # v0.7.1 -25.01.2022
 
 Add a `nimCI` task for the Nim CI to run now that the tests have external dependencies.
diff --git a/numericalnim.nimble b/numericalnim.nimble
index e544b42..f915240 100644
--- a/numericalnim.nimble
+++ b/numericalnim.nimble
@@ -1,5 +1,5 @@
 # Package Information
-version = "0.7.1"
+version = "0.8.0"
 author = "Hugo Granström"
 description = "A collection of numerical methods written in Nim. Current features: integration, ode, optimization."
 license = "MIT"