diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..75cf67a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/* +__pycache__ diff --git a/LICENCE.txt b/LICENCE.txt new file mode 100644 index 0000000..df865b2 --- /dev/null +++ b/LICENCE.txt @@ -0,0 +1,12 @@ +(C) Copyright 1999 - 2006, Carl Edward Rasmussen + +Permission is granted for anyone to copy, use, or modify these +programs and accompanying documents for purposes of research or +education, provided this copyright notice is retained, and note is +made of any changes that have been made. + +These programs and documents are distributed without any warranty, +express or implied. As the programs were written for research +purposes only, they have not been tested to the degree that would be +advisable in any important application. All use of these programs is +entirely at the user's own risk. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a224caa --- /dev/null +++ b/README.md @@ -0,0 +1,56 @@ +# Minimize [![Python 3.6+](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/) + +This repository is a Python implementation of C.E. Rasmussen's [minimize](http://learning.eng.cam.ac.uk/carl/code/minimize/) function which finds a (local) minimum of a (nonlinear) multivariate function. The function uses conjugate gradients and approximate linesearches based on polynomial interpolation with Wolfe-Powel conditions. The user supplies a function which returns the function value as well as the partial derivatives with respect to the variables to be minimized. + +Notes: +- Tested for python >= 3.6 + +## Usage + +Here the two-dimensional [rosenbrock](https://en.wikipedia.org/wiki/Rosenbrock_function) function is used to show how minimize works. The function returns the function value and the partial derivatives with respect to the variables to be minimized. + +Starting from initial conditions X0 = np.array([[-1],[0]]) and length = 100 "linesearches": + +>> X, convergence, i = minimize(rosenbrock, X0, length=100) + +X = array([[1.], + [1.]]) + +convergence = array([[ 1.04000000e+02, -1.00000000e+00, 0.00000000e+00], + [ 2.37965977e+00, -5.16910495e-01, 2.39153220e-01], + [ 2.32706339e+00, -5.24606889e-01, 2.70076996e-01], + [ 2.29625954e+00, -5.10173722e-01, 2.72781176e-01], + [ 1.81191347e+00, -3.05092426e-01, 6.01197093e-02], + [ 1.71447070e+00, -2.19978689e-01, 8.38263203e-04], + [ 6.91491002e-01, 1.87118858e-01, 1.74877000e-02], + [ 6.76960265e-01, 1.78542297e-01, 2.72217021e-02], + [ 4.89960939e-01, 3.46292785e-01, 9.48931429e-02], + [ 3.51192555e-01, 4.84116914e-01, 2.05204619e-01], + [ 2.54267052e-01, 4.96129750e-01, 2.48098759e-01], + [ 1.69404850e-01, 6.14217154e-01, 3.62918220e-01], + [ 1.16369088e-01, 7.14147306e-01, 4.91389897e-01], + [ 6.91860437e-02, 7.37951393e-01, 5.46845078e-01], + [ 4.18391999e-02, 8.13230887e-01, 6.53003913e-01], + [ 3.04624305e-02, 8.66986583e-01, 7.40365353e-01], + [ 1.13408338e-02, 8.96263305e-01, 8.05695260e-01], + [ 3.87028837e-03, 9.46723116e-01, 8.93072397e-01], + [ 9.20534056e-04, 9.86675590e-01, 9.70802928e-01], + [ 2.32598777e-04, 9.84776380e-01, 9.69692858e-01], + [ 2.17647876e-06, 9.99792629e-01, 9.99439237e-01], + [ 2.39305242e-08, 9.99883820e-01, 9.99777867e-01], + [ 2.08976920e-08, 9.99932591e-01, 9.99877974e-01], + [ 3.49850571e-09, 9.99940971e-01, 9.99881570e-01], + [ 1.46880712e-15, 1.00000000e+00, 1.00000000e+00], + [ 3.28531626e-18, 1.00000000e+00, 1.00000000e+00], + [ 1.09458505e-20, 1.00000000e+00, 1.00000000e+00], + [ 3.45589993e-21, 1.00000000e+00, 1.00000000e+00], + [ 3.44606626e-21, 1.00000000e+00, 1.00000000e+00], + [ 3.38668373e-21, 1.00000000e+00, 1.00000000e+00], + [ 2.51712978e-24, 1.00000000e+00, 1.00000000e+00], + [ 4.43734259e-31, 1.00000000e+00, 1.00000000e+00]]) + +i = 33 + +The minimum of the function occurs at X = [1., 1.] with a function value of 0 and is determined after 33 iterations. The convergence returned by minimize has the function values in the first column, and the parameters being minimised in the following D columns. If the length parameter is set to a negative value then the algorithm is limited by function evaluations rather than linesearches. The figure shows the result of the above minimization. + +![](imgs/convergence.png) \ No newline at end of file diff --git a/img/convergence.png b/img/convergence.png new file mode 100644 index 0000000..fe67af5 Binary files /dev/null and b/img/convergence.png differ diff --git a/minimize.py b/minimize.py new file mode 100644 index 0000000..05caa48 --- /dev/null +++ b/minimize.py @@ -0,0 +1,179 @@ +import numpy as np +import warnings +warnings.filterwarnings("ignore", category=RuntimeWarning) # suppress np.sqrt Runtime warnings + +def minimize(f, X, length, args=(), reduction=None, verbose=True): + """ + Minimize a differentiable multivariate function. + + Parameters + ---------- + f : function to minimize. The function must return the value + of the function (float) and a numpy array of partial + derivatives of shape (D,) with respect to X, where D is + the dimensionality of the function. + + X : numpy array - Shape : (D, 1) + initial guess. + + length : int + The length of the run. If positive, length gives the maximum + number of line searches, if negative its absolute value gives + the maximum number of function evaluations. + + args : tuple + Tuple of parameters to be passed to the function f. + + reduction : float + The expected reduction in the function value in the first + linesearch (if None, defaults to 1.0) + + verbose : bool + If True - prints the progress of minimize. (default is True) + + Return + ------ + Xs : numpy array - Shape : (D, 1) + The found solution. + + convergence : numpy array - Shape : (i, D+1) + Convergence information. The first column is the function values + returned by the function being minimized. The next D columns are + the guesses of X during the minimization process. + + i : int + Number of line searches or function evaluations depending on which + was selected. + + The function returns when either its length is up, or if no further progress + can be made (ie, we are at a (local) minimum, or so close that due to + numerical problems, we cannot get any closer) + + Copyright (C) 2001 - 2006 by Carl Edward Rasmussen (2006-09-08). + Coverted to python by David Lines (2019-23-08) + """ + INT = 0.1 # don't reevaluate within 0.1 of the limit of the current bracket + EXT = 3.0 # extrapolate maximum 3 times the current step size + MAX = 20 # max 20 function evaluations per line search + RATIO = 10 # maximum allowed slope ratio + SIG = 0.1 + RHO = SIG / 2 + # SIG and RHO control the Wolfe-Powell conditions + # SIG is the maximum allowed absolute ratio between + # previous and new slopes (derivatives in the search direction), thus setting + # SIG to low (positive) values forces higher precision in the line-searches. + # RHO is the minimum allowed fraction of the expected (from the slope at the + # initial point in the linesearch). Constants must satisfy 0 < RHO < SIG < 1. + # Tuning of SIG (depending on the nature of the function to be optimized) may + # speed up the minimization; it is probably not worth playing much with RHO. + + print("Minimizing %s ..." % f) + + if reduction is None: + red = 1.0 + else: + red = reduction + + S = 'Linesearch' if length > 0 else 'Function evaluation' + + i = 0 # run length counter + is_failed = 0 # no previous line search has failed + f0, df0 = eval('f')(X, *list(args)) # get initial function value and gradient + df0 = df0.reshape(-1, 1) + fX=[]; fX.append(f0) + Xd=[]; Xd.append(X) + i += (length < 0) # count epochs + s = -df0 # get column vec + d0 = -s.T @ s # initial search direction (steepest) and slope + x3 = red / (1 - d0) # initial step is red/(|s|+1) + + while i < abs(length): # while not finished + i += (length > 0) # count iterations + + X0 = X; F0 = f0; dF0 = df0 # copy current vals + M = MAX if length > 0 else min(MAX, -length-i) + + while 1: # extrapolate as long as necessary + x2 = 0; f2 = f0; d2 = d0; f3 = f0; df3 = df0 + success = False + + while not success and M > 0: + try: + M -= 1; i += (length < 0) # count epochs + f3, df3 = eval('f')(X + x3 * s, *list(args)) + df3 = df3.reshape(-1, 1) + if np.isnan(f3) or np.isinf(f3) or any(np.isnan(df3)+np.isinf(df3)): + raise Exception('Either nan or inf in function eval or gradients') + success = True + except: # catch any error occuring in f + x3 = (x2 + x3) / 2 # bisect and try again + + if f3 < F0: + X0 = X + x3 * s; F0 = f3; dF0 = df3 # keep best values + + d3 = df3.T @ s # new slope + if d3 > SIG*d0 or f3 > f0+x3*RHO*d0 or M ==0: + break # finished extrapolating + + x1 = x2; f1 = f2; d1 = d2 # move point 2 to point 1 + x2 = x3; f2 = f3; d2 = d3 # move point 3 to point 2 + A = 6*(f1-f2)+3*(d2+d1)*(x2-x1) # make cubic extrapolation + B = 3*(f2-f1)-(2*d1+d2)*(x2-x1) + x3 = x1-d1*(x2-x1)**2/(B+np.sqrt(B*B-A*d1*(x2-x1))) # num. error possible, ok! + + if np.iscomplex(x3) or np.isnan(x3) or np.isinf(x3) or x3 < 0: # num prob | wrong sign + x3 = x2*EXT + elif x3 > x2*EXT: + x3 = x2*EXT + elif x3 < x2+INT*(x2-x1): + x3 = x2+INT*(x2-x1) + + while (abs(d3) > -SIG*d0 or f3 > f0+x3*RHO*d0) and M > 0: # keep interpolating + + if d3 > 0 or f3 > f0+x3*RHO*d0: # choose subinterval + x4 = x3; f4 = f3; d4 = d3 # move point 3 to point 4 + else: + x2 = x3; f2 = f3; d2 = d3 # move point 3 to point 2 + + if f4 > f0: + x3 = x2-(0.5*d2*(x4-x2)**2)/(f4-f2-d2*(x4-x2)) # quadratic interpolation + else: + A = 6*(f2-f4)/(x4-x2)+3*(d4+d2) # cubic interpolation + B = 3*(f4-f2)-(2*d2+d4)*(x4-x2) + x3 = x2+(np.sqrt(B*B-A*d2*(x4-x2)**2)-B)/A # num. error possible, ok! + + if np.isnan(x3) or np.isinf(x3): + x3 = (x2+x4)/2 # if we had a numerical problem then bisect + + x3 = max(min(x3, x4-INT*(x4-x2)), x2+INT*(x4-x2)) # don't accept too close + f3, df3 = eval('f')(X + x3 * s, *list(args)) + df3 = df3.reshape(-1,1) + if f3 < F0: + X0 = X+x3*s; F0 = f3; dF0 = df3 # keep best values + + M -= 1; i += (length<0) # count epochs?! + d3 = df3.T @ s # new slope + + if abs(d3) < -SIG*d0 and f3 < f0+x3*RHO*d0: # if line search succeeded + X = X+x3*s; f0 = f3; fX.append(f0); Xd.append(X) # update variables + if verbose: + print('%s %6i; Value %4.6e\r' % (S, i, f0)) + s = (df3.T@df3-df0.T@df3)/(df0.T@df0)*s - df3 # Polack-Ribiere CG direction + df0 = df3 # swap derivatives + d3 = d0; d0 = df0.T@s + if d0 > 0: # new slope must be negative + s = -df0.reshape(-1,1); d0 = -s.T@s # otherwise use steepest direction + x3 = x3 * min(RATIO, d3/(d0-np.finfo(np.double).tiny))# slope ratio but max RATIO + ls_failed = False # this line search did not fail + else: + X = X0; f0 = F0; df0 = dF0 # restore best point so far + if ls_failed or i > abs(length): # line search failed twice in a row + break # or we ran out of time, so we give up + s = -df0.reshape(-1,1); d0 = -s.T@s # try steepest + x3 = 1/(1-d0) + ls_failed = True # this line search failed + + convergence = np.hstack((np.array(fX).reshape(-1,1), np.array(Xd)[:,:,0])) # bundle convergence info + Xs = X # solution + + return Xs, convergence, i \ No newline at end of file diff --git a/test/convergence.png b/test/convergence.png new file mode 100644 index 0000000..7e2c67a Binary files /dev/null and b/test/convergence.png differ diff --git a/test/minimize_2d_rosenbrock.py b/test/minimize_2d_rosenbrock.py new file mode 100644 index 0000000..0a20387 --- /dev/null +++ b/test/minimize_2d_rosenbrock.py @@ -0,0 +1,37 @@ +from minimize import minimize +from rosenbrock import rosenbrock +import numpy as np +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D + +# minimize rosenbrock +length = 100 +X = np.array([-1, 0]).reshape(-1,1) +x, con, i = minimize(rosenbrock, X, length) + +# Get rosenbrock data +X = np.arange(-1.1, 1.2, 0.05) +Y = np.arange(-0.5, 1.2, 0.05) +X, Y = np.meshgrid(X, Y) +XY = np.hstack((X.reshape(-1, 1), Y.reshape(-1, 1))) +N = XY.shape[0]; n,m = X.shape +Z = np.zeros((N,)) +for i, xy in enumerate(XY): + Z[i], _ = rosenbrock(xy) +Z = Z.reshape(n, m) + +# plot data +fig = plt.figure(figsize=(8,6)) +ax = plt.axes(projection='3d') + +ax.view_init(elev=45., azim=45) +ax.plot(con[:,1], con[:,2], con[:,0], '-r*', markersize=12) +ax.scatter(1, 1, 0, s=100) +ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='viridis', + edgecolor='none', alpha=0.3, antialiased=False) +ax.set_xlabel('$X_{1}$', fontsize=20) +ax.set_ylabel('$X_{2}$', fontsize=20) +ax.set_zlabel('$f(\mathbf{X})$', fontsize=20) +plt.savefig('../img/convergence.png', dpi=200) + +plt.show() diff --git a/test/minimize_3d_rosenbrock.py b/test/minimize_3d_rosenbrock.py new file mode 100644 index 0000000..9a02101 --- /dev/null +++ b/test/minimize_3d_rosenbrock.py @@ -0,0 +1,9 @@ +from minimize import minimize +from rosenbrock import rosenbrock +import numpy as np + +length = 1000 +X = np.array([0,0,0]).reshape(-1,1) +x, con, i = minimize(rosenbrock, X, length, verbose=True) + +print("Minimum at [%f %f %f] after %i linesearches" % (x[0],x[1],x[2],i)) diff --git a/test/minimize_3d_rosenbrock_func_evals.py b/test/minimize_3d_rosenbrock_func_evals.py new file mode 100644 index 0000000..b45c311 --- /dev/null +++ b/test/minimize_3d_rosenbrock_func_evals.py @@ -0,0 +1,9 @@ +from minimize import minimize +from test.rosenbrock import rosenbrock +import numpy as np + +length = -1000 +X = np.array([0,0,0]).reshape(-1,1) +x, con, i = minimize(rosenbrock, X, length, verbose=True) + +print("Minimum at [%f %f %f] after %i function evaluations" % (x[0],x[1],x[2],i)) diff --git a/test/minimize_rosenbrock_multidim_random_init.py b/test/minimize_rosenbrock_multidim_random_init.py new file mode 100644 index 0000000..050ec73 --- /dev/null +++ b/test/minimize_rosenbrock_multidim_random_init.py @@ -0,0 +1,24 @@ +from minimize import minimize +from test.rosenbrock import rosenbrock +import numpy as np +import matplotlib.pyplot as plt + +# minimize rosenbrock +length = 1000 +for j in range(2, 25): + for k in range(10): + min = np.ones((j, 1)) + X = np.random.normal(loc=0.0, scale=5.0, size=(j,1)) # random intialisation + x, con, i = minimize(rosenbrock, X, length, verbose=False) + + if np.all(np.round(x, 7) == min): + print("SUCCESS for %i-D trial number %i after %i linesearches" % (j,k,i)) + else: + print("FAILED to find min for %i-D trial number %i after %i linesearches" % (j,k,i)) + print("x: ", x.T) + y = np.arange(con.shape[0]) + plt.figure(1) + plt.plot(y, con[:,0]) + +plt.yscale('log') +plt.show() \ No newline at end of file diff --git a/test/minimize_rosenbrock_with_args.py b/test/minimize_rosenbrock_with_args.py new file mode 100644 index 0000000..9ccd3e7 --- /dev/null +++ b/test/minimize_rosenbrock_with_args.py @@ -0,0 +1,9 @@ +from minimize import minimize +from test.rosenbrock_with_args import rosenbrock_with_args +import numpy as np + +length = 1000 +X = np.array([0,0,0]).reshape(-1,1) +x, con, i = minimize(rosenbrock_with_args, X, length, args=(100,400), reduction=None, verbose=True) + +print("Minimum at [%f %f %f] after %i linesearches" % (x[0],x[1],x[2],i)) \ No newline at end of file diff --git a/test/rosenbrock.py b/test/rosenbrock.py new file mode 100644 index 0000000..75ffec0 --- /dev/null +++ b/test/rosenbrock.py @@ -0,0 +1,35 @@ +import numpy as np + +def rosenbrock(x): + """ + This function function returns the function value and partial + derivatives of the general dimension rosenbrock function, given by: + + f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 + + where D is the dimension of x. + + The true minimum of the function is 0 at x = (1 1 ... 1) + + Parameters + ---------- + + x : parameter of the function + + returns + ------- + + f : function value + + df : partial derivatives wrt x_i + + """ + D = len(x) + # function + f = np.sum( 100 * (x[1:D] - x[:D-1] ** 2) ** 2 + (1 - x[:D-1]) ** 2 ) + # partial derivatives + df = np.zeros((D,1)) + df[:D-1] = -400 * x[:D-1] * (x[1:D] - x[:D-1] ** 2) - 2 * (1 - x[:D-1]) + df[1:D] += 200 * (x[1:D] - x[:D-1] ** 2) + + return f, df \ No newline at end of file diff --git a/test/rosenbrock_with_args.py b/test/rosenbrock_with_args.py new file mode 100644 index 0000000..3e4f2dd --- /dev/null +++ b/test/rosenbrock_with_args.py @@ -0,0 +1,35 @@ +import numpy as np + +def rosenbrock_with_args(x, a, b): + """ + This function function returns the function value and partial + derivatives of the general dimension rosenbrock function, given by: + + f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 + + where D is the dimension of x. + + The true minimum of the function is 0 at x = (1 1 ... 1) + + Parameters + ---------- + + x : parameter of the function + + returns + ------- + + f : function value + + df : partial derivatives wrt x_i + + """ + D = len(x) + # function + f = np.sum( a * (x[1:D] - x[:D-1] ** 2) ** 2 + (1 - x[:D-1]) ** 2 ) + # partial derivatives + df = np.zeros((D,1)) + df[:D-1] = -b * x[:D-1] * (x[1:D] - x[:D-1] ** 2) - 2 * (1 - x[:D-1]) + df[1:D] += 200 * (x[1:D] - x[:D-1] ** 2) + + return f, df \ No newline at end of file