minimize, test function, readme, license

linesd · Aug 23, 2019 · d5a986d · d5a986d
commit d5a986d
Show file tree

Hide file tree

Showing 13 changed files with 407 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea/*
+__pycache__
diff --git a/LICENCE.txt b/LICENCE.txt
@@ -0,0 +1,12 @@
+(C) Copyright 1999 - 2006, Carl Edward Rasmussen
+
+Permission is granted for anyone to copy, use, or modify these
+programs and accompanying documents for purposes of research or
+education, provided this copyright notice is retained, and note is
+made of any changes that have been made.
+
+These programs and documents are distributed without any warranty,
+express or implied.  As the programs were written for research
+purposes only, they have not been tested to the degree that would be
+advisable in any important application.  All use of these programs is
+entirely at the user's own risk.
diff --git a/README.md b/README.md
@@ -0,0 +1,56 @@
+# Minimize [![Python 3.6+](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/)
+
+This repository is a Python implementation of C.E. Rasmussen's [minimize](http://learning.eng.cam.ac.uk/carl/code/minimize/) function which finds a (local) minimum of a (nonlinear) multivariate function. The function uses conjugate gradients and approximate linesearches based on polynomial interpolation with Wolfe-Powel conditions. The user supplies a function which returns the function value as well as the partial derivatives with respect to the variables to be minimized.
+
+Notes:
+- Tested for python >= 3.6
+
+## Usage
+
+Here the two-dimensional [rosenbrock](https://en.wikipedia.org/wiki/Rosenbrock_function) function is used to show how minimize works. The function returns the function value and the partial derivatives with respect to the variables to be minimized.
+
+Starting from initial conditions X0 = np.array([[-1],[0]]) and length = 100 "linesearches":
+
+>> X, convergence, i = minimize(rosenbrock, X0, length=100)
+
+X = array([[1.],
+       	   [1.]])
+
+convergence = array([[ 1.04000000e+02, -1.00000000e+00,  0.00000000e+00],
+			         [ 2.37965977e+00, -5.16910495e-01,  2.39153220e-01],
+			         [ 2.32706339e+00, -5.24606889e-01,  2.70076996e-01],
+			         [ 2.29625954e+00, -5.10173722e-01,  2.72781176e-01],
+			         [ 1.81191347e+00, -3.05092426e-01,  6.01197093e-02],
+			         [ 1.71447070e+00, -2.19978689e-01,  8.38263203e-04],
+			         [ 6.91491002e-01,  1.87118858e-01,  1.74877000e-02],
+			         [ 6.76960265e-01,  1.78542297e-01,  2.72217021e-02],
+			         [ 4.89960939e-01,  3.46292785e-01,  9.48931429e-02],
+			         [ 3.51192555e-01,  4.84116914e-01,  2.05204619e-01],
+			         [ 2.54267052e-01,  4.96129750e-01,  2.48098759e-01],
+			         [ 1.69404850e-01,  6.14217154e-01,  3.62918220e-01],
+			         [ 1.16369088e-01,  7.14147306e-01,  4.91389897e-01],
+			         [ 6.91860437e-02,  7.37951393e-01,  5.46845078e-01],
+			         [ 4.18391999e-02,  8.13230887e-01,  6.53003913e-01],
+			         [ 3.04624305e-02,  8.66986583e-01,  7.40365353e-01],
+			         [ 1.13408338e-02,  8.96263305e-01,  8.05695260e-01],
+			         [ 3.87028837e-03,  9.46723116e-01,  8.93072397e-01],
+			         [ 9.20534056e-04,  9.86675590e-01,  9.70802928e-01],
+			         [ 2.32598777e-04,  9.84776380e-01,  9.69692858e-01],
+			         [ 2.17647876e-06,  9.99792629e-01,  9.99439237e-01],
+			         [ 2.39305242e-08,  9.99883820e-01,  9.99777867e-01],
+			         [ 2.08976920e-08,  9.99932591e-01,  9.99877974e-01],
+			         [ 3.49850571e-09,  9.99940971e-01,  9.99881570e-01],
+			         [ 1.46880712e-15,  1.00000000e+00,  1.00000000e+00],
+			         [ 3.28531626e-18,  1.00000000e+00,  1.00000000e+00],
+			         [ 1.09458505e-20,  1.00000000e+00,  1.00000000e+00],
+			         [ 3.45589993e-21,  1.00000000e+00,  1.00000000e+00],
+			         [ 3.44606626e-21,  1.00000000e+00,  1.00000000e+00],
+			         [ 3.38668373e-21,  1.00000000e+00,  1.00000000e+00],
+			         [ 2.51712978e-24,  1.00000000e+00,  1.00000000e+00],
+			         [ 4.43734259e-31,  1.00000000e+00,  1.00000000e+00]])
+
+i = 33
+
+The minimum of the function occurs at X = [1., 1.] with a function value of 0 and is determined after 33 iterations. The convergence returned by minimize has the function values in the first column, and the parameters being minimised in the following D columns. If the length parameter is set to a negative value then the algorithm is limited by function evaluations rather than linesearches. The figure shows the result of the above minimization. 
+
+![](imgs/convergence.png)
diff --git a/img/convergence.png b/img/convergence.png
diff --git a/minimize.py b/minimize.py
@@ -0,0 +1,179 @@
+import numpy as np
+import warnings
+warnings.filterwarnings("ignore", category=RuntimeWarning) # suppress np.sqrt Runtime warnings
+
+def minimize(f, X, length, args=(), reduction=None, verbose=True):
+	"""
+	Minimize a differentiable multivariate function.
+
+	Parameters
+	----------
+	f : function to minimize. The function must return the value
+		of the function (float) and a numpy array of partial
+		derivatives of shape (D,) with respect to X, where D is
+		the dimensionality of the function.
+
+	X : numpy array - Shape : (D, 1)
+		initial guess.
+
+	length : int
+		The length of the run. If positive, length gives the maximum
+		number of line searches, if negative its absolute value gives
+		the maximum number of function evaluations.
+
+	args : tuple
+		Tuple of parameters to be passed to the function f.
+
+	reduction : float
+		The expected reduction in the function value in the first
+		linesearch (if None, defaults to 1.0)
+
+	verbose : bool
+		If True - prints the progress of minimize. (default is True)
+
+	Return
+	------
+	Xs : numpy array - Shape : (D, 1)
+		The found solution.
+
+	convergence : numpy array - Shape : (i, D+1)
+		Convergence information. The first column is the function values
+		returned by the function being minimized. The next D columns are
+		the guesses of X during the minimization process.
+
+	i : int
+		Number of line searches or function evaluations depending on which
+		was selected.
+
+	The function returns when either its length is up, or if no further progress
+ 	can be made (ie, we are at a (local) minimum, or so close that due to
+ 	numerical problems, we cannot get any closer)
+
+ 	Copyright (C) 2001 - 2006 by Carl Edward Rasmussen (2006-09-08).
+ 	Coverted to python by David Lines (2019-23-08)
+	"""
+	INT = 0.1 		# don't reevaluate within 0.1 of the limit of the current bracket
+	EXT = 3.0		# extrapolate maximum 3 times the current step size
+	MAX = 20		# max 20 function evaluations per line search
+	RATIO = 10		# maximum allowed slope ratio
+	SIG = 0.1
+	RHO = SIG / 2
+	# SIG and RHO control the Wolfe-Powell conditions
+	# SIG is the maximum allowed absolute ratio between
+	# previous and new slopes (derivatives in the search direction), thus setting
+	# SIG to low (positive) values forces higher precision in the line-searches.
+	# RHO is the minimum allowed fraction of the expected (from the slope at the
+	# initial point in the linesearch). Constants must satisfy 0 < RHO < SIG < 1.
+	# Tuning of SIG (depending on the nature of the function to be optimized) may
+	# speed up the minimization; it is probably not worth playing much with RHO.
+
+	print("Minimizing %s ..." % f)
+
+	if reduction is None:
+		red = 1.0
+	else:
+		red = reduction
+
+	S = 'Linesearch' if length > 0 else 'Function evaluation'
+
+	i = 0 								# run length counter
+	is_failed = 0						# no previous line search has failed
+	f0, df0 = eval('f')(X, *list(args))			# get initial function value and gradient
+	df0 = df0.reshape(-1, 1)
+	fX=[]; fX.append(f0)
+	Xd=[]; Xd.append(X)
+	i += (length < 0)					# count epochs
+	s = -df0							# get column vec
+	d0 = -s.T @ s 						# initial search direction (steepest) and slope
+	x3 = red / (1 - d0)					# initial step is red/(|s|+1)
+
+	while i < abs(length):				# while not finished
+		i += (length > 0)				# count iterations
+
+		X0 = X; F0 = f0; dF0 = df0 		# copy current vals
+		M = MAX if length > 0 else min(MAX, -length-i)
+
+		while 1:						# extrapolate as long as necessary
+			x2 = 0; f2 = f0; d2 = d0; f3 = f0; df3 = df0
+			success = False
+
+			while not success and M > 0:
+				try:
+					M -= 1; i += (length < 0) # count epochs
+					f3, df3 = eval('f')(X + x3 * s, *list(args))
+					df3 = df3.reshape(-1, 1)
+					if np.isnan(f3) or np.isinf(f3) or any(np.isnan(df3)+np.isinf(df3)):
+						raise Exception('Either nan or inf in function eval or gradients')
+					success = True
+				except:	# catch any error occuring in f
+					x3 = (x2 + x3) / 2 	# bisect and try again
+
+			if f3 < F0:
+				X0 = X + x3 * s; F0 = f3; dF0 = df3 # keep best values
+
+			d3 = df3.T @ s 				# new slope
+			if d3 > SIG*d0 or f3 > f0+x3*RHO*d0 or M ==0:
+				break					# finished extrapolating
+
+			x1 = x2; f1 = f2; d1 = d2 	# move point 2 to point 1
+			x2 = x3; f2 = f3; d2 = d3   # move point 3 to point 2
+			A = 6*(f1-f2)+3*(d2+d1)*(x2-x1) # make cubic extrapolation
+			B = 3*(f2-f1)-(2*d1+d2)*(x2-x1)
+			x3 = x1-d1*(x2-x1)**2/(B+np.sqrt(B*B-A*d1*(x2-x1))) # num. error possible, ok!
+
+			if np.iscomplex(x3) or np.isnan(x3) or np.isinf(x3) or x3 < 0: # num prob | wrong sign
+				x3 = x2*EXT
+			elif x3 > x2*EXT:
+				x3 = x2*EXT
+			elif x3 < x2+INT*(x2-x1):
+				x3 = x2+INT*(x2-x1)
+
+		while (abs(d3) > -SIG*d0 or f3 > f0+x3*RHO*d0) and M > 0: # keep interpolating
+
+			if d3 > 0 or f3 > f0+x3*RHO*d0:                       # choose subinterval
+				x4 = x3; f4 = f3; d4 = d3                         # move point 3 to point 4
+			else:
+				x2 = x3; f2 = f3; d2 = d3                         # move point 3 to point 2
+
+			if f4 > f0:
+				x3 = x2-(0.5*d2*(x4-x2)**2)/(f4-f2-d2*(x4-x2))     # quadratic interpolation
+			else:
+				A = 6*(f2-f4)/(x4-x2)+3*(d4+d2)            		  # cubic interpolation
+				B = 3*(f4-f2)-(2*d2+d4)*(x4-x2)
+				x3 = x2+(np.sqrt(B*B-A*d2*(x4-x2)**2)-B)/A         # num. error possible, ok!
+
+			if np.isnan(x3) or np.isinf(x3):
+				x3 = (x2+x4)/2               					  # if we had a numerical problem then bisect
+
+			x3 = max(min(x3, x4-INT*(x4-x2)), x2+INT*(x4-x2))     # don't accept too close
+			f3, df3 = eval('f')(X + x3 * s, *list(args))
+			df3 = df3.reshape(-1,1)
+			if f3 < F0:
+				X0 = X+x3*s; F0 = f3; dF0 = df3				      # keep best values
+
+			M -= 1; i += (length<0)                               # count epochs?!
+			d3 = df3.T @ s                                        # new slope
+
+		if abs(d3) < -SIG*d0 and f3 < f0+x3*RHO*d0:        		  # if line search succeeded
+			X = X+x3*s; f0 = f3; fX.append(f0); Xd.append(X)				      # update variables
+			if verbose:
+				print('%s %6i;  Value %4.6e\r' % (S, i, f0))
+			s = (df3.T@df3-df0.T@df3)/(df0.T@df0)*s - df3       # Polack-Ribiere CG direction
+			df0 = df3                                             # swap derivatives
+			d3 = d0; d0 = df0.T@s
+			if d0 > 0:                                    		  # new slope must be negative
+				s = -df0.reshape(-1,1); d0 = -s.T@s          					  # otherwise use steepest direction
+			x3 = x3 * min(RATIO, d3/(d0-np.finfo(np.double).tiny))# slope ratio but max RATIO
+			ls_failed = False                                     # this line search did not fail
+		else:
+			X = X0; f0 = F0; df0 = dF0                           # restore best point so far
+			if ls_failed or i > abs(length):                       # line search failed twice in a row
+				break                                             # or we ran out of time, so we give up
+			s = -df0.reshape(-1,1); d0 = -s.T@s                                 # try steepest
+			x3 = 1/(1-d0)
+			ls_failed = True                                     # this line search failed
+
+	convergence =  np.hstack((np.array(fX).reshape(-1,1), np.array(Xd)[:,:,0])) # bundle convergence info
+	Xs = X # solution
+
+	return Xs, convergence, i
diff --git a/test/convergence.png b/test/convergence.png
diff --git a/test/minimize_2d_rosenbrock.py b/test/minimize_2d_rosenbrock.py
@@ -0,0 +1,37 @@
+from minimize import minimize
+from rosenbrock import rosenbrock
+import numpy as np
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+
+# minimize rosenbrock
+length = 100
+X = np.array([-1, 0]).reshape(-1,1)
+x, con, i = minimize(rosenbrock, X, length)
+
+# Get rosenbrock data
+X = np.arange(-1.1, 1.2, 0.05)
+Y = np.arange(-0.5, 1.2, 0.05)
+X, Y = np.meshgrid(X, Y)
+XY = np.hstack((X.reshape(-1, 1), Y.reshape(-1, 1)))
+N = XY.shape[0]; n,m = X.shape
+Z = np.zeros((N,))
+for i, xy in enumerate(XY):
+    Z[i], _ = rosenbrock(xy)
+Z = Z.reshape(n, m)
+
+# plot data
+fig = plt.figure(figsize=(8,6))
+ax = plt.axes(projection='3d')
+
+ax.view_init(elev=45., azim=45)
+ax.plot(con[:,1], con[:,2], con[:,0], '-r*', markersize=12)
+ax.scatter(1, 1, 0, s=100)
+ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='viridis',
+                edgecolor='none', alpha=0.3, antialiased=False)
+ax.set_xlabel('$X_{1}$', fontsize=20)
+ax.set_ylabel('$X_{2}$', fontsize=20)
+ax.set_zlabel('$f(\mathbf{X})$', fontsize=20)
+plt.savefig('../img/convergence.png', dpi=200)
+
+plt.show()
diff --git a/test/minimize_3d_rosenbrock.py b/test/minimize_3d_rosenbrock.py
@@ -0,0 +1,9 @@
+from minimize import minimize
+from rosenbrock import rosenbrock
+import numpy as np
+
+length = 1000
+X = np.array([0,0,0]).reshape(-1,1)
+x, con, i = minimize(rosenbrock, X, length, verbose=True)
+
+print("Minimum at [%f %f %f] after %i linesearches" % (x[0],x[1],x[2],i))
diff --git a/test/minimize_3d_rosenbrock_func_evals.py b/test/minimize_3d_rosenbrock_func_evals.py
@@ -0,0 +1,9 @@
+from minimize import minimize
+from test.rosenbrock import rosenbrock
+import numpy as np
+
+length = -1000
+X = np.array([0,0,0]).reshape(-1,1)
+x, con, i = minimize(rosenbrock, X, length, verbose=True)
+
+print("Minimum at [%f %f %f] after %i function evaluations" % (x[0],x[1],x[2],i))
diff --git a/test/minimize_rosenbrock_multidim_random_init.py b/test/minimize_rosenbrock_multidim_random_init.py
@@ -0,0 +1,24 @@
+from minimize import minimize
+from test.rosenbrock import rosenbrock
+import numpy as np
+import matplotlib.pyplot as plt
+
+# minimize rosenbrock
+length = 1000
+for j in range(2, 25):
+    for k in range(10):
+        min = np.ones((j, 1))
+        X = np.random.normal(loc=0.0, scale=5.0, size=(j,1)) # random intialisation
+        x, con, i = minimize(rosenbrock, X, length, verbose=False)
+
+        if np.all(np.round(x, 7) == min):
+            print("SUCCESS for %i-D trial number %i after %i linesearches" % (j,k,i))
+        else:
+            print("FAILED to find min for %i-D trial number %i after %i linesearches" % (j,k,i))
+            print("x: ", x.T)
+            y = np.arange(con.shape[0])
+            plt.figure(1)
+            plt.plot(y, con[:,0])
+
+plt.yscale('log')
+plt.show()
diff --git a/test/minimize_rosenbrock_with_args.py b/test/minimize_rosenbrock_with_args.py
@@ -0,0 +1,9 @@
+from minimize import minimize
+from test.rosenbrock_with_args import rosenbrock_with_args
+import numpy as np
+
+length = 1000
+X = np.array([0,0,0]).reshape(-1,1)
+x, con, i = minimize(rosenbrock_with_args, X, length, args=(100,400), reduction=None, verbose=True)
+
+print("Minimum at [%f %f %f] after %i linesearches" % (x[0],x[1],x[2],i))
diff --git a/test/rosenbrock.py b/test/rosenbrock.py
@@ -0,0 +1,35 @@
+import numpy as np
+
+def rosenbrock(x):
+	"""
+	This function function returns the function value and partial 
+	derivatives of the general dimension rosenbrock function, given by:
+
+		f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2 
+
+	where D is the dimension of x.
+
+	The true minimum of the function is 0 at x = (1 1 ... 1)
+
+	Parameters
+	----------
+
+	x : parameter of the function
+
+	returns
+	-------
+
+	f : function value
+
+	df : partial derivatives wrt x_i
+
+	"""
+	D = len(x) 
+	# function
+	f = np.sum( 100 * (x[1:D] - x[:D-1] ** 2) ** 2 + (1 - x[:D-1]) ** 2 )
+	# partial derivatives
+	df = np.zeros((D,1))
+	df[:D-1] = -400 * x[:D-1] * (x[1:D] - x[:D-1] ** 2) - 2 * (1 - x[:D-1])
+	df[1:D] += 200 * (x[1:D] - x[:D-1] ** 2)
+
+	return f, df