Code refactoring + separate density estimation from pi estimation

vdumoulin · Apr 8, 2014 · 408141d · 408141d
1 parent 89e7ce4
commit 408141d
Showing 1 changed file with 77 additions and 43 deletions.
diff --git a/code/shotgun_monte_carlo/code.py b/code/shotgun_monte_carlo/code.py
@@ -1,11 +1,18 @@
 #!/usr/bin/env python
+__authors__ = ["Vincent Dumoulin"]
+__copyright__ = "Copyright 2014"
+__credits__ = ["Vincent Dumoulin"]
+__license__ = "3-clause BSD"
+__maintainer__ = "Vincent Dumoulin"
 """
-Monte Carlo estimator of pi.
+Source code for "A Ballistic Monte Carlo Approximation of pi".
 
-usage: code.py [-h] [-p PATH] [-v] [-s]
+usage: code.py [-h] [-e EXTRACT] [-p PATH] [-v] [-s]
 
 optional arguments:
   -h, --help            show this help message and exit
+  -e EXTRACT, --extract EXTRACT
+                        path to the directory containing sample images
   -p PATH, --path PATH  path to the directory containing sample images
   -v, --visualize       show figures being created
   -s, --save            save figures to disk
@@ -20,6 +27,9 @@
 from scipy import ndimage
 
 
+numpy.random.seed(122973)
+
+
 # Pretty font for figures
 rc('font', **{'family': 'serif', 'serif': ['Computer Modern Roman']})
 rc('text', usetex=True)
@@ -57,7 +67,8 @@ def generate_synthetic_samples(num_samples=25000):
     return samples
 
 
-def estimate_pi(samples=None, show_results=False, save_results=False):
+def estimate_pi(samples=None, split=10000, show_results=False,
+                save_results=False):
     """
     Return a Monte Carlo estimation of pi.
 
@@ -78,6 +89,8 @@ def estimate_pi(samples=None, show_results=False, save_results=False):
     samples : numpy.ndarray, optional
         Data used to estimate the value of pi. Defaults to None, in which case
         synthetic samples are used.
+    split : int, optional
+        End index of the data used to estimate the PDF. Defaults to 5000.
     show_results : bool, optional
         Whether to display figures on screen. Defaults to `False`.
     save_results : bool, optional
@@ -88,20 +101,26 @@ def estimate_pi(samples=None, show_results=False, save_results=False):
         print 'Generating synthetic samples...'
         samples = generate_synthetic_samples(num_samples=25000)
 
-    x = samples[:, 0]
-    y = samples[:, 1]
+    numpy.random.shuffle(samples)
+
+    x = samples[:split, 0]
+    y = samples[:split, 1]
+    samples_x = samples[split:, 0]
+    samples_y = samples[split:, 1]
 
-    g = x ** 2 + y ** 2 <= 1.0
+    g = samples_x ** 2 + samples_y ** 2 <= 1.0
     print 'Estimating sample PDF...'
-    densities, x_bins, y_bins, sample_densities = _histogram_pdf(x, y)
+    densities, x_bins, y_bins, sample_densities = histogram_pdf(x, y,
+                                                                samples_x,
+                                                                samples_y)
 
     pi_estimate = 4 * (g / sample_densities).mean()
     pi_error = numpy.abs((pi_estimate - numpy.pi) / numpy.pi)
 
     # Print the estimate and relevant information
     print "The estimate of pi is %(pi)6.5f, with an error of %(error)4.2f%%" % \
         {"pi": pi_estimate, "error": 100 * pi_error}
-    print "Estimation done with %(n)i data points" % {'n': samples.shape[0]}
+    print "Estimation done with %(n)i data points" % {'n': samples_x.shape[0]}
 
     # Plot the data points along with the quarter circle for reference
     # Plot data points
@@ -110,8 +129,8 @@ def estimate_pi(samples=None, show_results=False, save_results=False):
     pyplot.xlabel('$x$')
     pyplot.ylabel('$y$')
     pyplot.axes().set_aspect('equal')
-    pyplot.scatter(x, y, marker='.', color='0.5', edgecolor='0.5',
-                   linewidth=0.25)
+    pyplot.scatter(samples_x, samples_y, marker='.', color='0.5',
+                   edgecolor='0.5', linewidth=0.25)
     theta = numpy.linspace(start=0.0, stop=numpy.pi/2.0, num=100)
     pyplot.plot(numpy.cos(theta), numpy.sin(theta), color='k',
                 linewidth=2.5, linestyle='--')
@@ -136,36 +155,45 @@ def estimate_pi(samples=None, show_results=False, save_results=False):
         pyplot.show()
 
 
-def histogram_pdf(x, y, x_min=0 - 1e-5, x_max=1 + 1e-5, y_min=0 - 1e-5,
-                  y_max=1 + 1e-5, min_bins=1, max_bins=50, num_folds=20):
+def histogram_pdf(x, y, samples_x, samples_y, x_min=0 - 1e-5, x_max=1 + 1e-5,
+                  y_min=0 - 1e-5, y_max=1 + 1e-5, min_bins=1, max_bins=50,
+                  num_folds=20):
     """
-    WRITEME
+    Estimate the PDF of a set of (x, y) coordinates using k-fold
+    cross-validation and compute the likelihood of a set of (samples_x,
+    samples_y) coordinates using this estimate
 
     Parameters
     ----------
-    x : `numpy.ndarray`
-        x-coordinates of the points
-    y : `numpy.ndarray`
-        y-coordinates of the points
-    x_min : `float`, optional
+    x : numpy.ndarray
+        x-coordinates of the points used to estimate the PDf
+    y : numpy.ndarray
+        y-coordinates of the points used to estimate the PDf
+    sample_x : numpy.ndarray
+        x-coordinates of the points for which to compute the likelihood
+    sample_y : numpy.ndarray
+        y-coordinates of the points for which to compute the likelihood
+    x_min : float, optional
         Lower bound for the x bins of the histogram. Defaults to 0.
-    x_max : `float`, optional
+    x_max : float, optional
         Upper bound for the x bins of the histogram. Defaults to 1.
-    y_min : `float`, optional
+    y_min : float, optional
         Lower bound for the y bins of the histogram. Defaults to 0.
-    y_max : `float`, optional
+    y_max : float, optional
         Upper bound for the y bins of the histogram. Defaults to 1.
-    min_bins : `int`, optional
-        WRITEME
-    max_bins : `int`, optional
-        WRITEME
+    min_bins : int, optional
+        Minimal number of bins to try
+    max_bins : int, optional
+        Maximal number of bins to try
+    num_folds : int, optional
+        Number of folds for cross-validation
 
     Returns
     -------
-    pdf : `numpy.ndarray`
-    x_bins : `numpy.ndarray`
-    y_bins : `numpy.ndarray`
-    densities : `numpy.ndarray`
+    pdf : numpy.ndarray
+    x_bins : numpy.ndarray
+    y_bins : numpy.ndarray
+    densities : numpy.ndarray
     """
     fold_indexes = []
     fold_length = x.shape[0] / num_folds
@@ -196,7 +224,13 @@ def histogram_pdf(x, y, x_min=0 - 1e-5, x_max=1 + 1e-5, y_min=0 - 1e-5,
                 optimal_log_likelihood = mean_log_likelihood
                 optimal_num_bins = num_bins
 
-    return _histogram_pdf(x, y, x_min, x_max, y_min, y_max, optimal_num_bins)
+    print "Optimal number of bins is " + str(optimal_num_bins)
+    pdf, x_bins, y_bins, __ = _histogram_pdf(x, y, x_min, x_max, y_min, y_max,
+                                             optimal_num_bins)
+    x_indexes = (samples_x / x_max * optimal_num_bins).astype(int)
+    y_indexes = (samples_y / y_max * optimal_num_bins).astype(int)
+    sample_densities = pdf[[x_indexes, y_indexes]]
+    return pdf, x_bins, y_bins, sample_densities
 
 
 def _histogram_pdf(x, y, x_min=0 - 1e-5, x_max=1 + 1e-5, y_min=0 - 1e-5,
@@ -243,7 +277,7 @@ def _histogram_pdf(x, y, x_min=0 - 1e-5, x_max=1 + 1e-5, y_min=0 - 1e-5,
     return pdf, x_bins, y_bins, densities
 
 
-def extract_samples(directory_path=None, show_results=False):
+def extract_samples(directory_path, show_results=False):
     """
     Extract data points from target scans located in `directory_path`
 
@@ -264,24 +298,18 @@ def extract_samples(directory_path=None, show_results=False):
     directory_path = abspath(directory_path)
     samples = []
     paths_list = []
-    if directory_path is not None:
-        paths_list.extend([join(directory_path, f) for f in
-                           listdir(directory_path) if
-                           isfile(join(directory_path, f))
-                           and f.endswith('jpg')])
-    else:
-        paths_list.append('/Users/vdumoulin/Development/sheldon/' +
-                          'presentations/pi-gun/test_image.jpg')
+    paths_list.extend([join(directory_path, f) for f in
+                       listdir(directory_path) if
+                       isfile(join(directory_path, f))
+                       and f.endswith('jpg')])
 
     for path in paths_list:
         # Load image
         image = scipy.misc.imread(path, flatten=True)
-        # Smooth the image (to remove small variations)
-        smoothed_image = ndimage.gaussian_filter(image, 0)
         # Set the threshold
         threshold = 50
         # Binarize the image according to the threshold
-        binarized_image = smoothed_image > threshold
+        binarized_image = image > threshold
         # Find connected components
         labeled_image, num_objects = ndimage.label(binarized_image)
         # Find all centers of mass
@@ -323,8 +351,10 @@ def extract_samples(directory_path=None, show_results=False):
 if __name__ == "__main__":
     # Argument parsing
     parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", default=None,
+    parser.add_argument("-e", "--extract", default=None,
                         help="path to the directory containing sample images")
+    parser.add_argument("-p", "--path", default=None,
+                        help="path to the sample file")
     parser.add_argument("-v", "--visualize", action="store_true",
                         help="show figures being created")
     parser.add_argument("-s", "--save", action="store_true",
@@ -333,6 +363,10 @@ def extract_samples(directory_path=None, show_results=False):
 
     samples = None
     data_path = args.path
+    extract_path = args.extract
+    if extract_path is not None:
+        extract_samples(directory_path=extract_path,
+                        show_results=args.visualize)
     if data_path is not None:
         print 'Retrieving samples...'
         samples = numpy.load(data_path)