Added param_collection.py

odellus · odellus · commit 084b19a8f3b5 · 2016-10-26T17:19:34.000-07:00
diff --git a/my_atari_ram_policy.py b/my_atari_ram_policy.py
@@ -1,32 +1,31 @@
-from __future__ import print_function
-
 import tensorflow as tf
 import numpy as np
-import scipy.optimize as opt
+from my_param_collection import ParamCollection
+# from rl import Serializable
+# from categorical import cat_sample, cat_entropy
+# from ppo import PPOPolicy
 
 # Helper functions.
 def weight_variable(shape, stddev=0.1, initial=None):
     if initial is None:
-        initial = tf.truncated_normal(shape, stddev=stddev,dtype=tf.float64)
+        initial = tf.truncated_normal(shape, stddev=stddev, dtype=tf.float64)
     return tf.Variable(initial)
 
-def bias_variable(shape, init_bias=0.1,initial=None):
+def bias_variable(shape, init_bias=0.1, initial=None):
     if initial is None:
-        initial = tf.constant(init_bias, shape=shape,dtype=tf.float64)
+        initial = tf.constant(init_bias, shape=shape, dtype=tf.float64)
     return tf.Variable(initial)
 
-class AtariRAMPolicy(object):
+class AtariRAMPolicy(object): #PPOPolicy, Serializable):
     """
     TensorFlow policy to play Atari.
     adapted from cgt version in cs294 @ http://rll.berkeley.edu/deeprlcourse/
     """
     def __init__(self, n_actions):
-
+        # Serializable.__init__(self, n_actions)
         n_in = 128
         n_hid = 64
 
-        # Attach placeholders to self so they're in the scope of the feed_dict
-        # and sess.run() for later functions that use the model.
 
         # Observations placeholder. batch_size samples with 128 features.
         self.o_no = tf.placeholder(tf.float64, shape=[None, n_in])
@@ -51,35 +50,38 @@ def __init__(self, n_actions):
 
         # Initialize weights and bias from input to hidden layer.
         self.W_01 = weight_variable([n_in, n_hid])
+        self.W_12 = weight_variable([n_hid, n_actions], stddev=0.01)
         self.b_01 = bias_variable([n_hid])
+        self.b_12 = bias_variable([n_actions], init_bias=0.01)
 
         # Map input to hidden layer.
         h1 = tf.nn.tanh(tf.matmul(h0, self.W_01) + self.b_01)
 
         # Initialize weights and biases from hidden layer to action space.
-        self.W_12 = weight_variable([n_hid, n_actions], stddev=0.01)
-        self.b_12 = bias_variable([n_actions], init_bias=0.01)
 
         # Map hidden layer activations to probabilities of actions.
         self.probs_na = tf.nn.softmax(tf.matmul(h1, self.W_12) + self.b_12)
 
         logprobs_na = tf.log(self.probs_na)
 
         # This works.
-        n_batch = tf.shape(self.a_n)[0]
+        self.n_batch = tf.shape(self.a_n)[0]
 
         # Gather from a flattened version of the matrix since gather_nd does
         # not work on the gpu at this time.
-        idx_flattened = tf.range(0, n_batch) * n_actions + tf.cast(self.a_n, tf.int32)
+        idx_flattened = tf.range(0, self.n_batch) * n_actions + tf.cast(self.a_n, tf.int32)
 
         # The modeled log probability of the choice taken for whole batch.
         logps_n = tf.gather(tf.reshape(logprobs_na, [-1]), idx_flattened)
 
         # Product of modeled log probability for chosen action and return.
         self.surr = tf.reduce_mean(tf.mul(logps_n , self.q_n))
 
+        params = tf.trainable_variables()
+
         # Compute gradients of surrogate objective function.
-        self.surr_grads = tf.gradients(self.surr, [self.W_01, self.W_12, self.b_01, self.b_12])
+        # self.surr_grads = tf.gradients(self.surr, [self.W_01, self.W_12, self.b_01, self.b_12])
+        self.surr_grads = tf.gradients(self.surr, params)
 
         # Kullback-Liebler Divergence of new vs old transition probabilities.
         self.kl = tf.reduce_mean(
@@ -90,21 +92,23 @@ def __init__(self, n_actions):
         penobj = tf.sub(self.surr, tf.mul(self.lam, self.kl))
 
         # Compute gradients of KLD-constrained objective function.
-        self.penobj_grads = tf.gradients(penobj, [self.W_01, self.W_12, self.b_01, self.b_12])
+        self.penobj_grads = tf.gradients(penobj, params)
 
         # Attach a session with initialized variables to the class.
         self.sess = tf.InteractiveSession()
         self.sess.run(tf.initialize_all_variables())
 
+        self.pc = ParamCollection(self.sess, params)
 
     def step(self, X):
         feed_dict={
             self.o_no : X,
         }
-        pdist_na = self.sess.run(self.probs_na,feed_dict=feed_dict)
-        # acts_n = cat_sample(pdist_na)
+        pdist_na = self.sess.run(self.probs_na, feed_dict=feed_dict)
+        # pdist_na = self.f_probs(X)
+        acts_n = cat_sample(pdist_na)
         return {
-            # "action" : acts_n,
+            "action" : acts_n,
             "pdist" : pdist_na
         }
 
@@ -117,6 +121,7 @@ def compute_gradient(self, pdist_np, o_no, a_n, q_n):
         }
         [surr_grads] = self.sess.run([self.surr_grads],feed_dict=feed_dict)
         return np.concatenate([p.flatten() for p in surr_grads],0)
+        # return surr_grads
 
     def compute_surr_kl(self, pdist_np, o_no, a_n, q_n):
         feed_dict={
@@ -141,34 +146,32 @@ def compute_grad_lagrangian(self, lam, pdist_np, o_no, a_n, q_n):
 
 
     def compute_entropy(self, pdist_np):
+        raise NotImplementedError
         # return cat_entropy(pdist_np)
-        assert NotImplementedError
 
     def get_parameters_flat(self):
-        W_01 = self.sess.run(self.W_01)
-        W_12 = self.sess.run(self.W_12)
-        b_01 = self.sess.run(self.b_01)
-        b_12 = self.sess.run(self.b_12)
-        return np.concatenate([p.flatten() for p in [W_01, W_12, b_01, b_12]],0)
+        return self.pc.get_values_flat()
+        # W_01 = self.sess.run(self.W_01)
+        # W_12 = self.sess.run(self.W_12)
+        # b_01 = self.sess.run(self.b_01)
+        # b_12 = self.sess.run(self.b_12)
+        # return np.concatenate([p.flatten() for p in [W_01, W_12, b_01, b_12]],0)
 
     def set_parameters_flat(self, th):
-        self.sess.run(tf.initialize_all_variables())
-        # Get shape of parameters from the class.
-        n_in = self.n_in
-        n_hid = self.n_hid
-        n_actions = self.n_actions
-        # Grab and reshape weight matrices.
-        W_01 = th[:n_hid*n_in].reshape(n_in,n_hid)
-        W_12 = th[n_hid*n_in:n_hid*n_in+n_hid*n_actions].reshape(n_hid,n_actions)
-        # Pull the biases off the end of th.
-        b_01 = th[-n_hid-n_actions:-n_actions]
-        b_12 = th[-n_actions:]
-        # Assign the variables the values passed through th.
-        self.sess.run(tf.assign(self.W_01, W_01))
-        self.sess.run(tf.assign(self.W_12, W_12))
-        self.sess.run(tf.assign(self.b_01, b_01))
-        self.sess.run(tf.assign(self.b_12, b_12))
-
+        return self.pc.set_values_flat(th)
+        # self.sess.run(tf.initialize_all_variables())
+        # n_in = self.n_in
+        # n_hid = self.n_hid
+        # n_actions = self.n_actions
+        # W_01 = th[:n_hid*n_in].reshape(n_in,n_hid)
+        # W_12 = th[n_hid*n_in:n_hid*n_in+n_hid*n_actions].reshape(n_hid,n_actions)
+        # b_01 = th[-n_hid-n_actions:-n_actions]
+        # b_12 = th[-n_actions:]
+        # self.sess.run(tf.assign(self.W_01, W_01))
+        # self.sess.run(tf.assign(self.W_12, W_12))
+        # self.sess.run(tf.assign(self.b_01, b_01))
+        # self.sess.run(tf.assign(self.b_12, b_12))
+        # self.pc = ParamCollection(th)
 
 
 def test_AtariRAMPolicy():
@@ -180,7 +183,6 @@ def test_AtariRAMPolicy():
     n_features = 128
     n_actions = 9
     lam = 1.0
-    penalty_coeff = 1.0
 
     # Go ahead and initialize the policy.
     policy = AtariRAMPolicy(n_actions=n_actions)
@@ -196,22 +198,16 @@ def test_AtariRAMPolicy():
 
     # Now for some tests.
 
-    n_train_paths = int(0.75 * n_batch)
+    # # Test set_n_batch()
+    # policy.set_n_batch(n_batch)
+    # print(policy.n_batch)
 
-    train_sli = slice(0, n_train_paths)
-    test_sli = slice(train_sli.stop, None)
-
-    poar_train, poar_test = [tuple(arr[sli] for arr in (probs_na, obs, a_n, q_n)) for sli in (train_sli, test_sli)]
-    print(len(poar_train))
-    print(type(poar_train))
     # testing get_parameters_flat()
-    theta = policy.get_parameters_flat()
+    th = policy.get_parameters_flat() + 1.
     #
     # testing set_parameters_flat()
-    policy.set_parameters_flat(theta)
+    policy.set_parameters_flat(th)
 
-    # testing step()
-    policy.step(obs)
     # testing compute_gradient()
     policy.compute_gradient(probs_na, obs, a_n, q_n)
 
@@ -223,36 +219,10 @@ def test_AtariRAMPolicy():
 
     # Make sure we still have the same parameters
     th_new = policy.get_parameters_flat()
-    assert not np.any(th_new - theta)
-
-    # We define a new function in test_AtariRAMPolicy() so we can hand in the
-    # data needed and leave the parameters of the model as the only input.
-    def fpen(th):
-        thprev = policy.get_parameters_flat()
-        policy.set_parameters_flat(th)
-        surr, kl = policy.compute_surr_kl(*poar_train)
-        out = penalty_coeff * kl - surr
-        policy.set_parameters_flat(thprev)
-        return out
-
-    # Quick check it works.
-    fpen(theta)
-
-    # Do the same thing for the gradient of fpen.
-    def fgradpen(th):
-        thprev = policy.get_parameters_flat()
-        policy.set_parameters_flat(th)
-        out = - policy.compute_grad_lagrangian(penalty_coeff, *poar_train)
-        policy.set_parameters_flat(thprev)
-        return out
-
-    # Testing fgradpen()
-    fgradpen(theta)
-
-    # Test out our functions in the context of lbfgs-b minimization with scipy.
-    res = opt.fmin_l_bfgs_b(fpen, theta, fprime=fgradpen, maxiter=20)
-
-    print(res)
+
+    # assert not np.any(th_new - th)
+    print(th_new - th)
+
 
 if __name__ == "__main__":
     test_AtariRAMPolicy()
diff --git a/my_param_collection.py b/my_param_collection.py
@@ -0,0 +1,91 @@
+import tensorflow as tf
+import numpy as np
+
+class ParamCollection(object):
+
+    def __init__(self, sess, params):
+        """
+        params should be a list of TensorFlow nodes.
+        """
+        self._params = params
+        # Have to import the session to get the values being used.
+        self.sess = sess
+
+    @property
+    def params(self):
+        return self._params
+
+    def get_values(self):
+        """
+        Returns list of values of parameter arrays
+        """
+        return [self.sess.run(param) for param in self._params]
+
+    def get_shapes(self):
+        """
+        Shapes of parameter arrays
+        """
+        return [param.get_shape().as_list() for param in self._params]
+
+    def get_total_size(self):
+        """
+        Total number of parameters
+        """
+        return sum(np.prod(shape) for shape in self.get_shapes())
+
+    def num_vars(self):
+        """
+        Number of parameter arrays
+        """
+        return len(self._params)
+
+    def set_values(self, parvals):
+        """
+        Set values of parameter arrays given list of values `parvals`
+        """
+        assert len(parvals) == len(self._params)
+        for (param, newval) in zip(self._params, parvals):
+            self.sess.run(tf.assign(param, newval))
+            assert tuple(param.get_shape().as_list()) == newval.shape
+
+    def set_values_flat(self, theta):
+        """
+        Set parameters using a vector which represents all of the parameters
+        flattened and concatenated.
+        """
+        arrs = []
+        n = 0
+        for shape in self.get_shapes():
+            size = np.prod(shape)
+            arrs.append(theta[n:n+size].reshape(shape))
+            n += size
+        assert theta.size == n
+        self.set_values(arrs)
+
+    def get_values_flat(self):
+        """
+        Flatten all parameter arrays into one vector and return it as a numpy array.
+        """
+        theta = np.empty(self.get_total_size())
+        n = 0
+        for param in self._params:
+            s = np.prod(param.get_shape().as_list())
+            theta[n:n+s] = self.sess.run(param).flatten()
+            n += s
+        assert theta.size == n
+        return theta
+
+    def _params_names(self):
+        return [(param, param.name) for param in self._params]
+
+    def to_h5(self, grp):
+        """
+        Save parameter arrays to hdf5 group `grp`
+        """
+        for (param, name) in self._params_names():
+            arr = self.sess.run(param)
+            grp[name] = arr
+
+    def from_h5(self, grp):
+        parvals = [grp[name] for(_, name) in self._params_names()]
+        self.set_values(parvals)
diff --git a/original_param_collection.py b/original_param_collection.py