Skip to content

Commit 084b19a

Browse files
committed
Added param_collection.py
1 parent 4bc62f1 commit 084b19a

3 files changed

+260
-84
lines changed

my_atari_ram_policy.py

+54-84
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,31 @@
1-
from __future__ import print_function
2-
31
import tensorflow as tf
42
import numpy as np
5-
import scipy.optimize as opt
3+
from my_param_collection import ParamCollection
4+
# from rl import Serializable
5+
# from categorical import cat_sample, cat_entropy
6+
# from ppo import PPOPolicy
67

78
# Helper functions.
89
def weight_variable(shape, stddev=0.1, initial=None):
910
if initial is None:
10-
initial = tf.truncated_normal(shape, stddev=stddev,dtype=tf.float64)
11+
initial = tf.truncated_normal(shape, stddev=stddev, dtype=tf.float64)
1112
return tf.Variable(initial)
1213

13-
def bias_variable(shape, init_bias=0.1,initial=None):
14+
def bias_variable(shape, init_bias=0.1, initial=None):
1415
if initial is None:
15-
initial = tf.constant(init_bias, shape=shape,dtype=tf.float64)
16+
initial = tf.constant(init_bias, shape=shape, dtype=tf.float64)
1617
return tf.Variable(initial)
1718

18-
class AtariRAMPolicy(object):
19+
class AtariRAMPolicy(object): #PPOPolicy, Serializable):
1920
"""
2021
TensorFlow policy to play Atari.
2122
adapted from cgt version in cs294 @ http://rll.berkeley.edu/deeprlcourse/
2223
"""
2324
def __init__(self, n_actions):
24-
25+
# Serializable.__init__(self, n_actions)
2526
n_in = 128
2627
n_hid = 64
2728

28-
# Attach placeholders to self so they're in the scope of the feed_dict
29-
# and sess.run() for later functions that use the model.
3029

3130
# Observations placeholder. batch_size samples with 128 features.
3231
self.o_no = tf.placeholder(tf.float64, shape=[None, n_in])
@@ -51,35 +50,38 @@ def __init__(self, n_actions):
5150

5251
# Initialize weights and bias from input to hidden layer.
5352
self.W_01 = weight_variable([n_in, n_hid])
53+
self.W_12 = weight_variable([n_hid, n_actions], stddev=0.01)
5454
self.b_01 = bias_variable([n_hid])
55+
self.b_12 = bias_variable([n_actions], init_bias=0.01)
5556

5657
# Map input to hidden layer.
5758
h1 = tf.nn.tanh(tf.matmul(h0, self.W_01) + self.b_01)
5859

5960
# Initialize weights and biases from hidden layer to action space.
60-
self.W_12 = weight_variable([n_hid, n_actions], stddev=0.01)
61-
self.b_12 = bias_variable([n_actions], init_bias=0.01)
6261

6362
# Map hidden layer activations to probabilities of actions.
6463
self.probs_na = tf.nn.softmax(tf.matmul(h1, self.W_12) + self.b_12)
6564

6665
logprobs_na = tf.log(self.probs_na)
6766

6867
# This works.
69-
n_batch = tf.shape(self.a_n)[0]
68+
self.n_batch = tf.shape(self.a_n)[0]
7069

7170
# Gather from a flattened version of the matrix since gather_nd does
7271
# not work on the gpu at this time.
73-
idx_flattened = tf.range(0, n_batch) * n_actions + tf.cast(self.a_n, tf.int32)
72+
idx_flattened = tf.range(0, self.n_batch) * n_actions + tf.cast(self.a_n, tf.int32)
7473

7574
# The modeled log probability of the choice taken for whole batch.
7675
logps_n = tf.gather(tf.reshape(logprobs_na, [-1]), idx_flattened)
7776

7877
# Product of modeled log probability for chosen action and return.
7978
self.surr = tf.reduce_mean(tf.mul(logps_n , self.q_n))
8079

80+
params = tf.trainable_variables()
81+
8182
# Compute gradients of surrogate objective function.
82-
self.surr_grads = tf.gradients(self.surr, [self.W_01, self.W_12, self.b_01, self.b_12])
83+
# self.surr_grads = tf.gradients(self.surr, [self.W_01, self.W_12, self.b_01, self.b_12])
84+
self.surr_grads = tf.gradients(self.surr, params)
8385

8486
# Kullback-Liebler Divergence of new vs old transition probabilities.
8587
self.kl = tf.reduce_mean(
@@ -90,21 +92,23 @@ def __init__(self, n_actions):
9092
penobj = tf.sub(self.surr, tf.mul(self.lam, self.kl))
9193

9294
# Compute gradients of KLD-constrained objective function.
93-
self.penobj_grads = tf.gradients(penobj, [self.W_01, self.W_12, self.b_01, self.b_12])
95+
self.penobj_grads = tf.gradients(penobj, params)
9496

9597
# Attach a session with initialized variables to the class.
9698
self.sess = tf.InteractiveSession()
9799
self.sess.run(tf.initialize_all_variables())
98100

101+
self.pc = ParamCollection(self.sess, params)
99102

100103
def step(self, X):
101104
feed_dict={
102105
self.o_no : X,
103106
}
104-
pdist_na = self.sess.run(self.probs_na,feed_dict=feed_dict)
105-
# acts_n = cat_sample(pdist_na)
107+
pdist_na = self.sess.run(self.probs_na, feed_dict=feed_dict)
108+
# pdist_na = self.f_probs(X)
109+
acts_n = cat_sample(pdist_na)
106110
return {
107-
# "action" : acts_n,
111+
"action" : acts_n,
108112
"pdist" : pdist_na
109113
}
110114

@@ -117,6 +121,7 @@ def compute_gradient(self, pdist_np, o_no, a_n, q_n):
117121
}
118122
[surr_grads] = self.sess.run([self.surr_grads],feed_dict=feed_dict)
119123
return np.concatenate([p.flatten() for p in surr_grads],0)
124+
# return surr_grads
120125

121126
def compute_surr_kl(self, pdist_np, o_no, a_n, q_n):
122127
feed_dict={
@@ -141,34 +146,32 @@ def compute_grad_lagrangian(self, lam, pdist_np, o_no, a_n, q_n):
141146

142147

143148
def compute_entropy(self, pdist_np):
149+
raise NotImplementedError
144150
# return cat_entropy(pdist_np)
145-
assert NotImplementedError
146151

147152
def get_parameters_flat(self):
148-
W_01 = self.sess.run(self.W_01)
149-
W_12 = self.sess.run(self.W_12)
150-
b_01 = self.sess.run(self.b_01)
151-
b_12 = self.sess.run(self.b_12)
152-
return np.concatenate([p.flatten() for p in [W_01, W_12, b_01, b_12]],0)
153+
return self.pc.get_values_flat()
154+
# W_01 = self.sess.run(self.W_01)
155+
# W_12 = self.sess.run(self.W_12)
156+
# b_01 = self.sess.run(self.b_01)
157+
# b_12 = self.sess.run(self.b_12)
158+
# return np.concatenate([p.flatten() for p in [W_01, W_12, b_01, b_12]],0)
153159

154160
def set_parameters_flat(self, th):
155-
self.sess.run(tf.initialize_all_variables())
156-
# Get shape of parameters from the class.
157-
n_in = self.n_in
158-
n_hid = self.n_hid
159-
n_actions = self.n_actions
160-
# Grab and reshape weight matrices.
161-
W_01 = th[:n_hid*n_in].reshape(n_in,n_hid)
162-
W_12 = th[n_hid*n_in:n_hid*n_in+n_hid*n_actions].reshape(n_hid,n_actions)
163-
# Pull the biases off the end of th.
164-
b_01 = th[-n_hid-n_actions:-n_actions]
165-
b_12 = th[-n_actions:]
166-
# Assign the variables the values passed through th.
167-
self.sess.run(tf.assign(self.W_01, W_01))
168-
self.sess.run(tf.assign(self.W_12, W_12))
169-
self.sess.run(tf.assign(self.b_01, b_01))
170-
self.sess.run(tf.assign(self.b_12, b_12))
171-
161+
return self.pc.set_values_flat(th)
162+
# self.sess.run(tf.initialize_all_variables())
163+
# n_in = self.n_in
164+
# n_hid = self.n_hid
165+
# n_actions = self.n_actions
166+
# W_01 = th[:n_hid*n_in].reshape(n_in,n_hid)
167+
# W_12 = th[n_hid*n_in:n_hid*n_in+n_hid*n_actions].reshape(n_hid,n_actions)
168+
# b_01 = th[-n_hid-n_actions:-n_actions]
169+
# b_12 = th[-n_actions:]
170+
# self.sess.run(tf.assign(self.W_01, W_01))
171+
# self.sess.run(tf.assign(self.W_12, W_12))
172+
# self.sess.run(tf.assign(self.b_01, b_01))
173+
# self.sess.run(tf.assign(self.b_12, b_12))
174+
# self.pc = ParamCollection(th)
172175

173176

174177
def test_AtariRAMPolicy():
@@ -180,7 +183,6 @@ def test_AtariRAMPolicy():
180183
n_features = 128
181184
n_actions = 9
182185
lam = 1.0
183-
penalty_coeff = 1.0
184186

185187
# Go ahead and initialize the policy.
186188
policy = AtariRAMPolicy(n_actions=n_actions)
@@ -196,22 +198,16 @@ def test_AtariRAMPolicy():
196198

197199
# Now for some tests.
198200

199-
n_train_paths = int(0.75 * n_batch)
201+
# # Test set_n_batch()
202+
# policy.set_n_batch(n_batch)
203+
# print(policy.n_batch)
200204

201-
train_sli = slice(0, n_train_paths)
202-
test_sli = slice(train_sli.stop, None)
203-
204-
poar_train, poar_test = [tuple(arr[sli] for arr in (probs_na, obs, a_n, q_n)) for sli in (train_sli, test_sli)]
205-
print(len(poar_train))
206-
print(type(poar_train))
207205
# testing get_parameters_flat()
208-
theta = policy.get_parameters_flat()
206+
th = policy.get_parameters_flat() + 1.
209207
#
210208
# testing set_parameters_flat()
211-
policy.set_parameters_flat(theta)
209+
policy.set_parameters_flat(th)
212210

213-
# testing step()
214-
policy.step(obs)
215211
# testing compute_gradient()
216212
policy.compute_gradient(probs_na, obs, a_n, q_n)
217213

@@ -223,36 +219,10 @@ def test_AtariRAMPolicy():
223219

224220
# Make sure we still have the same parameters
225221
th_new = policy.get_parameters_flat()
226-
assert not np.any(th_new - theta)
227-
228-
# We define a new function in test_AtariRAMPolicy() so we can hand in the
229-
# data needed and leave the parameters of the model as the only input.
230-
def fpen(th):
231-
thprev = policy.get_parameters_flat()
232-
policy.set_parameters_flat(th)
233-
surr, kl = policy.compute_surr_kl(*poar_train)
234-
out = penalty_coeff * kl - surr
235-
policy.set_parameters_flat(thprev)
236-
return out
237-
238-
# Quick check it works.
239-
fpen(theta)
240-
241-
# Do the same thing for the gradient of fpen.
242-
def fgradpen(th):
243-
thprev = policy.get_parameters_flat()
244-
policy.set_parameters_flat(th)
245-
out = - policy.compute_grad_lagrangian(penalty_coeff, *poar_train)
246-
policy.set_parameters_flat(thprev)
247-
return out
248-
249-
# Testing fgradpen()
250-
fgradpen(theta)
251-
252-
# Test out our functions in the context of lbfgs-b minimization with scipy.
253-
res = opt.fmin_l_bfgs_b(fpen, theta, fprime=fgradpen, maxiter=20)
254-
255-
print(res)
222+
223+
# assert not np.any(th_new - th)
224+
print(th_new - th)
225+
256226

257227
if __name__ == "__main__":
258228
test_AtariRAMPolicy()

my_param_collection.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import tensorflow as tf
2+
import numpy as np
3+
4+
class ParamCollection(object):
5+
6+
def __init__(self, sess, params):
7+
"""
8+
params should be a list of TensorFlow nodes.
9+
"""
10+
self._params = params
11+
# Have to import the session to get the values being used.
12+
self.sess = sess
13+
14+
@property
15+
def params(self):
16+
return self._params
17+
18+
def get_values(self):
19+
"""
20+
Returns list of values of parameter arrays
21+
"""
22+
return [self.sess.run(param) for param in self._params]
23+
24+
def get_shapes(self):
25+
"""
26+
Shapes of parameter arrays
27+
"""
28+
return [param.get_shape().as_list() for param in self._params]
29+
30+
def get_total_size(self):
31+
"""
32+
Total number of parameters
33+
"""
34+
return sum(np.prod(shape) for shape in self.get_shapes())
35+
36+
def num_vars(self):
37+
"""
38+
Number of parameter arrays
39+
"""
40+
return len(self._params)
41+
42+
def set_values(self, parvals):
43+
"""
44+
Set values of parameter arrays given list of values `parvals`
45+
"""
46+
assert len(parvals) == len(self._params)
47+
for (param, newval) in zip(self._params, parvals):
48+
self.sess.run(tf.assign(param, newval))
49+
assert tuple(param.get_shape().as_list()) == newval.shape
50+
51+
def set_values_flat(self, theta):
52+
"""
53+
Set parameters using a vector which represents all of the parameters
54+
flattened and concatenated.
55+
"""
56+
arrs = []
57+
n = 0
58+
for shape in self.get_shapes():
59+
size = np.prod(shape)
60+
arrs.append(theta[n:n+size].reshape(shape))
61+
n += size
62+
assert theta.size == n
63+
self.set_values(arrs)
64+
65+
def get_values_flat(self):
66+
"""
67+
Flatten all parameter arrays into one vector and return it as a numpy array.
68+
"""
69+
theta = np.empty(self.get_total_size())
70+
n = 0
71+
for param in self._params:
72+
s = np.prod(param.get_shape().as_list())
73+
theta[n:n+s] = self.sess.run(param).flatten()
74+
n += s
75+
assert theta.size == n
76+
return theta
77+
78+
def _params_names(self):
79+
return [(param, param.name) for param in self._params]
80+
81+
def to_h5(self, grp):
82+
"""
83+
Save parameter arrays to hdf5 group `grp`
84+
"""
85+
for (param, name) in self._params_names():
86+
arr = self.sess.run(param)
87+
grp[name] = arr
88+
89+
def from_h5(self, grp):
90+
parvals = [grp[name] for(_, name) in self._params_names()]
91+
self.set_values(parvals)

0 commit comments

Comments
 (0)