1
- from __future__ import print_function
2
-
3
1
import tensorflow as tf
4
2
import numpy as np
5
- import scipy .optimize as opt
3
+ from my_param_collection import ParamCollection
4
+ # from rl import Serializable
5
+ # from categorical import cat_sample, cat_entropy
6
+ # from ppo import PPOPolicy
6
7
7
8
# Helper functions.
8
9
def weight_variable (shape , stddev = 0.1 , initial = None ):
9
10
if initial is None :
10
- initial = tf .truncated_normal (shape , stddev = stddev ,dtype = tf .float64 )
11
+ initial = tf .truncated_normal (shape , stddev = stddev , dtype = tf .float64 )
11
12
return tf .Variable (initial )
12
13
13
- def bias_variable (shape , init_bias = 0.1 ,initial = None ):
14
+ def bias_variable (shape , init_bias = 0.1 , initial = None ):
14
15
if initial is None :
15
- initial = tf .constant (init_bias , shape = shape ,dtype = tf .float64 )
16
+ initial = tf .constant (init_bias , shape = shape , dtype = tf .float64 )
16
17
return tf .Variable (initial )
17
18
18
- class AtariRAMPolicy (object ):
19
+ class AtariRAMPolicy (object ): #PPOPolicy, Serializable):
19
20
"""
20
21
TensorFlow policy to play Atari.
21
22
adapted from cgt version in cs294 @ http://rll.berkeley.edu/deeprlcourse/
22
23
"""
23
24
def __init__ (self , n_actions ):
24
-
25
+ # Serializable.__init__(self, n_actions)
25
26
n_in = 128
26
27
n_hid = 64
27
28
28
- # Attach placeholders to self so they're in the scope of the feed_dict
29
- # and sess.run() for later functions that use the model.
30
29
31
30
# Observations placeholder. batch_size samples with 128 features.
32
31
self .o_no = tf .placeholder (tf .float64 , shape = [None , n_in ])
@@ -51,35 +50,38 @@ def __init__(self, n_actions):
51
50
52
51
# Initialize weights and bias from input to hidden layer.
53
52
self .W_01 = weight_variable ([n_in , n_hid ])
53
+ self .W_12 = weight_variable ([n_hid , n_actions ], stddev = 0.01 )
54
54
self .b_01 = bias_variable ([n_hid ])
55
+ self .b_12 = bias_variable ([n_actions ], init_bias = 0.01 )
55
56
56
57
# Map input to hidden layer.
57
58
h1 = tf .nn .tanh (tf .matmul (h0 , self .W_01 ) + self .b_01 )
58
59
59
60
# Initialize weights and biases from hidden layer to action space.
60
- self .W_12 = weight_variable ([n_hid , n_actions ], stddev = 0.01 )
61
- self .b_12 = bias_variable ([n_actions ], init_bias = 0.01 )
62
61
63
62
# Map hidden layer activations to probabilities of actions.
64
63
self .probs_na = tf .nn .softmax (tf .matmul (h1 , self .W_12 ) + self .b_12 )
65
64
66
65
logprobs_na = tf .log (self .probs_na )
67
66
68
67
# This works.
69
- n_batch = tf .shape (self .a_n )[0 ]
68
+ self . n_batch = tf .shape (self .a_n )[0 ]
70
69
71
70
# Gather from a flattened version of the matrix since gather_nd does
72
71
# not work on the gpu at this time.
73
- idx_flattened = tf .range (0 , n_batch ) * n_actions + tf .cast (self .a_n , tf .int32 )
72
+ idx_flattened = tf .range (0 , self . n_batch ) * n_actions + tf .cast (self .a_n , tf .int32 )
74
73
75
74
# The modeled log probability of the choice taken for whole batch.
76
75
logps_n = tf .gather (tf .reshape (logprobs_na , [- 1 ]), idx_flattened )
77
76
78
77
# Product of modeled log probability for chosen action and return.
79
78
self .surr = tf .reduce_mean (tf .mul (logps_n , self .q_n ))
80
79
80
+ params = tf .trainable_variables ()
81
+
81
82
# Compute gradients of surrogate objective function.
82
- self .surr_grads = tf .gradients (self .surr , [self .W_01 , self .W_12 , self .b_01 , self .b_12 ])
83
+ # self.surr_grads = tf.gradients(self.surr, [self.W_01, self.W_12, self.b_01, self.b_12])
84
+ self .surr_grads = tf .gradients (self .surr , params )
83
85
84
86
# Kullback-Liebler Divergence of new vs old transition probabilities.
85
87
self .kl = tf .reduce_mean (
@@ -90,21 +92,23 @@ def __init__(self, n_actions):
90
92
penobj = tf .sub (self .surr , tf .mul (self .lam , self .kl ))
91
93
92
94
# Compute gradients of KLD-constrained objective function.
93
- self .penobj_grads = tf .gradients (penobj , [ self . W_01 , self . W_12 , self . b_01 , self . b_12 ] )
95
+ self .penobj_grads = tf .gradients (penobj , params )
94
96
95
97
# Attach a session with initialized variables to the class.
96
98
self .sess = tf .InteractiveSession ()
97
99
self .sess .run (tf .initialize_all_variables ())
98
100
101
+ self .pc = ParamCollection (self .sess , params )
99
102
100
103
def step (self , X ):
101
104
feed_dict = {
102
105
self .o_no : X ,
103
106
}
104
- pdist_na = self .sess .run (self .probs_na ,feed_dict = feed_dict )
105
- # acts_n = cat_sample(pdist_na)
107
+ pdist_na = self .sess .run (self .probs_na , feed_dict = feed_dict )
108
+ # pdist_na = self.f_probs(X)
109
+ acts_n = cat_sample (pdist_na )
106
110
return {
107
- # "action" : acts_n,
111
+ "action" : acts_n ,
108
112
"pdist" : pdist_na
109
113
}
110
114
@@ -117,6 +121,7 @@ def compute_gradient(self, pdist_np, o_no, a_n, q_n):
117
121
}
118
122
[surr_grads ] = self .sess .run ([self .surr_grads ],feed_dict = feed_dict )
119
123
return np .concatenate ([p .flatten () for p in surr_grads ],0 )
124
+ # return surr_grads
120
125
121
126
def compute_surr_kl (self , pdist_np , o_no , a_n , q_n ):
122
127
feed_dict = {
@@ -141,34 +146,32 @@ def compute_grad_lagrangian(self, lam, pdist_np, o_no, a_n, q_n):
141
146
142
147
143
148
def compute_entropy (self , pdist_np ):
149
+ raise NotImplementedError
144
150
# return cat_entropy(pdist_np)
145
- assert NotImplementedError
146
151
147
152
def get_parameters_flat (self ):
148
- W_01 = self .sess .run (self .W_01 )
149
- W_12 = self .sess .run (self .W_12 )
150
- b_01 = self .sess .run (self .b_01 )
151
- b_12 = self .sess .run (self .b_12 )
152
- return np .concatenate ([p .flatten () for p in [W_01 , W_12 , b_01 , b_12 ]],0 )
153
+ return self .pc .get_values_flat ()
154
+ # W_01 = self.sess.run(self.W_01)
155
+ # W_12 = self.sess.run(self.W_12)
156
+ # b_01 = self.sess.run(self.b_01)
157
+ # b_12 = self.sess.run(self.b_12)
158
+ # return np.concatenate([p.flatten() for p in [W_01, W_12, b_01, b_12]],0)
153
159
154
160
def set_parameters_flat (self , th ):
155
- self .sess .run (tf .initialize_all_variables ())
156
- # Get shape of parameters from the class.
157
- n_in = self .n_in
158
- n_hid = self .n_hid
159
- n_actions = self .n_actions
160
- # Grab and reshape weight matrices.
161
- W_01 = th [:n_hid * n_in ].reshape (n_in ,n_hid )
162
- W_12 = th [n_hid * n_in :n_hid * n_in + n_hid * n_actions ].reshape (n_hid ,n_actions )
163
- # Pull the biases off the end of th.
164
- b_01 = th [- n_hid - n_actions :- n_actions ]
165
- b_12 = th [- n_actions :]
166
- # Assign the variables the values passed through th.
167
- self .sess .run (tf .assign (self .W_01 , W_01 ))
168
- self .sess .run (tf .assign (self .W_12 , W_12 ))
169
- self .sess .run (tf .assign (self .b_01 , b_01 ))
170
- self .sess .run (tf .assign (self .b_12 , b_12 ))
171
-
161
+ return self .pc .set_values_flat (th )
162
+ # self.sess.run(tf.initialize_all_variables())
163
+ # n_in = self.n_in
164
+ # n_hid = self.n_hid
165
+ # n_actions = self.n_actions
166
+ # W_01 = th[:n_hid*n_in].reshape(n_in,n_hid)
167
+ # W_12 = th[n_hid*n_in:n_hid*n_in+n_hid*n_actions].reshape(n_hid,n_actions)
168
+ # b_01 = th[-n_hid-n_actions:-n_actions]
169
+ # b_12 = th[-n_actions:]
170
+ # self.sess.run(tf.assign(self.W_01, W_01))
171
+ # self.sess.run(tf.assign(self.W_12, W_12))
172
+ # self.sess.run(tf.assign(self.b_01, b_01))
173
+ # self.sess.run(tf.assign(self.b_12, b_12))
174
+ # self.pc = ParamCollection(th)
172
175
173
176
174
177
def test_AtariRAMPolicy ():
@@ -180,7 +183,6 @@ def test_AtariRAMPolicy():
180
183
n_features = 128
181
184
n_actions = 9
182
185
lam = 1.0
183
- penalty_coeff = 1.0
184
186
185
187
# Go ahead and initialize the policy.
186
188
policy = AtariRAMPolicy (n_actions = n_actions )
@@ -196,22 +198,16 @@ def test_AtariRAMPolicy():
196
198
197
199
# Now for some tests.
198
200
199
- n_train_paths = int (0.75 * n_batch )
201
+ # # Test set_n_batch()
202
+ # policy.set_n_batch(n_batch)
203
+ # print(policy.n_batch)
200
204
201
- train_sli = slice (0 , n_train_paths )
202
- test_sli = slice (train_sli .stop , None )
203
-
204
- poar_train , poar_test = [tuple (arr [sli ] for arr in (probs_na , obs , a_n , q_n )) for sli in (train_sli , test_sli )]
205
- print (len (poar_train ))
206
- print (type (poar_train ))
207
205
# testing get_parameters_flat()
208
- theta = policy .get_parameters_flat ()
206
+ th = policy .get_parameters_flat () + 1.
209
207
#
210
208
# testing set_parameters_flat()
211
- policy .set_parameters_flat (theta )
209
+ policy .set_parameters_flat (th )
212
210
213
- # testing step()
214
- policy .step (obs )
215
211
# testing compute_gradient()
216
212
policy .compute_gradient (probs_na , obs , a_n , q_n )
217
213
@@ -223,36 +219,10 @@ def test_AtariRAMPolicy():
223
219
224
220
# Make sure we still have the same parameters
225
221
th_new = policy .get_parameters_flat ()
226
- assert not np .any (th_new - theta )
227
-
228
- # We define a new function in test_AtariRAMPolicy() so we can hand in the
229
- # data needed and leave the parameters of the model as the only input.
230
- def fpen (th ):
231
- thprev = policy .get_parameters_flat ()
232
- policy .set_parameters_flat (th )
233
- surr , kl = policy .compute_surr_kl (* poar_train )
234
- out = penalty_coeff * kl - surr
235
- policy .set_parameters_flat (thprev )
236
- return out
237
-
238
- # Quick check it works.
239
- fpen (theta )
240
-
241
- # Do the same thing for the gradient of fpen.
242
- def fgradpen (th ):
243
- thprev = policy .get_parameters_flat ()
244
- policy .set_parameters_flat (th )
245
- out = - policy .compute_grad_lagrangian (penalty_coeff , * poar_train )
246
- policy .set_parameters_flat (thprev )
247
- return out
248
-
249
- # Testing fgradpen()
250
- fgradpen (theta )
251
-
252
- # Test out our functions in the context of lbfgs-b minimization with scipy.
253
- res = opt .fmin_l_bfgs_b (fpen , theta , fprime = fgradpen , maxiter = 20 )
254
-
255
- print (res )
222
+
223
+ # assert not np.any(th_new - th)
224
+ print (th_new - th )
225
+
256
226
257
227
if __name__ == "__main__" :
258
228
test_AtariRAMPolicy ()
0 commit comments