Fixed up randomness for environment and models.

BlGene · BlGene · commit dae45e830c3a · 2016-07-08T21:01:15.000Z
In general we want to have the largest possible diversity between the
processes, to prevent learning from degenerating.

Randomness in present in the environment through the random seed of the
 atari emulator and the number of no-ops at the beginning of the game.
It is present in the model through the sampling of discrete actions.

This patch makes sure there is a training level random seed, which is saved
to the args.txt file ( even if it has been generated ). This seed is in
turn used to create process level random seeds, which are used for both the
environment and the model. The enviroment random seed is used for the
emulator too.
diff --git a/a3c_ale.py b/a3c_ale.py
@@ -28,10 +28,10 @@
 
 class A3CFF(chainer.ChainList, a3c.A3CModel):
 
-    def __init__(self, n_actions):
+    def __init__(self, n_actions, seed):
         self.head = dqn_head.NIPSDQNHead()
         self.pi = policy.FCSoftmaxPolicy(
-            self.head.n_output_channels, n_actions)
+            self.head.n_output_channels, n_actions, seed)
         self.v = v_function.FCVFunction(self.head.n_output_channels)
         if sys.version_info < (3,0):
             super(A3CFF, self).__init__(self.head, self.pi, self.v)
@@ -46,10 +46,10 @@ def pi_and_v(self, state, keep_same_state=False):
 
 class A3CLSTM(chainer.ChainList, a3c.A3CModel):
 
-    def __init__(self, n_actions):
+    def __init__(self, n_actions, seed):
         self.head = dqn_head.NIPSDQNHead()
         self.pi = policy.FCSoftmaxPolicy(
-            self.head.n_output_channels, n_actions)
+            self.head.n_output_channels, n_actions, seed)
         self.v = v_function.FCVFunction(self.head.n_output_channels)
         self.lstm = L.LSTM(self.head.n_output_channels,
                            self.head.n_output_channels)
@@ -214,20 +214,32 @@ def main():
     parser.set_defaults(use_lstm=False)
     args = parser.parse_args()
 
-    if args.seed is not None:
-        random_seed.set_random_seed(args.seed)
+    if args.seed is None:
+        args.seed = np.random.randint(0, 2 ** 16)
+
+
+    # I suggest using train_randstate instead of np.random because it proably
+    # behaves better for async use.
+    train_randstate = np.random.RandomState(args.seed)
+
+    # Choose random seed before async execution, in oder to assure
+    # that we obtain different seeds for each process. This can be checked
+    # by making sure each emulator has different seed, this works because each
+    # emulator is set to have the same random seeds as its process ( the ALE python
+    # class ) see ale.py for detials
+    process_seeds = train_randstate.randint(0, 2 ** 16, args.processes)
 
     args.outdir = prepare_output_dir(args, args.outdir)
 
     print('Output files are saved in {}'.format(args.outdir))
 
     n_actions = ale.ALE(args.rom).number_of_actions
 
-    def model_opt():
+    def model_opt(seed=args.seed):
         if args.use_lstm:
-            model = A3CLSTM(n_actions)
+            model = A3CLSTM(n_actions,seed=seed)
         else:
-            model = A3CFF(n_actions)
+            model = A3CFF(n_actions,seed=seed)
         opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
         opt.setup(model)
         opt.add_hook(chainer.optimizer.GradientClipping(40))
@@ -249,9 +261,15 @@ def model_opt():
         column_names = ('steps', 'elapsed', 'mean', 'median', 'stdev')
         print('\t'.join(column_names), file=f)
 
+    # convert np.int64 to python int for JSON
+    process_seeds = [int(x) for x in process_seeds]
+
     def run_func(process_idx):
-        env = ale.ALE(args.rom, use_sdl=args.use_sdl)
-        model, opt = model_opt()
+        env = ale.ALE(args.rom,
+                      seed=process_seeds[process_idx],
+                      use_sdl=args.use_sdl)
+
+        model, opt = model_opt(seed=process_seeds[process_idx])
         async.set_shared_params(model, shared_params)
         async.set_shared_states(opt, shared_states)
 
diff --git a/ale.py b/ale.py
@@ -27,9 +27,22 @@ def __init__(self, rom_filename, seed=None, use_sdl=False, n_last_screens=4,
             assert seed >= 0 and seed < 2 ** 16, \
                 "ALE's random seed must be represented by unsigned int"
         else:
-            # Use numpy's random state
+            # Warning Starting ALE without explicit random seeds can lead
+            # to all processes sharing the same inital state. Please check the
+            # args.txt in case you are concerned about this.
             seed = np.random.randint(0, 2 ** 16)
-        ale.setInt(b'random_seed', seed)
+
+        # Remember our (per process) random seed
+        self.seed = seed
+
+        # Intialize a random state for this thread. If we always call
+        # self.randstate instead of np.random it should make the process
+        # deterministic.
+        self.randstate = np.random.RandomState(self.seed)
+
+        # Use the random seed for the ALE too
+        ale.setInt(b'random_seed', self.seed)
+
         ale.setFloat(b'repeat_action_probability', 0.0)
         ale.setBool(b'color_averaging', False)
         if record_screen_dir is not None:
@@ -142,7 +155,7 @@ def initialize(self):
             self.ale.reset_game()
 
         if self.max_start_nullops > 0:
-            n_nullops = np.random.randint(0, self.max_start_nullops + 1)
+            n_nullops = self.randstate.randint(0, self.max_start_nullops + 1)
             for _ in range(n_nullops):
                 self.ale.act(0)
 
diff --git a/async.py b/async.py
@@ -75,6 +75,7 @@ def run_async(n_process, run_func):
 
     processes = []
 
+    # It is not clear to me that this does what it should. --max
     def set_seed_and_run(process_idx, run_func):
         random_seed.set_random_seed(np.random.randint(0, 2 ** 32))
         run_func(process_idx)
diff --git a/policy.py b/policy.py
@@ -4,6 +4,7 @@
 import chainer
 from chainer import functions as F
 from chainer import links as L
+import numpy as np
 
 import policy_output
 
@@ -26,19 +27,29 @@ def compute_logits(self, state):
         raise NotImplementedError
 
     def __call__(self, state):
-        return policy_output.SoftmaxPolicyOutput(self.compute_logits(state))
+        # The SoftmaxPolicyOutput is not persistent, so it cannot hold
+        # its own random state, rely instead on the policy randstate
+        # passed as a reference
+        return policy_output.SoftmaxPolicyOutput(
+            self.compute_logits(state),
+            self.policy_randstate)
 
 
 class FCSoftmaxPolicy(chainer.ChainList, SoftmaxPolicy):
     """Softmax policy that consists of FC layers and rectifiers"""
 
-    def __init__(self, n_input_channels, n_actions,
+    def __init__(self, n_input_channels, n_actions, seed,
                  n_hidden_layers=0, n_hidden_channels=None):
         self.n_input_channels = n_input_channels
         self.n_actions = n_actions
         self.n_hidden_layers = n_hidden_layers
         self.n_hidden_channels = n_hidden_channels
 
+        # Have a per policy randstate, this should provide diversity
+        # in the fact of similar environments
+        self.model_seed = seed
+        self.policy_randstate = np.random.RandomState(seed)
+
         layers = []
         if n_hidden_layers > 0:
             layers.append(L.Linear(n_input_channels, n_hidden_channels))
diff --git a/policy_output.py b/policy_output.py
@@ -9,7 +9,7 @@ class PolicyOutput(object):
     pass
 
 
-def _sample_discrete_actions(batch_probs):
+def _sample_discrete_actions(batch_probs, randstate):
     """Sample a batch of actions from a batch of action probabilities.
 
     Args:
@@ -31,8 +31,9 @@ def _sample_discrete_actions(batch_probs):
 
 class SoftmaxPolicyOutput(PolicyOutput):
 
-    def __init__(self, logits):
+    def __init__(self, logits, randstate):
         self.logits = logits
+        self.policy_output_randstate = randstate
 
     @cached_property
     def most_probable_actions(self):
@@ -48,7 +49,7 @@ def log_probs(self):
 
     @cached_property
     def action_indices(self):
-        return _sample_discrete_actions(self.probs.data)
+        return _sample_discrete_actions(self.probs.data, self.policy_output_randstate)
 
     @cached_property
     def sampled_actions_log_probs(self):
diff --git a/prepare_output_dir.py b/prepare_output_dir.py
@@ -36,7 +36,7 @@ def prepare_output_dir(args, user_specified_dir=None):
 
     # Save all the arguments
     with open(os.path.join(outdir, 'args.txt'), 'w') as f:
-        f.write(json.dumps(vars(args)))
+        f.write(json.dumps(vars(args))+"\n\n")
 
     # Save `git status`
     with open(os.path.join(outdir, 'git-status.txt'), 'w') as f: