Curriculum learning reward thresholding bug fix (Unity-Technologies#1141)

Deric Pang · web-flow · commit a4e714092ee8 · 2018-09-04T17:00:11.000-07:00
diff --git a/UnitySDK/Assets/ML-Agents/Examples/WallJump/Scenes/WallJump.unity b/UnitySDK/Assets/ML-Agents/Examples/WallJump/Scenes/WallJump.unity
diff --git a/config/curricula/push-block/PushBlockBrain.json b/config/curricula/push-block/PushBlockBrain.json
@@ -1,7 +1,7 @@
 {
     "measure" : "reward",
     "thresholds" : [0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75],
-    "min_lesson_length" : 2,
+    "min_lesson_length" : 100,
     "signal_smoothing" : true, 
     "parameters" : 
     {
diff --git a/config/curricula/test/TestBrain.json b/config/curricula/test/TestBrain.json
@@ -1,7 +1,7 @@
 {
     "measure" : "reward",
     "thresholds" : [10, 20, 50],
-    "min_lesson_length" : 3,
+    "min_lesson_length" : 100,
     "signal_smoothing" : true, 
     "parameters" : 
     {
diff --git a/config/curricula/wall-jump/BigWallBrain.json b/config/curricula/wall-jump/BigWallBrain.json
@@ -1,7 +1,7 @@
 {
     "measure" : "progress",
     "thresholds" : [0.1, 0.3, 0.5],
-    "min_lesson_length" : 2,
+    "min_lesson_length": 100,
     "signal_smoothing" : true, 
     "parameters" : 
     {
diff --git a/config/curricula/wall-jump/SmallWallBrain.json b/config/curricula/wall-jump/SmallWallBrain.json
@@ -1,7 +1,7 @@
 {
     "measure" : "progress",
     "thresholds" : [0.1, 0.3, 0.5],
-    "min_lesson_length" : 2,
+    "min_lesson_length": 100,
     "signal_smoothing" : true, 
     "parameters" : 
     {
diff --git a/docs/Migrating.md b/docs/Migrating.md
@@ -41,10 +41,16 @@
   [trainer_config.yaml](../config/trainer_config.yaml). An example of passing
   a trainer configuration to `mlagents-learn` is shown above.
 * The environment name is now passed through the `--env` option.
-* Curriculum files must now be placed into a folder and be named appropriately.
-  Refer to the
-  [Curriculum training documentation](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-Curriculum-Learning.md)
-  for more information.
+* Curriculum learning has been changed. Refer to the
+    [curriculum learning documentation](Training-Curriculum-Learning.md)
+    for detailed information. In summary:
+  * Curriculum files for the same environment must now be placed into a folder.
+    Each curriculum file should be named after the brain whose curriculum it
+    specifies.
+  * `min_lesson_length` now specifies the minimum number of episodes in a lesson
+    and affects reward thresholding.
+  * It is no longer necessary to specify the `Max Steps` of the Academy to use
+    curriculum learning.
 
 ## Migrating from ML-Agents toolkit v0.3 to v0.4
 
diff --git a/docs/Training-Curriculum-Learning.md b/docs/Training-Curriculum-Learning.md
@@ -59,7 +59,7 @@ the BigWallBrain in the Wall Jump environment.
 {
     "measure" : "progress",
     "thresholds" : [0.1, 0.3, 0.5],
-    "min_lesson_length" : 2,
+    "min_lesson_length" : 100,
     "signal_smoothing" : true,
     "parameters" :
     {
@@ -74,8 +74,18 @@ the BigWallBrain in the Wall Jump environment.
   * `progress` - Uses ratio of steps/max_steps.
 * `thresholds` (float array) - Points in value of `measure` where lesson should
   be increased.
-* `min_lesson_length` (int) - How many times the progress measure should be
-  reported before incrementing the lesson.
+* `min_lesson_length` (int) - The minimum number of episodes that should be
+  completed before the lesson can change. If `measure` is set to `reward`, the
+  average cumulative reward of the last `min_lesson_length` episodes will be
+  used to determine if the lesson should change. Must be nonnegative.
+
+  __Important__: the average reward that is compared to the thresholds is
+  different than the mean reward that is logged to the console. For example,
+  if `min_lesson_length` is `100`, the lesson will increment after the average
+  cumulative reward of the last `100` episodes exceeds the current threshold.
+  The mean reward logged to the console is dictated by the `summary_freq`
+  parameter in the
+  [trainer configuration file](Training-ML-Agents.md#training-config-file).
 * `signal_smoothing` (true/false) - Whether to weight the current progress
   measure by previous values.
   * If `true`, weighting will be 0.75 (new) 0.25 (old).
@@ -107,5 +117,4 @@ agents in the Wall Jump environment with curriculum learning, we can run
 mlagents-learn config/trainer_config.yaml --curriculum=curricula/wall-jump/ --run-id=wall-jump-curriculum --train
 ```
 
-We can then keep track of the current
-lessons and progresses via TensorBoard.
+We can then keep track of the current lessons and progresses via TensorBoard.
diff --git a/ml-agents/mlagents/trainers/curriculum.py b/ml-agents/mlagents/trainers/curriculum.py
@@ -1,5 +1,6 @@
 import os
 import json
+import math
 
 from .exception import CurriculumError
 
@@ -13,9 +14,9 @@ def __init__(self, location, default_reset_parameters):
         """
         Initializes a Curriculum object.
         :param location: Path to JSON defining curriculum.
-        :param default_reset_parameters: Set of reset parameters for environment.
+        :param default_reset_parameters: Set of reset parameters for
+               environment.
         """
-        self.lesson_length = 0
         self.max_lesson_num = 0
         self.measure = None
         self._lesson_num = 0
@@ -30,15 +31,18 @@ def __init__(self, location, default_reset_parameters):
             raise CurriculumError(
                 'The file {0} could not be found.'.format(location))
         except UnicodeDecodeError:
-            raise CurriculumError('There was an error decoding {}'.format(location))
+            raise CurriculumError('There was an error decoding {}'
+                                  .format(location))
         self.smoothing_value = 0
         for key in ['parameters', 'measure', 'thresholds',
                     'min_lesson_length', 'signal_smoothing']:
             if key not in self.data:
                 raise CurriculumError("{0} does not contain a "
-                                                "{1} field.".format(location, key))
+                                      "{1} field."
+                                      .format(location, key))
         self.smoothing_value = 0
         self.measure = self.data['measure']
+        self.min_lesson_length = self.data['min_lesson_length']
         self.max_lesson_num = len(self.data['thresholds'])
 
         parameters = self.data['parameters']
@@ -51,32 +55,31 @@ def __init__(self, location, default_reset_parameters):
                 raise CurriculumError(
                     'The parameter {0} in Curriculum {1} must have {2} values '
                     'but {3} were found'.format(key, location,
-                                                self.max_lesson_num + 1, len(parameters[key])))
+                                                self.max_lesson_num + 1,
+                                                len(parameters[key])))
 
     @property
     def lesson_num(self):
         return self._lesson_num
 
     @lesson_num.setter
     def lesson_num(self, lesson_num):
-        self.lesson_length = 0
         self._lesson_num = max(0, min(lesson_num, self.max_lesson_num))
 
-    def increment_lesson(self, progress):
+    def increment_lesson(self, measure_val):
         """
         Increments the lesson number depending on the progress given.
-        :param progress: Measure of progress (either reward or percentage steps completed).
+        :param measure_val: Measure of progress (either reward or percentage
+               steps completed).
+        :return Whether the lesson was incremented.
         """
-        if self.data is None or progress is None:
-            return
+        if not self.data or not measure_val or math.isnan(measure_val):
+            return False
         if self.data['signal_smoothing']:
-            progress = self.smoothing_value * 0.25 + 0.75 * progress
-            self.smoothing_value = progress
-        self.lesson_length += 1
+            measure_val = self.smoothing_value * 0.25 + 0.75 * measure_val
+            self.smoothing_value = measure_val
         if self.lesson_num < self.max_lesson_num:
-            if ((progress > self.data['thresholds'][self.lesson_num]) and
-                    (self.lesson_length > self.data['min_lesson_length'])):
-                self.lesson_length = 0
+            if measure_val > self.data['thresholds'][self.lesson_num]:
                 self.lesson_num += 1
                 config = {}
                 parameters = self.data['parameters']
@@ -85,15 +88,19 @@ def increment_lesson(self, progress):
                 logger.info('{0} lesson changed. Now in lesson {1}: {2}'
                             .format(self._brain_name,
                                     self.lesson_num,
-                                    ', '.join([str(x) + ' -> ' + str(config[x]) for x in config])))
+                                    ', '.join([str(x) + ' -> ' + str(config[x])
+                                        for x in config])))
+                return True
+        return False
 
     def get_config(self, lesson=None):
         """
         Returns reset parameters which correspond to the lesson.
-        :param lesson: The lesson you want to get the config of. If None, the current lesson is returned.
+        :param lesson: The lesson you want to get the config of. If None, the
+               current lesson is returned.
         :return: The configuration of the reset parameters.
         """
-        if self.data is None:
+        if not self.data:
             return {}
         if lesson is None:
             lesson = self.lesson_num
diff --git a/ml-agents/mlagents/trainers/meta_curriculum.py b/ml-agents/mlagents/trainers/meta_curriculum.py
@@ -10,8 +10,8 @@
 
 
 class MetaCurriculum(object):
-    """A MetaCurriculum holds curriculums. Each curriculum is associated to a particular
-    brain in the environment.
+    """A MetaCurriculum holds curriculums. Each curriculum is associated to a
+    particular brain in the environment.
     """
 
     def __init__(self, curriculum_folder, default_reset_parameters):
@@ -33,10 +33,12 @@ def __init__(self, curriculum_folder, default_reset_parameters):
                 brain_name = curriculum_filename.split('.')[0]
                 curriculum_filepath = \
                     os.path.join(curriculum_folder, curriculum_filename)
-                curriculum = Curriculum(curriculum_filepath, default_reset_parameters)
+                curriculum = Curriculum(curriculum_filepath,
+                                        default_reset_parameters)
 
                 # Check if any two curriculums use the same reset params.
-                if any([(parameter in curriculum.get_config().keys()) for parameter in used_reset_parameters]):
+                if any([(parameter in curriculum.get_config().keys())
+                    for parameter in used_reset_parameters]):
                     logger.warning('Two or more curriculums will '
                                 'attempt to change the same reset '
                                 'parameter. The result will be '
@@ -69,18 +71,57 @@ def lesson_nums(self, lesson_nums):
         for brain_name, lesson in lesson_nums.items():
             self.brains_to_curriculums[brain_name].lesson_num = lesson
 
-    def increment_lessons(self, progresses):
-        """Increments all the lessons of all the curriculums in this MetaCurriculum.
+    def _lesson_ready_to_increment(self, brain_name, reward_buff_size):
+        """Determines whether the curriculum of a specified brain is ready
+        to attempt an increment.
 
         Args:
-            progresses (dict): A dict of brain name to progress.
+            brain_name (str): The name of the brain whose curriculum will be
+                checked for readiness.
+            reward_buff_size (int): The size of the reward buffer of the trainer
+                that corresponds to the specified brain.
+
+        Returns:
+            Whether the curriculum of the specified brain should attempt to
+            increment its lesson.
+        """
+        return reward_buff_size >= (self.brains_to_curriculums[brain_name]
+                                        .min_lesson_length)
+
+    def increment_lessons(self, measure_vals, reward_buff_sizes=None):
+        """Attempts to increments all the lessons of all the curriculums in this
+        MetaCurriculum. Note that calling this method does not guarantee the
+        lesson of a curriculum will increment. The lesson of a curriculum will
+        only increment if the specified measure threshold defined in the
+        curriculum has been reached and the minimum number of episodes in the
+        lesson have been completed.
+
+        Args:
+            measure_vals (dict): A dict of brain name to measure value.
+            reward_buff_sizes (dict): A dict of brain names to the size of their
+                corresponding reward buffers.
+
+        Returns:
+            A dict from brain name to whether that brain's lesson number was
+            incremented.
         """
-        for brain_name, progress in progresses.items():
-            self.brains_to_curriculums[brain_name].increment_lesson(progress)
+        ret = {}
+        if reward_buff_sizes:
+            for brain_name, buff_size in reward_buff_sizes.items():
+                if self._lesson_ready_to_increment(brain_name, buff_size):
+                    measure_val = measure_vals[brain_name]
+                    ret[brain_name] = (self.brains_to_curriculums[brain_name]
+                                           .increment_lesson(measure_val))
+        else:
+            for brain_name, measure_val in measure_vals.items():
+                ret[brain_name] = (self.brains_to_curriculums[brain_name]
+                                       .increment_lesson(measure_val))
+        return ret
 
 
     def set_all_curriculums_to_lesson_num(self, lesson_num):
-        """Sets all the curriculums in this meta curriculum to a specified lesson number.
+        """Sets all the curriculums in this meta curriculum to a specified
+        lesson number.
 
         Args:
             lesson_num (int): The lesson number which all the curriculums will
@@ -91,7 +132,8 @@ def set_all_curriculums_to_lesson_num(self, lesson_num):
 
 
     def get_config(self):
-        """Get the combined configuration of all curriculums in this MetaCurriculum.
+        """Get the combined configuration of all curriculums in this
+        MetaCurriculum.
 
         Returns:
             A dict from parameter to value.
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -4,6 +4,7 @@
 
 import logging
 import os
+from collections import deque
 
 import numpy as np
 import tensorflow as tf
@@ -19,7 +20,7 @@
 class PPOTrainer(Trainer):
     """The PPOTrainer is an implementation of the PPO algorithm."""
 
-    def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
+    def __init__(self, sess, brain, reward_buff_cap, trainer_parameters, training, seed, run_id):
         """
         Responsible for collecting experiences and training PPO model.
         :param sess: Tensorflow session.
@@ -57,6 +58,7 @@ def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
 
         self.training_buffer = Buffer()
         self.cumulative_rewards = {}
+        self._reward_buffer = deque(maxlen=reward_buff_cap)
         self.episode_steps = {}
         self.summary_path = trainer_parameters['summary_path']
         if not os.path.exists(self.summary_path):
@@ -91,6 +93,16 @@ def get_step(self):
         """
         return self.step
 
+    @property
+    def reward_buffer(self):
+        """
+        Returns the reward buffer. The reward buffer contains the cumulative
+        rewards of the most recent episodes completed by agents using this
+        trainer.
+        :return: the reward buffer.
+        """
+        return self._reward_buffer
+
     def increment_step_and_update_last_reward(self):
         """
         Increment the step count of the trainer and Updates the last reward
@@ -281,6 +293,7 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo
                 if info.local_done[l]:
                     self.stats['cumulative_reward'].append(
                         self.cumulative_rewards.get(agent_id, 0))
+                    self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))
                     self.stats['episode_length'].append(
                         self.episode_steps.get(agent_id, 0))
                     self.cumulative_rewards[agent_id] = 0
diff --git a/ml-agents/mlagents/trainers/trainer.py b/ml-agents/mlagents/trainers/trainer.py
@@ -23,7 +23,7 @@ def __init__(self, sess, brain_name, trainer_parameters, training, run_id):
         """
         Responsible for collecting experiences and training a neural network model.
         :param sess: Tensorflow session.
-        :param  trainer_parameters: The parameters for the trainer (dictionary).
+        :param trainer_parameters: The parameters for the trainer (dictionary).
         :param training: Whether the trainer is set for training.
         """
         self.sess = sess
diff --git a/ml-agents/mlagents/trainers/trainer_controller.py b/ml-agents/mlagents/trainers/trainer_controller.py
diff --git a/ml-agents/tests/trainers/test_curriculum.py b/ml-agents/tests/trainers/test_curriculum.py
diff --git a/ml-agents/tests/trainers/test_meta_curriculum.py b/ml-agents/tests/trainers/test_meta_curriculum.py

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"measure" : "reward",`
`3`	`3`	`"thresholds" : [0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75],`
`4`		`- "min_lesson_length" : 2,`
	`4`	`+ "min_lesson_length" : 100,`
`5`	`5`	`"signal_smoothing" : true,`
`6`	`6`	`"parameters" :`
`7`	`7`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"measure" : "reward",`
`3`	`3`	`"thresholds" : [10, 20, 50],`
`4`		`- "min_lesson_length" : 3,`
	`4`	`+ "min_lesson_length" : 100,`
`5`	`5`	`"signal_smoothing" : true,`
`6`	`6`	`"parameters" :`
`7`	`7`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"measure" : "progress",`
`3`	`3`	`"thresholds" : [0.1, 0.3, 0.5],`
`4`		`- "min_lesson_length" : 2,`
	`4`	`+ "min_lesson_length": 100,`
`5`	`5`	`"signal_smoothing" : true,`
`6`	`6`	`"parameters" :`
`7`	`7`	`{`