Skip to content

Commit a4e7140

Browse files
author
Deric Pang
authored
Curriculum learning reward thresholding bug fix (Unity-Technologies#1141)
1 parent dc6b78b commit a4e7140

File tree

14 files changed

+294
-220
lines changed

14 files changed

+294
-220
lines changed

UnitySDK/Assets/ML-Agents/Examples/WallJump/Scenes/WallJump.unity

Lines changed: 96 additions & 142 deletions
Large diffs are not rendered by default.

config/curricula/push-block/PushBlockBrain.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"measure" : "reward",
33
"thresholds" : [0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75],
4-
"min_lesson_length" : 2,
4+
"min_lesson_length" : 100,
55
"signal_smoothing" : true,
66
"parameters" :
77
{

config/curricula/test/TestBrain.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"measure" : "reward",
33
"thresholds" : [10, 20, 50],
4-
"min_lesson_length" : 3,
4+
"min_lesson_length" : 100,
55
"signal_smoothing" : true,
66
"parameters" :
77
{

config/curricula/wall-jump/BigWallBrain.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"measure" : "progress",
33
"thresholds" : [0.1, 0.3, 0.5],
4-
"min_lesson_length" : 2,
4+
"min_lesson_length": 100,
55
"signal_smoothing" : true,
66
"parameters" :
77
{

config/curricula/wall-jump/SmallWallBrain.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"measure" : "progress",
33
"thresholds" : [0.1, 0.3, 0.5],
4-
"min_lesson_length" : 2,
4+
"min_lesson_length": 100,
55
"signal_smoothing" : true,
66
"parameters" :
77
{

docs/Migrating.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,16 @@
4141
[trainer_config.yaml](../config/trainer_config.yaml). An example of passing
4242
a trainer configuration to `mlagents-learn` is shown above.
4343
* The environment name is now passed through the `--env` option.
44-
* Curriculum files must now be placed into a folder and be named appropriately.
45-
Refer to the
46-
[Curriculum training documentation](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Training-Curriculum-Learning.md)
47-
for more information.
44+
* Curriculum learning has been changed. Refer to the
45+
[curriculum learning documentation](Training-Curriculum-Learning.md)
46+
for detailed information. In summary:
47+
* Curriculum files for the same environment must now be placed into a folder.
48+
Each curriculum file should be named after the brain whose curriculum it
49+
specifies.
50+
* `min_lesson_length` now specifies the minimum number of episodes in a lesson
51+
and affects reward thresholding.
52+
* It is no longer necessary to specify the `Max Steps` of the Academy to use
53+
curriculum learning.
4854

4955
## Migrating from ML-Agents toolkit v0.3 to v0.4
5056

docs/Training-Curriculum-Learning.md

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ the BigWallBrain in the Wall Jump environment.
5959
{
6060
"measure" : "progress",
6161
"thresholds" : [0.1, 0.3, 0.5],
62-
"min_lesson_length" : 2,
62+
"min_lesson_length" : 100,
6363
"signal_smoothing" : true,
6464
"parameters" :
6565
{
@@ -74,8 +74,18 @@ the BigWallBrain in the Wall Jump environment.
7474
* `progress` - Uses ratio of steps/max_steps.
7575
* `thresholds` (float array) - Points in value of `measure` where lesson should
7676
be increased.
77-
* `min_lesson_length` (int) - How many times the progress measure should be
78-
reported before incrementing the lesson.
77+
* `min_lesson_length` (int) - The minimum number of episodes that should be
78+
completed before the lesson can change. If `measure` is set to `reward`, the
79+
average cumulative reward of the last `min_lesson_length` episodes will be
80+
used to determine if the lesson should change. Must be nonnegative.
81+
82+
__Important__: the average reward that is compared to the thresholds is
83+
different than the mean reward that is logged to the console. For example,
84+
if `min_lesson_length` is `100`, the lesson will increment after the average
85+
cumulative reward of the last `100` episodes exceeds the current threshold.
86+
The mean reward logged to the console is dictated by the `summary_freq`
87+
parameter in the
88+
[trainer configuration file](Training-ML-Agents.md#training-config-file).
7989
* `signal_smoothing` (true/false) - Whether to weight the current progress
8090
measure by previous values.
8191
* If `true`, weighting will be 0.75 (new) 0.25 (old).
@@ -107,5 +117,4 @@ agents in the Wall Jump environment with curriculum learning, we can run
107117
mlagents-learn config/trainer_config.yaml --curriculum=curricula/wall-jump/ --run-id=wall-jump-curriculum --train
108118
```
109119

110-
We can then keep track of the current
111-
lessons and progresses via TensorBoard.
120+
We can then keep track of the current lessons and progresses via TensorBoard.

ml-agents/mlagents/trainers/curriculum.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import json
3+
import math
34

45
from .exception import CurriculumError
56

@@ -13,9 +14,9 @@ def __init__(self, location, default_reset_parameters):
1314
"""
1415
Initializes a Curriculum object.
1516
:param location: Path to JSON defining curriculum.
16-
:param default_reset_parameters: Set of reset parameters for environment.
17+
:param default_reset_parameters: Set of reset parameters for
18+
environment.
1719
"""
18-
self.lesson_length = 0
1920
self.max_lesson_num = 0
2021
self.measure = None
2122
self._lesson_num = 0
@@ -30,15 +31,18 @@ def __init__(self, location, default_reset_parameters):
3031
raise CurriculumError(
3132
'The file {0} could not be found.'.format(location))
3233
except UnicodeDecodeError:
33-
raise CurriculumError('There was an error decoding {}'.format(location))
34+
raise CurriculumError('There was an error decoding {}'
35+
.format(location))
3436
self.smoothing_value = 0
3537
for key in ['parameters', 'measure', 'thresholds',
3638
'min_lesson_length', 'signal_smoothing']:
3739
if key not in self.data:
3840
raise CurriculumError("{0} does not contain a "
39-
"{1} field.".format(location, key))
41+
"{1} field."
42+
.format(location, key))
4043
self.smoothing_value = 0
4144
self.measure = self.data['measure']
45+
self.min_lesson_length = self.data['min_lesson_length']
4246
self.max_lesson_num = len(self.data['thresholds'])
4347

4448
parameters = self.data['parameters']
@@ -51,32 +55,31 @@ def __init__(self, location, default_reset_parameters):
5155
raise CurriculumError(
5256
'The parameter {0} in Curriculum {1} must have {2} values '
5357
'but {3} were found'.format(key, location,
54-
self.max_lesson_num + 1, len(parameters[key])))
58+
self.max_lesson_num + 1,
59+
len(parameters[key])))
5560

5661
@property
5762
def lesson_num(self):
5863
return self._lesson_num
5964

6065
@lesson_num.setter
6166
def lesson_num(self, lesson_num):
62-
self.lesson_length = 0
6367
self._lesson_num = max(0, min(lesson_num, self.max_lesson_num))
6468

65-
def increment_lesson(self, progress):
69+
def increment_lesson(self, measure_val):
6670
"""
6771
Increments the lesson number depending on the progress given.
68-
:param progress: Measure of progress (either reward or percentage steps completed).
72+
:param measure_val: Measure of progress (either reward or percentage
73+
steps completed).
74+
:return Whether the lesson was incremented.
6975
"""
70-
if self.data is None or progress is None:
71-
return
76+
if not self.data or not measure_val or math.isnan(measure_val):
77+
return False
7278
if self.data['signal_smoothing']:
73-
progress = self.smoothing_value * 0.25 + 0.75 * progress
74-
self.smoothing_value = progress
75-
self.lesson_length += 1
79+
measure_val = self.smoothing_value * 0.25 + 0.75 * measure_val
80+
self.smoothing_value = measure_val
7681
if self.lesson_num < self.max_lesson_num:
77-
if ((progress > self.data['thresholds'][self.lesson_num]) and
78-
(self.lesson_length > self.data['min_lesson_length'])):
79-
self.lesson_length = 0
82+
if measure_val > self.data['thresholds'][self.lesson_num]:
8083
self.lesson_num += 1
8184
config = {}
8285
parameters = self.data['parameters']
@@ -85,15 +88,19 @@ def increment_lesson(self, progress):
8588
logger.info('{0} lesson changed. Now in lesson {1}: {2}'
8689
.format(self._brain_name,
8790
self.lesson_num,
88-
', '.join([str(x) + ' -> ' + str(config[x]) for x in config])))
91+
', '.join([str(x) + ' -> ' + str(config[x])
92+
for x in config])))
93+
return True
94+
return False
8995

9096
def get_config(self, lesson=None):
9197
"""
9298
Returns reset parameters which correspond to the lesson.
93-
:param lesson: The lesson you want to get the config of. If None, the current lesson is returned.
99+
:param lesson: The lesson you want to get the config of. If None, the
100+
current lesson is returned.
94101
:return: The configuration of the reset parameters.
95102
"""
96-
if self.data is None:
103+
if not self.data:
97104
return {}
98105
if lesson is None:
99106
lesson = self.lesson_num

ml-agents/mlagents/trainers/meta_curriculum.py

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111

1212
class MetaCurriculum(object):
13-
"""A MetaCurriculum holds curriculums. Each curriculum is associated to a particular
14-
brain in the environment.
13+
"""A MetaCurriculum holds curriculums. Each curriculum is associated to a
14+
particular brain in the environment.
1515
"""
1616

1717
def __init__(self, curriculum_folder, default_reset_parameters):
@@ -33,10 +33,12 @@ def __init__(self, curriculum_folder, default_reset_parameters):
3333
brain_name = curriculum_filename.split('.')[0]
3434
curriculum_filepath = \
3535
os.path.join(curriculum_folder, curriculum_filename)
36-
curriculum = Curriculum(curriculum_filepath, default_reset_parameters)
36+
curriculum = Curriculum(curriculum_filepath,
37+
default_reset_parameters)
3738

3839
# Check if any two curriculums use the same reset params.
39-
if any([(parameter in curriculum.get_config().keys()) for parameter in used_reset_parameters]):
40+
if any([(parameter in curriculum.get_config().keys())
41+
for parameter in used_reset_parameters]):
4042
logger.warning('Two or more curriculums will '
4143
'attempt to change the same reset '
4244
'parameter. The result will be '
@@ -69,18 +71,57 @@ def lesson_nums(self, lesson_nums):
6971
for brain_name, lesson in lesson_nums.items():
7072
self.brains_to_curriculums[brain_name].lesson_num = lesson
7173

72-
def increment_lessons(self, progresses):
73-
"""Increments all the lessons of all the curriculums in this MetaCurriculum.
74+
def _lesson_ready_to_increment(self, brain_name, reward_buff_size):
75+
"""Determines whether the curriculum of a specified brain is ready
76+
to attempt an increment.
7477
7578
Args:
76-
progresses (dict): A dict of brain name to progress.
79+
brain_name (str): The name of the brain whose curriculum will be
80+
checked for readiness.
81+
reward_buff_size (int): The size of the reward buffer of the trainer
82+
that corresponds to the specified brain.
83+
84+
Returns:
85+
Whether the curriculum of the specified brain should attempt to
86+
increment its lesson.
87+
"""
88+
return reward_buff_size >= (self.brains_to_curriculums[brain_name]
89+
.min_lesson_length)
90+
91+
def increment_lessons(self, measure_vals, reward_buff_sizes=None):
92+
"""Attempts to increments all the lessons of all the curriculums in this
93+
MetaCurriculum. Note that calling this method does not guarantee the
94+
lesson of a curriculum will increment. The lesson of a curriculum will
95+
only increment if the specified measure threshold defined in the
96+
curriculum has been reached and the minimum number of episodes in the
97+
lesson have been completed.
98+
99+
Args:
100+
measure_vals (dict): A dict of brain name to measure value.
101+
reward_buff_sizes (dict): A dict of brain names to the size of their
102+
corresponding reward buffers.
103+
104+
Returns:
105+
A dict from brain name to whether that brain's lesson number was
106+
incremented.
77107
"""
78-
for brain_name, progress in progresses.items():
79-
self.brains_to_curriculums[brain_name].increment_lesson(progress)
108+
ret = {}
109+
if reward_buff_sizes:
110+
for brain_name, buff_size in reward_buff_sizes.items():
111+
if self._lesson_ready_to_increment(brain_name, buff_size):
112+
measure_val = measure_vals[brain_name]
113+
ret[brain_name] = (self.brains_to_curriculums[brain_name]
114+
.increment_lesson(measure_val))
115+
else:
116+
for brain_name, measure_val in measure_vals.items():
117+
ret[brain_name] = (self.brains_to_curriculums[brain_name]
118+
.increment_lesson(measure_val))
119+
return ret
80120

81121

82122
def set_all_curriculums_to_lesson_num(self, lesson_num):
83-
"""Sets all the curriculums in this meta curriculum to a specified lesson number.
123+
"""Sets all the curriculums in this meta curriculum to a specified
124+
lesson number.
84125
85126
Args:
86127
lesson_num (int): The lesson number which all the curriculums will
@@ -91,7 +132,8 @@ def set_all_curriculums_to_lesson_num(self, lesson_num):
91132

92133

93134
def get_config(self):
94-
"""Get the combined configuration of all curriculums in this MetaCurriculum.
135+
"""Get the combined configuration of all curriculums in this
136+
MetaCurriculum.
95137
96138
Returns:
97139
A dict from parameter to value.

ml-agents/mlagents/trainers/ppo/trainer.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import logging
66
import os
7+
from collections import deque
78

89
import numpy as np
910
import tensorflow as tf
@@ -19,7 +20,7 @@
1920
class PPOTrainer(Trainer):
2021
"""The PPOTrainer is an implementation of the PPO algorithm."""
2122

22-
def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
23+
def __init__(self, sess, brain, reward_buff_cap, trainer_parameters, training, seed, run_id):
2324
"""
2425
Responsible for collecting experiences and training PPO model.
2526
:param sess: Tensorflow session.
@@ -57,6 +58,7 @@ def __init__(self, sess, brain, trainer_parameters, training, seed, run_id):
5758

5859
self.training_buffer = Buffer()
5960
self.cumulative_rewards = {}
61+
self._reward_buffer = deque(maxlen=reward_buff_cap)
6062
self.episode_steps = {}
6163
self.summary_path = trainer_parameters['summary_path']
6264
if not os.path.exists(self.summary_path):
@@ -91,6 +93,16 @@ def get_step(self):
9193
"""
9294
return self.step
9395

96+
@property
97+
def reward_buffer(self):
98+
"""
99+
Returns the reward buffer. The reward buffer contains the cumulative
100+
rewards of the most recent episodes completed by agents using this
101+
trainer.
102+
:return: the reward buffer.
103+
"""
104+
return self._reward_buffer
105+
94106
def increment_step_and_update_last_reward(self):
95107
"""
96108
Increment the step count of the trainer and Updates the last reward
@@ -281,6 +293,7 @@ def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo
281293
if info.local_done[l]:
282294
self.stats['cumulative_reward'].append(
283295
self.cumulative_rewards.get(agent_id, 0))
296+
self.reward_buffer.appendleft(self.cumulative_rewards.get(agent_id, 0))
284297
self.stats['episode_length'].append(
285298
self.episode_steps.get(agent_id, 0))
286299
self.cumulative_rewards[agent_id] = 0

0 commit comments

Comments
 (0)