Skip to content

Commit e4966dd

Browse files
authored
Merge pull request #57 from Farama-Foundation/extend-bpd-ig
Extend IG with reward modes, make terms uniform across IG and BPD
2 parents 2be31ad + 70a15fe commit e4966dd

8 files changed

+48
-135
lines changed

momaland/envs/beach/beach.py

+15-11
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import functools
77
import random
8+
import warnings
89
from typing_extensions import override
910

1011
import numpy as np
@@ -69,11 +70,11 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
6970
The action space is a Discrete space [0, 1, 2], corresponding to moving left, moving right, staying in place.
7071
7172
## Reward Space
72-
The reward space is a 2D vector containing rewards for two different schemes ('local' or 'global') for:
73+
The reward space is a 2D vector containing rewards for two different modes ('individual' or 'team') for:
7374
- the occupation level
7475
- the mixture level
75-
If the scheme is 'local', the reward is given for the currently occupied section.
76-
If the scheme is 'global', the reward is summed over all sections.
76+
If the mode is 'individual', the reward is given for the currently occupied section.
77+
If the mode is 'team', the reward is summed over all sections.
7778
7879
## Starting State
7980
The initial position is a uniform random distribution of agents over the sections. This can be changed via the
@@ -90,7 +91,7 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
9091
## Arguments
9192
- 'num_timesteps (int)': number of timesteps in the domain. Default: 1
9293
- 'num_agents (int)': number of agents in the domain. Default: 100
93-
- 'reward_scheme (str)': the reward scheme to use ('local', or 'global'). Default: local
94+
- 'reward_mode (str)': the reward mode to use ('individual', or 'team'). Default: individual
9495
- 'sections (int)': number of beach sections in the domain. Default: 6
9596
- 'capacity (int)': capacity of each beach section. Default: 7
9697
- 'type_distribution (tuple)': the distribution of agent types in the domain. Default: 2 types equally distributed (0.3, 0.7).
@@ -104,7 +105,7 @@ def __init__(
104105
self,
105106
num_timesteps=1,
106107
num_agents=100,
107-
reward_scheme="local",
108+
reward_mode="individual",
108109
sections=6,
109110
capacity=7,
110111
type_distribution=(0.3, 0.7),
@@ -117,26 +118,29 @@ def __init__(
117118
sections: number of beach sections in the domain
118119
capacity: capacity of each beach section
119120
num_agents: number of agents in the domain
121+
reward_mode: the reward mode to use ('individual', or 'team'). Default: individual
120122
type_distribution: the distribution of agent types in the domain. Default: 2 types equally distributed.
121123
position_distribution: the initial distribution of agents in the domain. Default: uniform over all sections.
122124
num_timesteps: number of timesteps in the domain
123125
render_mode: render mode
124-
reward_scheme: the reward scheme to use ('local', or 'global'). Default: local
125126
"""
126127
EzPickle.__init__(
127128
self,
128129
num_timesteps,
129130
num_agents,
130-
reward_scheme,
131+
reward_mode,
131132
sections,
132133
capacity,
133134
type_distribution,
134135
position_distribution,
135136
render_mode,
136137
)
137-
self.reward_scheme = reward_scheme
138+
if reward_mode not in ["individual", "team"]:
139+
self.reward_mode = "individual"
140+
warnings.warn("Invalid reward_mode. Must be either 'individual' or 'team'. Defaulting to 'individual'.")
141+
else:
142+
self.reward_mode = reward_mode
138143
self.sections = sections
139-
# TODO Extend to distinct capacities per section?
140144
self.resource_capacities = [capacity for _ in range(sections)]
141145
self.num_timesteps = num_timesteps
142146
self.episode_num = 0
@@ -296,13 +300,13 @@ def step(self, actions):
296300
reward_per_section = np.zeros((self.sections, NUM_OBJECTIVES), dtype=np.float32)
297301

298302
if env_termination:
299-
if self.reward_scheme == "local":
303+
if self.reward_mode == "individual":
300304
for i in range(self.sections):
301305
lr_capacity = _local_capacity_reward(self.resource_capacities[i], section_consumptions[i])
302306
lr_mixture = _local_mixture_reward(section_agent_types[i])
303307
reward_per_section[i] = np.array([lr_capacity, lr_mixture])
304308

305-
elif self.reward_scheme == "global":
309+
elif self.reward_mode == "team":
306310
g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
307311
g_mixture = _global_mixture_reward(section_agent_types)
308312
reward_per_section = np.array([[g_capacity, g_mixture]] * self.sections)

momaland/envs/item_gathering/item_gathering.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,19 @@
66
77
Notes:
88
- In contrast to the original environment, the observation space is a 2D array of integers, i.e.,
9-
the map of the environment, where each integer represents either agents (1 for the agent receiving the observation,
10-
2 for the other agents) or items (3, 4, etc., depending on the number of items).
9+
the map of the environment, with 0 for empty cells, negative integers for agents, positive integers for items.
1110
- The number of agents and items is configurable, by providing an initial map.
1211
- If no initial map is provided, the environment uses a default map
1312
1413
Central observation:
15-
- If the central_observation flag is set to True, then the environment implements:
14+
- If the central_observation flag is set to True, then the environment includes in the implementation:
1615
- a central observation space: self.central_observation_space
1716
- a central observation function: self.state()
17+
The central_observation flag and the associated methods described above are used by the CentralisedAgent wrapper
1818
"""
1919

2020
import random
21+
import warnings
2122
from copy import deepcopy
2223
from os import path
2324
from typing_extensions import override
@@ -102,6 +103,7 @@ class MOItemGathering(MOParallelEnv, EzPickle):
102103
- 'num_timesteps': number of timesteps to run the environment for. Default: 10
103104
- 'initial_map': map of the environment. Default: 8x8 grid, 2 agents, 3 objectives (Källström and Heintz, 2019)
104105
- 'randomise': whether to randomise the map, at each episode. Default: False
106+
- 'reward_mode': reward mode for the environment ('individual' or 'team'). Default: 'individual'
105107
- 'render_mode': render mode for the environment. Default: None
106108
"""
107109

@@ -118,6 +120,7 @@ def __init__(
118120
num_timesteps=10,
119121
initial_map=DEFAULT_MAP,
120122
randomise=False,
123+
reward_mode="individual",
121124
render_mode=None,
122125
):
123126
"""Initializes the item gathering domain.
@@ -126,19 +129,26 @@ def __init__(
126129
num_timesteps: number of timesteps to run the environment for
127130
initial_map: map of the environment
128131
randomise: whether to randomise the map, at each episode
132+
reward_mode: reward mode for the environment, 'individual' or 'team'. Default: 'individual'
129133
render_mode: render mode for the environment
130134
"""
131135
EzPickle.__init__(
132136
self,
133137
num_timesteps,
134138
initial_map,
135139
randomise,
140+
reward_mode,
136141
render_mode,
137142
)
138143
self.num_timesteps = num_timesteps
139144
self.current_timestep = 0
140145
self.render_mode = render_mode
141146
self.randomise = randomise
147+
if reward_mode not in ["individual", "team"]:
148+
self.reward_mode = "individual"
149+
warnings.warn("reward_mode must be either 'individual' or 'team', defaulting to 'individual'.")
150+
else:
151+
self.reward_mode = reward_mode
142152

143153
# check if the initial map has any entries equal to 1
144154
assert len(np.argwhere(initial_map == 1).flatten()) > 0, "The initial map does not contain any agents (1s)."
@@ -391,6 +401,9 @@ def step(self, actions):
391401
if value_in_cell > 0:
392402
rewards[self.agents[i]][self.item_dict[value_in_cell]] += 1
393403
self.env_map[self.agent_positions[i][0], self.agent_positions[i][1]] = 0
404+
# if reward mode is teams, sum the rewards for all agents
405+
if self.reward_mode == "team":
406+
rewards = {agent: np.sum(list(rewards.values()), axis=0) for agent in self.agents}
394407

395408
map_obs = self.state()
396409
observations = {agent: (-(i + 1), map_obs) for i, agent in enumerate(self.agents)}

momaland/learning/iql/tabular_bpd.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,9 @@ def normalize_objective_rewards(self, reward, reward_scheme):
6565
np.array: the normalized reward
6666
"""
6767
# Set the normalization constants
68-
if reward_scheme == "local":
68+
if reward_scheme == "individual":
6969
cap_min, cap_max, mix_min, mix_max = self.l_cap_min, self.l_cap_max, self.l_mix_min, self.l_mix_max
70-
elif reward_scheme == "global":
70+
elif reward_scheme == "team":
7171
cap_min, cap_max, mix_min, mix_max = self.g_cap_min, self.g_cap_max, self.g_mix_min, self.g_mix_max
7272
else:
7373
raise ValueError(f"Unknown reward scheme: {reward_scheme}")
@@ -108,15 +108,15 @@ def step(self, actions):
108108
section_agent_types[self._state[i]][self._types[i]] += 1
109109
g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
110110
g_mixture = _global_mixture_reward(section_agent_types)
111-
g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "global")
111+
g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "team")
112112
infos = {
113113
agent: {"g_cap": g_capacity, "g_mix": g_mixture, "g_cap_norm": g_capacity_norm, "g_mix_norm": g_mixture_norm}
114114
for agent in self.possible_agents
115115
}
116116

117117
# Normalize the rewards
118118
for agent in self.possible_agents:
119-
rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_scheme)
119+
rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_mode)
120120

121121
return observations, rewards, terminations, truncations, infos
122122

momaland/learning/iql/train_iql_bpd.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
3838
# Maximum local capacity is achieved when there are 'capacity' agents in the section
3939
max_cap_local = _local_capacity_reward(capacity, capacity)
4040
cap_min = 0.0
41-
cap_max = max_cap_local if reward_scheme == "local" else max_cap_global
41+
cap_max = max_cap_local if reward_scheme == "individual" else max_cap_global
4242

4343
# Mixture
4444
# Maximum global mixture: one agent of each type in each section, except one where all other agents are
@@ -52,7 +52,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
5252
# Maximum local mixture is achieved when there is one agent of each type in the section
5353
max_mix_local = _local_mixture_reward([1, 1])
5454
mix_min = 0.0
55-
mix_max = max_mix_local if reward_scheme == "local" else max_mix_global
55+
mix_max = max_mix_local if reward_scheme == "individual" else max_mix_global
5656

5757
return cap_min, cap_max, mix_min, mix_max
5858

@@ -96,7 +96,7 @@ def parse_args():
9696
parser.add_argument('--position-distribution', type=float, nargs=5, default=[0., 0.5, 0., 0.5, 0.], )
9797
parser.add_argument('--sections', type=int, default=5, )
9898
parser.add_argument('--capacity', type=int, default=3, )
99-
parser.add_argument('--reward-scheme', type=str, default="local", help="the reward scheme to use")
99+
parser.add_argument('--reward-scheme', type=str, default="individual", help="the reward scheme to use")
100100

101101
args = parser.parse_args()
102102
args.time = time.time()
@@ -114,13 +114,13 @@ def parse_args():
114114
"position_distribution": args.position_distribution,
115115
"sections": args.sections,
116116
"capacity": args.capacity,
117-
"reward_scheme": args.reward_scheme,
117+
"reward_mode": args.reward_scheme,
118118
# Normalization constants
119119
"local_constants": compute_normalization_constants(
120-
args.num_agents, args.sections, args.capacity, args.type_distribution, "local"
120+
args.num_agents, args.sections, args.capacity, args.type_distribution, "individual"
121121
),
122122
"global_constants": compute_normalization_constants(
123-
args.num_agents, args.sections, args.capacity, args.type_distribution, "global"
123+
args.num_agents, args.sections, args.capacity, args.type_distribution, "team"
124124
),
125125
}
126126

momaland/learning/morl/random_centralised_agent_example.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def train_random(moma_env):
4444
num_timesteps=50,
4545
initial_map=test_map,
4646
randomise=True,
47+
reward_mode="individual",
4748
render_mode=None,
4849
)
4950

@@ -54,11 +55,11 @@ def train_random(moma_env):
5455
type_distribution=[0.5, 0.5],
5556
position_distribution=[0.5, 1],
5657
num_timesteps=10,
57-
reward_scheme="local",
58+
reward_mode="individual",
5859
)
5960

60-
# train_random(ig_env)
61-
# train_random(mobpd_env)
61+
train_random(ig_env)
62+
train_random(mobpd_env)
6263

6364
# train_sa_random(ig_env)
64-
train_sa_random(mobpd_env)
65+
# train_sa_random(mobpd_env)

momaland/learning/morl/sa_env_factory.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def make_single_agent_bpd_env(size="small"):
3434
bpd_env = mobeach_v0.parallel_env(
3535
num_timesteps=5,
3636
num_agents=10,
37-
reward_scheme="global",
37+
reward_mode="team",
3838
sections=3,
3939
capacity=2,
4040
type_distribution=(0.7, 0.3),
@@ -44,7 +44,7 @@ def make_single_agent_bpd_env(size="small"):
4444
bpd_env = moitem_gathering_v0.parallel_env(
4545
num_timesteps=1,
4646
num_agents=50,
47-
reward_scheme="global",
47+
reward_mode="team",
4848
sections=5,
4949
capacity=3,
5050
type_distribution=(0.7, 0.3),

momaland/learning/morl/train_bpd_GPILS.py

-58
This file was deleted.

momaland/learning/morl/train_bpd_PCN.py

-47
This file was deleted.

0 commit comments

Comments
 (0)