Merge pull request #57 from Farama-Foundation/extend-bpd-ig

rradules · web-flow · commit e4966dd23a30 · 2024-06-18T11:18:43.000+02:00
Extend IG with reward modes, make terms uniform across IG and BPD
diff --git a/momaland/envs/beach/beach.py b/momaland/envs/beach/beach.py
@@ -5,6 +5,7 @@
 
 import functools
 import random
+import warnings
 from typing_extensions import override
 
 import numpy as np
@@ -69,11 +70,11 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
     The action space is a Discrete space [0, 1, 2], corresponding to moving left, moving right, staying in place.
 
     ## Reward Space
-    The reward space is a 2D vector containing rewards for two different schemes ('local' or 'global') for:
+    The reward space is a 2D vector containing rewards for two different modes ('individual' or 'team') for:
     - the occupation level
     - the mixture level
-    If the scheme is 'local', the reward is given for the currently occupied section.
-    If the scheme is 'global', the reward is summed over all sections.
+    If the mode is 'individual', the reward is given for the currently occupied section.
+    If the mode is 'team', the reward is summed over all sections.
 
     ## Starting State
     The initial position is a uniform random distribution of agents over the sections. This can be changed via the
@@ -90,7 +91,7 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
     ## Arguments
     - 'num_timesteps (int)': number of timesteps in the domain. Default: 1
     - 'num_agents (int)': number of agents in the domain. Default: 100
-    - 'reward_scheme (str)': the reward scheme to use ('local', or 'global'). Default: local
+    - 'reward_mode (str)': the reward mode to use ('individual', or 'team'). Default: individual
     - 'sections (int)': number of beach sections in the domain. Default: 6
     - 'capacity (int)': capacity of each beach section. Default: 7
     - 'type_distribution (tuple)': the distribution of agent types in the domain. Default: 2 types equally distributed (0.3, 0.7).
@@ -104,7 +105,7 @@ def __init__(
         self,
         num_timesteps=1,
         num_agents=100,
-        reward_scheme="local",
+        reward_mode="individual",
         sections=6,
         capacity=7,
         type_distribution=(0.3, 0.7),
@@ -117,26 +118,29 @@ def __init__(
             sections: number of beach sections in the domain
             capacity: capacity of each beach section
             num_agents: number of agents in the domain
+            reward_mode: the reward mode to use ('individual', or 'team'). Default: individual
             type_distribution: the distribution of agent types in the domain. Default: 2 types equally distributed.
             position_distribution: the initial distribution of agents in the domain. Default: uniform over all sections.
             num_timesteps: number of timesteps in the domain
             render_mode: render mode
-            reward_scheme: the reward scheme to use ('local', or 'global'). Default: local
         """
         EzPickle.__init__(
             self,
             num_timesteps,
             num_agents,
-            reward_scheme,
+            reward_mode,
             sections,
             capacity,
             type_distribution,
             position_distribution,
             render_mode,
         )
-        self.reward_scheme = reward_scheme
+        if reward_mode not in ["individual", "team"]:
+            self.reward_mode = "individual"
+            warnings.warn("Invalid reward_mode. Must be either 'individual' or 'team'. Defaulting to 'individual'.")
+        else:
+            self.reward_mode = reward_mode
         self.sections = sections
-        # TODO Extend to distinct capacities per section?
         self.resource_capacities = [capacity for _ in range(sections)]
         self.num_timesteps = num_timesteps
         self.episode_num = 0
@@ -296,13 +300,13 @@ def step(self, actions):
         reward_per_section = np.zeros((self.sections, NUM_OBJECTIVES), dtype=np.float32)
 
         if env_termination:
-            if self.reward_scheme == "local":
+            if self.reward_mode == "individual":
                 for i in range(self.sections):
                     lr_capacity = _local_capacity_reward(self.resource_capacities[i], section_consumptions[i])
                     lr_mixture = _local_mixture_reward(section_agent_types[i])
                     reward_per_section[i] = np.array([lr_capacity, lr_mixture])
 
-            elif self.reward_scheme == "global":
+            elif self.reward_mode == "team":
                 g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
                 g_mixture = _global_mixture_reward(section_agent_types)
                 reward_per_section = np.array([[g_capacity, g_mixture]] * self.sections)
diff --git a/momaland/envs/item_gathering/item_gathering.py b/momaland/envs/item_gathering/item_gathering.py
@@ -6,18 +6,19 @@
 
 Notes:
     - In contrast to the original environment, the observation space is a 2D array of integers, i.e.,
-    the map of the environment, where each integer represents either agents (1 for the agent receiving the observation,
-     2 for the other agents) or items (3, 4, etc., depending on the number of items).
+    the map of the environment, with 0 for empty cells, negative integers for agents, positive integers for items.
     - The number of agents and items is configurable, by providing an initial map.
     - If no initial map is provided, the environment uses a default map
 
 Central observation:
-    - If the central_observation flag is set to True, then the environment implements:
+    - If the central_observation flag is set to True, then the environment includes in the implementation:
         - a central observation space: self.central_observation_space
         - a central observation function: self.state()
+    The central_observation flag and the associated methods described above are used by the CentralisedAgent wrapper
 """
 
 import random
+import warnings
 from copy import deepcopy
 from os import path
 from typing_extensions import override
@@ -102,6 +103,7 @@ class MOItemGathering(MOParallelEnv, EzPickle):
     - 'num_timesteps': number of timesteps to run the environment for. Default: 10
     - 'initial_map': map of the environment. Default: 8x8 grid, 2 agents, 3 objectives (Källström and Heintz, 2019)
     - 'randomise': whether to randomise the map, at each episode. Default: False
+    - 'reward_mode': reward mode for the environment ('individual' or 'team'). Default: 'individual'
     - 'render_mode': render mode for the environment. Default: None
     """
 
@@ -118,6 +120,7 @@ def __init__(
         num_timesteps=10,
         initial_map=DEFAULT_MAP,
         randomise=False,
+        reward_mode="individual",
         render_mode=None,
     ):
         """Initializes the item gathering domain.
@@ -126,19 +129,26 @@ def __init__(
             num_timesteps: number of timesteps to run the environment for
             initial_map: map of the environment
             randomise: whether to randomise the map, at each episode
+            reward_mode: reward mode for the environment, 'individual' or 'team'. Default: 'individual'
             render_mode: render mode for the environment
         """
         EzPickle.__init__(
             self,
             num_timesteps,
             initial_map,
             randomise,
+            reward_mode,
             render_mode,
         )
         self.num_timesteps = num_timesteps
         self.current_timestep = 0
         self.render_mode = render_mode
         self.randomise = randomise
+        if reward_mode not in ["individual", "team"]:
+            self.reward_mode = "individual"
+            warnings.warn("reward_mode must be either 'individual' or 'team', defaulting to 'individual'.")
+        else:
+            self.reward_mode = reward_mode
 
         # check if the initial map has any entries equal to 1
         assert len(np.argwhere(initial_map == 1).flatten()) > 0, "The initial map does not contain any agents (1s)."
@@ -391,6 +401,9 @@ def step(self, actions):
             if value_in_cell > 0:
                 rewards[self.agents[i]][self.item_dict[value_in_cell]] += 1
                 self.env_map[self.agent_positions[i][0], self.agent_positions[i][1]] = 0
+        # if reward mode is teams, sum the rewards for all agents
+        if self.reward_mode == "team":
+            rewards = {agent: np.sum(list(rewards.values()), axis=0) for agent in self.agents}
 
         map_obs = self.state()
         observations = {agent: (-(i + 1), map_obs) for i, agent in enumerate(self.agents)}
diff --git a/momaland/learning/iql/tabular_bpd.py b/momaland/learning/iql/tabular_bpd.py
@@ -65,9 +65,9 @@ def normalize_objective_rewards(self, reward, reward_scheme):
             np.array: the normalized reward
         """
         # Set the normalization constants
-        if reward_scheme == "local":
+        if reward_scheme == "individual":
             cap_min, cap_max, mix_min, mix_max = self.l_cap_min, self.l_cap_max, self.l_mix_min, self.l_mix_max
-        elif reward_scheme == "global":
+        elif reward_scheme == "team":
             cap_min, cap_max, mix_min, mix_max = self.g_cap_min, self.g_cap_max, self.g_mix_min, self.g_mix_max
         else:
             raise ValueError(f"Unknown reward scheme: {reward_scheme}")
@@ -108,15 +108,15 @@ def step(self, actions):
             section_agent_types[self._state[i]][self._types[i]] += 1
         g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
         g_mixture = _global_mixture_reward(section_agent_types)
-        g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "global")
+        g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "team")
         infos = {
             agent: {"g_cap": g_capacity, "g_mix": g_mixture, "g_cap_norm": g_capacity_norm, "g_mix_norm": g_mixture_norm}
             for agent in self.possible_agents
         }
 
         # Normalize the rewards
         for agent in self.possible_agents:
-            rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_scheme)
+            rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_mode)
 
         return observations, rewards, terminations, truncations, infos
 
diff --git a/momaland/learning/iql/train_iql_bpd.py b/momaland/learning/iql/train_iql_bpd.py
@@ -38,7 +38,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
     # Maximum local capacity is achieved when there are 'capacity' agents in the section
     max_cap_local = _local_capacity_reward(capacity, capacity)
     cap_min = 0.0
-    cap_max = max_cap_local if reward_scheme == "local" else max_cap_global
+    cap_max = max_cap_local if reward_scheme == "individual" else max_cap_global
 
     #   Mixture
     # Maximum global mixture: one agent of each type in each section, except one where all other agents are
@@ -52,7 +52,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
     # Maximum local mixture is achieved when there is one agent of each type in the section
     max_mix_local = _local_mixture_reward([1, 1])
     mix_min = 0.0
-    mix_max = max_mix_local if reward_scheme == "local" else max_mix_global
+    mix_max = max_mix_local if reward_scheme == "individual" else max_mix_global
 
     return cap_min, cap_max, mix_min, mix_max
 
@@ -96,7 +96,7 @@ def parse_args():
     parser.add_argument('--position-distribution', type=float, nargs=5, default=[0., 0.5, 0., 0.5, 0.], )
     parser.add_argument('--sections', type=int, default=5, )
     parser.add_argument('--capacity', type=int, default=3, )
-    parser.add_argument('--reward-scheme', type=str, default="local", help="the reward scheme to use")
+    parser.add_argument('--reward-scheme', type=str, default="individual", help="the reward scheme to use")
 
     args = parser.parse_args()
     args.time = time.time()
@@ -114,13 +114,13 @@ def parse_args():
         "position_distribution": args.position_distribution,
         "sections": args.sections,
         "capacity": args.capacity,
-        "reward_scheme": args.reward_scheme,
+        "reward_mode": args.reward_scheme,
         # Normalization constants
         "local_constants": compute_normalization_constants(
-            args.num_agents, args.sections, args.capacity, args.type_distribution, "local"
+            args.num_agents, args.sections, args.capacity, args.type_distribution, "individual"
         ),
         "global_constants": compute_normalization_constants(
-            args.num_agents, args.sections, args.capacity, args.type_distribution, "global"
+            args.num_agents, args.sections, args.capacity, args.type_distribution, "team"
         ),
     }
 
diff --git a/momaland/learning/morl/random_centralised_agent_example.py b/momaland/learning/morl/random_centralised_agent_example.py
@@ -44,6 +44,7 @@ def train_random(moma_env):
         num_timesteps=50,
         initial_map=test_map,
         randomise=True,
+        reward_mode="individual",
         render_mode=None,
     )
 
@@ -54,11 +55,11 @@ def train_random(moma_env):
         type_distribution=[0.5, 0.5],
         position_distribution=[0.5, 1],
         num_timesteps=10,
-        reward_scheme="local",
+        reward_mode="individual",
     )
 
-    # train_random(ig_env)
-    # train_random(mobpd_env)
+    train_random(ig_env)
+    train_random(mobpd_env)
 
     # train_sa_random(ig_env)
-    train_sa_random(mobpd_env)
+    # train_sa_random(mobpd_env)
diff --git a/momaland/learning/morl/sa_env_factory.py b/momaland/learning/morl/sa_env_factory.py
@@ -34,7 +34,7 @@ def make_single_agent_bpd_env(size="small"):
         bpd_env = mobeach_v0.parallel_env(
             num_timesteps=5,
             num_agents=10,
-            reward_scheme="global",
+            reward_mode="team",
             sections=3,
             capacity=2,
             type_distribution=(0.7, 0.3),
@@ -44,7 +44,7 @@ def make_single_agent_bpd_env(size="small"):
         bpd_env = moitem_gathering_v0.parallel_env(
             num_timesteps=1,
             num_agents=50,
-            reward_scheme="global",
+            reward_mode="team",
             sections=5,
             capacity=3,
             type_distribution=(0.7, 0.3),
diff --git a/momaland/learning/morl/train_bpd_GPILS.py b/momaland/learning/morl/train_bpd_GPILS.py
diff --git a/momaland/learning/morl/train_bpd_PCN.py b/momaland/learning/morl/train_bpd_PCN.py