Accelerate gymnasium envs

amacati · amacati · commit 7a8ef099c425 · 2024-11-28T00:42:02.000+01:00
diff --git a/benchmark/main.py b/benchmark/main.py
@@ -47,30 +47,25 @@ def profile_gym_env_step(sim_config: config_dict.ConfigDict, n_steps: int, devic
         max_episode_steps=200,
         return_datatype="numpy",
         num_envs=sim_config.n_worlds,
-        jax_random_key=42,
         **sim_config,
     )
 
     # Action for going up (in attitude control)
-    action = np.array(
-        [[[-0.3, 0, 0, 0] for _ in range(sim_config.n_drones)] for _ in range(sim_config.n_worlds)],
-        dtype=np.float32,
-    ).reshape(sim_config.n_worlds, -1)
+    action = np.zeros((sim_config.n_worlds, 4), dtype=np.float32)
+    action[..., 0] = -0.3
 
-    # step through env once to ensure JIT compilation
-    _, _ = envs.reset_all(seed=42)
-    _, _, _, _, _ = envs.step(action)
-    _, _ = envs.reset_all(seed=42)
-    _, _, _, _, _ = envs.step(action)
-    _, _ = envs.reset_all(seed=42)
+    # Step through env once to ensure JIT compilation
+    envs.reset_all(seed=42)
+    envs.step(action)
+    envs.step(action)
 
-    jax.block_until_ready(envs.unwrapped.sim._mjx_data)  # Ensure JIT compiled dynamics
+    jax.block_until_ready(envs.unwrapped.sim.states.pos)  # Ensure JIT compiled dynamics
 
     # Step through the environment
     for _ in range(n_steps):
         tstart = time.perf_counter()
-        _, _, _, _, _ = envs.step(action)
-        jax.block_until_ready(envs.unwrapped.sim._mjx_data)
+        envs.step(action)
+        jax.block_until_ready(envs.unwrapped.sim.states.pos)
         times.append(time.perf_counter() - tstart)
 
     envs.close()
@@ -90,14 +85,13 @@ def profile_step(sim_config: config_dict.ConfigDict, n_steps: int, device: str):
     sim.reset()
     sim.attitude_control(cmd)
     sim.step()
-    sim.reset()
-    jax.block_until_ready(sim._mjx_data)  # Ensure JIT compiled dynamics
+    jax.block_until_ready(sim.states.pos)  # Ensure JIT compiled dynamics
 
     for _ in range(n_steps):
         tstart = time.perf_counter()
         sim.attitude_control(cmd)
         sim.step()
-        jax.block_until_ready(sim._mjx_data)
+        jax.block_until_ready(sim.states.pos)
         times.append(time.perf_counter() - tstart)
 
     analyze_timings(times, n_steps, sim.n_worlds, sim.freq)
diff --git a/benchmark/performance.py b/benchmark/performance.py
@@ -26,7 +26,6 @@ def profile_step(sim_config: config_dict.ConfigDict, n_steps: int, device: str):
     sim.reset()
     control_fn(cmd)
     sim.step()
-    control_fn(cmd)
     sim.step()
     sim.reset()
     jax.block_until_ready(sim.states.pos)
@@ -52,7 +51,6 @@ def profile_gym_env_step(sim_config: config_dict.ConfigDict, n_steps: int, devic
         max_episode_steps=200,
         return_datatype="numpy",
         num_envs=sim_config.n_worlds,
-        jax_random_key=42,
         **sim_config,
     )
 
@@ -61,20 +59,17 @@ def profile_gym_env_step(sim_config: config_dict.ConfigDict, n_steps: int, devic
     action[..., 0] = -0.3
 
     # Step through env once to ensure JIT compilation.
-    # TODO: Currently triggering recompiles also after the first full run. Investigate why and fix
-    # envs accordingly.
     envs.reset_all(seed=42)
-
-    for _ in range(envs.max_episode_steps + 1):  # Ensure all paths have been taken at least once
-        envs.step(action)
-
+    envs.step(action)
+    envs.step(action)  # Ensure all paths have been taken at least once
     envs.reset_all(seed=42)
 
     profiler = Profiler()
     profiler.start()
 
     for _ in range(n_steps):
-        _, _, _, _, _ = envs.step(action)
+        envs.step(action)
+        jax.block_until_ready(envs.unwrapped.sim.states.pos)
 
     profiler.stop()
     renderer = HTMLRenderer()
diff --git a/crazyflow/gymnasium_envs/crazyflow.py b/crazyflow/gymnasium_envs/crazyflow.py
@@ -47,7 +47,6 @@ class CrazyflowBaseEnv(VectorEnv):
     def __init__(
         self,
         *,
-        jax_random_key: int,  # required for jax random number generator
         num_envs: int = 1,  # required for VectorEnv
         max_episode_steps: int = 1000,
         return_datatype: Literal["numpy", "jax"] = "jax",
@@ -56,7 +55,6 @@ def __init__(
         """Summary: Initializes the CrazyflowEnv.
 
         Args:
-            jax_random_key: The random key for the jax random number generator.
             num_envs: The number of environments to run in parallel.
             max_episode_steps: The maximum number of steps per episode.
             return_datatype: The data type for returned arrays, either "numpy" or "jax". If "numpy",
@@ -66,12 +64,14 @@ def __init__(
         """
         assert num_envs == kwargs["n_worlds"], "num_envs must be equal to n_worlds"
 
-        self.jax_key = jax.random.key(jax_random_key)
+        # Set random initial seed for JAX. For seeding, people should use the reset function
+        jax_seed = int(self.np_random.random() * 2**32)
+        self.jax_key = jax.random.key(jax_seed)
 
         self.num_envs = num_envs
         self.return_datatype = return_datatype
         self.device = jax.devices(kwargs["device"])[0]
-        self.max_episode_steps = jnp.array(max_episode_steps, dtype=jnp.int32, device=self.device)
+        self.max_episode_steps = max_episode_steps
 
         self.sim = Sim(**kwargs)
 
@@ -83,7 +83,7 @@ def __init__(
                 "Simulation frequency should be a multiple of control frequency. We can handle the other case, but we highly recommend to change the simulation frequency to a multiple of the control frequency."
             )
 
-        self.n_substeps = jnp.array(self.sim.freq // self.sim.control_freq)
+        self.n_substeps = self.sim.freq // self.sim.control_freq
 
         self.prev_done = jnp.zeros((self.sim.n_worlds), dtype=jnp.bool_, device=self.device)
 
@@ -111,44 +111,40 @@ def __init__(
 
     def step(self, action: Array) -> tuple[Array, Array, Array, Array, dict]:
         assert self.action_space.contains(action), f"{action!r} ({type(action)}) invalid"
-        action = jnp.array(action, device=self.device).reshape(
-            (self.sim.n_worlds, self.sim.n_drones, -1)
-        )
-
+        action = self._sanitize_action(action, self.sim.n_worlds, self.sim.n_drones, self.device)
         action = self._rescale_action(action, self.sim.control)
 
-        if self.sim.control == Control.state:
-            raise NotImplementedError(
-                "Possibly you want to control state differences instead of absolute states"
-            )
-            self.sim.state_control(action)
-        elif self.sim.control == Control.attitude:
-            self.sim.attitude_control(action)
-        elif self.sim.control == Control.thrust:
-            self.sim.thrust_control(action)
-        else:
-            raise ValueError(f"Invalid control type {self.sim.control}")
+        match self.sim.control:
+            case Control.state:
+                raise NotImplementedError(
+                    "Possibly you want to control state differences instead of absolute states"
+                )
+            case Control.attitude:
+                self.sim.attitude_control(action)
+            case Control.thrust:
+                self.sim.thrust_control(action)
+            case _:
+                raise ValueError(f"Invalid control type {self.sim.control}")
 
         for _ in range(self.n_substeps):
             self.sim.step()
-
         # Reset all environments which terminated or were truncated in the last step
         if jnp.any(self.prev_done):
             self.reset(mask=self.prev_done)
 
-        reward = self.reward
         terminated = self.terminated
         truncated = self.truncated
+        self.prev_done = self._done(terminated, truncated)
 
-        self.prev_done = jnp.logical_or(terminated, truncated)
+        convert = self.return_datatype == "numpy"
+        terminated = maybe_to_numpy(terminated, convert)
+        truncated = maybe_to_numpy(truncated, convert)
+        return self._obs(), self.reward, terminated, truncated, {}
 
-        return (
-            self._get_obs(),
-            reward,
-            maybe_to_numpy(terminated, self.return_datatype == "numpy"),
-            maybe_to_numpy(truncated, self.return_datatype == "numpy"),
-            {},
-        )
+    @staticmethod
+    @partial(jax.jit, static_argnames=["n_worlds", "n_drones", "device"])
+    def _sanitize_action(action: Array, n_worlds: int, n_drones: int, device: str) -> Array:
+        return jnp.array(action, device=device).reshape((n_worlds, n_drones, -1))
 
     @staticmethod
     @partial(jax.jit, static_argnames=["control_type"])
@@ -167,14 +163,19 @@ def _rescale_action(action: Array, control_type: str) -> Array:
             raise NotImplementedError(
                 f"Rescaling not implemented for control type '{control_type}'"
             )
-
         return action * params.scale_factor + params.mean
 
+    @staticmethod
+    @jax.jit
+    def _done(terminated: Array, truncated: Array) -> Array:
+        return jnp.logical_or(terminated, truncated)
+
     def reset_all(
         self, *, seed: int | None = None, options: dict | None = None
     ) -> tuple[dict[str, Array], dict]:
         super().reset(seed=seed)
-
+        if seed is not None:
+            self.jax_key = jax.random.key(seed)
         # Resets ALL (!) environments
         if options is None:
             options = {}
@@ -183,7 +184,7 @@ def reset_all(
 
         self.prev_done = jnp.zeros((self.sim.n_worlds), dtype=jnp.bool_)
 
-        return self._get_obs(), {}
+        return self._obs(), {}
 
     def reset(self, mask: Array) -> None:
         self.sim.reset(mask=mask)
@@ -241,26 +242,21 @@ def _terminated(dones: Array, states: SimState, contacts: Array) -> Array:
         return jnp.where(dones, False, terminated)
 
     @staticmethod
-    @jax.jit
-    def _truncated(
-        dones: Array, steps: Array, max_episode_steps: Array, n_substeps: Array
-    ) -> Array:
+    @partial(jax.jit, static_argnames=["max_episode_steps", "n_substeps"])
+    def _truncated(dones: Array, steps: Array, max_episode_steps: int, n_substeps: int) -> Array:
         truncated = steps / n_substeps >= max_episode_steps
         return jnp.where(dones, False, truncated)
 
     def render(self):
         self.sim.render()
 
-    def _get_obs(self) -> dict[str, Array]:
-        obs = {
-            state: maybe_to_numpy(
-                getattr(self.sim.states, state)[..., 2]
-                if state == "pos"
-                else getattr(self.sim.states, state),
-                self.return_datatype == "numpy",
-            )
-            for state in self.states_to_include_in_obs
-        }
+    def _obs(self) -> dict[str, Array]:
+        convert = self.return_datatype == "numpy"
+        fields = self.states_to_include_in_obs
+        states = [maybe_to_numpy(getattr(self.sim.states, field), convert) for field in fields]
+        obs = {k: v for k, v in zip(fields, states)}
+        if "pos" in obs:
+            obs["pos"] = obs["pos"][..., 2]
         return obs
 
 
@@ -276,8 +272,7 @@ def __init__(self, **kwargs: dict):
             -jnp.inf, jnp.inf, shape=(self._obs_size,), dtype=jnp.float32
         )
         self.observation_space = batch_space(self.single_observation_space, self.sim.n_worlds)
-
-        self.goal = jnp.zeros((kwargs["n_worlds"], 3), dtype=jnp.float32)
+        self.goal = jnp.zeros((kwargs["n_worlds"], 3), dtype=jnp.float32, device=self.device)
 
     @property
     def reward(self) -> Array:
@@ -303,8 +298,8 @@ def reset(self, mask: Array) -> None:
         )
         self.goal = self.goal.at[mask].set(new_goals[mask])
 
-    def _get_obs(self) -> dict[str, Array]:
-        obs = super()._get_obs()
+    def _obs(self) -> dict[str, Array]:
+        obs = super()._obs()
         obs["difference_to_goal"] = [self.goal - self.sim.states.pos]
         return obs
 
@@ -348,7 +343,7 @@ def reset(self, mask: Array) -> None:
         )
         self.target_vel = self.target_vel.at[mask].set(new_target_vel[mask])
 
-    def _get_obs(self) -> dict[str, Array]:
-        obs = super()._get_obs()
+    def _obs(self) -> dict[str, Array]:
+        obs = super()._obs()
         obs["difference_to_target_vel"] = [self.target_vel - self.sim.states.vel]
         return obs
diff --git a/examples/gymnasium_env.py b/examples/gymnasium_env.py
@@ -18,19 +18,16 @@
 SEED = 42
 
 envs = gymnasium.make_vec(
-    "CrazyflowEnvReachGoal-v0",
+    "DroneReachPos-v0",
     max_episode_steps=1000,
     return_datatype="numpy",
     num_envs=sim_config.n_worlds,
-    jax_random_key=SEED,
     **sim_config,
 )
 
 # action for going up (in attitude control). NOTE actions are rescaled in the environment
-action = np.array(
-    [[[-0.2, 0, 0, 0] for _ in range(sim_config.n_drones)] for _ in range(sim_config.n_worlds)],
-    dtype=np.float32,
-).reshape(sim_config.n_worlds, -1)
+action = np.zeros((sim_config.n_worlds * sim_config.n_drones, 4), dtype=np.float32)
+action[..., 0] = -0.2
 
 obs, info = envs.reset_all(seed=SEED)