improve env interface; add landing env

OliEfr · OliEfr · commit 5d1a734037bb · 2024-11-27T11:30:29.000+01:00
diff --git a/benchmark/main.py b/benchmark/main.py
@@ -44,7 +44,7 @@ def profile_gym_env_step(sim_config: config_dict.ConfigDict, n_steps: int, devic
 
     envs = gymnasium.make_vec(
         "DroneReachPos-v0",
-        max_episode_steps=200,
+        time_horizon_in_seconds=2,
         return_datatype="numpy",
         num_envs=sim_config.n_worlds,
         jax_random_key=42,
diff --git a/benchmark/performance.py b/benchmark/performance.py
@@ -49,7 +49,7 @@ def profile_gym_env_step(sim_config: config_dict.ConfigDict, n_steps: int, devic
 
     envs: CrazyflowEnvReachGoal = gymnasium.make_vec(
         "DroneReachPos-v0",
-        max_episode_steps=200,
+        time_horizon_in_seconds=2,
         return_datatype="numpy",
         num_envs=sim_config.n_worlds,
         jax_random_key=42,
diff --git a/crazyflow/gymnasium_envs/__init__.py b/crazyflow/gymnasium_envs/__init__.py
@@ -1,8 +1,8 @@
 from gymnasium.envs.registration import register
 
-from crazyflow.gymnasium_envs.crazyflow import CrazyflowEnvReachGoal, CrazyflowEnvTargetVelocity
+from crazyflow.gymnasium_envs.crazyflow import CrazyflowEnvReachGoal, CrazyflowEnvTargetVelocity, CrazyflowEnvLanding
 
-__all__ = ["CrazyflowEnvReachGoal", "CrazyflowEnvTargetVelocity"]
+__all__ = ["CrazyflowEnvReachGoal", "CrazyflowEnvTargetVelocity", "CrazyflowEnvLanding"]
 
 register(
     id="DroneReachPos-v0",
@@ -13,3 +13,9 @@
     id="DroneReachVel-v0",
     vector_entry_point="crazyflow.gymnasium_envs.crazyflow:CrazyflowEnvTargetVelocity",
 )
+
+
+register(
+    id="DroneLanding-v0",
+    vector_entry_point="crazyflow.gymnasium_envs.crazyflow:CrazyflowEnvLanding",
+)
diff --git a/crazyflow/gymnasium_envs/crazyflow.py b/crazyflow/gymnasium_envs/crazyflow.py
@@ -49,7 +49,7 @@ def __init__(
         *,
         jax_random_key: int,  # required for jax random number generator
         num_envs: int = 1,  # required for VectorEnv
-        max_episode_steps: int = 1000,
+        time_horizon_in_seconds: int = 10,
         return_datatype: Literal["numpy", "jax"] = "jax",
         **kwargs: dict,
     ):
@@ -58,7 +58,7 @@ def __init__(
         Args:
             jax_random_key: The random key for the jax random number generator.
             num_envs: The number of environments to run in parallel.
-            max_episode_steps: The maximum number of steps per episode.
+            time_horizon_in_seconds: The time horizon after which episodes are truncated.
             return_datatype: The data type for returned arrays, either "numpy" or "jax". If "numpy",
                 the returned arrays will be numpy arrays on the CPU. If "jax", the returned arrays
                 will be jax arrays on the "device" specified for the simulation.
@@ -71,7 +71,9 @@ def __init__(
         self.num_envs = num_envs
         self.return_datatype = return_datatype
         self.device = jax.devices(kwargs["device"])[0]
-        self.max_episode_steps = jnp.array(max_episode_steps, dtype=jnp.int32, device=self.device)
+        self.time_horizon_in_seconds = jnp.array(
+            time_horizon_in_seconds, dtype=jnp.int32, device=self.device
+        )
 
         self.sim = Sim(**kwargs)
 
@@ -224,7 +226,7 @@ def terminated(self) -> Array:
     @property
     def truncated(self) -> Array:
         return self._truncated(
-            self.prev_done, self.sim.steps, self.max_episode_steps, self.n_substeps
+            self.prev_done, self.sim.time, self.time_horizon_in_seconds, self.n_substeps
         )
 
     def _reward() -> None:
@@ -235,17 +237,18 @@ def _reward() -> None:
     def _terminated(dones: Array, states: SimState, contacts: Array) -> Array:
         contact = jnp.any(contacts, axis=1)
         z_coords = states.pos[..., 2]
-        # Sanity check if we are below the ground. Should not be triggered due to collision checking
-        below_ground = jnp.any(z_coords < -0.1, axis=1)
-        terminated = jnp.logical_or(below_ground, contact)  # no termination condition
+        below_ground = jnp.any(
+            z_coords < -0.1, axis=1
+        )  # Sanity check if we are below the ground. Should not be triggered due to collision checking
+        terminated = jnp.logical_or(below_ground, contact)
         return jnp.where(dones, False, terminated)
 
     @staticmethod
     @jax.jit
     def _truncated(
-        dones: Array, steps: Array, max_episode_steps: Array, n_substeps: Array
+        dones: Array, time: Array, time_horizon_in_seconds: Array, n_substeps: Array
     ) -> Array:
-        truncated = steps / n_substeps >= max_episode_steps
+        truncated = time >= time_horizon_in_seconds
         return jnp.where(dones, False, truncated)
 
     def render(self):
@@ -352,3 +355,40 @@ def _get_obs(self) -> dict[str, Array]:
         obs = super()._get_obs()
         obs["difference_to_target_vel"] = [self.target_vel - self.sim.states.vel]
         return obs
+
+
+class CrazyflowEnvLanding(CrazyflowBaseEnv):
+    """JAX Gymnasium environment for Crazyflie simulation."""
+
+    def __init__(self, **kwargs: dict):
+        assert kwargs["n_drones"] == 1, "Currently only supported for one drone"
+
+        super().__init__(**kwargs)
+        self._obs_size += 3  # difference to goal position
+        self.single_observation_space = spaces.Box(
+            -jnp.inf, jnp.inf, shape=(self._obs_size,), dtype=jnp.float32
+        )
+        self.observation_space = batch_space(self.single_observation_space, self.sim.n_worlds)
+
+        self.goal = jnp.zeros((kwargs["n_worlds"], 3), dtype=jnp.float32)
+        self.goal = self.goal.at[..., 2].set(0.1)  # 10cm above ground
+
+    @property
+    def reward(self) -> Array:
+        return self._reward(self.terminated, self.sim.states, self.goal)
+
+    @staticmethod
+    @jax.jit
+    def _reward(terminated: Array, states: SimState, goal: Array) -> Array:
+        norm_distance = jnp.linalg.norm(states.pos - goal, axis=2)
+        speed = jnp.linalg.norm(states.vel, axis=2)
+        reward = jnp.exp(-2.0 * norm_distance) * jnp.exp(-2.0 * speed)
+        return jnp.where(terminated, -1.0, reward)
+
+    def reset(self, mask: Array) -> None:
+        super().reset(mask)
+
+    def _get_obs(self) -> dict[str, Array]:
+        obs = super()._get_obs()
+        obs["difference_to_goal"] = [self.goal - self.sim.states.pos]
+        return obs
diff --git a/examples/gymnasium_env.py b/examples/gymnasium_env.py
@@ -18,8 +18,8 @@
 SEED = 42
 
 envs = gymnasium.make_vec(
-    "CrazyflowEnvReachGoal-v0",
-    max_episode_steps=1000,
+    "DroneLanding-v0",
+    time_horizon_in_seconds=2,
     return_datatype="numpy",
     num_envs=sim_config.n_worlds,
     jax_random_key=SEED,