pytorch-ignite · theory-in-progress · Jul 12, 2023 · Jul 27, 2023 · Jul 27, 2023 · Aug 23, 2023
diff --git a/functions/colab.js b/functions/colab.js
@@ -37,6 +37,13 @@ exports.handler = async function (event, _) {
     )
   }
 
+  if (title === 'Template Reinforcement Learning') {
+    specific_commands.push(
+      '!pip install swig\n',
+      '!pip install gymnasium[box2d]'
+    )
+  }
+
   const md_cell = [
     `# ${title} by PyTorch-Ignite Code-Generator\n\n`,
     'Please, run the cell below to execute your code.'

diff --git a/src/templates/template-reinforcement-learning/README.md b/src/templates/template-reinforcement-learning/README.md
@@ -0,0 +1,35 @@
+[![Code-Generator](https://badgen.net/badge/Template%20by/Code-Generator/ee4c2c?labelColor=eaa700)](https://github.com/pytorch-ignite/code-generator)
+
+# Reinforcement Learning Template
+
+This is the Reinforcement Learning template by Code-Generator using OpenAI Gym for the environment CarRacing-v2.
+
+## Getting Started
+
+Install the dependencies with `pip`:
+
+```sh
+pip install -r requirements.txt --progress-bar off -U
+```
+
+### Code structure
+
+```
+|
+|- README.md
+|
+|- a2c.py : main script to run
+|- a2c_model_env.py : Utility functions for the reinforcement learning template for various tasks
+|- utils.py : module with various helper functions
+|- requirements.txt : dependencies to install with pip
+|
+|- config_a2c.yaml : global configuration YAML file
+```
+
+## Training
+
+### 1 GPU Training
+
+```sh
+python a2c.py config_a2c.yaml
+```
diff --git a/src/templates/template-reinforcement-learning/a2c.py b/src/templates/template-reinforcement-learning/a2c.py
@@ -0,0 +1,109 @@
+from pprint import pformat
+from shutil import copy
+from typing import Any
+
+import ignite.distributed as idist
+import torch
+from ignite.engine import Events
+from ignite.handlers import LRScheduler
+
+from ignite.utils import manual_seed
+
+from utils import *
+
+from a2c_model_env import make_a2c_models, make_collector, make_loss, make_optim, make_test_env
+
+
+def main():
+    config = setup_config()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    config.device = f"{device}"
+
+    rank = idist.get_rank()
+    manual_seed(config.seed + rank)
+    config.output_dir = setup_output_dir(config, rank)
+    if rank == 0:
+        save_config(config, config.output_dir)
+
+    actor, critic = make_a2c_models(config)
+    actor = actor.to(device)
+    critic = critic.to(device)
+
+    collector = make_collector(config, policy=actor)
+    loss_module, adv_module = make_loss(config, actor_network=actor, value_network=critic)
+    optim = make_optim(config, actor_network=actor, value_network=critic)
+
+    batch_size = config.total_frames * config.num_envs
+    total_network_updates = config.total_frames // batch_size
+
+    scheduler = None
+    if config.lr_scheduler:
+        scheduler = torch.optim.lr_scheduler.LinearLR(optim, total_iters=total_network_updates)
+        scheduler = LRScheduler(scheduler)
+
+    test_env = make_test_env(config)
+
+    def run_single_timestep(engine, _):
+        frames_in_batch = engine.state.data.numel()
+        trainer.state.collected_frames += frames_in_batch * config.frame_skip
+        data_view = engine.state.data.reshape(-1)
+
+        with torch.no_grad():
+            batch = adv_module(data_view)
+
+        # Normalize advantage
+        adv = batch.get("advantage")
+
+        # mean of the advantage values
+        loc = adv.mean().item()
+        # standard deviation of the advantage values
+        scale = adv.std().clamp_min(1e-6).item()
+        # normalizing the advantage values
+        adv = (adv - loc) / scale
+        batch.set("advantage", adv)
+
+        # Forward pass A2C loss
+        batch = batch.to(device)
+        loss = loss_module(batch)
+        loss_sum = loss["loss_critic"] + loss["loss_objective"] + loss["loss_entropy"]
+
+        # Backward pass + learning step
+        loss_sum.backward()
+        grad_norm = torch.nn.utils.clip_grad_norm_(list(actor.parameters()) + list(critic.parameters()), max_norm=0.5)
+        engine.state.metrics = {
+            "loss_sum": loss_sum.item(),
+        }
+        optim.step()
+        optim.zero_grad()
+
+    trainer = Engine(run_single_timestep)
+
+    logger = setup_logging(config)
+    logger.info("Configuration: \n%s", pformat(vars(config)))
+    trainer.logger = logger
+
+    if config.lr_scheduler:
+        trainer.add_event_handler(Events.ITERATION_COMPLETED, scheduler)
+
+    trainer.add_event_handler(
+        Events.ITERATION_COMPLETED(every=config.log_every_episodes),
+        log_metrics,
+        tag="train",
+    )
+
+    @trainer.on(Events.ITERATION_STARTED)
+    def update_data():
+        # print(f"New iteration started")
+        trainer.state.data = next(iter(collector))
+        trainer.state.collected_frames = 0
+
+    @trainer.on(Events.ITERATION_COMPLETED)
+    def log2():
+        collector.update_policy_weights_()
+
+    # timesteps = range(config.steps_per_episode)
+    trainer.run(epoch_length=int(config.total_frames / config.frames_per_batch), max_epochs=1)
+
+
+if __name__ == "__main__":
+    main()