Update maddpg and the report (#470)

peterchen96 · web-flow · commit 196f02a39567 · 2021-08-18T19:56:31.000+08:00
* update trajectory_extension

* update maddpg

* update the experiment

* update the report

* update the report
diff --git a/docs/experiments/experiments/Policy Gradient/JuliaRL_MADDPG_KuhnPoker.jl b/docs/experiments/experiments/Policy Gradient/JuliaRL_MADDPG_KuhnPoker.jl
@@ -2,7 +2,7 @@
 # title: JuliaRL\_MADDPG\_KuhnPoker
 # cover: assets/JuliaRL_MADDPG_KuhnPoker.png
 # description: MADDPG applied to KuhnPoker
-# date: 2021-08-09
+# date: 2021-08-18
 # author: "[Peter Chen](https://github.com/peterchen96)" 
 # ---
 
@@ -43,7 +43,7 @@ function RL.Experiment(
             state_space_mapping = ss -> [[findfirst(==(s), state_space(env))] for s in state_space(env)]
             ),
         ## drop the dummy action of the other agent.
-        action_mapping = x -> length(x) == 1 ? x : Int(x[current_player(env)] + 1),
+        action_mapping = x -> length(x) == 1 ? x : Int(ceil(x[current_player(env)]) + 1),
     )
     ns, na = 1, 1 # dimension of the state and action.
     n_players = 2 # number of players
@@ -101,9 +101,10 @@ function RL.Experiment(
             policy = NamedPolicy(player, deepcopy(policy)),
             trajectory = deepcopy(trajectory),
         )) for player in players(env) if player != chance_player(env)),
+        SARTS, # traces
         128, # batch_size
         128, # update_freq
-        0, # step_counter
+        0, # initial update_step
         rng
     )
 
diff --git a/docs/homepage/blog/ospp_report_210370190/index.md b/docs/homepage/blog/ospp_report_210370190/index.md
@@ -301,7 +301,7 @@ As for updating the policy, the process is mainly the same as the [`DDPGPolicy`]
 
 #### Usage
 
-Here `MADDPGManager` is used for simultaneous games, or you can add an [action-related wrapper](https://juliareinforcementlearning.org/docs/rlenvs/#ReinforcementLearningEnvironments.ActionTransformedEnv-Tuple{Any}) to the sequential game to drop the dummy action of other players. And there is one [experiment](https://juliareinforcementlearning.org/docs/experiments/experiments/Policy%20Gradient/JuliaRL_MADDPG_KuhnPoker/#JuliaRL\\_MADDPG\\_KuhnPoker) `JuliaRL_MADDPG_KuhnPoker` as one usage example, which tests the algorithm on the Kuhn Poker game. Since the Kuhn Poker is one sequential game, I wrap the game just like the following:
+Here `MADDPGManager` is used for the environments of [`SIMULTANEOUS`](https://juliareinforcementlearning.org/docs/rlbase/#ReinforcementLearningBase.SIMULTANEOUS) and continuous action space(see the blog [Diagonal Gaussian Policies](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#stochastic-policies)), or you can add an [action-related wrapper](https://juliareinforcementlearning.org/docs/rlenvs/#ReinforcementLearningEnvironments.ActionTransformedEnv-Tuple{Any}) to the environment to ensure it can work with the algorithm. There is one [experiment](https://juliareinforcementlearning.org/docs/experiments/experiments/Policy%20Gradient/JuliaRL_MADDPG_KuhnPoker/#JuliaRL\\_MADDPG\\_KuhnPoker) `JuliaRL_MADDPG_KuhnPoker` as one usage example, which tests the algorithm on the Kuhn Poker game. Since the Kuhn Poker is one [`SEQUENTIAL`](ReinforcementLearningBase.SEQUENTIAL) game with discrete action space(see also the blog [Diagonal Gaussian Policies](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#stochastic-policies)), I wrap the environment just like the following:
 ```Julia
 wrapped_env = ActionTransformedEnv(
         StateTransformedEnv(
@@ -310,7 +310,7 @@ wrapped_env = ActionTransformedEnv(
             state_space_mapping = ss -> [[findfirst(==(s), state_space(env))] for s in state_space(env)]
             ),
         ## drop the dummy action of the other agent.
-        action_mapping = x -> length(x) == 1 ? x : Int(x[current_player(env)] + 1),
+        action_mapping = x -> length(x) == 1 ? x : Int(ceil(x[current_player(env)]) + 1),
     )
 ```
 
@@ -376,9 +376,10 @@ agents = MADDPGManager(
         policy = NamedPolicy(player, deepcopy(policy)),
         trajectory = deepcopy(trajectory),
     )) for player in players(env) if player != chance_player(env)),
+    SARTS, # traces
     128, # batch_size
     128, # update_freq
-    0, # update_step
+    0, # initial update_step
     rng
 )
 ```
@@ -387,4 +388,4 @@ Plus on the [`stop_condition`](https://github.com/JuliaReinforcementLearning/Rei
 
 \dfig{body;JuliaRL_MADDPG_KuhnPoker.png;Result of the experiment.}
 
-**Note that** the current `MADDPGManager` still only works on the envs of [`MINIMAL_ACTION_SET`](https://juliareinforcementlearning.org/docs/rlbase/#ReinforcementLearningBase.MINIMAL_ACTION_SET). And since **MADDPG** is one deterministic algorithm, i.e., the state's response is one deterministic action, the Kuhn Poker game may not be suitable for testing the performance. In the next weeks, I'll update the algorithm and try to test it on other games.
+**Note that** since **MADDPG** is one deterministic algorithm, i.e., the state's response is one deterministic action, the Kuhn Poker game may not be suitable for testing the performance. In the next weeks, I'll update the algorithm and try to test it on other games.
diff --git a/src/ReinforcementLearningCore/src/policies/agents/trajectories/trajectory_extension.jl b/src/ReinforcementLearningCore/src/policies/agents/trajectories/trajectory_extension.jl
@@ -85,11 +85,22 @@ function fetch!(s::BatchSampler, t::AbstractTrajectory, inds::Vector{Int})
     end
 end
 
-function fetch!(s::BatchSampler{SARTS}, t::CircularArraySARTTrajectory, inds::Vector{Int})
-    batch = NamedTuple{SARTS}((
-        (consecutive_view(t[x], inds) for x in SART)...,
-        consecutive_view(t[:state], inds .+ 1),
-    ))
+function fetch!(s::BatchSampler{traces}, t::Union{CircularArraySARTTrajectory, CircularArraySLARTTrajectory}, inds::Vector{Int}) where {traces}
+    if traces == SARTS
+        batch = NamedTuple{SARTS}((
+            (consecutive_view(t[x], inds) for x in SART)...,
+            consecutive_view(t[:state], inds .+ 1),
+        ))
+    elseif traces == SLARTSL
+        batch = NamedTuple{SLARTSL}((
+            (consecutive_view(t[x], inds) for x in SLART)...,
+            consecutive_view(t[:state], inds .+ 1),
+            consecutive_view(t[:legal_actions_mask], inds .+ 1),
+        ))
+    else
+        @error "unsupported traces $traces"
+    end
+    
     if isnothing(s.cache)
         s.cache = map(batch) do x
             convert(Array, x)
diff --git a/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/maddpg.jl b/src/ReinforcementLearningZoo/src/algorithms/policy_gradient/maddpg.jl
@@ -6,26 +6,28 @@ Multi-agent Deep Deterministic Policy Gradient(MADDPG) implemented in Julia. Her
 See the paper https://arxiv.org/abs/1706.02275 for more details.
 
 # Keyword arguments
-- `agents::Dict{<:Any, <:NamedPolicy{<:Agent{<:DDPGPolicy, <:AbstractTrajectory}, <:Any}}`, here each agent collects its own information. While updating the policy, each `critic` will assemble all agents' trajectory to update its own network.
+- `agents::Dict{<:Any, <:NamedPolicy{<:Agent{<:DDPGPolicy, <:AbstractTrajectory}, <:Any}}`, here each agent collects its own information. While updating the policy, each **critic** will assemble all agents' trajectory to update its own network.
+- `traces`, set to `SARTS` if you are apply to an environment of `MINIMAL_ACTION_SET`, or `SLARTSL` if you are to apply to an environment of `FULL_ACTION_SET`.
 - `batch_size::Int`
 - `update_freq::Int`
 - `update_step::Int`, count the step.
 - `rng::AbstractRNG`.
 """
 mutable struct MADDPGManager{P<:DDPGPolicy, T<:AbstractTrajectory, N<:Any} <: AbstractPolicy
     agents::Dict{<:N, <:Agent{<:NamedPolicy{<:P, <:N}, <:T}}
+    traces
     batch_size::Int
     update_freq::Int
     update_step::Int
     rng::AbstractRNG
 end
 
-# for simultaneous game with a discrete action space.
+# used for simultaneous environments.
 function (π::MADDPGManager)(env::AbstractEnv)
     while current_player(env) == chance_player(env)
         env |> legal_action_space |> rand |> env
     end
-    Dict((player, ceil(agent.policy(env))) for (player, agent) in π.agents)
+    Dict((player, agent.policy(env)) for (player, agent) in π.agents)
 end
 
 function (π::MADDPGManager)(stage::Union{PreEpisodeStage, PostActStage}, env::AbstractEnv)
@@ -42,7 +44,7 @@ function (π::MADDPGManager)(stage::PreActStage, env::AbstractEnv, actions)
     end
     
     # update policy
-    update!(π)
+    update!(π, env)
 end
 
 function (π::MADDPGManager)(stage::PostEpisodeStage, env::AbstractEnv)
@@ -52,11 +54,11 @@ function (π::MADDPGManager)(stage::PostEpisodeStage, env::AbstractEnv)
     end
 
     # update policy
-    update!(π)
+    update!(π, env)
 end
 
 # update policy
-function RLBase.update!(π::MADDPGManager)
+function RLBase.update!(π::MADDPGManager, env::AbstractEnv)
     π.update_step += 1
     π.update_step % π.update_freq == 0 || return
 
@@ -69,7 +71,7 @@ function RLBase.update!(π::MADDPGManager)
     temp_player = collect(keys(π.agents))[1]
     t = π.agents[temp_player].trajectory
     inds = rand(π.rng, 1:length(t), π.batch_size)
-    batches = Dict((player, RLCore.fetch!(BatchSampler{SARTS}(π.batch_size), agent.trajectory, inds)) 
+    batches = Dict((player, RLCore.fetch!(BatchSampler{π.traces}(π.batch_size), agent.trajectory, inds)) 
                 for (player, agent) in π.agents)
     
     # get s, a, s′ for critic
@@ -95,7 +97,8 @@ function RLBase.update!(π::MADDPGManager)
     )
 
     for (player, agent) in π.agents
-        p = agent.policy.policy # get DDPGPolicy struct
+        p = agent.policy.policy # get agent's concrete DDPGPolicy.
+
         A = p.behavior_actor
         C = p.behavior_critic
         Aₜ = p.target_actor
@@ -104,6 +107,28 @@ function RLBase.update!(π::MADDPGManager)
         γ = p.γ
         ρ = p.ρ
 
+        if π.traces == SLARTSL
+            # Note that by default **MADDPG** is used for the environments with continuous action space, and `legal_action_space_mask` is 
+            # defined in the environments with discrete action space. So we need `env.action_mapping` to transform the actions 
+            # getting from the trajectory.
+            @assert env isa ActionTransformedEnv
+
+            mask = batches[player][:next_legal_actions_mask]
+            mu_actions, new_actions = send_to_host((mu_actions, new_actions)) # make sure that the actions on cpu.
+            mu_l′ = Flux.batch(
+                (begin
+                    actions = env.action_mapping(mu_actions[:, i])
+                    mask[actions[player]]
+                end for i = 1:π.batch_size)
+            )
+            new_l′ = Flux.batch(
+                (begin
+                    actions = env.action_mapping(new_actions[:, i])
+                    mask[actions[player]]
+                end for i = 1:π.batch_size)
+            )
+        end
+
         _device(x) = send_to_device(device(A), x)
 
         # Note that here default A, C, Aₜ, Cₜ on the same device.
@@ -114,6 +139,10 @@ function RLBase.update!(π::MADDPGManager)
         t = _device(batches[player][:terminal])
 
         qₜ = Cₜ(vcat(s′, new_actions)) |> vec
+        if π.traces == SLARTSL
+            mu_l′, new_l′ = _device((mu_l′, new_l′))
+            qₜ .+= ifelse.(new_l′, 0.0f0, typemin(Float32))
+        end
         y = r .+ γ .* (1 .- t) .* qₜ
 
         gs1 = gradient(Flux.params(C)) do
@@ -128,7 +157,11 @@ function RLBase.update!(π::MADDPGManager)
         update!(C, gs1)
 
         gs2 = gradient(Flux.params(A)) do
-            loss = -mean(C(vcat(s, mu_actions)))
+            v = C(vcat(s, mu_actions)) |> vec
+            if π.traces == SLARTSL
+                v .+= ifelse.(mu_l′, 0.0f0, typemin(Float32))
+            end
+            loss = -mean(v)
             ignore() do
                 p.actor_loss = loss
             end