Skip to content

Commit 2393a1d

Browse files
committed
first commit
0 parents  commit 2393a1d

12 files changed

+1317
-0
lines changed

A2C_CartPole.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# %%
2+
import numpy as np
3+
import gym
4+
import torch
5+
import torch.nn as nn
6+
import torch.optim as optim
7+
8+
# %% 定义网络结构
9+
10+
class ActorCritic(nn.Module):
11+
def __init__(self, input_dim, output_dim):
12+
super(ActorCritic, self).__init__()
13+
self.actor = nn.Sequential(
14+
nn.Linear(input_dim, 128),
15+
nn.ReLU(),
16+
nn.Linear(128, output_dim),
17+
nn.Softmax(dim=-1)
18+
)
19+
self.critic = nn.Sequential(
20+
nn.Linear(input_dim, 128),
21+
nn.ReLU(),
22+
nn.Linear(128, 1)
23+
)
24+
25+
def forward(self, state):
26+
probs = self.actor(state)
27+
value = self.critic(state)
28+
return probs, value
29+
30+
31+
# A2C更新函数
32+
def train(model, optimizer, state, action, reward, next_state, done, gamma=0.99):
33+
state = torch.FloatTensor(state)
34+
next_state = torch.FloatTensor(next_state)
35+
reward = torch.FloatTensor([reward])
36+
action = torch.LongTensor([action])
37+
38+
probs, value = model(state)
39+
_, next_value = model(next_state)
40+
41+
# 计算advantage
42+
td_target = reward + gamma * next_value * (1 - done)
43+
delta = td_target - value
44+
45+
# 计算actor和critic的损失
46+
actor_loss = -torch.log(probs[action]) * delta.detach()
47+
critic_loss = delta ** 2
48+
49+
# 合并损失并进行反向传播
50+
loss = actor_loss + critic_loss
51+
optimizer.zero_grad()
52+
loss.backward()
53+
optimizer.step()
54+
55+
# %% 训练
56+
# 主程序: 离散动作,连续状态
57+
if __name__ == "__main__":
58+
env = gym.make("CartPole-v1")
59+
model = ActorCritic(env.observation_space.shape[0], env.action_space.n)
60+
optimizer = optim.Adam(model.parameters(), lr=0.001)
61+
62+
for episode in range(1000):
63+
state, info = env.reset()
64+
episode_reward = 0
65+
66+
while True:
67+
probs, _ = model(torch.FloatTensor(state))
68+
action = np.random.choice(env.action_space.n, p=probs.detach().numpy())
69+
next_state, reward, done, _, info = env.step(action)
70+
train(model, optimizer, state, action, reward, next_state, done)
71+
state = next_state
72+
episode_reward += reward
73+
74+
if done:
75+
print(f"Episode {episode}, Reward: {episode_reward}")
76+
break

A3C_CartPole.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# %%
2+
import numpy as np
3+
import gym
4+
import torch
5+
import torch.nn as nn
6+
import torch.optim as optim
7+
import multiprocessing
8+
9+
# %%
10+
# 定义网络结构
11+
class ActorCritic(nn.Module):
12+
def __init__(self, input_dim, output_dim):
13+
super(ActorCritic, self).__init__()
14+
self.actor = nn.Sequential(
15+
nn.Linear(input_dim, 128),
16+
nn.ReLU(),
17+
nn.Linear(128, output_dim),
18+
nn.Softmax(dim=-1)
19+
)
20+
self.critic = nn.Sequential(
21+
nn.Linear(input_dim, 128),
22+
nn.ReLU(),
23+
nn.Linear(128, 1)
24+
)
25+
26+
def forward(self, state):
27+
probs = self.actor(state)
28+
value = self.critic(state)
29+
return probs, value
30+
31+
# %%
32+
# A3C更新函数
33+
def train(global_model, optimizer, state, action, reward, next_state, done, gamma=0.99):
34+
state = torch.FloatTensor(state)
35+
next_state = torch.FloatTensor(next_state)
36+
reward = torch.FloatTensor([reward])
37+
action = torch.LongTensor([action])
38+
39+
probs, value = global_model(state)
40+
_, next_value = global_model(next_state)
41+
42+
td_target = reward + gamma * next_value * (1 - done)
43+
delta = td_target - value
44+
45+
actor_loss = -torch.log(probs[action]) * delta.detach()
46+
critic_loss = delta ** 2
47+
48+
loss = actor_loss + critic_loss
49+
optimizer.zero_grad()
50+
loss.backward()
51+
optimizer.step()
52+
53+
# 工作线程
54+
def worker(global_model, optimizer, worker_id):
55+
env = gym.make("CartPole-v1")
56+
state, info = env.reset()
57+
while True:
58+
action_probs, _ = global_model(torch.FloatTensor(state))
59+
action = np.random.choice(env.action_space.n, p=action_probs.detach().numpy())
60+
next_state, reward, done, _, info = env.step(action)
61+
train(global_model, optimizer, state, action, reward, next_state, done)
62+
state = next_state
63+
if done:
64+
state, _ = env.reset()
65+
66+
# %%
67+
if __name__ == "__main__":
68+
global_model = ActorCritic(4, 2)
69+
global_model.share_memory() # 允许多进程共享模型参数
70+
optimizer = optim.Adam(global_model.parameters(), lr=0.001)
71+
72+
processes = []
73+
for i in range(multiprocessing.cpu_count()): # 使用所有可用的CPU核心
74+
p = multiprocessing.Process(target=worker, args=(global_model, optimizer, i))
75+
p.start()
76+
processes.append(p)
77+
78+
for p in processes:
79+
p.join()

DDPG_Pendulum.py

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# %%
2+
import numpy as np
3+
import gym
4+
import torch
5+
import torch.nn as nn
6+
import torch.optim as optim
7+
import random
8+
9+
# 定义Actor网络,适用于连续状态空间
10+
class Actor(nn.Module):
11+
def __init__(self, input_dim, output_dim):
12+
super(Actor, self).__init__()
13+
self.net = nn.Sequential(
14+
nn.Linear(input_dim, 128),
15+
nn.ReLU(),
16+
nn.Linear(128, output_dim),
17+
nn.Tanh() # 使输出在-1到1之间
18+
)
19+
20+
def forward(self, state):
21+
return self.net(state)
22+
23+
# 定义Critic网络,输出Q值
24+
class Critic(nn.Module):
25+
def __init__(self, input_dim, action_dim):
26+
super(Critic, self).__init__()
27+
self.net = nn.Sequential(
28+
nn.Linear(input_dim + action_dim, 128),
29+
nn.ReLU(),
30+
nn.Linear(128, 1)
31+
)
32+
33+
def forward(self, state, action):
34+
return self.net(torch.cat([state, action], dim=1))
35+
36+
# 经验回放
37+
class ReplayBuffer:
38+
def __init__(self, capacity):
39+
self.capacity = capacity
40+
self.buffer = []
41+
self.position = 0
42+
43+
def push(self, state, action, reward, next_state, done):
44+
if len(self.buffer) < self.capacity:
45+
self.buffer.append(None)
46+
self.buffer[self.position] = (state, action, reward, next_state, done)
47+
self.position = (self.position + 1) % self.capacity
48+
49+
def sample(self, batch_size):
50+
batch = random.sample(self.buffer, batch_size)
51+
state, action, reward, next_state, done = map(np.stack, zip(*batch))
52+
return state, action, reward, next_state, done
53+
54+
def __len__(self):
55+
return len(self.buffer)
56+
57+
# %%
58+
# DDPG更新函数
59+
def train(actor, critic, actor_target, critic_target, actor_optimizer, critic_optimizer, replay_buffer, gamma=0.99, tau=0.005):
60+
state, action, reward, next_state, done = replay_buffer.sample(64)
61+
62+
state = torch.FloatTensor(state)
63+
action = torch.FloatTensor(action)
64+
reward = torch.FloatTensor(reward).unsqueeze(1)
65+
next_state = torch.FloatTensor(next_state)
66+
done = torch.FloatTensor(done).unsqueeze(1)
67+
68+
# Critic update
69+
with torch.no_grad():
70+
next_action = actor_target(next_state)
71+
target_q = reward + (1 - done) * gamma * critic_target(next_state, next_action)
72+
current_q = critic(state, action)
73+
critic_loss = nn.MSELoss()(current_q, target_q)
74+
critic_optimizer.zero_grad()
75+
critic_loss.backward()
76+
critic_optimizer.step()
77+
78+
# Actor update
79+
actor_loss = -critic(state, actor(state)).mean()
80+
actor_optimizer.zero_grad()
81+
actor_loss.backward()
82+
actor_optimizer.step()
83+
84+
# Soft update target networks
85+
for target_param, param in zip(actor_target.parameters(), actor.parameters()):
86+
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
87+
for target_param, param in zip(critic_target.parameters(), critic.parameters()):
88+
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
89+
90+
# %%
91+
# 主程序
92+
if __name__ == "__main__":
93+
env = gym.make("Pendulum-v1")
94+
95+
actor = Actor(env.observation_space.shape[0], env.action_space.shape[0])
96+
critic = Critic(env.observation_space.shape[0], env.action_space.shape[0])
97+
98+
actor_target = Actor(env.observation_space.shape[0], env.action_space.shape[0])
99+
critic_target = Critic(env.observation_space.shape[0], env.action_space.shape[0])
100+
101+
actor_target.load_state_dict(actor.state_dict()) # 软更新
102+
critic_target.load_state_dict(critic.state_dict())
103+
104+
actor_optimizer = optim.Adam(actor.parameters(), lr=0.001)
105+
critic_optimizer = optim.Adam(critic.parameters(), lr=0.001)
106+
107+
replay_buffer = ReplayBuffer(1000000)
108+
109+
for episode in range(1000):
110+
state,_ = env.reset()
111+
episode_reward = 0
112+
113+
for step in range(200): # Pendulum-v1默认的最大步数是200
114+
action = actor(torch.FloatTensor(state)).detach().numpy()
115+
next_state, reward, done, _, _ = env.step(action)
116+
replay_buffer.push(state, action, reward, next_state, done)
117+
state = next_state
118+
episode_reward += reward
119+
120+
if len(replay_buffer) > 1000:
121+
train(actor, critic, actor_target, critic_target, actor_optimizer, critic_optimizer, replay_buffer)
122+
123+
if done:
124+
break
125+
126+
print(f"Episode {episode}, Reward: {episode_reward}")

0 commit comments

Comments
 (0)