-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMARL.py
87 lines (69 loc) · 2.28 KB
/
MARL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# %% Nash Q-learning
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
# 定义博弈矩阵: 决定了奖励函数怎么改
R = np.array([[3, 0], [5, 1]])
# 初始化Q值表
num_states = 2
num_actions = 2
Q = np.zeros((num_states, num_actions)) # 智能体使用共享策略
# 设置算法参数
alpha = 0.1 # 学习率
gamma = 0.9 # 折扣因子
epsilon = 0.1 # 探索率
# 可视化相关参数
episode_rewards = []
# Nash Q-Learning算法
num_episodes = 10000
for episode in range(num_episodes):
state = np.random.choice(num_states)
# 一定概率下随机探索
if np.random.uniform(0, 1) < epsilon:
action = np.random.choice(num_actions)
else:
action = np.argmax(Q[state, :])
next_state = state
while next_state == state:
next_state = np.random.choice(num_states) # 换一个随机状态,避免重复探索
# 更新Q值
reward = R[state, action]
Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * max(Q[next_state, :]))
episode_rewards.append(reward)
# 每1000次迭代可视化一次
if episode % 1000 == 0:
clear_output(wait=True)
plt.figure(figsize=(10, 5))
plt.plot(episode_rewards)
plt.title("Episode Rewards")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.show()
# 打印学到的策略
optimal_policy = np.argmax(Q, axis=1)
print("Optimal Policy:", optimal_policy)
# %% MiniMax Q-Learning
import numpy as np
# 定义博弈矩阵
R = np.array([[3, -1], [0, 2]])
# 初始化Q值表
num_states = 2
num_actions = 2
Q = np.zeros((num_states, num_actions))
# 设置算法参数
alpha = 0.1 # 学习率
gamma = 0.9 # 折扣因子
# Minimax Q-Learning算法
num_episodes = 10000
for episode in range(num_episodes):
state = np.random.choice(num_states)
action = np.argmax(Q[state, :]) # 初始策略是选择具有最大Q值的动作
next_state = state
while next_state == state:
next_state = np.random.choice(num_states)
# 更新Q值
reward = R[state, action]
Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * np.min(Q[next_state, :]))
# 打印学到的策略
optimal_policy = np.argmax(Q, axis=1)
print("Optimal Policy:", optimal_policy)