Skip to content

Commit c762cad

Browse files
author
ryan.osgar
committed
reloaded older model
0 parents  commit c762cad

35 files changed

+1512
-0
lines changed

Diff for: .DS_Store

6 KB
Binary file not shown.

Diff for: .idea/DeepRlTrex.iml

+12
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: .idea/dictionaries/ryan_osgar.xml

+7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: .idea/encodings.xml

+4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: .idea/misc.xml

+4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: .idea/modules.xml

+9
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: .idea/vcs.xml

+7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: .idea/workspace.xml

+662
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: README.md

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# DeepRlTrex
2+
3+
Reinforcement learning implementation of double-deep-q-learning, dueling network architure and PER to play the Google
4+
Chrome Trex Game:
5+
6+
- Double Deep Q-Network: https://arxiv.org/pdf/1509.06461.pdf
7+
- Dueling Network Architecture: https://arxiv.org/pdf/1511.06581.pdf
8+
- Prioritized Experience Replay: https://arxiv.org/pdf/1511.05952.pdf
9+
10+
![](/assets/trex_demo.gif)
11+
12+
**Dependancies:**
13+
- mss==5.1.0
14+
- numpy==1.18.1
15+
- tensorflow==2.2.0
16+
- seaborn==0.10.1
17+
- pandas==1.0.3
18+
- Keras==2.4.3
19+
- selenium==3.141.0
20+
- PyAutoGUI==0.9.50
21+
- matplotlib==3.1.3
22+
- Pillow==7.2.0
23+
- progressbar33==2.4
24+
25+
26+
27+
To run a demo of a working agent use:
28+
```python
29+
python demo.py
30+
```
31+
32+

Diff for: __pycache__/action_space.cpython-37.pyc

1.33 KB
Binary file not shown.

Diff for: __pycache__/agent.cpython-37.pyc

5.92 KB
Binary file not shown.

Diff for: __pycache__/environment.cpython-37.pyc

4.19 KB
Binary file not shown.

Diff for: __pycache__/logger.cpython-37.pyc

971 Bytes
Binary file not shown.

Diff for: __pycache__/memory.cpython-37.pyc

3.03 KB
Binary file not shown.

Diff for: __pycache__/sum_tree.cpython-37.pyc

1.83 KB
Binary file not shown.

Diff for: __pycache__/utils.cpython-37.pyc

3.73 KB
Binary file not shown.

Diff for: action_space.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import pyautogui
2+
import random
3+
import time
4+
5+
6+
class Action:
7+
def __init__(self, action):
8+
self.action = action
9+
10+
def __repr__(self):
11+
return f'ActionObj("{self.action}")'
12+
13+
def act(self):
14+
if self.action == 'space':
15+
pyautogui.press('space')
16+
time.sleep(.47)
17+
return 0
18+
19+
if self.action == 'none':
20+
time.sleep(0.02)
21+
return 1
22+
23+
24+
class ActionSpace:
25+
def __init__(self):
26+
self.space = Action('space')
27+
self.none = Action('none')
28+
29+
self.actions = [self.space, self.none]
30+
31+
def sample(self):
32+
action = random.choice(list(range(len(self.actions))))
33+
return action
34+
35+

Diff for: agent.py

+188
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import numpy as np
2+
import tensorflow as tf
3+
import datetime
4+
from memory import ReplayMemory
5+
import progressbar
6+
import math
7+
8+
9+
class Agent:
10+
def __init__(self,
11+
environment,
12+
optimizer,
13+
memory_length,
14+
dueling=True,
15+
loss='mse',
16+
load_weights=None,
17+
save_weights=None,
18+
verbose_action=False):
19+
20+
self.environment = environment
21+
self._optimizer = optimizer
22+
self._loss = loss
23+
self.memory = ReplayMemory(memory_length)
24+
self.dueling = dueling
25+
26+
# Initialize discount and exploration rate, etc
27+
self.total_steps = 0
28+
self.gamma = 0.99
29+
self.epsilon = 1
30+
self.epsilon_min = 0.01
31+
self.epsilon_decay = 0.00005
32+
self.tau = 0.05
33+
self.pretraining_steps = 0
34+
35+
# Build networks
36+
self.q_network = self._build_compile_model()
37+
self.target_network = self._build_compile_model()
38+
self.align_target_model(how='hard')
39+
40+
if load_weights:
41+
self.load_weights(load_weights)
42+
43+
self.save_weights_fp = save_weights
44+
self.start_time = datetime.datetime.now()
45+
self.verbose_action = verbose_action
46+
47+
def load_weights(self, weights_fp):
48+
if weights_fp:
49+
print('loading weights...')
50+
self.q_network.load_weights(weights_fp)
51+
self.align_target_model(how='hard')
52+
53+
def save_weights(self, weights_fp):
54+
if weights_fp:
55+
self.q_network.save_weights(weights_fp)
56+
57+
def set_epsilon_decay_schedule(self, epsilon, epsilon_min, annealed_steps):
58+
self.epsilon = epsilon
59+
self.epsilon_min = epsilon_min
60+
self.epsilon_decay = math.log(self.epsilon / self.epsilon_min) / annealed_steps
61+
62+
def set_beta_schedule(self, beta_start, beta_max, annealed_samplings):
63+
self.memory.beta = beta_start
64+
self.memory.beta_max = beta_max
65+
self.memory.beta_increment_per_sampling = (self.memory.beta_max - self.memory.beta) / annealed_samplings
66+
67+
def predict(self, state, use_target=False):
68+
if use_target:
69+
return self.target_network.predict(state)
70+
else:
71+
return self.q_network.predict(state)
72+
73+
def _decay_epsilon(self):
74+
self.epsilon = self.epsilon * np.exp(-self.epsilon_decay)
75+
76+
def store(self, state, action, reward, next_state, terminated):
77+
self.memory.add((state, action, reward, next_state, terminated))
78+
self.total_steps += 1
79+
80+
if (self.epsilon > self.epsilon_min) and (self.memory.length > self.pretraining_steps):
81+
self._decay_epsilon()
82+
83+
def batch_store(self, batch_load):
84+
batch_load[-2][2] = -0.1 # custom reward altering
85+
for row in batch_load:
86+
self.store(*row)
87+
88+
def _build_compile_model(self):
89+
inputs = tf.keras.layers.Input(shape=(32, 290, 4))
90+
conv1 = tf.keras.layers.Conv2D(32, (8, 8), strides=4, padding='same', activation='relu')(inputs)
91+
conv2 = tf.keras.layers.Conv2D(64, (4, 4), strides=2, padding='same', activation='relu')(conv1)
92+
conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, padding='same', activation='relu')(conv2)
93+
conv3 = tf.keras.layers.Flatten()(conv3)
94+
95+
advt = tf.keras.layers.Dense(256, activation='relu')(conv3)
96+
final = tf.keras.layers.Dense(2)(advt)
97+
98+
if self.dueling:
99+
value = tf.keras.layers.Dense(256, activation='relu')(conv3)
100+
value = tf.keras.layers.Dense(1)(value)
101+
102+
advt = tf.keras.layers.Lambda(lambda x: x - tf.reduce_mean(x, axis=1, keepdims=True))(final)
103+
final = tf.keras.layers.Add()([value, advt])
104+
105+
model = tf.keras.models.Model(inputs=inputs, outputs=final)
106+
model.compile(optimizer=self._optimizer,
107+
loss=self._loss,
108+
metrics=['accuracy'])
109+
return model
110+
111+
def align_target_model(self, how):
112+
assert how in ('hard', 'soft'), '"how" must be either "hard" or "soft"'
113+
114+
if how == 'hard':
115+
self.target_network.set_weights(self.q_network.get_weights())
116+
117+
elif how == 'soft':
118+
for t, e in zip(self.target_network.trainable_variables, self.q_network.trainable_variables):
119+
t.assign(t * (1 - self.tau) + (e * self.tau))
120+
121+
def choose_action(self, state):
122+
if np.random.rand() <= self.epsilon:
123+
action = self.environment.action_space.sample()
124+
if self.verbose_action:
125+
print(f'action: {action}, q: random')
126+
return action
127+
128+
q_values = self.predict(state, use_target=False)
129+
action = np.argmax(q_values[0])
130+
if self.verbose_action:
131+
print(f'action: {action}, q: {q_values}')
132+
return action
133+
134+
def train(self, batch, is_weights):
135+
136+
td_errors = np.zeros(len(batch))
137+
states = np.zeros((len(batch), 32, 290, 4))
138+
targets = np.zeros((len(batch), 2))
139+
140+
for i, (state, action, reward, next_state, terminated) in enumerate(batch):
141+
target, td_error = self._get_target(state, action, reward, next_state, terminated)
142+
states[i] = state.reshape(32, 290, 4)
143+
targets[i] = target
144+
td_errors[i] = td_error
145+
146+
self.q_network.fit(states, targets, sample_weight=is_weights, batch_size=32, epochs=1, verbose=0)
147+
self.align_target_model(how='soft')
148+
149+
return td_errors
150+
151+
def replay(self, batch_size, epoch_steps=None):
152+
153+
num_batches = 1
154+
155+
if epoch_steps:
156+
num_batches = int(np.max([np.floor(epoch_steps / 4), 1]))
157+
158+
bar = progressbar.ProgressBar(maxval=num_batches,
159+
widgets=[f'training - ', progressbar.widgets.Counter(), f'/{num_batches} ',
160+
progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
161+
bar.start()
162+
163+
for i in range(num_batches):
164+
leaf_idx, batch, is_weights = self.memory.get_batch(batch_size) # prioritized experience replay
165+
td_errors = self.train(batch, is_weights)
166+
self.memory.update_sum_tree(leaf_idx, td_errors)
167+
168+
bar.update(i + 1)
169+
170+
bar.finish()
171+
self.save_weights(self.save_weights_fp)
172+
173+
def _get_target(self, state, action, reward, next_state, terminated):
174+
target = self.predict(state, use_target=False)
175+
prev_target = target[0][action]
176+
177+
if terminated:
178+
target[0][action] = reward
179+
else:
180+
a = np.argmax(self.predict(next_state, use_target=False)[0])
181+
target[0][action] = reward + (self.gamma * self.predict(next_state, use_target=True)[0][a]) # double Q Network
182+
183+
td_error = abs(prev_target - target[0][action])
184+
185+
return target, td_error
186+
187+
188+

Diff for: assets/G_game_over.png

71 Bytes
Loading

Diff for: assets/trex_demo.gif

293 KB
Loading

Diff for: demo.py

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import tensorflow as tf
2+
from environment import Environment
3+
from agent import Agent
4+
5+
6+
if __name__ == '__main__':
7+
8+
# create environment object
9+
env = Environment()
10+
11+
load_path = 'model/model-weights'
12+
save_path = 'model/model-weights'
13+
14+
agent = Agent(env,
15+
tf.keras.optimizers.Adam(learning_rate=0.0001),
16+
memory_length=50000,
17+
dueling=True,
18+
loss='mse',
19+
load_weights=load_path,
20+
save_weights=None,
21+
verbose_action=False)
22+
23+
env.init_game()
24+
25+
for episode in range(10000000):
26+
env.demo(agent)

0 commit comments

Comments
 (0)