-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqlearning_predator.py
98 lines (76 loc) · 3.24 KB
/
qlearning_predator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from animat import *
class PredatorState:
PreyDetected = 0
Hungry = 1
NotHungry = 2
class PredatorAction:
EatPrey = 0
MoveForward = 1
MoveTowardsPrey = 2
class QLearning_Predator:
def __init__(self, epsilon=0.1, alpha=0.2, gamma=0.9):
self.table = {}
# Table is a dictionary with a mapping from states to actions, where one state can map to multiple actions.
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
self.prev_state = None
self.prev_max_index = None
self.current_action = None
self.chosen_action = None
self.settable()
# --- Initialise Q-table
# Key is a list of states
# Value is a list of actions followed by their q-values
def settable(self):
self.table[(PredatorState.Hungry, PredatorState.PreyDetected)] = [PredatorAction.MoveTowardsPrey, self.rand(),PredatorAction.EatPrey, self.rand()]
self.table[(PredatorState.NotHungry, PredatorState.PreyDetected)] = [PredatorAction.MoveForward, self.rand()]
self.table[PredatorState.Hungry] = [PredatorAction.EatPrey, self.rand()]
self.table[PredatorState.NotHungry] = [PredatorAction.MoveForward, self.rand()]
# --- Choose Action with max Q value
def choose_action(self, current_state):
if len(current_state) == 1:
self.current_action = self.table.get((current_state[0]))
else:
self.current_action = self.table.get(tuple(current_state))
if self.current_action is None:
return None
# Iterating through current action and finding action with max value
max_qvalue = -1
max_index = 0
index = 0
while index < len(self.current_action)-1:
if max_qvalue < self.current_action[index+1]:
max_index = index
max_qvalue = self.current_action[index+1]
index += 2
self.prev_state = current_state
self.prev_max_index = max_index
self.chosen_action = self.current_action[max_index]
# Return best action and it's q weight
return self.current_action[max_index], self.current_action[max_index+1]
# --- Perform Q Learning
def doQLearning(self, reward, state):
# For the first iteration of a generation!
if self.prev_state is None:
return
prev_state = self.prev_state
prev_action = self.current_action
prev_max_index = self.prev_max_index
# Find Qt-1
oldq = prev_action[prev_max_index+1]
# Find Qt
newqtemp = self.choose_action(state) # Contains best action and it's weight
newq = newqtemp[1] # Newq contains best/max weight
# QLearning
# Q(t-1) = Q(t-1) + alpha * [ r + (gamma * max(Q(t)) - Q(t-1) ]
oldq += self.alpha*(reward+(self.gamma * newq)-oldq) # Calculate newQ
# print "Updated Q value ", oldq , newq
# Update QValue and reflect in Table
# print "Before", prev_state, prev_action
prev_action[prev_max_index+1] = oldq
# print "After", prev_action
self.table[tuple(prev_state)] = prev_action
# --- Return Random weight from 0 to 1
def rand(self):
return random.uniform(0.0, 1.0)