-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathtaxonomy.py
1267 lines (1107 loc) · 82.2 KB
/
taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
from abc import ABC, abstractmethod
from collections import OrderedDict
import copy
from enum import Enum
import re
import sys
import html2text
import markdown
# "Orientation" of the graph: left-right or top-bottom.
RANKDIR = "LR" # LR or TB
# Edge style for weak connection between nodes
WEAK_LINK = 'dashed'
# Sometimes we add connection between nodes just to maintain ordering.
# This is the style of the edge for such connection.
INVIS = "invis"
# Style from rood node
ROOT_EDGE = "solid"
# Style for ordering edge
ORDER_EDGE = "invis"
FONT_NAME = "sans-serif"
CLUSTER_FONT_NAME = "arial black"
NODE_FONT_NAME = "helvetica-bold"
NODE_FONT_SIZE = 12
TIMELINE_FONT_NAME = NODE_FONT_NAME
TIMELINE_COLOR = "white"
TIMELINE_FONT_SIZE = 14
TIMELINE_FILL_COLOR = '#707070'
EDGE_FONT_COLOR = 'black'
EDGE_FONT_SIZE = 10
# Useful graphviz links:
# - https://graphviz.readthedocs.io/en/stable/index.html
# - http://www.graphviz.org/pdf/dotguide.pdf
# - https://graphviz.org/doc/info/attrs.html
class Flag(Enum):
"""
These are various flags that can be attributed to an algorithm
"""
# MF = "Model-Free" # is the default for all algorithms
MB = "Model-Based"
MC = "Monte Carlo"
# TD = "Temporal Difference" # is the default for all algorithms
# Policy:
ONP = "On-Policy"
OFP = "Off-Policy"
# Action space:
DA = "Discrete action space"
CA = "Continuous action space"
# State space:
DS = "Discrete state space"
CS = "Continuous state space"
# Policy space
SP = "Stochastic Policy"
DP = "Deterministic Policy"
# Operator:
ADV = "Advantage"
# Miscellaneous:
RB = "Replay Buffer"
RNN = "Recurrent Neural Network"
DI = "Distributional"
MG = "Model is Given" # model-based flag
ML = "Model is Learnt" # model based flag
def url2md(url):
"""
Get Markdown of url. url can be string or (title,url) sequence
"""
if isinstance(url, str):
o = re.search('//([^/]+)/', url)
if not o:
name = 'link'
else:
parts = o.group(1).split('.')
name = parts[-2] + '.' + parts[-1]
return f'[{name}]({url})'
else:
return f'[{url[0]}]({url[1]})'
def md2txt(md):
html = markdown.markdown(md)
cvt = html2text.HTML2Text()
cvt.ignore_emphasis = True
cvt.ignore_links = True
txt = cvt.handle(html)
return txt
class Edge:
"""
An Edge is a connection between Nodes/Groups
"""
def __init__(self, dest, **attrs):
self.dest = dest
# self.label = label # to prevent label from being displayed in graph
self.attrs = attrs
@property
def label(self):
return self.attrs.get('label', '')
@property
def invisible(self):
return self.attrs.get('style', '') == INVIS
class NodeBase(ABC):
"""
Base class for Nodes and Groups.
"""
def __init__(self, title, description, group, flags=[], authors=None, year=None, url=None,
videos=[], links=[], graph_type="node", output_md=True, **attrs):
assert graph_type in ['node', 'cluster', 'node']
self.title = title
self.description = description
self.group = group
self.flags = flags
self.authors = authors
self.year = year
self.url = url
self.videos = videos
self.links = links
self.graph_type = graph_type
self.output_md = output_md
self.attrs = attrs
if group:
group.nodes.append(self)
self.out_edges = []
self.in_edges = []
def __str__(self):
return self.title
@property
def md_title(self):
"""Normalized title for markdown"""
return self.title.replace('\\n', ' ')
@property
def name(self):
"""
Suitable name to be used as HTML anchor etc.
"""
return re.sub('[^0-9a-zA-Z]+', '', self.md_title)
@property
def graph_name(self):
"""
Identification of this node in the graph. If type is cluster, the name needs to be
prefixed with "cluster" (graphviz convention)
"""
return ('cluster' + self.title) if self.graph_type == 'cluster' else self.title
@property
def tooltip(self):
lines = self.description.split('\n')
if lines:
return md2txt(lines[0]) + (f'({self.year})' if self.year else '')
else:
return ''
@property
def graph_rank(self):
"""
The rank of this node in the graph/cluster.
"""
if self.year:
if self.year >= 1980 and self.year < 2000:
return '1980-90s'
elif self.year >= 2000 and self.year < 2010:
return '2000s'
elif self.year >= 2010 and self.year <= 2015:
return '2010-2015'
else:
return str(self.year)
else:
return ''
def connect(self, other_node, **attrs):
"""
Add connection from this node to other node. attrs are edge attributes.
"""
self.out_edges.append(Edge(other_node, **attrs))
other_node.in_edges.append(Edge(self, **attrs))
def get_parents(self):
"""
Get list of parents from root up to immediate parent.
"""
p = self
parents = []
while p.group:
parents.append(p.group)
p = p.group
parents.reverse()
return parents
@abstractmethod
def collect_nodes(self):
pass
@abstractmethod
def export_node(self, graph):
pass
@abstractmethod
def export_connections(self, graph, cluster):
pass
@abstractmethod
def export_graph(self, graph, cluster):
pass
@abstractmethod
def export_md(self):
pass
def _export_node(self, graph):
if self.graph_type != "node":
return
attrs = copy.copy(self.attrs)
attrs['fontname'] = NODE_FONT_NAME
if 'fontsize' not in attrs:
attrs['fontsize'] = str(NODE_FONT_SIZE)
if 'shape' not in attrs:
attrs['shape'] = 'box'
if 'style' not in attrs:
attrs['style'] = 'rounded,bold,filled'
if 'fillcolor' not in attrs:
attrs['fillcolor'] = '#dae8fc'
if 'tooltip' not in attrs:
attrs['tooltip'] = self.tooltip
if True:
#url = self.url
url = f'https://github.com/bennylp/RL-Taxonomy#{self.name}'
url = url.replace("&", "&").replace("<", "<").replace(">", ">").replace("\"", """)
attrs['URL'] = url
graph.node(self.graph_name, label=f'{self.title}', **attrs)
def _export_connections(self, graph, cluster):
for edge in self.out_edges:
attrs = copy.copy(edge.attrs)
attrs['fontcolor'] = EDGE_FONT_COLOR
attrs['fontsize'] = str(EDGE_FONT_SIZE)
attrs['fontname'] = FONT_NAME
if attrs.get('style', '') == WEAK_LINK:
attrs['color'] = 'darkgray'
attrs['fontcolor'] = 'darkgray'
if edge.dest.group == self.group:
cluster.edge(self.graph_name, edge.dest.graph_name, **attrs)
else:
graph.edge(self.graph_name, edge.dest.graph_name, **attrs)
def _export_md(self):
if not self.output_md:
return f' <a name="{self.name}"></a>\n'
parents = self.get_parents()
md = ('#' * min(len(parents) + 2, 5)) + f' <a name="{self.name}"></a>{self.md_title}\n'
if parents:
paths = parents + [self]
md += f'(Path: '
md += ' --> '.join([f'[{p.md_title}](#{p.name})' for p in paths]) + ')\n\n'
if self.description:
md += f'{self.description}\n\n'
if self.url:
md += f'- Paper: {self.url}\n'
if self.authors:
md += f'- Authors: {self.authors}\n'
if self.year:
md += f'- Year: {self.year}\n'
if self.flags:
md += '- Flags:\n'
for f in self.flags:
md += f' - {f.value} ({f.name})\n'
if self.in_edges:
md += f'- Related to prior idea{"s" if len(self.in_edges)>1 else ""}:\n'
for e in self.in_edges:
if e.invisible:
continue
md += f' - [{e.dest.md_title}](#{e.dest.name})'
if e.label:
md += f' ({e.label})'
md += '\n'
if self.out_edges:
md += f'- Related to subsequent idea{"s" if len(self.out_edges)>1 else ""}:\n'
for e in self.out_edges:
if e.invisible:
continue
md += f' - [{e.dest.md_title}](#{e.dest.name})'
if e.label:
md += f' ({e.label})'
md += '\n'
if self.links:
md += '- Useful links:\n' + '\n'.join([f' - {url2md(l)}' for l in self.links]) + '\n'
if self.videos:
md += '- Videos:\n' + '\n'.join([f' - {url2md(l)}' for l in self.videos]) + '\n'
md += '\n'
return md
class Group(NodeBase):
"""
Group is a "container" for other nodes.
"""
def __init__(self, title, description, group, flags=[], authors=None, year=None, url=None,
videos=[], links=[], graph_type="cluster", output_md=True,
**attrs):
super().__init__(title, description, group, flags=flags, authors=authors, year=year, url=url,
videos=videos, links=links, graph_type=graph_type, output_md=output_md, **attrs)
self.nodes = []
def collect_nodes(self):
nodes = [self]
for node in self.nodes:
nodes += node.collect_nodes()
return nodes
def export_node(self, graph):
self._export_node(graph)
for child in self.nodes:
child.export_node(graph)
def export_connections(self, graph, cluster):
if self.graph_type == "cluster":
with cluster.subgraph(name=self.graph_name) as c:
c.attr(label=self.title)
c.attr(color='black')
# c.node_attr['style'] = 'filled'
for child in self.nodes:
child.export_connections(graph, c)
else:
for child in self.nodes:
child.export_connections(graph, cluster)
self._export_connections(graph, cluster)
def export_graph(self, graph, cluster):
if self.graph_type == "cluster":
with cluster.subgraph(name=self.graph_name) as c:
c.attr(label=self.title)
c.attr(color='black')
if 'style' not in self.attrs:
c.attr(style='dashed')
c.attr(fontname=CLUSTER_FONT_NAME)
c.attr(fontsize='16')
#c.attr(tooltip=self.tooltip) # probably not. tooltip for large area is confusing
c.attr(**self.attrs)
self._export_node(c)
for child in self.nodes:
child.export_graph(graph, c)
else:
self._export_node(cluster)
for child in self.nodes:
child.export_graph(graph, cluster)
self._export_connections(graph, cluster)
def export_md(self):
md = self._export_md()
for child in self.nodes:
md += child.export_md()
return md
class Node(NodeBase):
"""
A Node represents an algorithm. The relevant properties can be initialized from
the constructor.
"""
def __init__(self, title, description, group, flags=[], authors=None, year=None, url=None,
videos=[], links=[], graph_type="node", output_md=True, **attrs):
super().__init__(title, description, group, flags=flags, authors=authors, year=year, url=url,
videos=videos, links=links, graph_type=graph_type, output_md=output_md, **attrs)
def collect_nodes(self):
return [self]
def export_node(self, graph):
self._export_node(graph)
def export_connections(self, graph, cluster):
self._export_connections(graph, cluster)
def export_graph(self, graph, cluster):
self._export_node(cluster)
self._export_connections(graph, cluster)
def export_md(self):
return self._export_md()
#
# The nodes. Note: Within their group, keep nodes relatively sorted by their publication year
#
rl = Group('Reinforcement\\nLearning',
'Reinforcement learning (RL) is an area of machine learning concerned with how software agents ought to take actions in an environment in order to maximize the notion of cumulative reward [from [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning)]',
None, graph_type="node", shape='plaintext', style='', fontsize='18',
links=[('A (Long) Peek into Reinforcement Learning', 'https://lilianweng.github.io/lil-log/2018/02/19/a-long-peek-into-reinforcement-learning.html'),
('(book) Reinforcement Learning: An Introduction - 2nd Edition - Richard S. Sutton and Andrew G. Barto', 'http://incompleteideas.net/book/the-book.html')
],
videos=[('(playlist) Introduction to Reinforcement learning with David Silver', 'https://www.youtube.com/playlist?list=PLqYmG7hTraZBiG_XpjnPrSNw-1XQaM_gB'),
('(playlist) Reinforcement Learning Course | DeepMind & UCL', 'https://www.youtube.com/playlist?list=PLqYmG7hTraZBKeNJ-JE_eyJHZ7XgBoAyb'),
('(playlist) Reinforcement Learning Tutorials', 'https://www.youtube.com/playlist?list=PLWzQK00nc192L7UMJyTmLXaHa3KcO0wBT'),
('(playlist) Deep RL Bootcamp 2017', 'https://www.youtube.com/playlist?list=PLAdk-EyP1ND8MqJEJnSvaoUShrAWYe51U'),
('(playlist) CS885 Reinforcement Learning - Spring 2018 - University of Waterloo', 'https://www.youtube.com/playlist?list=PLdAoL1zKcqTXFJniO3Tqqn6xMBBL07EDc'),
('(playlist) CS234: Reinforcement Learning | Winter 2019', 'https://www.youtube.com/playlist?list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u'),
])
model_free = Group('Model Free',
'In model free reinforcement learning, the agent directly tries to predict the value/policy without having or trying to model the environment',
rl, style='rounded,filled', fillcolor='#f7fdff')
root_model_free = Node('Model Free', model_free.description, model_free, output_md=False, fillcolor='#ffe6cc', weight='10')
rl.connect(root_model_free)
model_based = Group('Model Based',
'In model-based reinforcement learning, the agent uses the experience to try to model the environment, and then uses the model to predict the value/policy',
rl, style='rounded,filled', fillcolor='#dafdda',
links=[('Model-Based Reinforcement Learning: Theory and Practice', 'https://bair.berkeley.edu/blog/2019/12/12/mbpo/'),
])
root_model_based = Node('Model Based', model_based.description, model_based, output_md=False, fillcolor='#ffe6cc')
rl.connect(root_model_based)
meta_rl = Group('Meta-RL',
'In meta reinforcement learning, the agent is trained over distribution of tasks, and with the knowledge it tries to solve new unseen but related task.',
rl, style='rounded,filled', fillcolor='#f5f5da',
links=[('Meta Reinforcement Learning', 'https://lilianweng.github.io/lil-log/2019/06/23/meta-reinforcement-learning.html')
])
root_meta_rl = Node('Meta-RL', meta_rl.description, meta_rl, year=2001, output_md=False, fillcolor='#ffe6cc')
rl.connect(root_meta_rl)
value_gradient = Group('Value Gradient',
'The algorithm is learning the value function of each state or state-action. The policy is implicit, usually by just selecting the best value',
model_free, style='rounded,dashed,filled', fillcolor='#daf0f6')
policy_gradient = Group('Policy Gradient/Actor-Critic',
'The algorithm works directly to optimize the policy, with or without value function. If the value function is learned in addition to the policy, we would get Actor-Critic algorithm. Most policy gradient algorithms are Actor-Critic. The *Critic* updates value function parameters *w* and depending on the algorithm it could be action-value ***Q(a|s;w)*** or state-value ***V(s;w)***. The *Actor* updates policy parameters θ, in the direction suggested by the critic, ***π(a|s;θ)***. [from [Lilian Weng\' blog](https://lilianweng.github.io/lil-log/2018/02/19/a-long-peek-into-reinforcement-learning.html)]',
model_free, style='rounded,dashed,filled', fillcolor='#daf0f6',
links=[
('Policy Gradient Algorithms', 'https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html'),
('RL — Policy Gradient Explained', 'https://medium.com/@jonathan_hui/rl-policy-gradients-explained-9b13b688b146'),
('Going Deeper Into Reinforcement Learning: Fundamentals of Policy Gradients', 'https://danieltakeshi.github.io/2017/03/28/going-deeper-into-reinforcement-learning-fundamentals-of-policy-gradients/'),
('An introduction to Policy Gradients with Cartpole and Doom', 'https://www.freecodecamp.org/news/an-introduction-to-policy-gradients-with-cartpole-and-doom-495b5ef2207f/')
])
root_value_gradient = Node('Value Gradient', value_gradient.description, value_gradient, output_md=False, fillcolor='#ffe6cc')
root_policy_gradient = Node('Policy Gradient\\n/Actor-Critic', policy_gradient.description, policy_gradient, output_md=False, fillcolor='#ffe6cc')
root_model_free.connect(root_value_gradient)
root_model_free.connect(root_policy_gradient)
#
# VALUE GRADIENT
#
sarsa = Node('SARSA',
'SARSA (State-Action-Reward-State-Action) is an on-policy TD control method',
value_gradient,
flags=[Flag.ONP, Flag.DA],
authors='G. A. Rummery, M. Niranjan',
year=1994,
url='http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.17.2539&rep=rep1&type=pdf')
root_value_gradient.connect(sarsa, style=ROOT_EDGE)
qlearning = Node('Q-learning',
'Q-learning an off-policy TD control method. Unlike SARSA, it doesn\'t follow the policy to find the next action but rather chooses most optimal action in a greedy fashion',
value_gradient,
flags=[Flag.OFP, Flag.DA],
authors='Chris Watkins',
year=1989,
url='http://www.cs.rhul.ac.uk/~chrisw/new_thesis.pdf',
links=[('Diving deeper into Reinforcement Learning with Q-Learning', 'https://www.freecodecamp.org/news/diving-deeper-into-reinforcement-learning-with-q-learning-c18d0db58efe/'),
('Simple Reinforcement Learning with Tensorflow Part 0: Q-Learning with Tables and Neural Networks', 'https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0')]
)
root_value_gradient.connect(qlearning, style=ROOT_EDGE)
td_gammon = Node('TD-Gammon',
'TD-Gammon is a model-free reinforcement learning algorithm similar to Q-learning, and uses a multi-layer perceptron with one hidden layer as the value function approximator. It learns the game entirely by playing against itself and achieves superhuman level of play.',
value_gradient,
flags=[],
authors='Gerald Tesauro',
year=1995,
url='https://dl.acm.org/doi/10.1145/203330.203343',
links=[]
)
root_value_gradient.connect(td_gammon, style=ROOT_EDGE)
dqn = Node('DQN',
'Deep Q Network (DQN) is Q-Learning with deep neural network as state-action value estimator and uses a replay buffer to sample experiences from previous trajectories to make learning more stable.',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Volodymyr Mnih, Koray Kavukcuoglu, David Silver, Alex Graves, Ioannis Antonoglou, Daan Wierstra, Martin Riedmiller',
year=2013,
url='https://arxiv.org/abs/1312.5602',
links=[('(tutorial) Deep Q Learning for the CartPole', 'https://towardsdatascience.com/deep-q-learning-for-the-cartpole-44d761085c2f'),
('An introduction to Deep Q-Learning: let’s play Doom', 'https://www.freecodecamp.org/news/an-introduction-to-deep-q-learning-lets-play-doom-54d02d8017d8/')])
qlearning.connect(dqn)
drqn = Node('DRQN',
'Deep Recurrent Q-Learning. Adding recurrency to a Deep Q-Network (DQN) by replacing the first post-convolutional fully-connected layer with a recurrent LSTM',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB, Flag.RNN],
authors='Matthew Hausknecht, Peter Stone',
year=2015,
url='https://arxiv.org/abs/1507.06527',
links=[])
dqn.connect(drqn)
ddqn = Node('DDQN',
'Double DQN adds another neural network, making separate network for policy and target. The target network is only updated after certain number of steps/episodes. This makes the learning more stable.',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA],
authors='Hado van Hasselt, Arthur Guez, David Silver',
year=2015,
url='https://arxiv.org/abs/1509.06461',
links=[('(tutorial) Deep Q Learning for the CartPole', 'https://towardsdatascience.com/deep-q-learning-for-the-cartpole-44d761085c2f')])
dqn.connect(ddqn)
dqn_per = Node('PER',
'Prioritized Experience Replay (PER) improves data efficiency by replaying transitions from which there is more to learn more often',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Tom Schaul, John Quan, Ioannis Antonoglou, David Silver',
year=2015,
url='https://arxiv.org/abs/1511.05952',
links=[])
dqn.connect(dqn_per)
duel_dqn = Node('Duelling-DQN',
'Duelling DQN represents two separate estimators: one for the state value function and one for the state-dependent action advantage function. The main benefit of this factoring is to generalize learning across actions without imposing any change to the underlying reinforcement learning algorithm.',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA],
authors='Ziyu Wang, Tom Schaul, Matteo Hessel, Hado van Hasselt, Marc Lanctot, Nando de Freitas',
year=2016,
url='https://arxiv.org/abs/1511.06581')
ddqn.connect(duel_dqn)
qr_dqn = Node('QR-DQN',
'Distributional Reinforcement Learning with Quantile Regression (QR-DQN). In QR-DQN, distribution of values values are used for each state-action pair instead of a single mean value',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB, Flag.DI],
authors='Will Dabney, Mark Rowland, Marc G. Bellemare, Rémi Munos',
year=2017,
url='https://arxiv.org/abs/1710.10044',
links=[('(GitHub) Quantile Regression DQN', 'https://github.com/senya-ashukha/quantile-regression-dqn-pytorch')])
dqn.connect(qr_dqn)
c51 = Node('C51',
'C51 Algorithm. The core idea of Distributional Bellman is to ask the following questions. If we can model the Distribution of the total future rewards, why restrict ourselves to the expected value (i.e. Q function)? There are several benefits to learning an approximate distribution rather than its approximate expectation. [[source: flyyufelix\'s blog](https://flyyufelix.github.io/2017/10/24/distributional-bellman.html)]',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB, Flag.DI],
authors='Marc G. Bellemare, Will Dabney, Rémi Munos',
year=2017,
url='https://arxiv.org/abs/1707.06887',
links=[('Distributional Bellman and the C51 Algorithm', 'https://flyyufelix.github.io/2017/10/24/distributional-bellman.html')])
dqn.connect(c51)
# dqn_per.connecT(c51, syle=INVIS)
rainbow = Node('RAINBOW',
'Combines six DQN extensions, namely Double Q-Learning, prioritized replay, dueling networks, multi-step learning, distributional DQN, and noisy DQN into single model to achieve state of the art performance',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Matteo Hessel, Joseph Modayil, Hado van Hasselt, Tom Schaul, Georg Ostrovski, Will Dabney, Dan Horgan, Bilal Piot, Mohammad Azar, David Silver',
year=2017,
url='https://arxiv.org/abs/1710.02298',
links=[])
ddqn.connect(rainbow, style=WEAK_LINK)
dqn_per.connect(rainbow, style=WEAK_LINK)
duel_dqn.connect(rainbow, style=WEAK_LINK)
qr_dqn.connect(rainbow, style=WEAK_LINK)
dqn_her = Node('DQN+HER',
'DQN with Hindsight Experience Replay (HER)',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, Wojciech Zaremba',
year=2017,
url='https://arxiv.org/abs/1707.01495',
links=[('Learning from mistakes with Hindsight Experience Replay', 'https://becominghuman.ai/learning-from-mistakes-with-hindsight-experience-replay-547fce2b3305')])
dqn.connect(dqn_her)
iqn = Node('IQN',
"""Implicit Quantile Networks (IQN). From the abstract: In this work, we build on recent advances in distributional reinforcement learning to give a generally applicable, flexible, and state-of-the-art distributional variant of DQN. We achieve this by using quantile regression to approximate the full quantile function for the state-action return distribution. By reparameterizing a distribution over the sample space, this yields an implicitly defined return distribution and gives rise to a large class of risk-sensitive policies. We demonstrate improved performance on the 57 Atari 2600 games in the ALE, and use our algorithm's implicitly defined distributions to study the effects of risk-sensitive policies in Atari games.
""",
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB, Flag.DI],
authors='Will Dabney, Georg Ostrovski, David Silver, Rémi Munos',
year=2018,
url='https://arxiv.org/abs/1806.06923',
links=[('(StackExchange) How does Implicit Quantile-Regression Network (IQN) differ from QR-DQN?', 'https://datascience.stackexchange.com/questions/40874/how-does-implicit-quantile-regression-network-iqn-differ-from-qr-dqn')])
dqn.connect(iqn)
# dqn_per.connecT(c51, syle=INVIS)
apex_dqn = Node('APE-X DQN',
'DQN with Distributed Prioritized Experience Replay',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Dan Horgan, John Quan, David Budden, Gabriel Barth-Maron, Matteo Hessel, Hado van Hasselt, David Silver',
year=2018,
url='https://arxiv.org/abs/1803.00933',
links=[('Understanding and Implementing Distributed Prioritized Experience Replay (Horgan et al., 2018)', 'https://towardsdatascience.com/understanding-and-implementing-distributed-prioritized-experience-replay-horgan-et-al-2018-d2c1640e0520')])
dqn.connect(apex_dqn)
# dqn_per.connecT(c51, syle=INVIS)
r2d2 = Node('R2D2',
"""Recurrent Replay Distributed DQN (R2D2). (from the abstract) Building on the recent successes of distributed training of RL agents, in this paper we investigate the training of RNN-based RL agents from distributed prioritized experience replay. We study the effects of parameter lag resulting in representational drift and recurrent state staleness and empirically derive an improved training strategy. Using a single network architecture and fixed set of hyper-parameters, the resulting agent, Recurrent Replay Distributed DQN, quadruples the previous state of the art on Atari-57, and matches the state of the art on DMLab-30. It is the first agent to exceed human-level performance in 52 of the 57 Atari games.""",
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Steven Kapturowski, Georg Ostrovski, John Quan, Remi Munos, Will Dabney',
year=2019,
url='https://openreview.net/forum?id=r1lyTjAqYX',
links=[])
dqn.connect(r2d2)
ngu = Node('NGU',
'Never Give Up (NGU). (from the abstract) We propose a reinforcement learning agent to solve hard exploration games by learning a range of directed exploratory policies. We construct an episodic memory-based intrinsic reward using k-nearest neighbors over the agent\'s recent experience to train the directed exploratory policies, thereby encouraging the agent to repeatedly revisit all states in its environment. A self-supervised inverse dynamics model is used to train the embeddings of the nearest neighbour lookup, biasing the novelty signal towards what the agent can control. We employ the framework of Universal Value Function Approximators (UVFA) to simultaneously learn many directed exploration policies with the same neural network, with different trade-offs between exploration and exploitation. By using the same neural network for different degrees of exploration/exploitation, transfer is demonstrated from predominantly exploratory policies yielding effective exploitative policies. The proposed method can be incorporated to run with modern distributed RL agents that collect large amounts of experience from many actors running in parallel on separate environment instances. Our method doubles the performance of the base agent in all hard exploration in the Atari-57 suite while maintaining a very high score across the remaining games, obtaining a median human normalised score of 1344.0%. Notably, the proposed method is the first algorithm to achieve non-zero rewards (with a mean score of 8,400) in the game of Pitfall! without using demonstrations or hand-crafted features.',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Adrià Puigdomènech Badia, Pablo Sprechmann, Alex Vitvitskyi, Daniel Guo, Bilal Piot, Steven Kapturowski, Olivier Tieleman, Martín Arjovsky, Alexander Pritzel, Andew Bolt, Charles Blundell',
year=2020,
url='https://arxiv.org/abs/2002.06038',
links=[])
r2d2.connect(ngu)
agent57 = Node('Agent57',
'(from the abstract) Atari games have been a long-standing benchmark in the reinforcement learning (RL) community for the past decade. This benchmark was proposed to test general competency of RL algorithms. Previous work has achieved good average performance by doing outstandingly well on many games of the set, but very poorly in several of the most challenging games. We propose Agent57, the first deep RL agent that outperforms the standard human benchmark on all 57 Atari games. To achieve this result, we train a neural network which parameterizes a family of policies ranging from very exploratory to purely exploitative. We propose an adaptive mechanism to choose which policy to prioritize throughout the training process. Additionally, we utilize a novel parameterization of the architecture that allows for more consistent and stable learning.',
value_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Adrià Puigdomènech Badia, Bilal Piot, Steven Kapturowski, Pablo Sprechmann, Alex Vitvitskyi, Daniel Guo, Charles Blundell',
year=2020,
url='https://arxiv.org/abs/2003.13350',
links=[('DeepMind Unveils Agent57, the First AI Agents that Outperforms Human Benchmarks in 57 Atari Games', 'https://towardsdatascience.com/deepmind-unveils-agent57-the-first-ai-agents-that-outperforms-human-benchmarks-in-57-atari-games-35db4282dab3'),
])
ngu.connect(agent57)
#
# POLICY GRADIENT / ACTOR-CRITIC
#
reinforce = Node('REINFORCE',
'REINFORCE (Monte-Carlo policy gradient) is a pure policy gradient algorithm that works without a value function. The agent collects a trajectory of one episode using its current policy, and uses the returns to update the policy parameter',
policy_gradient,
flags=[Flag.MC, Flag.ONP, Flag.CS, Flag.DA],
authors='Ronald J. Williams',
year=1992,
url='https://people.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf',
links=[('LearningReinforcementLearningbyLearningREINFORCE (PDF)', 'http://www.cs.toronto.edu/~tingwuwang/REINFORCE.pdf'),
('An introduction to Policy Gradients with Cartpole and Doom', 'https://www.freecodecamp.org/news/an-introduction-to-policy-gradients-with-cartpole-and-doom-495b5ef2207f/')
]
)
root_policy_gradient.connect(reinforce, style=ROOT_EDGE)
"""
vpg = Node('VPG',
'Vanilla Policy Gradient',
policy_gradient,
flags=[Flag.MC, Flag.ONP, Flag.CS, Flag.DA, Flag.CA],
authors='Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour',
year=2000,
url='https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf',
links=['https://spinningup.openai.com/en/latest/algorithms/vpg.html']
)
"""
dpg = Node('DPG',
'Deterministic Policy Gradient. Abstract: In this paper we consider deterministic policy gradient algorithms for reinforcement learning with continuous actions. The deterministic policy gradient has a particularly appealing form: it is the expected gradient of the action-value function. This simple form means that the deterministic policy gradient can be estimated much more efficiently than the usual stochastic policy gradient. To ensure adequate exploration, we introduce an off-policy actor-critic algorithm that learns a deterministic target policy from an exploratory behaviour policy. We demonstrate that deterministic policy gradient algorithms can significantly outperform their stochastic counterparts in high-dimensional action spaces.',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA, Flag.DP],
authors='David Silver, Guy Lever, Nicolas Heess, Thomas Degris, Daan Wierstra, Martin Riedmiller',
year=2014,
url='http://proceedings.mlr.press/v32/silver14.pdf',
links=[]
)
root_policy_gradient.connect(dpg, style=ROOT_EDGE)
ddpg = Node('DDPG',
'Deep Deterministic Policy Gradient (DDPG).',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA, Flag.DP, Flag.RB],
authors='Timothy P. Lillicrap, Jonathan J. Hunt, Alexander Pritzel, Nicolas Heess, Tom Erez, Yuval Tassa, David Silver, Daan Wierstra',
year=2015,
url='https://arxiv.org/abs/1509.02971',
links=[('Deep Deterministic Policy Gradient - Spinning Up', 'https://spinningup.openai.com/en/latest/algorithms/ddpg.html')
]
)
dpg.connect(ddpg)
dqn.connect(ddpg, style=WEAK_LINK, label='replay buffer', constraint="false")
trpo = Node('TRPO',
'Trust Region Policy Optimization (TRPO) improves training stability by enforcing a KL divergence constraint to avoid parameter updates that change the policy too much at one step.',
policy_gradient,
flags=[Flag.ONP, Flag.CS, Flag.CA, Flag.ADV],
authors='John Schulman, Sergey Levine, Philipp Moritz, Michael I. Jordan, Pieter Abbeel',
year=2015,
url='https://arxiv.org/pdf/1502.05477',
links=[('RL — Trust Region Policy Optimization (TRPO) Explained', 'https://medium.com/@jonathan_hui/rl-trust-region-policy-optimization-trpo-explained-a6ee04eeeee9'),
('RL — Trust Region Policy Optimization (TRPO) Part 2', 'https://medium.com/@jonathan_hui/rl-trust-region-policy-optimization-trpo-part-2-f51e3b2e373a')]
)
root_policy_gradient.connect(trpo, style=ROOT_EDGE)
gae = Node('GAE',
'Generalized Advantage Estimation',
policy_gradient,
flags=[Flag.ONP, Flag.CS, Flag.CA],
authors='John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, Pieter Abbeel',
year=2015,
url='https://arxiv.org/abs/1506.02438',
links=[('Generalized Advantage Estimator Explained', 'https://notanymike.github.io/GAE/'),
('Notes on the Generalized Advantage Estimation Paper', 'https://danieltakeshi.github.io/2017/04/02/notes-on-the-generalized-advantage-estimation-paper/')]
)
root_policy_gradient.connect(gae, style=ROOT_EDGE)
trpo.connect(gae, style=WEAK_LINK)
a3c = Node('A3C',
'Asynchronous Advantage Actor-Critic (A3C) is a classic policy gradient method with the special focus on parallel training. In A3C, the critics learn the state-value function, ***V(s;w)***, while multiple actors are trained in parallel and get synced with global parameters from time to time. Hence, A3C is good for parallel training by default, i.e. on one machine with multi-core CPU. [from [Lilian Weng\' blog](https://lilianweng.github.io/lil-log/2018/02/19/a-long-peek-into-reinforcement-learning.html)]',
policy_gradient,
flags=[Flag.ONP, Flag.CS, Flag.CA, Flag.ADV, Flag.SP],
authors='Volodymyr Mnih, Adrià Puigdomènech Badia, Mehdi Mirza, Alex Graves, Timothy P. Lillicrap, Tim Harley, David Silver, Koray Kavukcuoglu',
year=2016,
url='https://arxiv.org/abs/1602.01783',
links=[('Simple Reinforcement Learning with Tensorflow Part 8: Asynchronous Actor-Critic Agents (A3C)', 'https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2'),
('An implementation of A3C', 'https://github.com/dennybritz/reinforcement-learning/tree/master/PolicyGradient/a3c')]
)
root_policy_gradient.connect(a3c, style=ROOT_EDGE)
a3c.connect(rainbow, style=WEAK_LINK, constraing='false')
ddpg_her = Node('DDPG+HER',
'Hindsight Experience Replay (HER)',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.DP, Flag.RB],
authors='Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, Wojciech Zaremba',
year=2017,
url='https://arxiv.org/abs/1707.01495',
links=['https://becominghuman.ai/learning-from-mistakes-with-hindsight-experience-replay-547fce2b3305'])
ddpg.connect(ddpg_her, style=WEAK_LINK)
dqn_her.connect(ddpg_her, style=WEAK_LINK, label='HER', constraint='false', arrowhead='none')
maddpg = Node('MADDPG',
'Multi-agent DDPG (MADDPG) extends DDPG to an environment where multiple agents are coordinating to complete tasks with only local information. In the viewpoint of one agent, the environment is non-stationary as policies of other agents are quickly upgraded and remain unknown. MADDPG is an actor-critic model redesigned particularly for handling such a changing environment and interactions between agents (from [Lilian Weng\'s blog](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html#maddpg))',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA, Flag.DP, Flag.RB],
authors='Ryan Lowe, Yi Wu, Aviv Tamar, Jean Harb, Pieter Abbeel, Igor Mordatch',
year=2017,
url='https://arxiv.org/abs/1706.02275',
links=[]
)
ddpg.connect(maddpg)
a2c = Node('A2C',
'A2C is a synchronous, deterministic variant of Asynchronous Advantage Actor Critic (A3C). It uses multiple workers to avoid the use of a replay buffer.',
policy_gradient,
flags=[Flag.ONP, Flag.CS, Flag.CA, Flag.ADV, Flag.SP],
authors='OpenAI',
year=2017,
url='https://openai.com/blog/baselines-acktr-a2c/',
links=[
('OpenAI Baselines: ACKTR & A2C', 'https://openai.com/blog/baselines-acktr-a2c/'),
('An intro to Advantage Actor Critic methods: let’s play Sonic the Hedgehog!', 'https://www.freecodecamp.org/news/an-intro-to-advantage-actor-critic-methods-lets-play-sonic-the-hedgehog-86d6240171d/'),
('Stable Baselines: A2C', 'https://stable-baselines.readthedocs.io/en/master/modules/a2c.html')
]
)
a3c.connect(a2c)
acer = Node('ACER',
'Actor-Critic with Experience Replay (ACER) combines several ideas of previous algorithms: it uses multiple workers (as A2C), implements a replay buffer (as in DQN), uses Retrace for Q-value estimation, importance sampling and a trust region. ACER is A3C\'s off-policy counterpart. ACER proposes several designs to overcome the major obstacle to making A3C off policy, that is how to control the stability of the off-policy estimator. (source: [Lilian Weng\'s blog](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html#acer))',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA, Flag.ADV, Flag.RB],
authors='Ziyu Wang, Victor Bapst, Nicolas Heess, Volodymyr Mnih, Remi Munos, Koray Kavukcuoglu, Nando de Freitas',
year=2017,
url='https://arxiv.org/abs/1611.01224',
links=[
]
)
a3c.connect(acer)
dqn.connect(acer, style=WEAK_LINK, label='replay buffer')
# a2c.connect(acer, style=WEAK_LINK, label='multiple workers')
a2c.connect(acer, style=ORDER_EDGE)
trpo.connect(acer, style=WEAK_LINK, label='TRPO technique')
acktr = Node('ACKTR',
'Actor Critic using Kronecker-Factored Trust Region (ACKTR) is applying trust region optimization to deep reinforcement learning using a recently proposed Kronecker-factored approximation to the curvature.',
policy_gradient,
flags=[Flag.ONP, Flag.CS, Flag.CA, Flag.ADV],
authors='Yuhuai Wu, Elman Mansimov, Shun Liao, Roger Grosse, Jimmy Ba',
year=2017,
url='https://arxiv.org/abs/1708.05144',
links=[
]
)
root_policy_gradient.connect(acktr, style=ROOT_EDGE)
a2c.connect(acktr, style=ORDER_EDGE) # just to maintain relative timeline order
ppo = Node('PPO',
'Proximal Policy Optimization (PPO) is similar to [TRPO](#TRPO) but uses simpler mechanism while retaining similar performance.',
policy_gradient,
flags=[Flag.ONP, Flag.CS, Flag.DA, Flag.CA, Flag.ADV],
authors='John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov',
year=2017,
url='https://arxiv.org/abs/1707.06347',
links=['https://spinningup.openai.com/en/latest/algorithms/ppo.html',
'https://openai.com/blog/openai-baselines-ppo/'],
videos=[('Policy Gradient methods and Proximal Policy Optimization (PPO): diving into Deep RL!', 'https://www.youtube.com/watch?v=5P7I-xPq8u8')]
)
trpo.connect(ppo, style=WEAK_LINK)
svpg = Node('SVPG',
'Stein Variational Policy Gradient (SVPG)',
policy_gradient,
flags=[Flag.ONP, Flag.CS, Flag.DA, Flag.CA],
authors='Yang Liu, Prajit Ramachandran, Qiang Liu, Jian Peng',
year=2017,
url='https://arxiv.org/abs/1704.02399',
links=[('Policy Gradient Algorithms', 'https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html#svpg'),
]
)
root_policy_gradient.connect(svpg, style=ROOT_EDGE)
a2c.connect(svpg, style=ORDER_EDGE) # just to maintain relative timeline order
reactor = Node('Reactor',
'From the abstract: In this work we present a new agent architecture, called Reactor, which combines multiple algorithmic and architectural contributions to produce an agent with higher sample-efficiency than Prioritized Dueling DQN (Wang et al., 2016) and Categorical DQN (Bellemare et al., 2017), while giving better run-time performance than A3C (Mnih et al., 2016). Our first contribution is a new policy evaluation algorithm called Distributional Retrace, which brings multi-step off-policy updates to the distributional reinforcement learning setting. The same approach can be used to convert several classes of multi-step policy evaluation algorithms designed for expected value evaluation into distributional ones. Next, we introduce the β-leave-one-out policy gradient algorithm which improves the trade-off between variance and bias by using action values as a baseline. Our final algorithmic contribution is a new prioritized replay algorithm for sequences, which exploits the temporal locality of neighboring observations for more efficient replay prioritization. Using the Atari 2600 benchmarks, we show that each of these innovations contribute to both the sample efficiency and final agent performance. Finally, we demonstrate that Reactor reaches state-of-the-art performance after 200 million frames and less than a day of training.',
policy_gradient,
flags=[Flag.OFP, Flag.RNN, Flag.RB, Flag.DI],
authors='Audrunas Gruslys, Will Dabney, Mohammad Gheshlaghi Azar, Bilal Piot, Marc Bellemare, Remi Munos',
year=2017,
url='https://arxiv.org/abs/1704.04651',
links=[]
)
root_policy_gradient.connect(reactor, style=ROOT_EDGE)
d4pg = Node('D4PG',
'Distributed Distributional Deep Deterministic Policy Gradient (D4PG) adopts the very successful distributional perspective on reinforcement learning and adapts it to the continuous control setting. It combines this within a distributed framework. It also combines this technique with a number of additional, simple improvements such as the use of N-step returns and prioritized experience replay [from the paper\'s abstract]',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA, Flag.DP, Flag.RB],
authors='Gabriel Barth-Maron, Matthew W. Hoffman, David Budden, Will Dabney, Dan Horgan, Dhruva TB, Alistair Muldal, Nicolas Heess, Timothy Lillicrap',
year=2018,
url='https://arxiv.org/abs/1804.08617',
links=[]
)
ddpg.connect(d4pg)
apex_ddpg = Node('APE-X DDPG',
'DDPG with Distributed Prioritized Experience Replay',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.RB],
authors='Dan Horgan, John Quan, David Budden, Gabriel Barth-Maron, Matteo Hessel, Hado van Hasselt, David Silver',
year=2018,
url='https://arxiv.org/abs/1803.00933',
links=[('Understanding and Implementing Distributed Prioritized Experience Replay (Horgan et al., 2018)', 'https://towardsdatascience.com/understanding-and-implementing-distributed-prioritized-experience-replay-horgan-et-al-2018-d2c1640e0520')])
ddpg.connect(apex_ddpg)
apex_dqn.connect(apex_ddpg, label='APE-X', style=WEAK_LINK, constraint='false', arrowhead='none')
sac = Node('SAC',
'Soft Actor Critic (SAC) is an algorithm that optimizes a stochastic policy in an off-policy way, forming a bridge between stochastic policy optimization and DDPG-style approaches.',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA, Flag.CA, Flag.SP],
authors='Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine',
year=2018,
url='https://arxiv.org/abs/1801.01290',
links=[('Spinning Up SAC page', 'https://spinningup.openai.com/en/latest/algorithms/sac.html'),
('(GitHub) SAC code by its author', 'https://github.com/haarnoja/sac')])
root_policy_gradient.connect(sac, style=ROOT_EDGE)
ppo.connect(sac, style=ORDER_EDGE) # just to maintain relative timeline order
td3 = Node('TD3',
'Twin Delayed DDPG (TD3). TD3 addresses function approximation error in DDPG by introducing twin Q-value approximation network and less frequent updates',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.DA, Flag.DP, Flag.RB],
authors='Scott Fujimoto, Herke van Hoof, David Meger',
year=2018,
url='https://arxiv.org/abs/1802.09477',
links=[('Twin Delayed DDPG (Spinning Up)', 'https://spinningup.openai.com/en/latest/algorithms/td3.html')])
ddpg.connect(td3)
ddqn.connect(td3, style=WEAK_LINK, label='double Q-learning')
mpo = Node('MPO',
'Maximum a Posteriori Policy Optimization (MPO) is an RL method that combines the sample efficiency of off-policy methods with the scalability and hyperparameter robustness of on-policy methods. It is an EM style method, which alternates an E-step that re-weights state-action samples with an M step that updates a deep neural network with supervised training. MPO achieves state of the art results on many continuous control tasks while using an order of magnitude fewer samples when compared with PPO',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA, Flag.DA],
authors='Abbas Abdolmaleki, Jost Tobias Springenberg, Yuval Tassa, Remi Munos, Nicolas Heess, Martin Riedmiller',
year=2018,
url='https://arxiv.org/abs/1806.06920',
links=[])
root_policy_gradient.connect(mpo, style=ROOT_EDGE)
impala = Node('IMPALA',
'Importance Weighted Actor-Learner Architecture (IMPALA)',
policy_gradient,
flags=[Flag.OFP, Flag.CS, Flag.CA],
authors='Lasse Espeholt, Hubert Soyer, Remi Munos, Karen Simonyan, Volodymir Mnih, Tom Ward, Yotam Doron, Vlad Firoiu, Tim Harley, Iain Dunning, Shane Legg, Koray Kavukcuoglu',
year=2018,
url='https://arxiv.org/abs/1802.01561',
links=[('Policy Gradient Algorithms', 'https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html')])
root_policy_gradient.connect(impala, style=ROOT_EDGE)
a2c.connect(impala, style=ORDER_EDGE) # just to maintain relative timeline order
#
# MODEL BASED
#
dyna_q = Node('Dyna-Q',
'Dyna-Q uses the experience drawn from real interaction with the environment to improve the value function/policy (called direct RL, using Q-learning) and the model of the environment (called model learning). The model is then used to create experiences (called planning) to improve the value function/policy.',
model_based,
flags=[Flag.OFP, Flag.MB],
authors='Richard S. Sutton, Andrew G. Barto',
year=1990,
links=[('(book) Reinforcement Learning: An Introduction - 2nd Edition - Richard S. Sutton and Andrew G. Barto - Section 8.2', 'http://incompleteideas.net/book/the-book.html'),
])
root_model_based.connect(dyna_q, style=ROOT_EDGE)
prio_sweep = Node('Prioritized Sweeping',
'Prioritized Sweeping/Queue-Dyna is similar to Dyna, and it improves Dyna by updating value based on priority rather than randomly. Values are also associated with state rather than state-action.',
model_based,
flags=[Flag.MB],
authors='Moore, Atkeson, Peng, Williams',
year=1993,
links=[('(book) Reinforcement Learning: An Introduction - 2nd Edition - Richard S. Sutton and Andrew G. Barto - Section 8.4', 'http://incompleteideas.net/book/the-book.html')])
dyna_q.connect(prio_sweep, style=ROOT_EDGE)
mcts = Node('MCTS',
'Monte Carlo Tree Search (MCTS) selects the next action by performing rollout algorithm, which estimates action values for a given policy by averaging the returns of many simulated trajectories that start with each possible action and then follow the given policy. Unlike Monte Carlo control, the goal of a rollout algorithm is not to estimate a complete optimal action-value function, q-star, or a complete action-value function,q-pi, for a given policy pi. Instead, they produce Monte Carlo estimates of action values only for each current state, and once an action is selected, this estimation will be discarded and fresh calculation will be performed on the next state. MCTS enchances this rollout algorithm by the addition of a means for accumulating value estimates obtained from the Monte Carlo simulations in order to successively direct simulations toward more highly-rewarding trajectories.',
model_based,
flags=[Flag.MB],
authors='Rémi Coulom, L. Kocsis, Cs. Szepesvári',
year=2006,
links=[('(book) Reinforcement Learning: An Introduction - 2nd Edition - Richard S. Sutton and Andrew G. Barto - Section 8.11', 'http://incompleteideas.net/book/the-book.html'),
('(Wikipedia) MCTS', 'https://en.wikipedia.org/wiki/Monte_Carlo_tree_search')])
root_model_based.connect(mcts, style=ROOT_EDGE)
pilco = Node('PILCO',
'(from the abstract) In this paper, we introduce PILCO, a practical, data-efficient model-based policy search method. PILCO reduces model bias, one of the key problems of model-based reinforcement learning, in a principled way. By learning a probabilistic dynamics model and explicitly incorporating model uncertainty into long-term planning, PILCO can cope with very little data and facilitates learning froms cratch in only a few trials. Policy evaluationis performed in closed form using state-of-the-art approximate inference. Furthermore, policy gradients are computed analytically for policy improvement. We report unprecedented learning efficiency on challenging and high-dimensional control tasks.',
model_based,
flags=[],
authors='Marc Peter Deisenroth, Carl Edward Rasmussen',
year=2011,
url='https://www.ias.informatik.tu-darmstadt.de/uploads/Publications/Deisenroth_ICML_2011.pdf',
links=[('PILCO website', 'http://mlg.eng.cam.ac.uk/pilco/'),
])
root_model_based.connect(pilco, style=ROOT_EDGE)
i2a = Node('I2A',
'(from the abstract) We introduce Imagination-Augmented Agents (I2As), a novel architecture for deep reinforcement learning combining model-free and model-based aspects. In contrast to most existing model-based reinforcement learning and planning methods, which prescribe how a model should be used to arrive at a policy, I2As learn to interpret predictions from a learned environment model to construct implicit plans in arbitrary ways, by using the predictions as additional context in deep policy networks. I2As show improved data efficiency, performance, and robustness to model misspecification compared to several baselines.',
model_based,
flags=[Flag.ML],
authors='Théophane Weber, Sébastien Racanière, David P. Reichert, Lars Buesing, Arthur Guez, Danilo Jimenez Rezende, Adria Puigdomènech Badia, Oriol Vinyals, Nicolas Heess, Yujia Li, Razvan Pascanu, Peter Battaglia, Demis Hassabis, David Silver, Daan Wierstra',
year=2017,
url='https://arxiv.org/abs/1707.06203',
links=[])
root_model_based.connect(i2a, style=ROOT_EDGE)
mbmf = Node('MBMF',
'(from the abstract) Neural Network Dynamics for Model-Based Deep Reinforcement Learning with Model-Free Fine-Tuning. We demonstrate that medium-sized neural network models can in fact be combined with model predictive control (MPC) to achieve excellent sample complexity in a model-based reinforcement learning algorithm, producing stable and plausible gaits to accomplish various complex locomotion tasks. We also propose using deep neural network dynamics models to initialize a model-free learner, in order to combine the sample efficiency of model-based approaches with the high task-specific performance of model-free methods. We empirically demonstrate on MuJoCo locomotion tasks that our pure model-based approach trained on just random action data can follow arbitrary trajectories with excellent sample efficiency, and that our hybrid algorithm can accelerate model-free learning on high-speed benchmark tasks, achieving sample efficiency gains of 3-5x on swimmer, cheetah, hopper, and ant agents.',
model_based,
flags=[Flag.ML],
authors='Anusha Nagabandi, Gregory Kahn, Ronald S. Fearing, Sergey Levine',
year=2017,
url='https://arxiv.org/abs/1708.02596',
links=[('Algorithm\'s site', 'https://sites.google.com/view/mbmf'),
('(GitHub) Code', 'https://github.com/nagaban2/nn_dynamics'), ])
root_model_based.connect(mbmf, style=ROOT_EDGE)
exit_algo = Node('Exit',
'Expert Iteration (ExIt) is a novel reinforcement learning algorithm which decomposes the problem into separate planning and generalisation tasks. Planning new policies is performed by tree search, while a deep neural network generalises those plans. Subsequently, tree search is improved by using the neural network policy to guide search, increasing the strength of new plans. In contrast, standard deep Reinforcement Learning algorithms rely on a neural network not only to generalise plans, but to discover them too. We show that ExIt outperforms REINFORCE for training a neural network to play the board game Hex, and our final tree search agent, trained tabula rasa, defeats MoHex 1.0, the most recent Olympiad Champion player to be publicly released. (from the abstract)',
model_based,
flags=[Flag.MG],
authors='Thomas Anthony, Zheng Tian, David Barber',
year=2017,
url='https://arxiv.org/abs/1705.08439',
links=[])
root_model_based.connect(exit_algo, style=ROOT_EDGE)
alpha_zero = Node('AlphaZero',
'AlphaZero generalises tabula rasa reinforcement learning from games of self-play approach. Starting from random play, and given no domain knowledge except the game rules, AlphaZero achieved within 24 hours a superhuman level of play in the games of chess and shogi (Japanese chess) as well as Go, and convincingly defeated a world-champion program in each case. (from the abstract)',
model_based,
flags=[Flag.MG],
authors='David Silver, Thomas Hubert, Julian Schrittwieser, Ioannis Antonoglou, Matthew Lai, Arthur Guez, Marc Lanctot, Laurent Sifre, Dharshan Kumaran, Thore Graepel, Timothy Lillicrap, Karen Simonyan, Demis Hassabis',
year=2017,
url='https://arxiv.org/abs/1712.01815',
links=[])
root_model_based.connect(alpha_zero, style=ROOT_EDGE)
mve = Node('MVE',
'(from the abstract) Recent model-free reinforcement learning algorithms have proposed incorporating learned dynamics models as a source of additional data with the intention of reducing sample complexity. Such methods hold the promise of incorporating imagined data coupled with a notion of model uncertainty to accelerate the learning of continuous control tasks. Unfortunately, they rely on heuristics that limit usage of the dynamics model. We present model-based value expansion, which controls for uncertainty in the model by only allowing imagination to fixed depth. By enabling wider use of learned dynamics models within a model-free reinforcement learning algorithm, we improve value estimation, which, in turn, reduces the sample complexity of learning.',
model_based,
flags=[Flag.ML],