-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
516 lines (454 loc) · 20.9 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="description"
content="PCF-Grasp">
<meta name="keywords" content="6-DoF Grasping, Point Completion, Manipulation">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>PerAct</title>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-FV4ZJ9PVSV"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-FV4ZJ9PVSV');
</script>
<script>
function updateSingleVideo() {
var demo = document.getElementById("single-menu-demos").value;
var task = document.getElementById("single-menu-tasks").value;
var inst = document.getElementById("single-menu-instances").value;
console.log("single", demo, task, inst)
var video = document.getElementById("multi-task-result-video");
video.src = "media/results/sim_rollouts/" +
"n" +
demo +
"-" +
task +
"-" +
inst +
".mp4"
video.playbackRate = 1.75;
video.play();
}
function updateQpredVideo() {
var task = document.getElementById("single-menu-qpred").value;
console.log("qpred", task)
var video = document.getElementById("q-pred-video");
video.src = "media/results/qpred/" +
task +
".mp4"
video.playbackRate = 1.75;
video.play();
}
</script>
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
<link rel="stylesheet"
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="./static/css/index.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script defer src="./static/js/fontawesome.all.min.js"></script>
<script src="./static/js/bulma-carousel.min.js"></script>
<script src="./static/js/bulma-slider.min.js"></script>
<script src="./static/js/index.js"></script>
</head>
<body onload="updateSingleVideo(); updateQpredVideo();">
<nav class="navbar" role="navigation" aria-label="main navigation">
<div class="navbar-brand">
<a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu">
<div class="navbar-start" style="flex-grow: 1; justify-content: center;">
<a class="navbar-item" target="_blank" href="https://mohitshridhar.com">
<span class="icon">
<i class="fas fa-home"></i>
</span>
</a>
<div class="navbar-item has-dropdown is-hoverable">
<a class="navbar-link">
More Research
</a>
<div class="navbar-dropdown">
<a class="navbar-item" target="_blank" href="https://cliport.github.io">
CLIPort
</a>
<a class="navbar-item" target="_blank" href="https://askforalfred.com/">
ALFRED
</a>
<a class="navbar-item" target="_blank" href="http://alfworld.github.io/">
ALFWorld
</a>
<a class="navbar-item" target="_blank" href="https://arxiv.org/pdf/1806.03831.pdf">
INGRESS
</a>
</div>
</div>
</div>
</div>
</nav>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<h1 class="title is-1 publication-title">PCF-Grasp: Point Completion to Featrue<br>for 6-DoF Grasp</h1>
<h3 class="title is-4 conference-authors"><a target="_blank" href="https://www.robot-learning.org/">CoRL 2022</a></h3>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a target="_blank" href="https://mohitshridhar.com/">Yaofeng Cheng</a><sup>1</sup>,</span>
<span class="author-block">
<a target="_blank" href="http://lucasmanuelli.com/">Fusheng Zha</a><sup>2</sup>,</span>
<span class="author-block">
<a target="_blank" href="https://homes.cs.washington.edu/~fox/">Jiansong Zhao</a><sup>1, 2</sup>
</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block"><sup>1</sup>State Key Laboratory of Robotics and System,</span>
<span class="author-block"><sup>2</sup>Harbin Institute of Technology</span>
</div>
<div class="column has-text-centered">
<div class="publication-links">
<!-- PDF Link. -->
<span class="link-block">
<a target="_blank" href="paper/peract_corl2022.pdf"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>Paper</span>
</a>
</span>
<!-- Arxiv Link. -->
<span class="link-block">
<a target="_blank" href="https://arxiv.org/abs/2209.05451"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file"></i>
</span>
<span>ArXiv</span>
</a>
</span>
<!-- Video Link. -->
<span class="link-block">
<a target="_blank" href="https://www.youtube.com/watch?v=TB0g52N-3_Y"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-youtube"></i>
</span>
<span>Video</span>
</a>
</span>
<!-- Code Link. -->
<span class="link-block">
<a target="_blank" href="https://github.com/peract/peract"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>
</span>
</div>
</div>
<!-- <br>
<br> -->
</div>
</div>
</div>
</div>
</section>
<section class="hero teaser">
<div class="container is-fullhd">
<div class="hero-body">
<div class="container">
<div class="columns is-vcentered is-centered">
<video id="teaser" autoplay muted loop height="100%">
<source src="media/intro/sim_rolling_v2.mp4"
type="video/mp4">
</video>
</br>
</div>
<br>
<h2 class="subtitle has-text-centered">
<span class="dperact">PerAct</span> can learn <br><b>a single language-conditioned policy</b> for 18 RLBench tasks with <b>249 unique task variations</b>
</h2>
</div>
</div>
</div>
</section>
<section class="hero is-light is-small">
<div class="hero-body">
<div class="container">
<div id="results-carousel" class="carousel results-carousel">
<div class="item item-steve">
<video poster="" id="steve" autoplay muted loop height="100%">
<source src="media/intro/1_handsan.mp4"
type="video/mp4">
</video>
</div>
<div class="item item-fullbody">
<video poster="" id="fullbody" autoplay muted loop height="100%">
<source src="media/intro/2_food_bin.mp4"
type="video/mp4">
</video>
</div>
<div class="item item-shiba">
<video poster="" id="shiba" autoplay muted loop height="100%">
<source src="media/intro/4_drawer.mp4"
type="video/mp4">
</video>
</div>
<div class="item item-blueshirt">
<video poster="" id="blueshirt" autoplay muted loop height="100%">
<source src="media/intro/3_marker.mp4"
type="video/mp4">
</video>
</div>
<div class="item item-shiba">
<video poster="" id="shiba" autoplay muted loop height="100%">
<source src="media/intro/5_stick.mp4"
type="video/mp4">
</video>
</div>
<div class="item item-shiba">
<video poster="" id="shiba" autoplay muted loop height="100%">
<source src="media/intro/7_sweeping.mp4"
type="video/mp4">
</video>
</div>
<div class="item item-chair-tp">
<video poster="" id="chair-tp" autoplay muted loop height="100%">
<source src="media/intro/6_blocks.mp4"
type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</div>
</section>
<h2 class="subtitle has-text-centered">
</br>
We also train <b>one multi-task Transformer from scratch</b> on 7 real-world tasks with just <b>53 demos</b> in total.
</h2>
<section class="section">
<div class="container is-max-desktop">
<!-- Abstract. -->
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
Transformers have revolutionized vision and natural language processing
with their ability to scale with large datasets.
</p>
<p>
We investigate this question with <span class="dperact">PerAct</span>, a
language-conditioned behavior-cloning agent for multi-task 6-DoF manipulation.
<span class="dperact">PerAct</span> encodes language goals and RGB-D voxel observations with a
<a target=”_blank” href="https://www.deepmind.com/blog/building-architectures-that-can-handle-the-worlds-data">Perceiver Transformer</a>, and outputs discretized
actions by “detecting the next best voxel action”. Unlike frameworks that operate on 2D images, the voxelized observation
and action space provides a strong structural prior for efficiently learning 6-DoF
policies.
</p>
<p>
With this formulation, we train <b>a single multi-task Transformer</b> for 18
<a target="_blank" href="https://sites.google.com/view/rlbench">RLBench</a> Our
results show that <span class="dperact">PerAct</span> significantly outperforms unstructured image-to-action
</p>
</div>
</div>
</div>
<br>
<br>
<!--/ Abstract. -->
</div>
<!-- Paper video. -->
<div class="columns is-centered has-text-centered">
<div class="column is-two-thirds">
<h2 class="title is-3">Video</h2>
<div class="publication-video">
<iframe src="https://www.youtube.com/embed/TB0g52N-3_Y?rel=0&showinfo=0"
frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>
</div>
</div>
</div>
</section>
<section class="section">
<div class="container is-max-widescreen">
<div class="rows">
<!-- Animation. -->
<div class="rows is-centered ">
<div class="row is-full-width">
<h2 class="title is-3"><span class="dperact">PerAct</span></h2>
<!-- Interpolating. -->
<h3 class="title is-4">A Transformer for Detecting Actions</h3>
<div class="content has-text-justified">
<!-- <br> -->
</div>
<p>
<span class="dperact">PerAct</span> is a language-conditioned behavior-cloning agent trained with supervised learning to <i>detect actions</i>. Instead of using object-detectors, instance-segmentors, or pose-estimators to represent a scene and then learning a policy, <span class="dperact">PerAct</span> directly learns <b>perceptual representations of actions</b> conditioned on language goals. This <a target="_blank" href="https://en.wikipedia.org/wiki/Ecological_psychology">action-centric approach</a> with a unified observation and action space makes <span class="dperact">PerAct</span> applicable to a broad range of tasks involving articulated objects, deformable objects, granular media, and even some non-prehensile interactions with tools.
</p>
</br>
</br>
<img src="media/figures/arch.png" class="interpolation-image"
alt="Interpolate start reference image." />
</br>
</br>
<p>
<span class="dperact">PerAct</span> takes as input a language goal and a voxel grid reconstructed from RGB-D sensors. The voxels are split into 3D patches (like <a target="_blank" href="https://arxiv.org/abs/2010.11929">vision transformers</a> split images into 2D patches), and the language goal is encoded with a pre-trained language model. The language and voxel features are appended together as a sequence and encoded with a <a target=”_blank” href="https://www.deepmind.com/blog/building-architectures-that-can-handle-the-worlds-data">PerceiverIO Transformer</a> to learn per-voxel features. These features are then reshaped with linear layers to predict a discretized translation, rotation, gripper open, and collision avoidance action, which can be executed with a motion-planner.
<!-- This action is executed with a motion-planner after which the new observation is used to predict the next discrete action in an observe-act loop until termination. -->
Overall, the voxelized observation and action space provides a strong structural prior for efficiently learning 6-DoF polices. Checkout our <a target="_blank" href="https://colab.research.google.com/drive/1HAqemP4cE81SQ6QO1-N85j5bF4C0qLs0?usp=sharing">Colab Tutorial</a> for an annotated guide on implemententing <span class="dperact">PerAct</span> and training it from scratch on a single GPU.
</p>
</br>
</br>
<h3 class="title is-4">Encoding High-Dimensional Input</h3>
<p class="justify">
<img src="media/figures/perceiver.png" class="interpolation-image" width="480" align="right"
style="margin:0% 4% "
alt="Interpolate start reference image." />
The input grid is 100×100×100 = 1 million voxels. After extracting 5×5×5 patches, the input is 20×20×20 = 8000 embeddings long. Despite this long sequence, Perceiver uses a small set of latent vectors to encode the input. These latent vectors are randomly initialized and trained end-to-end. This approach decouples the depth of the Transformer self-attention layers from the dimensionality of the input space, which allows us train <span class="dperact">PerAct</span> on very large input voxel grids. Perceiver has been deployed in several domains like <a target="_blank" href="https://www.deepmind.com/publications/perceiver-ar-general-purpose-long-context-autoregressive-generation">long-context auto-regressive generation</a>, <a target="_blank" href="https://arxiv.org/abs/2204.14198">vision-language models for few-shot learning</a>, <a target="_blank" href="https://arxiv.org/abs/2107.14795">image and audio classification, and optical flow prediction.</a>
</p>
<br/>
<br/>
<!--/ Re-rendering. -->
<h2 class="title is-3">Results</h2>
<h3 class="title is-4">Simulation Results</h3>
<div class="columns">
<div class="column has-text-centered">
<h3 class="title is-5">One Multi-Task Transformer</h3>
Trained with
<div class="select is-small">
<select id="single-menu-demos" onchange="updateSingleVideo()">
<option value="10">10</option>
<option value="100" selected="selected">100</option>
</select>
</div>
demos per task, evaluated on
<div class="select is-small">
<select id="single-menu-tasks" onchange="updateSingleVideo()">
<option value="open_drawer" selected="selected">open drawer</option>
<option value="slide_block">slide block</option>
<option value="sweep_to_dustpan">sweep to dustpan</option>
<option value="meat_off_grill">meat off grill</option>
<option value="turn_tap">turn tap</option>
<option value="put_in_drawer">put in drawer</option>
<option value="close_jar">close jar</option>
<option value="drag_stick">drag stick</option>
<option value="stack_blocks">stack blocks</option>
<option value="screw_bulb">screw bulb</option>
<option value="put_in_safe">put in safe</option>
<option value="place_wine">place wine</option>
<option value="put_in_cupboard">put in cupboard</option>
<option value="sort_shape">sort shape</option>
<option value="push_buttons">push buttons</option>
<option value="insert_peg">insert peg</option>
<option value="stack_cups">stack cups</option>
<option value="place_cups">place cups</option>
</select>
</div>
episode
<div class="select is-small">
<select id="single-menu-instances" onchange="updateSingleVideo()">
<option value="s1">01</option>
<option value="s2" selected="selected">02</option>
<option value="s3">03</option>
<option value="s4">04</option>
<option value="s5">05</option>
</select>
</div>
<br/>
<br/>
<video id="multi-task-result-video"
muted
autoplay
loop
width="100%">
<source src="media/results/sim_rollouts/n10-open_drawer-s2.mp4"
type="video/mp4">
</video>
</div>
</div>
</br>
</br>
<h3 class="title is-4">Action Predictions</h3>
<div class="columns">
<div class="column has-text-centered">
<h3 class="title is-5">Q-Prediction Examples</h3>
Visualize predictions for
<div class="select is-small is-rounded">
<select id="single-menu-qpred" onchange="updateQpredVideo()">
<option value="tomato" selected="selected">"put the tomatoes in the top bin"</option>
<option value="stick">"hit the green ball with the stick"</option>
<option value="handsan">"press the hand san"</option>
<option value="tape">"put the tape in the top drawer"</option>
</select>
</div>
</div>
</div>
</div>
</div>
</section>
<video id="q-pred-video"
muted
autoplay
loop
width="100%">
<source src="media/results/qpred/tomato.mp4"
type="video/mp4">
</video>
<br>
<br>
<section class="section">
<div class="container is-max-widescreen">
<div class="rows">
<h2 class="title is-3">Emergent Properties</h2>
<h3 class="title is-4">Tracking Objects</h3>
A selected example of tracking an unseen hand sanitizer instance with an agent that was trained on a single object with 5 "press the handsan" demos. Since <span class="dperact">PerAct</span> focuses on actions, it doesn't need a complete representation of the bottle, and only has to predict <b><i>where</i> to press</b> the sanitizer.
<video id="tracking-objects"
muted
autoplay
loop
width="99%">
<source src="media/results/animations/handsan_tracking_v2.mp4"
type="video/mp4">
</video>
</div>
</div>
</section>
<section class="section" id="BibTeX">
<div class="container is-max-widescreen content">
<h2 class="title">BibTeX</h2>
<pre><code>@inproceedings{shridhar2022peract,
title = {Perceiver-Actor: A Multi-Task Transformer for Robotic Manipulation},
author = {Shridhar, Mohit and Manuelli, Lucas and Fox, Dieter},
booktitle = {Proceedings of the 6th Conference on Robot Learning (CoRL)},
year = {2022},
}</code></pre>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="columns is-centered">
<div class="column">
<div class="content has-text-centered">
<p>
Website template borrowed from <a href="https://github.com/nerfies/nerfies.github.io">NeRFies</a> made by the amazing <a href="https://keunhong.com/">Keunhong Park</a>.
</p>
</div>
</div>
</div>
</div>
</footer>
</body>
</html>