-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
640 lines (606 loc) · 28.6 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="description" content="🌌: Neurosymbolic Grounding for Compositional World Models">
<meta name="keywords" content="COSMOS, World Models, Neurosymbolic Learning, Compositional Generalization">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>COSMOS</title>
<script>
window.dataLayer = window.dataLayer || [];
function gtag() {
dataLayer.push(arguments);
}
gtag('js', new Date());
gtag('config', 'G-PYVRSFMDRL');
</script>
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="./static/css/index.css">
<link rel="stylesheet" href="./static/css/scrollytelling.css">
<link rel="icon" href="https://fav.farm/🌌">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<script defer src="./static/js/fontawesome.all.min.js"></script>
<script src="./static/js/bulma-carousel.min.js"></script>
<script src="./static/js/bulma-slider.min.js"></script>
<script src="./static/js/index.js"></script>
</head>
<body>
<section class="hero">
<div class="hero-body">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<h1 class="title is-1 publication-title"><span class="cosmos">Cosmos</span>: Neurosymbolic
Grounding for Compositional World Models</h1>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a href="https://atharvas.net">Atharva Sehgal</a><sup>1</sup>,</span>
<span class="author-block">
<a href="https://www.linkedin.com/in/aryagrayeli">Arya Grayeli</a><sup>1</sup>,</span>
<span class="author-block">
<a href="https://jenjsun.com">Jennifer J. Sun</a><sup>2</sup>,
</span>
<span class="author-block">
<a href="https://www.cs.utexas.edu/~swarat/">Swarat Chaudhuri</a><sup>1</sup>,
</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block"><sup>1</sup>UT Austin,</span>
<span class="author-block"><sup>2</sup>Caltech</span>
</div>
<div class="column has-text-centered">
<div class="publication-links">
<!-- PDF Link. -->
<span class="link-block">
<a href="https://arxiv.org/abs/2310.12690"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="ai ai-arxiv"></i>
</span>
<span>arXiv</span>
</a>
</span>
<!-- Video Link. -->
<!-- <span class="link-block">
<a href="https://www.youtube.com/watch?v=dQw4w9WgXcQ"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-youtube"></i>
</span>
<span>Video</span>
</a>
</span> -->
<!-- Code Link. -->
<span class="link-block">
<a href="https://github.com/trishullab/cosmos"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fab fa-github"></i>
</span>
<span>Code</span>
</a>
</span>
<span class="link-block">
<a href="static/presentation.pdf"
class="external-link button is-normal is-rounded is-dark">
<span class="icon">
<i class="fas fa-file-pdf"></i>
</span>
<span>Brief Slide Deck</span>
</a>
</span>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<section class="hero teaser">
<div class="container is-max-desktop">
<div class="hero-body">
<img src="./static/images/overview-animation.svg" style="max-width: 100%; height: auto;" loading="eager"
<h2 class="subtitle has-text-centered">
<span class="cosmos">Cosmos</span> studies a new form of compositional generalization and
uses vision-language foundation models to learn world models from unsupervised environment interactions.
</h2>
</div>
</div>
</section>
<section class="section">
<div class="container is-max-desktop">
<!-- Abstract. -->
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
<span class="cosmos">Cosmos</span> is a framework for object-centric world modeling
that is designed for compositional generalization (CG), i.e., high performance on
unseen input scenes obtained through the composition of known visual "atoms."
</p>
<p>
The central insight behind Cosmos is the use of a novel form of neurosymbolic
grounding. Specifically, the framework introduces two new tools:
(i) neurosymbolic scene encodings, which represent each entity in a scene
using a real vector computed using a neural encoder, as well as a vector
of composable symbols describing attributes of the entity, and
(ii)a neurosymbolic attention mechanism that binds these entities to learned
rules of interaction.
</p>
<p>
Cosmos is end-to-end differentiable; also, unlike traditional
neurosymbolic methods that require representations to be manually mapped to symbols,
it computes an entity's symbolic attributes using vision-language foundation models.
Through an evaluation that considers two different forms of CG on an established
blocks-pushing domain,
we show that the framework establishes a new state-of-the-art for CG in world modeling.
</p>
</div>
</div>
</div>
<!--/ Abstract. -->
<div class="container is-hidden-tablet is-max-desktop">
<!-- Abstract. -->
<h2 class="title has-text-centered is-3">⚠️Warning⚠️</h2>
<div class="content has-text-justified">
<p>
The next sections might not render correctly on mobile devices. Please view this page on a desktop or enable "desktop mode" for the best experience!
</p>
</div>
</div>
</div>
</section>
<section class="section">
</section>
<section class="section">
<div class="container">
<h2 class="title is-2">Types of Compositions</h2>
<!-- Method. -->
<div class="columns is-centered" id="types-of-compositions-scroll">
<div class="column is-max-mobile is-two-fifth-tablet is-one-quarter-desktop is-one-fifth-widescreen article">
<!-- <h2 class="title is-3">Types of Compositions</h2> -->
<!-- Method SubSection A. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Distribution of Atoms</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Our dataset contains 2D shapes of various shapes and colors. An image is generated by
sampling a fixed number of shapes. We refer to each shape as an <em>atom</em>. Atoms can be composed together
in many ways. We shall study two types of compositions in this work.
</p>
<!-- We can also animate the scene by interpolating the deformation latent codes of two input
frames. Use the slider here to linearly interpolate between the left frame and the right
frame.
</p> -->
</div>
<br />
<!--/ Method SubSection A. -->
<!-- Method SubSection B. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Entity composition involves sampling atoms (entities) such that the specific combination of atoms is not
seen during training.
</p>
</div>
<br />
<!--/ Method SubSection B. -->
<!-- Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Sampling</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Specifically, here, the model has seen a red square, a purple triangle, and a blue circle in
other contexts, but never together.
</p>
</div>
<br />
<!--/ Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Rendering State</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The rendered image randomly places the atoms in the scene.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Sampling Actions</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
In this environment, each atom can be moved either North, East, South, or West. Actions are sampled uniformly.
</p>
</div>
<br />
<!--/ Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Prediction</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The next state is derived by applying the action to the current state. Each atom also has a pre-specified
"weight" which influences whether it can push other shapes (if heavier) or be pushed (if
lighter). The weights can be inferred from the shape. The white arrow provides emphasis and isn't part of the state.
</p>
<p>
<strong>Some Observations:</strong>
<ol>
<li>To perform well in this task, the internal representation of the model must be able to distinguish between the different atoms.</li>
<li>While the atoms have changed, the rules by which the atoms interact have not. So, while the
task of perceiving a new composition atoms is challenging, the task of predicting the next state is
relatively easy.</li>
</ol>
</p>
</div>
<br />
<!--/ Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Entity composition has been the traditional way of studying compositional generalization. However,
in EC, the movement of each object is independent of the movement of other objects.
In relational composition, we compose on the level of <em>relations</em> between objects.
Now, in addition to sampling new objects, objects which share a certain attribute have shared dynamics as well.
In this case, the attribute is color. A composition of objects occurs when the two objects
share the same color. Furthermore, objects with shared attributes share dynamics as well.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Sampling</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Specifically, here, the square and the circle are both red, while the triangle is green.
Like before, the model has seen each shape in other contexts, but never together. ie: a
red square and a red circle have never been seen together.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Subtypes</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
There are many ways in which we can select the relation to compose on. In this work, we study two cases.
<ul>
<li><strong>Sticky Composition:</strong> Dynamics are shared based on two relations: color and adjacency.</li>
<li><strong>Team Composition:</strong> Dynamics are shared based on one relation: color.</li>
</ul>
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Sticky Composition</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
An action is applied on a single object and the dynamics are shared with all objects of the same color that are adjacent to the object.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Sticky Composition</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Consecutively, the action on the square moves both the square and the circle northwards.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Team Composition</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
An action is applied on a single object and the dynamics are shared with all objects of the same color.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Team Composition</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
This time, the action on the square also moves the circle northwards. However, the
circle and the square are considerably far apart.
</p>
</div>
</div>
<!-- Image. -->
<!-- Make sure image fits in the container. -->
<div class="column content">
<img src="static/types-of-compositions/1.svg" id="updateableFigure">
</div>
</div>
</div>
</section>
<!-- Section title? -->
<section class="section">
<div class="container">
<h2 class="title is-2">A single step of <span class="cosmos">Cosmos</span></h2>
<!-- Method. -->
<div class="columns is-centered" id="onepass-inference-scroll">
<div class="column is-max-mobile is-two-fifth-tablet is-one-quarter-desktop is-one-fifth-widescreen article">
<!-- Method SubSection A. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Input</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The input to the model is an image of the current state, and the object-factorized action on
which to condition the next state.
</p>
</div>
<br />
<!--/ Method SubSection A. -->
<!-- Method SubSection B. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Encoder</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
We process the image into a set of entities using pretrained <a href="https://segment-anything.com">SAM</a> to obtain segmentations for
each entity and a finetuned <a href="https://arxiv.org/abs/1512.03385">Resnet</a> to obtain a latent vector for each entity.
</p>
</div>
<br />
<!--/ Method SubSection B. -->
<!-- Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Preprocessing</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Each vector is decoded to an image using a spatial decoder. The spatial decoder is trained in conjunction with the entity encoder.
In practice, we warm-start the encoder and decoder to ensure good auto-encoder reconstructions.
</p>
</div>
<br />
<!--/ Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Zero-shot labelling</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
We use a pretrained <a href="https://pypi.org/project/open-clip-torch/">CLIP</a> model to predict the
symbolic attribute that most-likely describes the entity. Notice that the attribute labelled here (<code>C_shape = ⚫️</code>)
ignores all other attributes of the entity.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Zero-shot labelling</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The process is repeated for other attributes as well. Notice that each attribute is labelled independently of the others,
allowing the model to trivially generalize to different compositions of attributes.
</p>
</div>
<br />
<!--/ Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Label Relaxation</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Each label is represented as a one-hot vector. In practice, this discrete representation does not
align well with downstream attention based modules. Hence, the one-hot vector is used to select a
learnable vector from a set of learnable vectors.
</p>
</div>
<br />
<!--/ Method SubSection C. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Concatenation</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Thus, the resultant <em>symbol vector</em> is a composition of learnable latent vectors
distinct to each attribute value. Furthermore, we can ensure a canonical ordering of the
symbols, making downstream attention-based computations invariant to permutations of attributes
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Encoding</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The symbolic labelling process is repeated for each entity in the scene. The resultant symbolic
vectors are stacked to form a <em>symbolic encoding</em> of the scene.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Strawman #1</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Following <a href="https://arxiv.org/abs/2103.01937">NPS</a>, we will break down the transition function into two parts: Learning to select a module and learning to specialize the module to a task. However, how should we employ the neural and symbolic encodings?
</p>
<p>
The <strong style="color: #fb8c00;">symbolic encoding</strong> will help the selection module be robust to attribute compositions. However, if we just use the symbolic encoding, we will risk bottlenecking the model's ability to learn fine-grained dynamics-relevant attributes that may not be known ahead of time.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Strawman #2</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The <strong style="color: #417be4;">neural encoding</strong>, on the other hand, captures rich dynamics-relevant attributes which will enable good reconstruction. However, we will risk overfitting to attribute compositions seen during training.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Motivating a Neurosymbolic Encoding</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
We solve this problem by employing a hybrid approach. We'll use the symbolic encoding to select a module and the neural encoding to predict the next state.
</p>
<p>
We'll now describe the rest of the architecture.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Module Selection</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The symbolic encoding is concatenated with the action vector after the encoding and action are
reordered to match the canonical ordering of the symbols. The concatenated vector is select a
learnable module which is used to predict the next state.
</p>
<p>
Note that the symbolic encoding is
only selecting the module, which leaves room for the neural encoding to learn fine-grained
dynamics-relevant attributes that may not be known ahead of time.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Module Application</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The selected module is applied to the neural encoding to predict the next state.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Spatial Decoding</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
We re-use the spatial decoder to decode the predicted next state into an image.
</p>
</div>
<br />
<h3 class="title is-size-6-mobile is-size-4-tablet">Learning Objective</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The model is trained end-to-end using a mixture of the next-state reconstruction error (<code>MSE</code>) and the
auto-encoder reconstruction error (<code>AE-MSE</code>; not shown).
</p>
</div>
</div>
<!-- Image. -->
<div class="column content">
<img src="static/onepass-inference/1.svg" id="updateableFigure">
</div>
</div>
</div>
</section>
<section class="section">
<div class="container">
<h2 class="title is-2">Compositional Generalization in 2D Block Pushing</h2>
<!-- Method. -->
<div class="columns is-centered" id="results-frames-scroll">
<div class="column is-quarter-mobile is-two-fifth-tablet is-one-quarter-desktop is-one-fifth-widescreen article">
<!-- Method SubSection A. -->
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - Ground Truth</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
We first look at qualitative results in Entity Composition. A comprehensive analysis is available in the <a href="https://arxiv.org/abs/2310.12690">paper</a>.
Each row represents a random sample of objects and a randomly sampled action. The model has never seen this composition of objects before.
<br/>
We <strong>expect strong performance from baselines</strong> as the dynamics are invariant to the sampled composition.
</p>
</div>
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - COSMOS</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
COSMOS is able to predict the next state with high fidelity.
</p>
</div>
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - Aligned NPS</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Our first baseline is a modified version of <a href="https://arxiv.org/abs/2103.01937">NPS</a> with a slot-action alignment attention mechanism.
This is equivalent to an ablation of COSMOS without the symbolic representation. NPS is also able to predict the next state with good fidelity.
Slight deviations are emphasized with a red dotted box.
</p>
</div>
<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - GNN</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Our second baseline uses a GNN to model the interactions between objects.
This is related to <a href="https://arxiv.org/abs/2204.13661">HOWM</a> and <a href="https://arxiv.org/abs/1911.12247">G-SWM</a>.
It also serves as an ablation of COSMOS without the symbolic representation and module selection mechanism. The GNN achieves
strong performance. However, some entities are reconstructed with wrong attributes; an indication of attribute overfitting.
</p>
</div>
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - Ground Truth</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
In this environment, objects related by <code>position</code> (adjacency) and <code>color</code> will move together. As before, the sampled objects have never been
seen by the model.
<br/>
As the dynamics are no longer invariant to compositions, we expect <strong>strong performance from COSMOS</strong> and <strong>overfitting from baselines</strong>.
</p>
</div>
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - COSMOS</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
COSMOS is able to predict the next state with high fidelity.
</p>
</div>
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - Aligned NPS</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
Our first baseline mispredicts the next state. Notice how, in both samples, the model moves the actor and the <code>triangle</code>
even though the triangle doesn't share any attributes with the actor.
</p>
</div>
<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - GNN</h3>
<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
<p>
The GNN also mispredicts the next state. For more details and results, check out our <a href="https://arxiv.org/abs/2310.12690">paper</a>!
</p>
</div>
</div>
<!-- Image. -->
<div class="column content">
<img src="static/results-frames/1.svg" id="updateableFigure">
</div>
</div>
</div>
</section>
<section class="section">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column is-full-width">
<h2 class="title is-3">Related Links</h2>
<div class="content has-text-left">
<p>
This project would not be possible without the excellent work of the community. These are some relevant papers to better understand the
premise of this work.
</p>
<ul>
<li><a href="https://arxiv.org/abs/2204.13661">Toward compositional generalization in object-oriented world modeling</a> </li>
<li><a href="https://arxiv.org/abs/2103.01937">Neural Production Systems: Learning Rule-Governed Visual Dynamics</a> </li>
<li><a href="https://arxiv.org/abs/1911.12247 ">Contrastive learning of structured world models</a> </li>
<li><a href="https://arxiv.org/abs/1803.10122">World Models</a> </li>
<li><a href="https://arxiv.org/abs/2107.13132">Unsupervised Learning of Neurosymbolic Encoders</a> </li>
</ul>
</div>
</div>
</div>
</div>
</section>
<section class="section" id="BibTeX">
<div class="container is-max-desktop content">
<h2 class="title">BibTeX</h2>
<pre><code>@misc{sehgal2023neurosymbolic,
title={Neurosymbolic Grounding for Compositional World Models},
author={Atharva Sehgal and Arya Grayeli and Jennifer J. Sun and Swarat Chaudhuri},
year={2023},
eprint={2310.12690},
archivePrefix={arXiv},
primaryClass={cs.LG}
}</code></pre>
</div>
</section>
<footer class="footer">
<div class="container">
<div class="content has-text-centered">
<a class="icon-link" href="https://arxiv.org/abs/2310.12690">
<i class="fas fa-file-pdf"></i>
</a>
<a class="icon-link" href="https://github.com/trishullab/cosmos" class="external-link" disabled>
<i class="fab fa-github"></i>
</a>
</div>
<div class="columns is-centered">
<div class="column is-8">
<div class="content">
<p>
This template is based on the <a href="https://nerfies.github.io/">Nerfiles</a> project page.
The source code is available <a href="https://github.com/nerfies/nerfies.github.io">here</a> and is
licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
Commons Attribution-ShareAlike 4.0 International License</a>. I also make heavy use of the
<a href="https://github.com/russellsamora/scrollama">Scrollama.js</a> package. Please remember
to cite either the <a href="https://nerfies.github.io/">Nerfiles</a> website or
<a href="https://github.com/trishullab/cosmos-web">this website</a> if you use this template!
</p>
</div>
</div>
</div>
</div>
</footer>
<script src="https://unpkg.com/[email protected]/dist/d3.min.js"></script>
<!-- Scrolly Storytelling -->
<script src="https://unpkg.com/scrollama"></script>
<script src="./static/js/scrollytelling.js"></script>
<script>
// Init scrollable sections.
init("#types-of-compositions-scroll");
init("#onepass-inference-scroll");
init("#results-frames-scroll");
</script>
</body>
</html>