index.html

<!DOCTYPE html>
<html>

<head>
	<meta charset="utf-8">
	<meta name="description" content="🌌: Neurosymbolic Grounding for Compositional World Models">
	<meta name="keywords" content="COSMOS, World Models, Neurosymbolic Learning, Compositional Generalization">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>COSMOS</title>

	<script>
		window.dataLayer = window.dataLayer || [];

		function gtag() {
			dataLayer.push(arguments);
		}

		gtag('js', new Date());

		gtag('config', 'G-PYVRSFMDRL');
	</script>

	<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

	<link rel="stylesheet" href="./static/css/bulma.min.css">
	<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
	<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
	<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
	<link rel="stylesheet" href="./static/css/index.css">
	<link rel="stylesheet" href="./static/css/scrollytelling.css">
	<link rel="icon" href="https://fav.farm/🌌">

	<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
	<script defer src="./static/js/fontawesome.all.min.js"></script>
	<script src="./static/js/bulma-carousel.min.js"></script>
	<script src="./static/js/bulma-slider.min.js"></script>
	<script src="./static/js/index.js"></script>
</head>

<body>

	<section class="hero">
		<div class="hero-body">
			<div class="container is-max-desktop">
				<div class="columns is-centered">
					<div class="column has-text-centered">
						<h1 class="title is-1 publication-title"><span class="cosmos">Cosmos</span>: Neurosymbolic
							Grounding for Compositional World Models</h1>
						<div class="is-size-5 publication-authors">
							<span class="author-block">
								<a href="https://atharvas.net">Atharva Sehgal</a><sup>1</sup>,</span>
							<span class="author-block">
								<a href="https://www.linkedin.com/in/aryagrayeli">Arya Grayeli</a><sup>1</sup>,</span>
							<span class="author-block">
								<a href="https://jenjsun.com">Jennifer J. Sun</a><sup>2</sup>,
							</span>
							<span class="author-block">
								<a href="https://www.cs.utexas.edu/~swarat/">Swarat Chaudhuri</a><sup>1</sup>,
							</span>
						</div>

						<div class="is-size-5 publication-authors">
							<span class="author-block"><sup>1</sup>UT Austin,</span>
							<span class="author-block"><sup>2</sup>Caltech</span>
						</div>

						<div class="column has-text-centered">
							<div class="publication-links">
								<!-- PDF Link. -->
								<span class="link-block">
									<a href="https://arxiv.org/abs/2310.12690"
										class="external-link button is-normal is-rounded is-dark">
										<span class="icon">
											<i class="ai ai-arxiv"></i>
										</span>
										<span>arXiv</span>
									</a>
								</span>
								<!-- Video Link. -->
								<!-- <span class="link-block">
									<a href="https://www.youtube.com/watch?v=dQw4w9WgXcQ"
										class="external-link button is-normal is-rounded is-dark">
										<span class="icon">
											<i class="fab fa-youtube"></i>
										</span>
										<span>Video</span>
									</a>
								</span> -->
								<!-- Code Link. -->
								<span class="link-block">
									<a href="https://github.com/trishullab/cosmos"
										class="external-link button is-normal is-rounded is-dark">
										<span class="icon">
											<i class="fab fa-github"></i>
										</span>
										<span>Code</span>
									</a>
								</span>
								<span class="link-block">
									<a href="static/presentation.pdf"
										class="external-link button is-normal is-rounded is-dark">
										<span class="icon">
											<i class="fas fa-file-pdf"></i>
										</span>
										<span>Brief Slide Deck</span>
									</a>
								</span>
							</div>

						</div>
					</div>
				</div>
			</div>
		</div>
	</section>

	<section class="hero teaser">
		<div class="container is-max-desktop">
			<div class="hero-body">
				<img src="./static/images/overview-animation.svg" style="max-width: 100%; height: auto;" loading="eager"
				<h2 class="subtitle has-text-centered">
					<span class="cosmos">Cosmos</span> studies a new form of compositional generalization and
					 uses vision-language foundation models to learn world models from unsupervised environment interactions.
				</h2>

			</div>
		</div>
	</section>


	<section class="section">
		<div class="container is-max-desktop">
			<!-- Abstract. -->
			<div class="columns is-centered has-text-centered">
				<div class="column is-four-fifths">
					<h2 class="title is-3">Abstract</h2>
					<div class="content has-text-justified">
						<p>
							<span class="cosmos">Cosmos</span> is a framework for object-centric world modeling
							that is designed for compositional generalization (CG), i.e., high performance on
							unseen input scenes obtained through the composition of known visual "atoms."
						</p>
						<p>
							The central insight behind Cosmos is the use of a novel form of neurosymbolic
							grounding. Specifically, the framework introduces two new tools:
							(i) neurosymbolic scene encodings, which represent each entity in a scene
							using a real vector computed using a neural encoder, as well as a vector
							of composable symbols describing attributes of the entity, and
							(ii)a neurosymbolic attention mechanism that binds these entities to learned
							rules of interaction.
						</p>
						<p>
							Cosmos is end-to-end differentiable; also, unlike traditional
							neurosymbolic methods that require representations to be manually mapped to symbols,
							it computes an entity's symbolic attributes using vision-language foundation models.
							Through an evaluation that considers two different forms of CG on an established
							blocks-pushing domain,
							we show that the framework establishes a new state-of-the-art for CG in world modeling.
						</p>
					</div>
				</div>
			</div>
			<!--/ Abstract. -->
			<div class="container is-hidden-tablet is-max-desktop">
				<!-- Abstract. -->
				<h2 class="title has-text-centered is-3">⚠️Warning⚠️</h2>
				<div class="content has-text-justified">
					<p>
						The next sections might not render correctly on mobile devices. Please view this page on a desktop or enable "desktop mode" for the best experience!
					</p>
				</div>
			</div>
			</div>
	</section>

	<section class="section">

	</section>

	<section class="section">
		<div class="container">
			<h2 class="title is-2">Types of Compositions</h2>
			<!-- Method. -->
			<div class="columns is-centered" id="types-of-compositions-scroll">
				<div class="column is-max-mobile is-two-fifth-tablet is-one-quarter-desktop is-one-fifth-widescreen article">
					<!-- <h2 class="title is-3">Types of Compositions</h2> -->
					<!-- Method SubSection A. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Distribution of Atoms</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Our dataset contains 2D shapes of various shapes and colors. An image is generated by
							sampling a fixed number of shapes. We refer to each shape as an <em>atom</em>. Atoms can be composed together
							in many ways. We shall study two types of compositions in this work.
						</p>
							<!-- We can also animate the scene by interpolating the deformation latent codes of two input
							frames. Use the slider here to linearly interpolate between the left frame and the right
							frame.
						</p> -->
					</div>
					<br />
					<!--/ Method SubSection A. -->

					<!-- Method SubSection B. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Entity composition involves sampling atoms (entities) such that the specific combination of atoms is not
							seen during training.
						</p>
					</div>
					<br />
					<!--/ Method SubSection B. -->

					<!-- Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Sampling</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Specifically, here, the model has seen a red square, a purple triangle, and a blue circle in
							other contexts, but never together.
						</p>
					</div>
					<br />
					<!--/ Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Rendering State</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The rendered image randomly places the atoms in the scene.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Sampling Actions</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							In this environment, each atom can be moved either North, East, South, or West. Actions are sampled uniformly.
						</p>
					</div>
					<br />
					<!--/ Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition: Prediction</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The next state is derived by applying the action to the current state. Each atom also has a pre-specified
							"weight" which influences whether it can push other shapes (if heavier) or be pushed (if
							lighter). The weights can be inferred from the shape. The white arrow provides emphasis and isn't part of the state.
						</p>
						<p>
							<strong>Some Observations:</strong> 
							<ol>
								<li>To perform well in this task, the internal representation of the model must be able to distinguish between the different atoms.</li>
								<li>While the atoms have changed, the rules by which the atoms interact have not. So, while the
							task of perceiving a new composition atoms is challenging, the task of predicting the next state is
							relatively easy.</li>
							</ol>
						</p>
					</div>
					<br />
					<!--/ Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Entity composition has been the traditional way of studying compositional generalization. However,
							in EC, the movement of each object is independent of the movement of other objects.
							In relational composition, we compose on the level of <em>relations</em> between objects. 
							Now, in addition to sampling new objects, objects which share a certain attribute have shared dynamics as well.
							In this case, the attribute is color. A composition of objects occurs when the two objects
							share the same color. Furthermore, objects with shared attributes share dynamics as well.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Sampling</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Specifically, here, the square and the circle are both red, while the triangle is green.
							Like before, the model has seen each shape in other contexts, but never together. ie: a
							red square and a red circle have never been seen together.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Subtypes</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							There are many ways in which we can select the relation to compose on. In this work, we study two cases.
							<ul>
								<li><strong>Sticky Composition:</strong> Dynamics are shared based on two relations: color and adjacency.</li>
								<li><strong>Team Composition:</strong> Dynamics are shared based on one relation: color.</li>
							</ul> 
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Sticky Composition</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							An action is applied on a single object and the dynamics are shared with all objects of the same color that are adjacent to the object.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Sticky Composition</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Consecutively, the action on the square moves both the square and the circle northwards.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Team Composition</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							An action is applied on a single object and the dynamics are shared with all objects of the same color.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition: Team Composition</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							This time, the action on the square also moves the circle northwards. However, the 
							circle and the square are considerably far apart.
						</p>
					</div>
				</div>
				<!-- Image. -->
				<!-- Make sure image fits in the container. -->
					<div class="column content">
					<img src="static/types-of-compositions/1.svg" id="updateableFigure">		
				</div>
			</div>
		</div>
	</section>

	<!-- Section title? -->

	<section class="section">
		<div class="container">
			<h2 class="title is-2">A single step of <span class="cosmos">Cosmos</span></h2>
			<!-- Method. -->
			<div class="columns is-centered" id="onepass-inference-scroll">
				<div class="column is-max-mobile is-two-fifth-tablet is-one-quarter-desktop is-one-fifth-widescreen article">
					<!-- Method SubSection A. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Input</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The input to the model is an image of the current state, and the object-factorized action on
							which to condition the next state.
						</p>
					</div>
					<br />
					<!--/ Method SubSection A. -->
					<!-- Method SubSection B. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Encoder</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							We process the image into a set of entities using pretrained <a href="https://segment-anything.com">SAM</a> to obtain segmentations for
							each entity and a finetuned <a href="https://arxiv.org/abs/1512.03385">Resnet</a> to obtain a latent vector for each entity.
						</p>
					</div>
					<br />
					<!--/ Method SubSection B. -->

					<!-- Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Preprocessing</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Each vector is decoded to an image using a spatial decoder. The spatial decoder is trained in conjunction with the entity encoder.
							In practice, we warm-start the encoder and decoder to ensure good auto-encoder reconstructions.
						</p>
					</div>
					<br />
					<!--/ Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Zero-shot labelling</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							We use a pretrained <a href="https://pypi.org/project/open-clip-torch/">CLIP</a> model to predict the 
							symbolic attribute that most-likely describes the entity. Notice that the attribute labelled here (<code>C_shape = ⚫️</code>)
							ignores all other attributes of the entity.
						</p> 
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Zero-shot labelling</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The process is repeated for other attributes as well. Notice that each attribute is labelled independently of the others,
							allowing the model to trivially generalize to different compositions of attributes.
						</p>
					</div>
					<br />
					<!--/ Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Label Relaxation</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Each label is represented as a one-hot vector. In practice, this discrete representation does not
							align well with downstream attention based modules. Hence, the one-hot vector is used to select a
							learnable vector from a set of learnable vectors.
						</p>
					</div>
					<br />
					<!--/ Method SubSection C. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Labelling: Concatenation</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Thus, the resultant <em>symbol vector</em> is a composition of learnable latent vectors
							distinct to each attribute value. Furthermore, we can ensure a canonical ordering of the
							symbols, making downstream attention-based computations invariant to permutations of attributes
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Symbolic Encoding</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The symbolic labelling process is repeated for each entity in the scene. The resultant symbolic
							vectors are stacked to form a <em>symbolic encoding</em> of the scene.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Strawman #1</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Following <a href="https://arxiv.org/abs/2103.01937">NPS</a>, we will break down the transition function into two parts: Learning to select a module and learning to specialize the module to a task. However, how should we employ the neural and symbolic encodings?
						</p>
						<p>
							The <strong style="color: #fb8c00;">symbolic encoding</strong> will help the selection module be robust to attribute compositions. However, if we just use the symbolic encoding, we will risk bottlenecking the model's ability to learn fine-grained dynamics-relevant attributes that may not be known ahead of time.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Strawman #2</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The <strong style="color: #417be4;">neural encoding</strong>, on the other hand, captures rich dynamics-relevant attributes which will enable good reconstruction. However, we will risk overfitting to attribute compositions seen during training.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Motivating a Neurosymbolic Encoding</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							We solve this problem by employing a hybrid approach. We'll use the symbolic encoding to select a module and the neural encoding to predict the next state.
						</p>
						<p>
							We'll now describe the rest of the architecture.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Module Selection</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The symbolic encoding is concatenated with the action vector after the encoding and action are
							reordered to match the canonical ordering of the symbols. The concatenated vector is select a
							learnable module which is used to predict the next state. 
						</p>
						<p>
							Note that the symbolic encoding is
							only selecting the module, which leaves room for the neural encoding to learn fine-grained
							dynamics-relevant attributes that may not be known ahead of time.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Module Application</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The selected module is applied to the neural encoding to predict the next state.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Spatial Decoding</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							We re-use the spatial decoder to decode the predicted next state into an image.
						</p>
					</div>
					<br />
					<h3 class="title is-size-6-mobile is-size-4-tablet">Learning Objective</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The model is trained end-to-end using a mixture of the next-state reconstruction error (<code>MSE</code>) and the
							auto-encoder reconstruction error (<code>AE-MSE</code>; not shown).
						</p>
					</div>
				</div>
				<!-- Image. -->
				<div class="column content">
						<img src="static/onepass-inference/1.svg" id="updateableFigure">			
				</div>
			</div>
		</div>
	</section>

	<section class="section">
		<div class="container">
			<h2 class="title is-2">Compositional Generalization in 2D Block Pushing</h2>
			<!-- Method. -->
			<div class="columns is-centered" id="results-frames-scroll">
				<div class="column is-quarter-mobile is-two-fifth-tablet is-one-quarter-desktop is-one-fifth-widescreen article">
					<!-- Method SubSection A. -->
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - Ground Truth</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							We first look at qualitative results in Entity Composition. A comprehensive analysis is available in the <a href="https://arxiv.org/abs/2310.12690">paper</a>.
							Each row represents a random sample of objects and a randomly sampled action. The model has never seen this composition of objects before.
							<br/>
							We <strong>expect strong performance from baselines</strong> as the dynamics are invariant to the sampled composition.
						</p>
					</div>
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - COSMOS</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							COSMOS is able to predict the next state with high fidelity.
						</p>
					</div>
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - Aligned NPS</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Our first baseline is a modified version of <a href="https://arxiv.org/abs/2103.01937">NPS</a> with a slot-action alignment attention mechanism.
							This is equivalent to an ablation of COSMOS without the symbolic representation. NPS is also able to predict the next state with good fidelity.
							Slight deviations are emphasized with a red dotted box.
						</p>
					</div>
					<h3 class="title is-size-6-mobile is-size-4-tablet">Entity Composition - GNN</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Our second baseline uses a GNN to model the interactions between objects.
							This is related to <a href="https://arxiv.org/abs/2204.13661">HOWM</a> and <a href="https://arxiv.org/abs/1911.12247">G-SWM</a>.
							It also serves as an ablation of COSMOS without the symbolic representation and module selection mechanism. The GNN achieves
							strong performance. However, some entities are reconstructed with wrong attributes; an indication of attribute overfitting.  
						</p>
					</div>
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - Ground Truth</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							In this environment, objects related by <code>position</code> (adjacency) and <code>color</code> will move together. As before, the sampled objects have never been
							seen by the model.
							<br/>
							As the dynamics are no longer invariant to compositions, we expect <strong>strong performance from COSMOS</strong> and <strong>overfitting from baselines</strong>.
						</p>
					</div>
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - COSMOS</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							COSMOS is able to predict the next state with high fidelity.
						</p>
					</div>
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - Aligned NPS</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							Our first baseline mispredicts the next state. Notice how, in both samples, the model moves the actor and the <code>triangle</code>
							even though the triangle doesn't share any attributes with the actor.
						</p>
					</div>
					<h3 class="title is-size-6-mobile is-size-4-tablet">Relational Composition - GNN</h3>
					<div class="content is-size-7-mobile is-size-6-tablet has-text-left step">
						<p>
							The GNN also mispredicts the next state. For more details and results, check out our <a href="https://arxiv.org/abs/2310.12690">paper</a>!
						</p>
					</div>
				</div>
				<!-- Image. -->
				<div class="column content">
						<img src="static/results-frames/1.svg" id="updateableFigure">			
				</div>
			</div>
		</div>
	</section>
	<section class="section">
		<div class="container is-max-desktop">
			<div class="columns is-centered">
				<div class="column is-full-width">
					<h2 class="title is-3">Related Links</h2>

					<div class="content has-text-left">
						<p>
							This project would not be possible without the excellent work of the community. These are some relevant papers to better understand the 
							premise of this work.
						</p>
						<ul>
								<li><a href="https://arxiv.org/abs/2204.13661">Toward compositional generalization in object-oriented world modeling</a> </li>
								<li><a href="https://arxiv.org/abs/2103.01937">Neural Production Systems: Learning Rule-Governed Visual Dynamics</a> </li>
								<li><a href="https://arxiv.org/abs/1911.12247 ">Contrastive learning of structured world models</a> </li>
								<li><a href="https://arxiv.org/abs/1803.10122">World Models</a> </li>
								<li><a href="https://arxiv.org/abs/2107.13132">Unsupervised Learning of Neurosymbolic Encoders</a> </li>
							</ul>
							
					</div>
				</div>
			</div>

		</div>
	</section>


	<section class="section" id="BibTeX">
		<div class="container is-max-desktop content">
			<h2 class="title">BibTeX</h2>
			<pre><code>@misc{sehgal2023neurosymbolic,
	title={Neurosymbolic Grounding for Compositional World Models}, 
	author={Atharva Sehgal and Arya Grayeli and Jennifer J. Sun and Swarat Chaudhuri},
	year={2023},
	eprint={2310.12690},
	archivePrefix={arXiv},
	primaryClass={cs.LG}
}</code></pre>
		</div>
	</section>

	<footer class="footer">
		<div class="container">
			<div class="content has-text-centered">
				<a class="icon-link" href="https://arxiv.org/abs/2310.12690">
					<i class="fas fa-file-pdf"></i>
				</a>
				<a class="icon-link" href="https://github.com/trishullab/cosmos" class="external-link" disabled>
					<i class="fab fa-github"></i>
				</a>
			</div>
			<div class="columns is-centered">
				<div class="column is-8">
					<div class="content">
						<p>
							This template is based on the <a href="https://nerfies.github.io/">Nerfiles</a> project page.
							The source code is available <a href="https://github.com/nerfies/nerfies.github.io">here</a> and is 
							licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
							Commons Attribution-ShareAlike 4.0 International License</a>. I also make heavy use of the
							<a href="https://github.com/russellsamora/scrollama">Scrollama.js</a> package. Please remember
							to cite either the  <a href="https://nerfies.github.io/">Nerfiles</a>  website or
							<a href="https://github.com/trishullab/cosmos-web">this website</a> if you use this template!
						</p>
					</div>
				</div>
			</div>
		</div>
	</footer>

	<script src="https://unpkg.com/d3@5.9.1/dist/d3.min.js"></script>
	<!-- Scrolly Storytelling -->
	<script src="https://unpkg.com/scrollama"></script>
	<script src="./static/js/scrollytelling.js"></script>
	<script>
		// Init scrollable sections.
		init("#types-of-compositions-scroll");
		init("#onepass-inference-scroll");
		init("#results-frames-scroll");
	</script>
</body>

</html>