yashkant
diff --git a/‎.gitignore
+5 b/‎.gitignore
+5
diff --git a/‎README.md
+105 b/‎README.md
+105
diff --git a/‎configs/spad3.yaml
+126 b/‎configs/spad3.yaml
+126
diff --git a/‎configs/spad_four_views.yaml
+89 b/‎configs/spad_four_views.yaml
+89
@@ -0,0 +1,5 @@
+data/
+*.egg-info
+*.pyc 
+*.pyo
+__pycache__
@@ -0,0 +1,105 @@
+SPAD : Spatially Aware Multiview Diffusers
+===================================================
+<h4>
+Yash Kant, Ziyi Wu, Michael Vasilkovsky, Gordon Qian, Jian Ren, Riza Alp Guler, Bernard Ghanem, Sergey Tulyakov*, Igor Gilitschenski*, Aliaksandr Siarohin*
+</br>
+<span style="font-size: 14pt; color: #555555">
+Published at CVPR, 2024
+</span>
+</h4>
+<hr>
+
+**Paper:** [https://arxiv.org/abs/2402.05235](https://https://arxiv.org/abs/2402.05235)
+
+**Project Page:** [https://yashkant.github.io/spad/](https://yashkant.github.io/spad/)
+
+
+<p align="center">
+  <img src="data/visuals/readme/spad_pipeline.png">
+</p>
+
+Model pipeline. (a) We fine-tune a pre-trained text-to-image diffusion model on multi-view rendering of 3D objects.
+(b) Our model jointly denoises noisy multi-view images conditioned on text and relative camera poses. To enable cross-view interaction, we apply 3D self-attention by concatenating all views, and enforce epipolar constraints on the attention map.
+(c) We further add Plücker Embedding to the attention layers as positional encodings, to enhance camera control.
+
+## Filtered High-Quality Objaverse 
+If you are looking for the objaverse assets we used to train SPAD models, you can find that list here: [filtered_objaverse.txt](https://github.com/yashkant/spad/data/filtered_objaverse.txt). 
+
+To see how this list was generated / tweak its parameters, you can try this colab notebook here: [filter_objaverse.ipynb](https://colab.research.google.com/drive/1UJM4caaBJsYOkP7EmjPjBvoJ7U0qY4kq#scrollTo=sR28TydbQUuT)
+
+## Visualizing and Creating Epipolar Masks  
+If you would like to visualize the epipolar masks and plucker embeddings or use them as separate module, read and run the following script:
+
+```
+python scripts/visualize_epipolar_mask.py 
+```
+
+## Repository Setup
+
+Create a fresh conda environment, and install all dependencies.
+
+```text
+conda create -n spad python=3.8 -y
+conda activate spad
+```
+
+Clone the repository with submodules using the following command:
+
+```text
+git clone --recursive https://github.com/yashkant/spad
+cd spad
+```
+
+If you already have the repository cloned, you can update the submodules using the following command:
+```text
+git submodule update --init --recursive
+```
+
+Install dependencies and pytorch (tested with CUDA 11.8):
+```
+pip install -r requirements.txt
+pip install --ignore-installed torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
+
+<!-- (is this needed??) Install the `spad` package:
+```text
+pip install -e .
+``` -->
+
+
+## Download Files
+Download files from the [dropbox link](https://www.dropbox.com/sh/dk6oubjlt2x7w0h/AAAKExm33IKnVe8mkC4tOzUKa) and place it in the ``data/`` folder.
+Ensure that data paths match the directory structure provided in ``data/README.md``
+
+## Pretrained Model
+
+We provide two pretrained models, with following specifications:
+- `spad_two_views`: Trained with learning rate 1e-4, relative cameras (between views) and no intrinsics, two views, random viewpoints.
+- `spad_four_views`: Trained with learning rate 2e-5, absolute cameras (between views) with intrinsics, four views, random + orthogonal viewpoints
+
+You can test these models out using: 
+```
+python scripts/inference.py --model <model_name> 
+```
+
+You can adjust the following hyperparameters for best results:
+```
+--cfg_scale: 3.0 to 9.0 (default 7.5)
+--blob_sigma: 0.2 to 0.7 (default 0.5)
+--ddim_steps: 50 to 1000 (default 100)
+```
+
+## Citation
+Consider citing our work:
+```
+@misc{kant2024spad,
+      title={SPAD : Spatially Aware Multiview Diffusers}, 
+      author={Yash Kant and Ziyi Wu and Michael Vasilkovsky and Guocheng Qian and Jian Ren and Riza Alp Guler and Bernard Ghanem and Sergey Tulyakov and Igor Gilitschenski and Aliaksandr Siarohin},
+      year={2024},
+      eprint={2402.05235},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+
@@ -0,0 +1,126 @@
+model:
+  base_learning_rate: 0.0001
+  resume_path: data/v1-5-pruned.ckpt
+  fast_attention: true
+  target: nvs.nvs_ldm.TextViews
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: none
+    cond_stage_key: none
+    conditioning_key: hybrid-mv
+    cc_type: timesteps_only_emb
+    mv_timesteps_style: same
+    oom_fix: true
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    cfg_conds:
+    - txt
+    cfg_scales:
+    - 7.5
+    skip_plucker: false
+    skip_epi: false
+    scheduler_config:
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 100
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+    unet_config:
+      target: nvs.branch_unet.ManyStreamUnetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_heads: 8
+        use_spatial_transformer: true
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: true
+        legacy: false
+        denoise_channels: 4
+        in_feat_channels: 6
+        decode_cross: true
+        post_init_type: manystream-plucker
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+    ckpt_path: null
+data:
+  target: nvs.z123_dataset.ManyViewsDataModuleFromConfig
+  params:
+    root_dir: /nfs/data/objaverse/rendering
+    batch_size: 40
+    views_per_sample_range:
+    - 2
+    - 2
+    batch_size_dict:
+      laion: 60
+      mv:
+        2: 30
+        4: 12
+    num_workers: 20
+    total_view: 4
+    laion_batch_prob: 0.1
+    setup: polyview-random
+    additional_setups: []
+    add_text: true
+    add_text_tok: true
+    only_text_samples: true
+    text_type: cap3d_no_3d
+    use_internal_filter: meta_filtered
+    laion_type: 625K
+    mask_init: ones
+    mv_datasets:
+    - objaverse
+    debug: false
+    train:
+      validation: false
+      image_transforms:
+        size: 256
+
@@ -0,0 +1,89 @@
+model:
+  base_learning_rate: 2.0e-05
+  resume_path: data/v1-5-pruned.ckpt
+  fast_attention: true
+  target: spad.spad.SPAD
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: none
+    cond_stage_key: none
+    conditioning_key: hybrid-mv
+    use_intrinsic: true
+    use_abs_extrinsics: true
+    image_size: 32
+    channels: 4
+    cond_stage_trainable: false
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    cfg_conds:
+    - txt
+    cfg_scales:
+    - 7.5
+    scheduler_config:
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps:
+        - 100
+        cycle_lengths:
+        - 10000000000000
+        f_start:
+        - 1.0e-06
+        f_max:
+        - 1.0
+        f_min:
+        - 1.0
+
+    unet_config:
+      target: spad.mv_unet.SPADUnetModel
+      params:
+        image_size: 32
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions:
+        - 4
+        - 2
+        - 1
+        num_res_blocks: 2
+        channel_mult:
+        - 1
+        - 2
+        - 4
+        - 4
+        num_heads: 8
+        use_spatial_transformer: true
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: true
+        legacy: false
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+    ckpt_path: null