add dual patchnorm as an option

lucidrains · lucidrains · commit 943b85c5ad53 · 2023-03-04T21:08:06.000-08:00
diff --git a/README.md b/README.md
@@ -129,3 +129,15 @@ sampled_images.shape # (4, 3, 128, 128)
     volume  = {abs/2202.00512}
 }
 ```
+
+```bibtex
+@misc{https://doi.org/10.48550/arxiv.2302.01327,
+    doi     = {10.48550/ARXIV.2302.01327},
+    url     = {https://arxiv.org/abs/2302.01327},
+    author  = {Kumar, Manoj and Dehghani, Mostafa and Houlsby, Neil},
+    title   = {Dual PatchNorm},
+    publisher = {arXiv},
+    year    = {2023},
+    copyright = {Creative Commons Attribution 4.0 International}
+}
+```
diff --git a/rin_pytorch/rin_pytorch.py b/rin_pytorch/rin_pytorch.py
@@ -64,6 +64,9 @@ def convert_image_to(img_type, image):
         return image.convert(img_type)
     return image
 
+def Sequential(*mods):
+    return nn.Sequential(*filter(exists, mods))
+
 # use layernorm without bias, more stable
 
 class LayerNorm(nn.Module):
@@ -347,6 +350,7 @@ def __init__(
         num_latents = 256,              # they still had to use a fair amount of latents for good results (256), in line with the Perceiver line of papers from Deepmind
         learned_sinusoidal_dim = 16,
         latent_token_time_cond = False, # whether to use 1 latent token as time conditioning, or do it the adaptive layernorm way (which is highly effective as shown by some other papers "Paella" - Dominic Rampas et al.)
+        dual_patchnorm = True,
         **attn_kwargs
     ):
         super().__init__()
@@ -378,9 +382,11 @@ def __init__(
 
         # pixels to patch and back
 
-        self.to_patches = nn.Sequential(
+        self.to_patches = Sequential(
             Rearrange('b c (h p1) (w p2) -> b (h w) (c p1 p2)', p1 = patch_size, p2 = patch_size),
-            nn.Linear(pixel_patch_dim * 2, dim)
+            nn.LayerNorm(pixel_patch_dim * 2) if dual_patchnorm else None,
+            nn.Linear(pixel_patch_dim * 2, dim),
+            nn.LayerNorm(dim) if dual_patchnorm else None,
         )
 
         self.axial_pos_emb = nn.Parameter(torch.randn(2, patch_height_width, dim) * 0.02)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'RIN-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.5.2',
+  version = '0.5.3',
   license='MIT',
   description = 'RIN - Recurrent Interface Network - Pytorch',
   author = 'Phil Wang',