code · pull · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/README.md b/README.md
@@ -77,6 +77,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
    - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
    - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/)
    - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
+   - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/)
 - Audio Models
    - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
    - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/)

diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
@@ -457,6 +457,82 @@ def process_out(self, latent):
         latents_std = self.latents_std.to(latent.device, latent.dtype)
         return latent * latents_std / self.scale_factor + latents_mean
 
+class Wan22(Wan21):
+    latent_channels = 48
+    latent_dimensions = 3
+
+    latent_rgb_factors = [
+            [ 0.0119,  0.0103,  0.0046],
+            [-0.1062, -0.0504,  0.0165],
+            [ 0.0140,  0.0409,  0.0491],
+            [-0.0813, -0.0677,  0.0607],
+            [ 0.0656,  0.0851,  0.0808],
+            [ 0.0264,  0.0463,  0.0912],
+            [ 0.0295,  0.0326,  0.0590],
+            [-0.0244, -0.0270,  0.0025],
+            [ 0.0443, -0.0102,  0.0288],
+            [-0.0465, -0.0090, -0.0205],
+            [ 0.0359,  0.0236,  0.0082],
+            [-0.0776,  0.0854,  0.1048],
+            [ 0.0564,  0.0264,  0.0561],
+            [ 0.0006,  0.0594,  0.0418],
+            [-0.0319, -0.0542, -0.0637],
+            [-0.0268,  0.0024,  0.0260],
+            [ 0.0539,  0.0265,  0.0358],
+            [-0.0359, -0.0312, -0.0287],
+            [-0.0285, -0.1032, -0.1237],
+            [ 0.1041,  0.0537,  0.0622],
+            [-0.0086, -0.0374, -0.0051],
+            [ 0.0390,  0.0670,  0.2863],
+            [ 0.0069,  0.0144,  0.0082],
+            [ 0.0006, -0.0167,  0.0079],
+            [ 0.0313, -0.0574, -0.0232],
+            [-0.1454, -0.0902, -0.0481],
+            [ 0.0714,  0.0827,  0.0447],
+            [-0.0304, -0.0574, -0.0196],
+            [ 0.0401,  0.0384,  0.0204],
+            [-0.0758, -0.0297, -0.0014],
+            [ 0.0568,  0.1307,  0.1372],
+            [-0.0055, -0.0310, -0.0380],
+            [ 0.0239, -0.0305,  0.0325],
+            [-0.0663, -0.0673, -0.0140],
+            [-0.0416, -0.0047, -0.0023],
+            [ 0.0166,  0.0112, -0.0093],
+            [-0.0211,  0.0011,  0.0331],
+            [ 0.1833,  0.1466,  0.2250],
+            [-0.0368,  0.0370,  0.0295],
+            [-0.3441, -0.3543, -0.2008],
+            [-0.0479, -0.0489, -0.0420],
+            [-0.0660, -0.0153,  0.0800],
+            [-0.0101,  0.0068,  0.0156],
+            [-0.0690, -0.0452, -0.0927],
+            [-0.0145,  0.0041,  0.0015],
+            [ 0.0421,  0.0451,  0.0373],
+            [ 0.0504, -0.0483, -0.0356],
+            [-0.0837,  0.0168,  0.0055]
+        ]
+
+    latent_rgb_factors_bias = [0.0317, -0.0878, -0.1388]
+
+    def __init__(self):
+        self.scale_factor = 1.0
+        self.latents_mean = torch.tensor([
+                -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
+                -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
+                -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502,
+                -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230,
+                -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748,
+                0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667,
+            ]).view(1, self.latent_channels, 1, 1, 1)
+        self.latents_std = torch.tensor([
+                0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013,
+                0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978,
+                0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659,
+                0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093,
+                0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887,
+                0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744
+            ]).view(1, self.latent_channels, 1, 1, 1)
+
 class Hunyuan3Dv2(LatentFormat):
     latent_channels = 64
     latent_dimensions = 1

diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
@@ -201,8 +201,10 @@ def forward(
             freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
         """
         # assert e.dtype == torch.float32
-
-        e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
+        if e.ndim < 4:
+            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
+        else:
+            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2)
         # assert e[0].dtype == torch.float32
 
         # self-attention
@@ -325,7 +327,10 @@ def forward(self, x, e):
             e(Tensor): Shape [B, C]
         """
         # assert e.dtype == torch.float32
-        e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1)
+        if e.ndim < 3:
+            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1)
+        else:
+            e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2)
         x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
         return x
 
@@ -506,8 +511,9 @@ def forward_orig(
 
         # time embeddings
         e = self.time_embedding(
-            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
-        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+            sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
+        e = e.reshape(t.shape[0], -1, e.shape[-1])
+        e0 = self.time_projection(e).unflatten(2, (6, self.dim))
 
         # context
         context = self.text_embedding(context)