code · pull · Sep 19, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
@@ -1551,6 +1551,9 @@ def forward_orig(
         context_img_len = None
 
         if audio_embed is not None:
+            if reference_latent is not None:
+                zero_audio_pad = torch.zeros(audio_embed.shape[0], reference_latent.shape[-3], *audio_embed.shape[2:], device=audio_embed.device, dtype=audio_embed.dtype)
+                audio_embed = torch.cat([audio_embed, zero_audio_pad], dim=1)
             audio = self.audio_proj(audio_embed).permute(0, 3, 1, 2).flatten(2).transpose(1, 2)
         else:
             audio = None

diff --git a/comfy/model_management.py b/comfy/model_management.py
@@ -348,7 +348,7 @@ def amd_min_version(device=None, min_rdna_version=0):
 #                    if any((a in arch) for a in ["gfx1201"]):
 #                        ENABLE_PYTORCH_ATTENTION = True
         if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
-            if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
+            if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
                 SUPPORT_FP8_OPS = True
 
 except:

diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py
@@ -233,6 +233,7 @@ def sharpen(self, image: torch.Tensor, sharpen_radius: int, sigma:float, alpha:
 
         kernel_size = sharpen_radius * 2 + 1
         kernel = gaussian_kernel(kernel_size, sigma, device=image.device) * -(alpha*10)
+        kernel = kernel.to(dtype=image.dtype)
         center = kernel_size // 2
         kernel[center, center] = kernel[center, center] - kernel.sum() + 1.0
         kernel = kernel.repeat(channels, 1, 1).unsqueeze(1)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
@@ -1095,10 +1095,6 @@ def execute(cls, positive, negative, vae, width, height, length, batch_size, ref
             audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0]  # [T, 5, 1280]
             audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0)
 
-            # pad for ref latent
-            zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype)
-            audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0)
-
             audio_emb = audio_emb.unsqueeze(0)
             audio_emb_neg = torch.zeros_like(audio_emb)
             positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb})