modelscope · continue-revolution · Sep 9, 2025 · Sep 2, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -24,3 +24,4 @@ tests/data/expect/algorithm/ddim_20steps.safetensors filter=lfs diff=lfs merge=l
 tests/data/expect/algorithm/flow_match_euler_i10.safetensors filter=lfs diff=lfs merge=lfs -text
 tests/data/expect/algorithm/output.safetensors filter=lfs diff=lfs merge=lfs -text
 tests/data/expect/flux/flux_text_encoder_1.safetensors filter=lfs diff=lfs merge=lfs -text
+tests/data/input/wan_s2v/* filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ and offloading strategies, enabling loading of larger diffusion models (e.g., Fl
 
 ## News
 
+- **[v0.6.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.6.0)** | **September 9, 2025**: ![Image](assets/tongyi.svg) Supports [Wan2.2-S2V](https://modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B), a video generation model designed for audio-driven cinematic video generation
 - **[v0.5.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.5.0)** | **August 27, 2025**: ![Image](assets/tongyi.svg) Supports [Qwen-Image-Edit](https://modelscope.cn/models/Qwen/Qwen-Image-Edit), the image editing version of Qwen-Image, enabling semantic/appearance visual editing, and precise text editing
 - **[v0.4.1](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.1)** | **August 4, 2025**: ![Image](assets/tongyi.svg) Supports [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image), an image generation model excels at complex text rendering and creating images in a wide range of artistic styles
 - **[v0.4.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.0)** | **August 1, 2025**:

diff --git a/diffsynth_engine/__init__.py b/diffsynth_engine/__init__.py
@@ -3,6 +3,7 @@
     SDXLPipelineConfig,
     FluxPipelineConfig,
     WanPipelineConfig,
+    WanSpeech2VideoPipelineConfig,
     QwenImagePipelineConfig,
     HunyuanPipelineConfig,
     SDStateDicts,
@@ -45,6 +46,7 @@
     "SDXLPipelineConfig",
     "FluxPipelineConfig",
     "WanPipelineConfig",
+    "WanSpeech2VideoPipelineConfig",
     "QwenImagePipelineConfig",
     "HunyuanPipelineConfig",
     "SDStateDicts",

diff --git a/diffsynth_engine/conf/models/wan/dit/wan2.2-s2v-14b.json b/diffsynth_engine/conf/models/wan/dit/wan2.2-s2v-14b.json
@@ -0,0 +1,13 @@
+{
+    "patch_size": [1, 2, 2],
+    "in_dim": 16,
+    "dim": 5120,
+    "ffn_dim": 13824,
+    "freq_dim": 256,
+    "text_dim": 4096,
+    "out_dim": 16,
+    "num_heads": 40,
+    "num_layers": 40,
+    "eps": 1e-6,
+    "audio_inject_layers": [0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39]
+}
diff --git a/diffsynth_engine/configs/__init__.py b/diffsynth_engine/configs/__init__.py
@@ -7,13 +7,15 @@
     SDXLPipelineConfig,
     FluxPipelineConfig,
     WanPipelineConfig,
+    WanSpeech2VideoPipelineConfig,
     QwenImagePipelineConfig,
     HunyuanPipelineConfig,
     BaseStateDicts,
     SDStateDicts,
     SDXLStateDicts,
     FluxStateDicts,
     WanStateDicts,
+    WanS2VStateDicts,
     QwenImageStateDicts,
 )
 from .controlnet import ControlType, ControlNetParams
@@ -27,13 +29,15 @@
     "SDXLPipelineConfig",
     "FluxPipelineConfig",
     "WanPipelineConfig",
+    "WanSpeech2VideoPipelineConfig",
     "QwenImagePipelineConfig",
     "HunyuanPipelineConfig",
     "BaseStateDicts",
     "SDStateDicts",
     "SDXLStateDicts",
     "FluxStateDicts",
     "WanStateDicts",
+    "WanS2VStateDicts",
     "QwenImageStateDicts",
     "ControlType",
     "ControlNetParams",

diff --git a/diffsynth_engine/configs/pipeline.py b/diffsynth_engine/configs/pipeline.py
@@ -184,6 +184,34 @@ def __post_init__(self):
         init_parallel_config(self)
 
 
+@dataclass
+class WanSpeech2VideoPipelineConfig(WanPipelineConfig):
+    audio_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
+    audio_encoder_dtype: torch.dtype = torch.float32
+
+    @classmethod
+    def basic_config(
+        cls,
+        model_path: str | os.PathLike | List[str | os.PathLike],
+        audio_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
+        device: str = "cuda",
+        parallelism: int = 1,
+        offload_mode: Optional[str] = None,
+    ) -> "WanSpeech2VideoPipelineConfig":
+        return cls(
+            model_path=model_path,
+            audio_encoder_path=audio_encoder_path,
+            device=device,
+            parallelism=parallelism,
+            use_cfg_parallel=True if parallelism > 1 else False,
+            use_fsdp=True if parallelism > 1 else False,
+            offload_mode=offload_mode,
+        )
+
+    def __post_init__(self):
+        init_parallel_config(self)
+
+
 @dataclass
 class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
     model_path: str | os.PathLike | List[str | os.PathLike]
@@ -274,6 +302,14 @@ class WanStateDicts:
     image_encoder: Optional[Dict[str, torch.Tensor]] = None
 
 
+@dataclass
+class WanS2VStateDicts:
+    model: Dict[str, torch.Tensor] | Dict[str, Dict[str, torch.Tensor]]
+    t5: Dict[str, torch.Tensor]
+    vae: Dict[str, torch.Tensor]
+    audio_encoder: Dict[str, torch.Tensor]
+
+
 @dataclass
 class QwenImageStateDicts:
     model: Dict[str, torch.Tensor]