Skip to content

Commit ca8a9a5

Browse files
Wan Speech2Video (#162)
* wan sound to video * fix some problem * more consistent style and update moviepy * get rid of decord * style improvement * finish todo * seems that they don't know what is binary search and it's not appropriate to use binary search here * maybe fix attn problem * move data to lfs * test cases * improve example * basic doc, leave detailed doc to future * fix an error in example * fix some naming issue * fix bbox transform issue * seems that regex match is notoriously slow * seems that regex match is notoriously slow * no need to split context again * add rope explanation * fix pose image permute issue * amend accord. to pr 157 * remove some comments and unnecessary imports * remove legacy weight_norm * remove unused librosa * fix bufferreader not pickle-able error * remove non-used import * revert parallel.py, but we do need to set PARALLEL_FWD_TIMEOUT_SEC to be much longer to avoid not receiving the output after multi-gpu inference complete * fix testcases * edit accord. to comments ---------
1 parent b74ba8d commit ca8a9a5

27 files changed

+1993
-2
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ tests/data/expect/algorithm/ddim_20steps.safetensors filter=lfs diff=lfs merge=l
2424
tests/data/expect/algorithm/flow_match_euler_i10.safetensors filter=lfs diff=lfs merge=lfs -text
2525
tests/data/expect/algorithm/output.safetensors filter=lfs diff=lfs merge=lfs -text
2626
tests/data/expect/flux/flux_text_encoder_1.safetensors filter=lfs diff=lfs merge=lfs -text
27+
tests/data/input/wan_s2v/* filter=lfs diff=lfs merge=lfs -text

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ and offloading strategies, enabling loading of larger diffusion models (e.g., Fl
2323

2424
## News
2525

26+
- **[v0.6.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.6.0)** | **September 9, 2025**: ![Image](assets/tongyi.svg) Supports [Wan2.2-S2V](https://modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B), a video generation model designed for audio-driven cinematic video generation
2627
- **[v0.5.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.5.0)** | **August 27, 2025**: ![Image](assets/tongyi.svg) Supports [Qwen-Image-Edit](https://modelscope.cn/models/Qwen/Qwen-Image-Edit), the image editing version of Qwen-Image, enabling semantic/appearance visual editing, and precise text editing
2728
- **[v0.4.1](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.1)** | **August 4, 2025**: ![Image](assets/tongyi.svg) Supports [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image), an image generation model excels at complex text rendering and creating images in a wide range of artistic styles
2829
- **[v0.4.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.0)** | **August 1, 2025**:

diffsynth_engine/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
SDXLPipelineConfig,
44
FluxPipelineConfig,
55
WanPipelineConfig,
6+
WanSpeech2VideoPipelineConfig,
67
QwenImagePipelineConfig,
78
HunyuanPipelineConfig,
89
SDStateDicts,
@@ -45,6 +46,7 @@
4546
"SDXLPipelineConfig",
4647
"FluxPipelineConfig",
4748
"WanPipelineConfig",
49+
"WanSpeech2VideoPipelineConfig",
4850
"QwenImagePipelineConfig",
4951
"HunyuanPipelineConfig",
5052
"SDStateDicts",
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"patch_size": [1, 2, 2],
3+
"in_dim": 16,
4+
"dim": 5120,
5+
"ffn_dim": 13824,
6+
"freq_dim": 256,
7+
"text_dim": 4096,
8+
"out_dim": 16,
9+
"num_heads": 40,
10+
"num_layers": 40,
11+
"eps": 1e-6,
12+
"audio_inject_layers": [0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39]
13+
}

diffsynth_engine/configs/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
SDXLPipelineConfig,
88
FluxPipelineConfig,
99
WanPipelineConfig,
10+
WanSpeech2VideoPipelineConfig,
1011
QwenImagePipelineConfig,
1112
HunyuanPipelineConfig,
1213
BaseStateDicts,
1314
SDStateDicts,
1415
SDXLStateDicts,
1516
FluxStateDicts,
1617
WanStateDicts,
18+
WanS2VStateDicts,
1719
QwenImageStateDicts,
1820
)
1921
from .controlnet import ControlType, ControlNetParams
@@ -27,13 +29,15 @@
2729
"SDXLPipelineConfig",
2830
"FluxPipelineConfig",
2931
"WanPipelineConfig",
32+
"WanSpeech2VideoPipelineConfig",
3033
"QwenImagePipelineConfig",
3134
"HunyuanPipelineConfig",
3235
"BaseStateDicts",
3336
"SDStateDicts",
3437
"SDXLStateDicts",
3538
"FluxStateDicts",
3639
"WanStateDicts",
40+
"WanS2VStateDicts",
3741
"QwenImageStateDicts",
3842
"ControlType",
3943
"ControlNetParams",

diffsynth_engine/configs/pipeline.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,34 @@ def __post_init__(self):
184184
init_parallel_config(self)
185185

186186

187+
@dataclass
188+
class WanSpeech2VideoPipelineConfig(WanPipelineConfig):
189+
audio_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
190+
audio_encoder_dtype: torch.dtype = torch.float32
191+
192+
@classmethod
193+
def basic_config(
194+
cls,
195+
model_path: str | os.PathLike | List[str | os.PathLike],
196+
audio_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
197+
device: str = "cuda",
198+
parallelism: int = 1,
199+
offload_mode: Optional[str] = None,
200+
) -> "WanSpeech2VideoPipelineConfig":
201+
return cls(
202+
model_path=model_path,
203+
audio_encoder_path=audio_encoder_path,
204+
device=device,
205+
parallelism=parallelism,
206+
use_cfg_parallel=True if parallelism > 1 else False,
207+
use_fsdp=True if parallelism > 1 else False,
208+
offload_mode=offload_mode,
209+
)
210+
211+
def __post_init__(self):
212+
init_parallel_config(self)
213+
214+
187215
@dataclass
188216
class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
189217
model_path: str | os.PathLike | List[str | os.PathLike]
@@ -274,6 +302,14 @@ class WanStateDicts:
274302
image_encoder: Optional[Dict[str, torch.Tensor]] = None
275303

276304

305+
@dataclass
306+
class WanS2VStateDicts:
307+
model: Dict[str, torch.Tensor] | Dict[str, Dict[str, torch.Tensor]]
308+
t5: Dict[str, torch.Tensor]
309+
vae: Dict[str, torch.Tensor]
310+
audio_encoder: Dict[str, torch.Tensor]
311+
312+
277313
@dataclass
278314
class QwenImageStateDicts:
279315
model: Dict[str, torch.Tensor]

0 commit comments

Comments
 (0)