Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
458ab45
wan sound to video
Sep 2, 2025
947f071
fix some problem
continue-revolution Sep 3, 2025
474c0db
more consistent style and update moviepy
continue-revolution Sep 3, 2025
9778671
get rid of decord
continue-revolution Sep 3, 2025
4be44bd
style improvement
continue-revolution Sep 3, 2025
e1c1734
Merge branch 'main' into conrevo/wan-s2v
continue-revolution Sep 5, 2025
328ece9
finish todo
continue-revolution Sep 5, 2025
002a43c
seems that they don't know what is binary search and it's not appropr…
continue-revolution Sep 5, 2025
99c82d3
maybe fix attn problem
continue-revolution Sep 5, 2025
5275f3a
move data to lfs
continue-revolution Sep 5, 2025
2b9db14
test cases
continue-revolution Sep 5, 2025
1399b05
improve example
continue-revolution Sep 5, 2025
d3640b8
basic doc, leave detailed doc to future
continue-revolution Sep 5, 2025
c05fec3
fix an error in example
tenderness-git Sep 5, 2025
0ea539c
fix some naming issue
Sep 5, 2025
feddb40
fix bbox transform issue
continue-revolution Sep 5, 2025
a1bd233
seems that regex match is notoriously slow
Sep 5, 2025
7485a3f
seems that regex match is notoriously slow
Sep 5, 2025
9398756
no need to split context again
tenderness-git Sep 5, 2025
57b5ba7
Merge branch 'conrevo/wan-s2v' of github.com:modelscope/DiffSynth-Eng…
tenderness-git Sep 5, 2025
64faedd
add rope explanation
continue-revolution Sep 5, 2025
d7d1e4f
fix pose image permute issue
continue-revolution Sep 5, 2025
f1a7376
amend accord. to pr 157
continue-revolution Sep 8, 2025
51ebd63
remove some comments and unnecessary imports
continue-revolution Sep 8, 2025
4aa48cf
remove legacy weight_norm
continue-revolution Sep 8, 2025
e1a23ea
remove unused librosa
continue-revolution Sep 8, 2025
c9f45e9
fix bufferreader not pickle-able error
continue-revolution Sep 8, 2025
e31f125
remove non-used import
continue-revolution Sep 8, 2025
27f1760
revert parallel.py, but we do need to set PARALLEL_FWD_TIMEOUT_SEC to…
continue-revolution Sep 8, 2025
3c05ddb
fix testcases
continue-revolution Sep 8, 2025
931c39c
edit accord. to comments
continue-revolution Sep 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ tests/data/expect/algorithm/ddim_20steps.safetensors filter=lfs diff=lfs merge=l
tests/data/expect/algorithm/flow_match_euler_i10.safetensors filter=lfs diff=lfs merge=lfs -text
tests/data/expect/algorithm/output.safetensors filter=lfs diff=lfs merge=lfs -text
tests/data/expect/flux/flux_text_encoder_1.safetensors filter=lfs diff=lfs merge=lfs -text
tests/data/input/wan_s2v/* filter=lfs diff=lfs merge=lfs -text
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ and offloading strategies, enabling loading of larger diffusion models (e.g., Fl

## News

- **[v0.6.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.6.0)** | **September 9, 2025**: ![Image](assets/tongyi.svg) Supports [Wan2.2-S2V](https://modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B), a video generation model designed for audio-driven cinematic video generation
- **[v0.5.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.5.0)** | **August 27, 2025**: ![Image](assets/tongyi.svg) Supports [Qwen-Image-Edit](https://modelscope.cn/models/Qwen/Qwen-Image-Edit), the image editing version of Qwen-Image, enabling semantic/appearance visual editing, and precise text editing
- **[v0.4.1](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.1)** | **August 4, 2025**: ![Image](assets/tongyi.svg) Supports [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image), an image generation model excels at complex text rendering and creating images in a wide range of artistic styles
- **[v0.4.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.0)** | **August 1, 2025**:
Expand Down
2 changes: 2 additions & 0 deletions diffsynth_engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
SDXLPipelineConfig,
FluxPipelineConfig,
WanPipelineConfig,
WanSpeech2VideoPipelineConfig,
QwenImagePipelineConfig,
HunyuanPipelineConfig,
SDStateDicts,
Expand Down Expand Up @@ -45,6 +46,7 @@
"SDXLPipelineConfig",
"FluxPipelineConfig",
"WanPipelineConfig",
"WanSpeech2VideoPipelineConfig",
"QwenImagePipelineConfig",
"HunyuanPipelineConfig",
"SDStateDicts",
Expand Down
13 changes: 13 additions & 0 deletions diffsynth_engine/conf/models/wan/dit/wan2.2-s2v-14b.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"patch_size": [1, 2, 2],
"in_dim": 16,
"dim": 5120,
"ffn_dim": 13824,
"freq_dim": 256,
"text_dim": 4096,
"out_dim": 16,
"num_heads": 40,
"num_layers": 40,
"eps": 1e-6,
"audio_inject_layers": [0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39]
}
4 changes: 4 additions & 0 deletions diffsynth_engine/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
SDXLPipelineConfig,
FluxPipelineConfig,
WanPipelineConfig,
WanSpeech2VideoPipelineConfig,
QwenImagePipelineConfig,
HunyuanPipelineConfig,
BaseStateDicts,
SDStateDicts,
SDXLStateDicts,
FluxStateDicts,
WanStateDicts,
WanS2VStateDicts,
QwenImageStateDicts,
)
from .controlnet import ControlType, ControlNetParams
Expand All @@ -27,13 +29,15 @@
"SDXLPipelineConfig",
"FluxPipelineConfig",
"WanPipelineConfig",
"WanSpeech2VideoPipelineConfig",
"QwenImagePipelineConfig",
"HunyuanPipelineConfig",
"BaseStateDicts",
"SDStateDicts",
"SDXLStateDicts",
"FluxStateDicts",
"WanStateDicts",
"WanS2VStateDicts",
"QwenImageStateDicts",
"ControlType",
"ControlNetParams",
Expand Down
36 changes: 36 additions & 0 deletions diffsynth_engine/configs/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,34 @@ def __post_init__(self):
init_parallel_config(self)


@dataclass
class WanSpeech2VideoPipelineConfig(WanPipelineConfig):
audio_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None
audio_encoder_dtype: torch.dtype = torch.float32

@classmethod
def basic_config(
cls,
model_path: str | os.PathLike | List[str | os.PathLike],
audio_encoder_path: Optional[str | os.PathLike | List[str | os.PathLike]] = None,
device: str = "cuda",
parallelism: int = 1,
offload_mode: Optional[str] = None,
) -> "WanSpeech2VideoPipelineConfig":
return cls(
model_path=model_path,
audio_encoder_path=audio_encoder_path,
device=device,
parallelism=parallelism,
use_cfg_parallel=True if parallelism > 1 else False,
use_fsdp=True if parallelism > 1 else False,
offload_mode=offload_mode,
)

def __post_init__(self):
init_parallel_config(self)


@dataclass
class QwenImagePipelineConfig(AttentionConfig, OptimizationConfig, ParallelConfig, BaseConfig):
model_path: str | os.PathLike | List[str | os.PathLike]
Expand Down Expand Up @@ -274,6 +302,14 @@ class WanStateDicts:
image_encoder: Optional[Dict[str, torch.Tensor]] = None


@dataclass
class WanS2VStateDicts:
model: Dict[str, torch.Tensor] | Dict[str, Dict[str, torch.Tensor]]
t5: Dict[str, torch.Tensor]
vae: Dict[str, torch.Tensor]
audio_encoder: Dict[str, torch.Tensor]


@dataclass
class QwenImageStateDicts:
model: Dict[str, torch.Tensor]
Expand Down
Loading