ModelTC · helloyongyang · Apr 2, 2026 · Apr 2, 2026 · gemini-code-assist · Apr 2, 2026
diff --git a/lightx2v/infer.py b/lightx2v/infer.py
@@ -169,6 +169,7 @@ def main():
     parser.add_argument("--save_result_path", type=str, default=None, help="The path to save video path/file")
     parser.add_argument("--return_result_tensor", action="store_true", help="Whether to return result tensor. (Useful for comfyui)")
     parser.add_argument("--target_shape", type=int, nargs="+", default=[], help="Set return video or image shape")
+    parser.add_argument("--target_video_length", type=int, default=81, help="The target video length for each generated clip")
     parser.add_argument("--aspect_ratio", type=str, default="")
     parser.add_argument("--video_path", type=str, default=None, help="input video path(for sr/v2v task)")
     parser.add_argument("--sr_ratio", type=float, default=2.0, help="super resolution ratio for sr task")

diff --git a/lightx2v/models/runners/wan/wan_audio_runner.py b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -23,6 +23,7 @@
 from lightx2v.models.video_encoders.hf.wan.vae_2_2 import Wan2_2_VAE
 from lightx2v.server.metrics import monitor_cli
 from lightx2v.utils.envs import *
+from lightx2v.utils.input_info import UNSET
 from lightx2v.utils.profiler import *
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.utils.utils import find_torch_model_path, fixed_shape_resize, get_optimal_patched_size_with_sp, isotropic_crop_resize, load_weights, wan_vae_to_comfy
@@ -315,8 +316,14 @@ def read_audio_input(self, audio_path):
         if expected_frames < int(self.video_duration * target_fps):
             logger.warning(f"Input video duration is greater than actual audio duration, using audio duration instead: audio_duration={audio_len / target_fps}, video_duration={self.video_duration}")
 
-        # Segment audio
-        audio_segments = self._audio_processor.segment_audio(audio_array, expected_frames, self.config.get("target_video_length", 81), self.prev_frame_length)
+        # Segment audio (CLI / input_info wins over config_json; target_video_length is not merged into config)
+        target_video_length = self.config.get("target_video_length", 81)
+        ii = getattr(self, "input_info", None)
+        if ii is not None and hasattr(ii, "target_video_length"):
+            tvl = ii.target_video_length
+            if tvl is not None and tvl is not UNSET and tvl > 0:
+                target_video_length = tvl
-        target_video_length = self.config.get("target_video_length", 81)
-        ii = getattr(self, "input_info", None)
-        if ii is not None and hasattr(ii, "target_video_length"):
-            tvl = ii.target_video_length
-            if tvl is not None and tvl is not UNSET and tvl > 0:
-                target_video_length = tvl
+        target_video_length = self.config.get("target_video_length", 81)
+        ii = getattr(self, "input_info", None)
+        if ii is not None:
+            tvl = getattr(ii, "target_video_length", None)
+            if tvl not in (None, UNSET) and tvl > 0:
+                target_video_length = tvl
-        target_video_length = self.config.get("target_video_length", 81)
-        ii = getattr(self, "input_info", None)
-        if ii is not None and hasattr(ii, "target_video_length"):
-            tvl = ii.target_video_length
-            if tvl is not None and tvl is not UNSET and tvl > 0:
-                target_video_length = tvl
+        target_video_length = self.config.get("target_video_length", 81)
+        ii = getattr(self, "input_info", None)
+        if ii is not None:
+            tvl = getattr(ii, "target_video_length", None)
+            if tvl not in (None, UNSET) and tvl > 0:
+                target_video_length = tvl
+        audio_segments = self._audio_processor.segment_audio(audio_array, expected_frames, target_video_length, self.prev_frame_length)
 
         # Mask latent for multi-person s2v
         if mask_files is not None:

diff --git a/lightx2v/utils/input_info.py b/lightx2v/utils/input_info.py
@@ -121,6 +121,7 @@ class S2VInputInfo:
     resized_shape: list = field(default_factory=list)
     latent_shape: list = field(default_factory=list)
     target_shape: list = field(default_factory=list)
+    target_video_length: int = field(default_factory=int)
 
     # prev info
     overlap_frame: torch.Tensor = field(default_factory=lambda: None)
@@ -148,6 +149,7 @@ class RS2VInputInfo:
     resized_shape: list = field(default_factory=list)
     latent_shape: list = field(default_factory=list)
     target_shape: list = field(default_factory=list)
+    target_video_length: int = field(default_factory=int)
 
     # prev info
     overlap_frame: torch.Tensor = field(default_factory=lambda: None)