From 0e26f54cbe1036daf0b4e5287571ad387d6d5666 Mon Sep 17 00:00:00 2001 From: OleehyO Date: Mon, 20 Jan 2025 09:43:45 +0000 Subject: [PATCH 1/2] docs: clarify frame number requirements for CogVideoX models Specify that frame numbers must be: - 16N + 1 (N <= 10) for CogVideoX1.5-5B models - 8N + 1 (N <= 6) for CogVideoX-2B/5B models --- README.md | 5 +++++ README_ja.md | 5 +++++ README_zh.md | 7 ++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 48e3b64f..e5ca8bb6 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,11 @@ models we currently offer, along with their foundational information. Min(W, H) = 768
768 ≤ Max(W, H) ≤ 1360
Max(W, H) % 16 = 0 720 * 480 + + Number of Frames + Should be 16N + 1 where N <= 10 (default 81) + Should be 8N + 1 where N <= 6 (default 49) + Inference Precision BF16 (Recommended), FP16, FP32, FP8*, INT8, Not supported: INT4 diff --git a/README_ja.md b/README_ja.md index 074cc93f..3927d17a 100644 --- a/README_ja.md +++ b/README_ja.md @@ -191,6 +191,11 @@ CogVideoXは、[清影](https://chatglm.cn/video?fr=osm_cogvideox) と同源の Min(W, H) = 768
768 ≤ Max(W, H) ≤ 1360
Max(W, H) % 16 = 0 720 * 480 + + フレーム数 + 16N + 1 (N <= 10) である必要があります (デフォルト 81) + 8N + 1 (N <= 6) である必要があります (デフォルト 49) + 推論精度 BF16(推奨), FP16, FP32,FP8*,INT8,INT4非対応 diff --git a/README_zh.md b/README_zh.md index c6c81f87..1444a927 100644 --- a/README_zh.md +++ b/README_zh.md @@ -180,7 +180,12 @@ CogVideoX是 [清影](https://chatglm.cn/video?fr=osm_cogvideox) 同源的开源 1360 * 768 Min(W, H) = 768
768 ≤ Max(W, H) ≤ 1360
Max(W, H) % 16 = 0 720 * 480 - + + + 帧数 + 必须为 16N + 1 其中 N <= 10 (默认 81) + 必须为 8N + 1 其中 N <= 6 (默认 49) + 推理精度 BF16(推荐), FP16, FP32,FP8*,INT8,不支持INT4 From d9e2a415e841a2e4004746314f1a9ff5b8bc2035 Mon Sep 17 00:00:00 2001 From: OleehyO Date: Mon, 20 Jan 2025 09:48:17 +0000 Subject: [PATCH 2/2] fix: fix resolution handling for different model types --- inference/cli_demo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference/cli_demo.py b/inference/cli_demo.py index 41f4267c..37dfcfc7 100644 --- a/inference/cli_demo.py +++ b/inference/cli_demo.py @@ -100,7 +100,7 @@ def generate_video( if width is None or height is None: height, width = desired_resolution logging.info(f"\033[1mUsing default resolution {desired_resolution} for {model_name}\033[0m") - elif (width, height) != desired_resolution: + elif (height, width) != desired_resolution: if generate_type == "i2v": # For i2v models, use user-defined width and height logging.warning( @@ -111,7 +111,7 @@ def generate_video( logging.warning( f"\033[1;31m{model_name} is not supported for custom resolution. Setting back to default resolution {desired_resolution}.\033[0m" ) - width, height = desired_resolution + height, width = desired_resolution if generate_type == "i2v": pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)