Enable the i2vgen pipeline (#1670)

Signed-off-by: yuanwu <[email protected]> Co-authored-by: Ilyas Moutawwakil <[email protected]> Co-authored-by: regisss <[email protected]>
huggingface · Feb 7, 2025 · 18449ba · 18449ba
1 parent 4abb0e6
commit 18449ba
Show file tree

Hide file tree

Showing 7 changed files with 958 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -299,6 +299,7 @@ The following model architectures, tasks and device distributions have been vali
 | FLUX.1           | <li>LoRA</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#flux1)</li><li>[image-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#flux1-image-to-image)</li> |
 | Text to Video    |               | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)</li> |
 | Image to Video   |               | <li>Single card</li> | <li>[image-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#image-to-video-generation)</li> |
+| i2vgen-xl   |               | <li>Single card</li> | <li>[image-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#I2vgen-xl)</li> |
 
 ### PyTorch Image Models/TIMM:
 

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -122,6 +122,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | FLUX.1              | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#dreambooth-lora-fine-tuning-with-flux1-dev)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Text to Video       |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)</li> |
+| i2vgen-xl       |          | <li>Single card</li> | <li>[image-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#I2vgen-xl)</li> |
 
 - PyTorch Image Models/TIMM:
 

diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
@@ -849,6 +849,31 @@ python image_to_video_generation.py \
     --height=512
 ```
 
+# I2vgen-xl
+I2vgen-xl is high quality Image-to-Video synthesis via cascaded diffusion models. Please refer to  [Huggingface i2vgen-xl doc](https://huggingface.co/ali-vilab/i2vgen-xl).
+
+Here is how to generate video with one image and text prompt:
+
+```bash
+PT_HPU_MAX_COMPOUND_OP_SIZE=1 \
+python image_to_video_generation.py \
+    --model_name_or_path "ali-vilab/i2vgen-xl" \
+    --image_path "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" \
+    --num_videos_per_prompt 1 \
+    --video_save_dir ./i2vgen_xl \
+    --num_inference_steps 50 \
+    --use_habana \
+    --use_hpu_graphs \
+    --gaudi_config Habana/stable-diffusion \
+    --gif \
+    --num_frames 16 \
+    --prompts "Papers were floating in the air on a table in the library" \
+    --negative_prompts "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" \
+    --seed 8888  \
+    --sdp_on_bf16 \
+    --bf16
+```
+
 # Important Notes for Gaudi3 Users  
 
 - **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced.

diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py
@@ -19,9 +19,13 @@
 from pathlib import Path
 
 import torch
-from diffusers.utils import export_to_video, load_image
+from diffusers.utils import export_to_gif, export_to_video, load_image
 
-from optimum.habana.diffusers import GaudiEulerDiscreteScheduler, GaudiStableVideoDiffusionPipeline
+from optimum.habana.diffusers import (
+    GaudiEulerDiscreteScheduler,
+    GaudiI2VGenXLPipeline,
+    GaudiStableVideoDiffusionPipeline,
+)
 from optimum.habana.utils import set_seed
 
 
@@ -57,6 +61,20 @@ def main():
     )
 
     # Pipeline arguments
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        nargs="*",
+        default="Papers were floating in the air on a table in the library",
+        help="The prompt or prompts to guide the image generation.",
+    )
+    parser.add_argument(
+        "--negative_prompts",
+        type=str,
+        nargs="*",
+        default="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms",
+        help="The prompt or prompts not to guide the image generation.",
+    )
     parser.add_argument(
         "--image_path",
         type=str,
@@ -177,6 +195,7 @@ def main():
         ),
     )
     parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument("--gif", action="store_true", help="Whether to generate the video in gif format.")
     parser.add_argument(
         "--sdp_on_bf16",
         action="store_true",
@@ -212,14 +231,20 @@ def main():
     )
     logger.setLevel(logging.INFO)
 
+    i2v_models = ["i2vgen-xl"]
+    is_i2v_model = any(model in args.model_name_or_path for model in i2v_models)
+
     # Load input image(s)
     input = []
     logger.info("Input image(s):")
     if isinstance(args.image_path, str):
         args.image_path = [args.image_path]
     for image_path in args.image_path:
         image = load_image(image_path)
-        image = image.resize((args.height, args.width))
+        if is_i2v_model:
+            image = image.convert("RGB")
+        else:
+            image = image.resize((args.height, args.width))
         input.append(image)
         logger.info(image_path)
 
@@ -281,6 +306,24 @@ def main():
             output_type=args.output_type,
             num_frames=args.num_frames,
         )
+    elif is_i2v_model:
+        del kwargs["scheduler"]
+        pipeline = GaudiI2VGenXLPipeline.from_pretrained(
+            args.model_name_or_path,
+            **kwargs,
+        )
+        generator = torch.manual_seed(args.seed)
+        outputs = pipeline(
+            prompt=args.prompts,
+            image=input,
+            num_videos_per_prompt=args.num_videos_per_prompt,
+            batch_size=args.batch_size,
+            num_frames=args.num_frames,
+            num_inference_steps=args.num_inference_steps,
+            negative_prompt=args.negative_prompts,
+            guidance_scale=9.0,
+            generator=generator,
+        )
     else:
         pipeline = GaudiStableVideoDiffusionPipeline.from_pretrained(
             args.model_name_or_path,
@@ -321,7 +364,11 @@ def main():
             video_save_dir.mkdir(parents=True, exist_ok=True)
             logger.info(f"Saving video frames in {video_save_dir.resolve()}...")
             for i, frames in enumerate(outputs.frames):
-                export_to_video(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".mp4", fps=7)
+                if args.gif:
+                    export_to_gif(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".gif")
+                else:
+                    export_to_video(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".mp4", fps=7)
+
                 if args.save_frames_as_images:
                     for j, frame in enumerate(frames):
                         frame.save(

diff --git a/optimum/habana/diffusers/__init__.py b/optimum/habana/diffusers/__init__.py
@@ -6,6 +6,7 @@
 from .pipelines.ddpm.pipeline_ddpm import GaudiDDPMPipeline
 from .pipelines.flux.pipeline_flux import GaudiFluxPipeline
 from .pipelines.flux.pipeline_flux_img2img import GaudiFluxImg2ImgPipeline
+from .pipelines.i2vgen_xl.pipeline_i2vgen_xl import GaudiI2VGenXLPipeline
 from .pipelines.pipeline_utils import GaudiDiffusionPipeline
 from .pipelines.stable_diffusion.pipeline_stable_diffusion import GaudiStableDiffusionPipeline
 from .pipelines.stable_diffusion.pipeline_stable_diffusion_depth2img import GaudiStableDiffusionDepth2ImgPipeline