Multi-image inference using LLava #1707

DavidAbrahamyan · 2025-01-21T19:30:56Z

I was trying to make a multi-image inference using llava-v1.6-vicuna-7b. Here is my code:

from PIL import Image
import habana_frameworks.torch as ht
import habana_frameworks.torch.core as htcore
import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration, AutoConfig, AutoModelForVision2Seq, pipeline
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
import time
import os

adapt_transformers_to_gaudi()

print("----------------------------=====================-------------------------------")
print("Trying LLava-Next(1.6)-Vicuna-7B")
print("Setting the device to hpu")
device = torch.device("hpu")

print("Loading the model")
args_model_name_or_path = "/workspace/models/model_llava_v1_6_vicuna_7b"
model_type = AutoConfig.from_pretrained(args_model_name_or_path).model_type

print("Model type: ", model_type)

print("Loading the processor")
args_processor = AutoProcessor.from_pretrained(args_model_name_or_path)

model_dtype = torch.bfloat16

url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)

url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
image_snowman = Image.open(requests.get(url, stream=True).raw)

# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
conversation_1 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
            ],
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "There is a red stop sign in the image."},
            ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What about this image? How many cats do you see?"},
            ],
    },
]

conversation_2 = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
            ],
    },
]

prompt_1 = args_processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = args_processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]

print("Creating the pipeline")
generator = pipeline(
            "image-to-text",
            model=args_model_name_or_path,
            tokenizer=args_model_name_or_path,
            image_processor=args_model_name_or_path,
            torch_dtype=model_dtype,
            device="hpu",
        )

print("Initializing a couple of params (kwargs, batch size, nuber of iterations)")
generate_kwargs = {"max_new_tokens": 200, "do_sample": False}  # Customize as needed
batch_size = 4  # Adjust the batch size as needed
args_n_iterations = 1
start = time.perf_counter()
results = generator(images=[image_stop, image_cats, image_snowman], prompt=prompts, generate_kwargs=generate_kwargs)

end = time.perf_counter()
duration = end - start
print("Total duration:", duration)
print(results)
print("----------------------------=====================-------------------------------")

however, I get the following error:

ValueError: The input provided to the model are wrong. The number of image tokens is 2 while the number of image given to the model is 1. This prevents correct indexing and breaks batch generation.

Can anyone help me identify what the issue is? I would very much appreciate any help.

The text was updated successfully, but these errors were encountered:

regisss · 2025-01-30T14:51:21Z

@DavidAbrahamyan Is this more or less the same as #1708 ?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Multi-image inference using LLava #1707

Multi-image inference using LLava #1707

DavidAbrahamyan commented Jan 21, 2025

regisss commented Jan 30, 2025

Multi-image inference using LLava #1707

Multi-image inference using LLava #1707

Comments

DavidAbrahamyan commented Jan 21, 2025

regisss commented Jan 30, 2025