You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I was trying to make a multi-image inference using llava-v1.6-vicuna-7b. Here is my code:
from PIL import Image
import habana_frameworks.torch as ht
import habana_frameworks.torch.core as htcore
import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration, AutoConfig, AutoModelForVision2Seq, pipeline
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
import time
import os
adapt_transformers_to_gaudi()
print("----------------------------=====================-------------------------------")
print("Trying LLava-Next(1.6)-Vicuna-7B")
print("Setting the device to hpu")
device = torch.device("hpu")
print("Loading the model")
args_model_name_or_path = "/workspace/models/model_llava_v1_6_vicuna_7b"
model_type = AutoConfig.from_pretrained(args_model_name_or_path).model_type
print("Model type: ", model_type)
print("Loading the processor")
args_processor = AutoProcessor.from_pretrained(args_model_name_or_path)
model_dtype = torch.bfloat16
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image_cats = Image.open(requests.get(url, stream=True).raw)
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
image_snowman = Image.open(requests.get(url, stream=True).raw)
# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
conversation_1 = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "There is a red stop sign in the image."},
],
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What about this image? How many cats do you see?"},
],
},
]
conversation_2 = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
prompt_1 = args_processor.apply_chat_template(conversation_1, add_generation_prompt=True)
prompt_2 = args_processor.apply_chat_template(conversation_2, add_generation_prompt=True)
prompts = [prompt_1, prompt_2]
print("Creating the pipeline")
generator = pipeline(
"image-to-text",
model=args_model_name_or_path,
tokenizer=args_model_name_or_path,
image_processor=args_model_name_or_path,
torch_dtype=model_dtype,
device="hpu",
)
print("Initializing a couple of params (kwargs, batch size, nuber of iterations)")
generate_kwargs = {"max_new_tokens": 200, "do_sample": False} # Customize as needed
batch_size = 4 # Adjust the batch size as needed
args_n_iterations = 1
start = time.perf_counter()
results = generator(images=[image_stop, image_cats, image_snowman], prompt=prompts, generate_kwargs=generate_kwargs)
end = time.perf_counter()
duration = end - start
print("Total duration:", duration)
print(results)
print("----------------------------=====================-------------------------------")
however, I get the following error:
ValueError: The input provided to the model are wrong. The number of image tokens is 2 while the number of image given to the model is 1. This prevents correct indexing and breaks batch generation.
Can anyone help me identify what the issue is? I would very much appreciate any help.
The text was updated successfully, but these errors were encountered:
I was trying to make a multi-image inference using llava-v1.6-vicuna-7b. Here is my code:
however, I get the following error:
Can anyone help me identify what the issue is? I would very much appreciate any help.
The text was updated successfully, but these errors were encountered: