import torch
from transformers import MllamaForConditionalGeneration, AutoProcessor
from transformers import BitsAndBytesConfig
import time
from PIL import Image
import requests

model_id= 'my model path'

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

model = MllamaForConditionalGeneration.from_pretrained(

model_id,

# torch_dtype=torch.bfloat16,

# device_map="cuda",

quantization_config=bnb_config,

)

model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="cuda",
)
processor = AutoProcessor.from_pretrained(model_id)

for i in range(5):
start = time.time()
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": "Can you please describe this image in just one sentence?"}
    ]}
]

input_text = processor.apply_chat_template(
    messages, add_generation_prompt=True,
)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to(model.device)
output = model.generate(**inputs, max_new_tokens=70)

print("Took time: ",time.time()-start)

print(processor.decode(output[0][inputs["input_ids"].shape[-1]:]))

Inference with bitsandbytes for Llama 3.2 vision model as per the blog https://huggingface.co/blog/llama32 is taking more time than the default model load. #2510

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions