Skip to content

Enabled VLMs via CLI on v1.19.3 #297

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions QEfficient/base/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
from typing import Any

from transformers import AutoConfig
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

from QEfficient.base.modeling_qeff import QEFFBaseModel
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
from QEfficient.transformers.modeling_utils import model_class_mapping


class QEFFCommonLoader:
Expand All @@ -42,8 +41,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
architecture = config.architectures[0] if config.architectures else None

if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
model_class = QEFFAutoModelForCausalLM
class_name = model_class_mapping.get(architecture)
if class_name:
module = __import__("QEfficient.transformers.models.modeling_auto")
model_class = getattr(module, class_name)
else:
raise NotImplementedError(
f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
Expand Down
6 changes: 5 additions & 1 deletion QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from QEfficient.base.pytorch_transforms import PytorchTransform
from QEfficient.compile.qnn_compiler import compile as qnn_compile
from QEfficient.generation.cloud_infer import QAICInferenceSession
from QEfficient.utils import constants
from QEfficient.utils import constants, dump_qconfig
from QEfficient.utils._utils import load_json
from QEfficient.utils.cache import QEFF_HOME, to_hashable

Expand Down Expand Up @@ -211,6 +211,7 @@ def _export(
self.onnx_path = onnx_path
return onnx_path

@dump_qconfig
def _compile(
self,
onnx_path: Optional[str] = None,
Expand Down Expand Up @@ -336,8 +337,10 @@ def _compile(
)

self.qpc_path = qpc_path

return qpc_path

@dump_qconfig
def _qnn_compile(
self,
onnx_path: Optional[str] = None,
Expand Down Expand Up @@ -435,4 +438,5 @@ def _qnn_compile(
)

self.qpc_path = qpc_path

return qpc_path
87 changes: 68 additions & 19 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@
import sys
from typing import List, Optional

import requests
from PIL import Image
from transformers import AutoProcessor, TextStreamer
from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES

from QEfficient.base.common import QEFFCommonLoader
from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer
from QEfficient.utils.logging_utils import logger


Expand Down Expand Up @@ -65,18 +70,16 @@ def main(
:allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
:kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
-allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
-qpc_crc=True -> -qpc-crc

.. code-block:: bash

python -m QEfficient.cloud.infer OPTIONS

"""
cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
tokenizer = load_hf_tokenizer(
pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
cache_dir=cache_dir,
hf_token=hf_token,
)

if "--mxfp6" in sys.argv:
if args.mxfp6:
Expand All @@ -93,6 +96,16 @@ def main(
local_model_dir=local_model_dir,
)

image_path = kwargs.pop("image_path", None)
image_url = kwargs.pop("image_url", None)

config = qeff_model.model.config
architecture = config.architectures[0] if config.architectures else None
if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
img_size = kwargs.pop("img_size", None)
if img_size or image_path or image_url:
logger.warning(f"Skipping image arguments as they are not valid for {architecture}")

#########
# Compile
#########
Expand All @@ -116,14 +129,47 @@ def main(
#########
# Execute
#########
_ = qeff_model.generate(
tokenizer,
prompts=prompt,
device_id=device_group,
prompt=prompt,
prompts_txt_file_path=prompts_txt_file_path,
generation_len=generation_len,
)
if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
processor = AutoProcessor.from_pretrained(model_name, use_fast=False)

if not (image_url or image_path):
raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)

conversation = constants.Constants.conversation
conversation[0]["content"][1].update({"text": prompt[0]}) # Currently accepting only 1 prompt

# Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

split_inputs = processor(
text=input_text,
images=raw_image,
return_tensors="pt",
add_special_tokens=False,
)
streamer = TextStreamer(processor.tokenizer)
output = qeff_model.generate(
inputs=split_inputs,
streamer=streamer,
device_ids=device_group,
generation_len=generation_len,
)
print(output)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should not be printing it this way in infer. Can we set a verbose level and print it accordingly from Auto classes itself @ochougul @quic-amitraj ?

else:
tokenizer = load_hf_tokenizer(
pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
cache_dir=cache_dir,
hf_token=hf_token,
)
_ = qeff_model.generate(
tokenizer,
prompts=prompt,
device_id=device_group,
prompt=prompt,
prompts_txt_file_path=prompts_txt_file_path,
generation_len=generation_len,
)


if __name__ == "__main__":
Expand Down Expand Up @@ -220,18 +266,21 @@ def main(
"--enable_qnn",
"--enable-qnn",
action="store_true",
nargs="?",
const=True,
type=str,
default=False,
help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
If not provided, the default configuration will be used.\
Sample Config: QEfficient/compile/qnn_config.json",
)
parser.add_argument(
"qnn_config",
nargs="?",
type=str,
)

args, compiler_options = parser.parse_known_args()

if isinstance(args.enable_qnn, str):
args.qnn_config = args.enable_qnn
args.enable_qnn = True

compiler_options_dict = {}
for i in range(0, len(compiler_options)):
if compiler_options[i].startswith("--"):
Expand Down
4 changes: 4 additions & 0 deletions QEfficient/peft/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ def model_hash(self) -> str:
mhash = mhash.hexdigest()[:16]
return mhash

@property
def get_model_config(self) -> dict:
return self.model.get_base_model().config.__dict__

def load_adapter(self, model_id: str, adapter_name: str):
"""Loads a new adapter from huggingface hub or local path

Expand Down
4 changes: 4 additions & 0 deletions QEfficient/peft/lora/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ def model_hash(self) -> str:
mhash = mhash.hexdigest()[:16]
return mhash

@property
def get_model_config(self) -> dict:
return self.model.model.config.__dict__

def download_adapter(
self,
adapter_model_id: str,
Expand Down
10 changes: 10 additions & 0 deletions QEfficient/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import torch
import torch.nn as nn
import transformers.models.auto.modeling_auto as mapping
from transformers.models.codegen.modeling_codegen import (
CodeGenAttention,
CodeGenBlock,
Expand Down Expand Up @@ -272,6 +273,15 @@
}


model_class_mapping = {
**{architecture: "QEFFAutoModelForCausalLM" for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()},
**{
architecture: "QEFFAutoModelForImageTextToText"
for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()
},
}


def _prepare_cross_attention_mask(
cross_attention_mask: torch.Tensor,
num_vision_tokens: int,
Expand Down
Loading
Loading