-
Notifications
You must be signed in to change notification settings - Fork 43
Enabled Infer CLI for VLM #287
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
9567658
to
ad06845
Compare
bc60d47
to
76e863a
Compare
76e863a
to
8d99a93
Compare
Removing onnx_defer_loading flag which was originally removed in _[Removed onnx_defer_loading from Immutable Convertor Args. PR: 230]_ but got added back later in _[Mllama(single + dual) + InternVL(single) + Llava (single) PR: 267]_ maybe becausing of rebasing. Signed-off-by: Shubham Agrawal <[email protected]> Signed-off-by: Asmita Goswami <[email protected]>
This will create a config JSON file, which contains all the details about compilation and SDK versions. Currently, this code is added in the code block of QEFFAutoModelForCausalLM.compile. The config would look like below: ``` { "huggingface_config": { "vocab_size": 50257, "n_positions": 1024, "n_embd": 768, "n_layer": 12, "n_head": 12, "n_inner": null, "activation_function": "gelu_new", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "chunk_size_feed_forward": 0, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": [ "GPT2LMHeadModel" ], "finetuning_task": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "problem_type": null, "_name_or_path": "gpt2", "_commit_hash": "607a30d783dfa663caf39e06633721c8d4cfcd7e", "_attn_implementation_internal": "eager", "transformers_version": null, "model_type": "gpt2", "n_ctx": 1024 }, "qpc_config": { "QEff_config": { "pytorch_transforms": [ "AwqToMatmulNbitsTransform", "GPTQToMatmulNbitsTransform", "CustomOpsTransform", "KVCacheTransform" ], "onnx_transforms": [ "FP16ClipTransform", "SplitTensorsTransform" ], "onnx_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/GPT2LMHeadModel.onnx" }, "aic_compiler_config": { "apps_sdk_version": "1.20.0", "compile_dir": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47", "specializtions_file_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/specializations.json", "prefill_seq_len": 32, "ctx_len": 128, "batch_size": 1, "full_batch_size": null, "num_devices": 1, "num_cores": 16, "mxfp6_matmul": false, "mxint8_kv_cache": false, "num_speculative_tokens": null }, "qnn_config": { "enable_qnn": true, "qnn_config_path": "QEfficient/compile/qnn_config.json", "product": "QAIRT", "os": { "Ubuntu": 22.04, "Windows": 11 }, "sdk_flavor": [ "aic" ], "version": "2.31.0", "build_id": "250109072054_3882", "qnn_backend_api_version": "2.18.0", "tensorflow": "2.10.1", "tflite": "2.3.0", "torch": "1.13.1", "onnx": "1.16.1", "onnxruntime": "1.17.1", "onnxsimplifier": "0.4.36", "android-ndk": "r26c", "platform": "AIC.1.20.0.14" } } } ``` Note: The code structure may change. --------- Signed-off-by: Abukhoyer Shaik <[email protected]> Signed-off-by: Asmita Goswami <[email protected]>
… validation page (quic#303) Signed-off-by: Abukhoyer Shaik <[email protected]> Signed-off-by: Asmita Goswami <[email protected]>
This is just small fixes done for printing the `QEFFAutoModelForCausalLM`'s instance by changing the `__repr__(self)` method. Signed-off-by: Abukhoyer Shaik <[email protected]> Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
3165896
to
1608804
Compare
…o image_text_support
…ansformers into image_text_support
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think, we should add one small multimodal model under CLI api testing.
"model_name" : ["gpt2"], |
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
Added test_infer_vlm.py for testing. |
Signed-off-by: Asmita Goswami <[email protected]>
TODO: |
0b41c60
to
7b8ab2f
Compare
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): | ||
exec_info = execute_vlm_model( | ||
qeff_model=qeff_model, | ||
model_name=model_name, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO: Here use load_hf_processor
and load_streamer
to load processor and streamer. Create a list of conversation in _utill that will be mapped with the model architecture. At the end use qeff_model.generate
as else condition. This way this code will be more scalable and well formatted.Then there will be no need of function execute_vlm_model
.
Signed-off-by: Asmita Goswami <[email protected]>
Signed-off-by: Asmita Goswami <[email protected]>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Added support for enabling VLMs via CLI.
Sample command:
python -m QEfficient.cloud.infer --model_name meta-llama/Llama-3.2-11B-Vision-Instruct --batch_size 1 --prompt_len 32 --ctx_len 512 --num_cores 16 --device_group [0] --prompt "Descrive the image?" --mos 1 --allocator_dealloc_delay 1 --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg