[Bug-fix:] QEFFAutoModelForImageTextToText class docstring fixing (#372)

abukhoy · web-flow · commit 54a9b6f50d1d · 2025-04-22T14:53:39.000+05:30
This pull request addresses the issue with the code block in the class
docstring of `QEFFAutoModelForImageTextToText`. Previously, the
docstring was not displaying correctly on `gh-pages` due to an error in
the Python code block.

---------

Signed-off-by: Abukhoyer Shaik &lt;quic_abukhoye@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -1176,6 +1176,7 @@ class QEFFAutoModelForImageTextToText:
         :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True.
 
     .. code-block:: python
+
         import requests
         from PIL import Image
         from transformers import AutoProcessor, TextStreamer
@@ -1189,8 +1190,8 @@ class QEFFAutoModelForImageTextToText:
         image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
 
         ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc
-        processor = AutoProcessor.from_pretrained(model_name, token=token)
-        model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, attn_implementation="eager", kv_offload=False)
+        processor = AutoProcessor.from_pretrained(model_name, token=HF_TOKEN)
+        model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=HF_TOKEN, attn_implementation="eager", kv_offload=False)
 
         ## STEP - 2 Export & Compile the Model
         model.compile(
@@ -1220,12 +1221,12 @@ class QEFFAutoModelForImageTextToText:
             return_tensors="pt",
             add_special_tokens=False,
             padding="max_length",
-            max_length=prefill_seq_len,
+            max_length=32,
         )
 
         ## STEP - 4 Run Inference on the compiled model
         streamer = TextStreamer(processor.tokenizer)
-        model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+        model.generate(inputs=inputs, streamer=streamer, generation_len=512)
 
     """
 
diff --git a/tests/transformers/spd/test_pld_inference.py b/tests/transformers/spd/test_pld_inference.py
@@ -145,9 +145,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
     """
     num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
     input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
-    assert (
-        input_len_padded <= ctx_len
-    ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
     return input_len_padded
 
 
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
@@ -75,9 +75,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
     """
     num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
     input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
-    assert (
-        input_len_padded <= ctx_len
-    ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
     return input_len_padded
 
 
@@ -320,9 +320,9 @@ def test_spec_decode_inference(
     for prompt, generation in zip(prompts, batch_decode):
         print(f"{prompt=} {generation=}")
     # validation check
-    assert mean_num_accepted_tokens == float(
-        num_speculative_tokens + 1
-    ), f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}"
+    assert mean_num_accepted_tokens == float(num_speculative_tokens + 1), (
+        f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}"
+    )
     del target_model_session
     del draft_model_session
     generated_ids = np.asarray(generated_ids[0]).flatten()