Skip to content

Added --iteration and --automation flags #512

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def main(

image_path = kwargs.pop("image_path", None)
image_url = kwargs.pop("image_url", None)
iteration = kwargs.pop("iteration", 1)
automation = kwargs.pop("automation", False)

config = qeff_model.model.config
architecture = config.architectures[0] if config.architectures else None
Expand Down Expand Up @@ -234,6 +236,8 @@ def main(
prompt=prompt,
prompts_txt_file_path=prompts_txt_file_path,
generation_len=generation_len,
iteration=iteration,
automation=automation,
)


Expand Down
71 changes: 41 additions & 30 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def cloud_ai_100_exec_kv(
stream: bool = True,
write_io_dir: Optional[str] = None,
automation=False,
iteration: int = 1,
prompt_to_lora_id_mapping: Optional[List[int]] = None,
is_tlm: bool = False,
):
Expand All @@ -341,6 +342,7 @@ def cloud_ai_100_exec_kv(
:stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``.
:Write_io_dir (str): Path to write the input and output files. ``Defaults to None``.
:automation (bool): If true, it prints input, output, and performance stats. ``Defaults to False``.
:iteration (int): Number of iterations to run the inference. ``Defaults to 1``.
:prompt_to_lora_id_mapping (List[int]): Mapping to associate prompts with their respective LoRA adapter.

Returns:
Expand Down Expand Up @@ -373,30 +375,34 @@ def cloud_ai_100_exec_kv(
full_batch_size=full_batch_size,
is_tlm=is_tlm,
)
if full_batch_size is None:
exec_info = [
generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping)
for i in range(0, len(prompt), batch_size)
]
prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info])
decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info])
total_perf = np.average([info.perf_metrics.total_perf for info in exec_info])
total_time = np.average([info.perf_metrics.total_time for info in exec_info])
generated_texts = [info.generated_texts for info in exec_info]
generated_ids = [info.generated_ids for info in exec_info]

exec_info = CloudAI100ExecInfo(
batch_size=batch_size,
generated_texts=generated_texts,
generated_ids=generated_ids,
perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
)
else:
exec_info = generate_text.generate(
prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
)

print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
for _ in range(0, int(iteration)):
if full_batch_size is None:
exec_info = [
generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping)
for i in range(0, len(prompt), batch_size)
]
prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info])
decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info])
total_perf = np.average([info.perf_metrics.total_perf for info in exec_info])
total_time = np.average([info.perf_metrics.total_time for info in exec_info])
generated_texts = [info.generated_texts for info in exec_info]
generated_ids = [info.generated_ids for info in exec_info]

exec_info = CloudAI100ExecInfo(
batch_size=batch_size,
generated_texts=generated_texts,
generated_ids=generated_ids,
perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
)
else:
exec_info = generate_text.generate(
prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
)

print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)

# TODO: Need to handle the case where exec_info if given for n iterations
return exec_info


Expand Down Expand Up @@ -822,7 +828,9 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):

return decode_pause_time

def run_decode(self, decode_inputs, generation_len, streamer: Optional[transformers.TextStreamer] = None):
def run_decode(
self, decode_inputs, generation_len, automation, streamer: Optional[transformers.TextStreamer] = None
):
"""
Default method for running decode. Executes the decoding process for a given set of inputs and a specified generation length.

Expand Down Expand Up @@ -857,11 +865,11 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
self.generated_ids[:, num_token] = decode_inputs["input_ids"][:, -1]
finished_sequences |= decode_inputs["input_ids"] == self.tokenizer.eos_token_id

if finished_sequences.all():
if finished_sequences.all() and not automation:
break
return num_token

def generate_decode_stream(self, decode_inputs, generation_len):
def generate_decode_stream(self, decode_inputs, generation_len, automation):
"""
Generator method for yielding decode tokens. Executes the decoding process for a given set of inputs and a specified generation length.

Expand Down Expand Up @@ -889,7 +897,7 @@ def generate_decode_stream(self, decode_inputs, generation_len):
self.generated_ids[:, num_token] = decode_inputs["input_ids"].squeeze(1)
finished_sequences |= decode_inputs["input_ids"] == self.tokenizer.eos_token_id

if finished_sequences.all():
if finished_sequences.all() and not automation:
break
yield decode_inputs["input_ids"] # yield the last token

Expand Down Expand Up @@ -953,6 +961,7 @@ def _regular_model_execution(
prompt: List[str],
generation_len: Optional[int] = None,
stream: Optional[bool] = True,
automation: Optional[bool] = False,
prompt_to_lora_id_mapping: Optional[List[int]] = None,
):
"""
Expand Down Expand Up @@ -980,7 +989,7 @@ def _regular_model_execution(
decode_inputs = self._qaic_model.prepare_decode_inputs()

loop_start = perf_counter() # Start decode loop timer
num_token = self._qaic_model.run_decode(decode_inputs, generation_len, self._text_streamer)
num_token = self._qaic_model.run_decode(decode_inputs, generation_len, automation, self._text_streamer)
end = perf_counter()
generated_texts = self._tokenizer.batch_decode(self._qaic_model.generated_ids, skip_special_tokens=True)

Expand Down Expand Up @@ -1034,6 +1043,7 @@ def generate_stream_tokens(
self,
prompt: List[str],
generation_len: Optional[int] = None,
automation: Optional[bool] = False,
prompt_to_lora_id_mapping: Optional[List[int]] = None,
):
"""
Expand Down Expand Up @@ -1063,7 +1073,7 @@ def generate_stream_tokens(

loop_start = perf_counter() # Start decode loop timer
num_token = 0
for token_id in self._qaic_model.generate_decode_stream(decode_inputs, generation_len):
for token_id in self._qaic_model.generate_decode_stream(decode_inputs, generation_len, automation):
decoded_tokens = []
for idx in range(self._qaic_model.batch_size):
decoded_tokens.append(self._tokenizer.decode(token_id[idx], skip_special_tokens=True))
Expand All @@ -1082,6 +1092,7 @@ def generate(
prompt: List[str],
generation_len: Optional[int] = None,
stream: bool = True,
automation: Optional[bool] = False,
prompt_to_lora_id_mapping: Optional[List[int]] = None,
):
"""
Expand All @@ -1105,7 +1116,7 @@ def generate(
if stream:
print("\nPrompt : " + prompt[0] + "\nCompletion :", flush=True, end="")
perf_metrics, generated_texts = self._regular_model_execution(
prompt, generation_len, stream, prompt_to_lora_id_mapping
prompt, generation_len, stream, automation, prompt_to_lora_id_mapping
)

if stream:
Expand Down
2 changes: 2 additions & 0 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,6 +1892,8 @@ def generate(
prompt=prompts,
device_id=device_id,
generation_len=generation_len,
automation=kwargs.pop("automation", False),
iteration=kwargs.pop("iteration", 1),
is_tlm=self.is_tlm,
)
else:
Expand Down
Loading