[Audio] People's Speech dataset and tracer tool (#1086)

kylesayrs · web-flow · commit f8611457a018 · 2025-03-11T17:09:51.000Z
## Purpose ## * Provide a predefined audio dataset for * Testing traceability of audio models * e2e tests with audio models * Simpler examples (blog) ## Prerequisites ## * #1030 * #1085 ## Changes ## * Implement `PeoplesSpeech` dataset * Because of the more complex nature of audio processors, this dataset needs to hardcode some processing logic specific to models * Assumes that most processing is similar to whisper processing, which seems to be the standard * Because processing changes depending on the model, this means mapped outputs cannot be cached * Add `load_from_cache_file` argument to preprocessing mapping (this was overlooked before) * Integrate dataset with tracing debugger tool ## Testing ## ```bash llmcompressor.trace \ --model_id openai/whisper-large-v2\ --model_class TraceableWhisperForConditionalGeneration\ --modality audio ``` Traceable definition of qwen2_audio is not finished yet, but this loads and is accepted as valid input ```bash llmcompressor.trace \ --model_id Qwen/Qwen2-Audio-7B\ --model_class Qwen2AudioForConditionalGeneration\ --modality audio ``` --------- Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/src/llmcompressor/transformers/finetune/data/__init__.py b/src/llmcompressor/transformers/finetune/data/__init__.py
@@ -8,6 +8,7 @@
 from .flickr_30k import Flickr30K
 from .gsm8k import GSM8KDataset
 from .open_platypus import OpenPlatypusDataset
+from .peoples_speech import PeoplesSpeech
 from .ptb import PtbDataset
 from .ultrachat_200k import UltraChatDataset
 from .wikitext import WikiTextDataset
diff --git a/src/llmcompressor/transformers/finetune/data/peoples_speech.py b/src/llmcompressor/transformers/finetune/data/peoples_speech.py
@@ -0,0 +1,90 @@
+from copy import deepcopy
+from typing import TYPE_CHECKING, Any, Dict
+
+from datasets.formatting.formatting import LazyRow
+from loguru import logger
+
+from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.transformers.finetune.data.base import get_columns
+from llmcompressor.typing import DatasetType, Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.transformers import DataTrainingArguments as DataArgs
+
+
+@TextGenerationDataset.register(name="peoples_speech")
+class PeoplesSpeech(TextGenerationDataset):
+    """
+    ML Commons People's Speech audio dataset
+
+    Unfortunately, due to the specialized nature of audio model preprocessing, some
+    model specific code must be defined here. This dataset has been tested with the
+    WhisperForConditionalGeneration and Qwen2AudioForConditionalGeneration model classes
+
+    :param data_args: configuration settings for dataset loading
+    :param split: split from dataset to load, for instance `test` or `train[:5%]`
+    :param processor: processor or tokenizer to use on dataset
+    """
+
+    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
+        data_args = deepcopy(data_args)
+        data_args.dataset = "MLCommons/peoples_speech"
+        data_args.dataset_config_name = "test"
+        if not data_args.overwrite_cache:
+            logger.warning(
+                "Because audio processors are more complex, dataset mapping functions "
+                "vary with model architecture and their results cannot be cached. "
+                "Setting overwrite_cache=True"
+            )
+            data_args.overwrite_cache = True
+        self.processor_type = processor.__class__.__name__
+
+        super().__init__(data_args=data_args, split=split, processor=processor)
+
+    def dataset_template(self, example):
+        audio = example["audio"]["array"]
+        sampling_rate = example["audio"]["sampling_rate"]
+
+        if self.processor_type == "Qwen2AudioProcessor":
+            messages = [
+                {"role": "user", "content": [{"audio": None}]},
+                {"role": "user", "content": [{"text": "What did the person say?"}]},
+            ]
+            text = self.processor.apply_chat_template(messages)
+            return {"audios": [audio], "sampling_rate": sampling_rate, "text": text}
+
+        else:
+            # chat template decoder ids are appended later by self.processor.__call__
+            text = " " + example["text"].capitalize()
+            return {"audio": audio, "sampling_rate": sampling_rate, "text": text}
+
+    def filter_tokenizer_args(self, dataset: DatasetType) -> DatasetType:
+        if self.processor_type == "WhisperProcessor":
+            tokenizer_args = ["audio", "sampling_rate", "text"]
+            column_names = get_columns(dataset)
+
+            return dataset.remove_columns(list(set(column_names) - set(tokenizer_args)))
+
+        else:
+            return super().filter_tokenizer_args(dataset)
+
+    def tokenize(self, data: LazyRow) -> Dict[str, Any]:
+        if self.processor_type == "WhisperProcessor":
+            inputs = self.processor(
+                audio=data["audio"],
+                sampling_rate=data["sampling_rate"],
+                text=data["text"],
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+
+            # TODO: inputs["input_features"] is a float dtype, which may conflict with
+            # the dtype of the model. Add logic to in data pipeline to move inputs to
+            # the matching model device and dtype
+            inputs["decoder_input_ids"] = inputs["labels"]
+            del inputs["labels"]
+
+            return inputs
+
+        else:
+            return super().tokenize(data)
diff --git a/src/llmcompressor/transformers/tracing/debug.py b/src/llmcompressor/transformers/tracing/debug.py
@@ -117,6 +117,10 @@ def get_dataset_kwargs(modality: str) -> Dict[str, str]:
             "dataset": "flickr",
             "splits": {"calibration": "test[:1]"},
         },
+        "audio": {
+            "dataset": "peoples_speech",
+            "splits": {"calibration": "test[:1]"},
+        },
     }
 
     if modality not in dataset_kwargs: