MuTox Scenario (#3343)

Co-authored-by: Haoqin Tu <[email protected]>
stanford-crfm · Feb 17, 2025 · 6552e67 · 6552e67
1 parent 432dcd3
commit 6552e67
Show file tree

Hide file tree

Showing 8 changed files with 335 additions and 314 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -282,6 +282,9 @@ audiolm =
     crfm-helm[openai]
     crfm-helm[google]
 
+    # For clipping audio
+    pydub~=0.25.1
+
     # For HuggingFace audio datasets
     soundfile~=0.12
     librosa~=0.10

diff --git a/src/helm/benchmark/presentation/run_entries_audio.conf b/src/helm/benchmark/presentation/run_entries_audio.conf
@@ -2,7 +2,6 @@ entries: [
     ####################################################################################################################
     # Auditory Perception
     ####################################################################################################################
-    {description: "audio_mnist:model=audiolm", priority: 1}
     {description: "meld_audio:model=audiolm", priority: 1}
     {description: "vocal_sound:model=audiolm", priority: 1}
     {description: "audiocaps:model=audiolm", priority: 1}
@@ -51,4 +50,39 @@ entries: [
     {description: "common_voice_15:language=German,model=audiolm", priority: 1}
     {description: "common_voice_15:language=French,model=audiolm", priority: 1}
 
+    ####################################################################################################################
+    # Toxicity
+    ####################################################################################################################
+
+    {description: "mutox:language=Arabic,model=audiolm", priority: 1}
+    {description: "mutox:language=Bengali,model=audiolm", priority: 1}
+    {description: "mutox:language=Bulgarian,model=audiolm", priority: 1}
+    {description: "mutox:language=Catalan,model=audiolm", priority: 1}
+    {description: "mutox:language=Czech,model=audiolm", priority: 1}
+    {description: "mutox:language=Mandarin_Chinese,model=audiolm", priority: 1}
+    {description: "mutox:language=Danish,model=audiolm", priority: 1}
+    {description: "mutox:language=German,model=audiolm", priority: 1}
+    {description: "mutox:language=Greek,model=audiolm", priority: 1}
+    {description: "mutox:language=English,model=audiolm", priority: 1}
+    {description: "mutox:language=Estonian,model=audiolm", priority: 1}
+    # {description: "mutox:language=Western_Persian,model=audiolm", priority: 1}
+    {description: "mutox:language=Finnish,model=audiolm", priority: 1}
+    {description: "mutox:language=French,model=audiolm", priority: 1}
+    {description: "mutox:language=Hebrew,model=audiolm", priority: 1}
+    {description: "mutox:language=Hindi,model=audiolm", priority: 1}
+    {description: "mutox:language=Hungarian,model=audiolm", priority: 1}
+    {description: "mutox:language=Indonesian,model=audiolm", priority: 1}
+    {description: "mutox:language=Italian,model=audiolm", priority: 1}
+    {description: "mutox:language=Dutch,model=audiolm", priority: 1}
+    {description: "mutox:language=Polish,model=audiolm", priority: 1}
+    {description: "mutox:language=Portuguese,model=audiolm", priority: 1}
+    {description: "mutox:language=Russian,model=audiolm", priority: 1}
+    {description: "mutox:language=Spanish,model=audiolm", priority: 1}
+    {description: "mutox:language=Slovak,model=audiolm", priority: 1}
+    {description: "mutox:language=Swahili,model=audiolm", priority: 1}
+    {description: "mutox:language=Tagalog,model=audiolm", priority: 1}
+    {description: "mutox:language=Turkish,model=audiolm", priority: 1}
+    {description: "mutox:language=Urdu,model=audiolm", priority: 1}
+    {description: "mutox:language=Vietnamese,model=audiolm", priority: 1}
+]
 ]
diff --git a/src/helm/benchmark/presentation/run_entries_audio_debug.conf b/src/helm/benchmark/presentation/run_entries_audio_debug.conf
@@ -0,0 +1,3 @@
+entries: [
+    {description: "mutox:language=Arabic,model=audiolm", priority: 1}
+]
diff --git a/src/helm/benchmark/presentation/run_entries_audio_v0.conf b/src/helm/benchmark/presentation/run_entries_audio_v0.conf
diff --git a/src/helm/benchmark/run_specs/audio_run_specs.py b/src/helm/benchmark/run_specs/audio_run_specs.py
@@ -163,6 +163,26 @@ def get_meld_audio_run_spec() -> RunSpec:
     )
 
 
+@run_spec_function("mutox")
+def get_mutox_audio_run_spec(language: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.mutox_scenario.MuToxScenario",
+        args={"language": language},
+    )
+    adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
+        input_noun=None, output_noun="Answer", max_train_instances=0
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "mutox"
+    return RunSpec(
+        name=f"{run_spec_name}:language={language}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )
+
+
 @run_spec_function("covost2")
 def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
     scenario_spec = ScenarioSpec(

diff --git a/src/helm/benchmark/scenarios/audio_language/mutox_scenario.py b/src/helm/benchmark/scenarios/audio_language/mutox_scenario.py
@@ -0,0 +1,254 @@
+from io import BytesIO
+from typing import List
+import os
+import requests
+
+from pydub import AudioSegment
+from tqdm import tqdm
+import pandas as pd
+
+from helm.benchmark.scenarios.scenario import (
+    TEST_SPLIT,
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.audio_utils import is_invalid_audio_file
+from helm.common.media_object import MediaObject, MultimediaObject
+from helm.common.general import ensure_directory_exists, ensure_file_downloaded
+from helm.common.hierarchical_logger import hlog, htrack_block
+
+
+class MuToxScenario(Scenario):
+    """
+    MuTox: MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector
+
+    MuTox, the first highly multilingual audio-based dataset with toxicity labels. The dataset consists of 20k
+    audio utterances for English and Spanish, and 4k for the other languages. To showcase the quality of this
+    dataset, we train the MuTox audio-based toxicity classifier, which allows zero-shot toxicity detection across
+    a broad range of languages. This classifier outperforms existing text-based trainable classifiers by more than
+    1% AUC, while increasing the language coverage from 8 to 100+ languages. When compared to a wordlist-based
+    classifier that covers a similar number of languages, MuTox improves precision and recall by ∼2.5 times.
+
+    Languages:
+        "Arabic": "arb",
+        "Bengali": "ben",
+        "Bulgarian": "bul",
+        "Catalan": "cat",
+        "Czech": "ces",
+        "Mandarin Chinese": "cmn",
+        "Danish": "dan",
+        "German": "deu",
+        "Greek": "ell",
+        "English": "eng",
+        "Estonian": "est",
+        "Western Persian": "fas",
+        "Finnish": "fin",
+        "French": "fra",
+        "Hebrew": "heb",
+        "Hindi": "hin",
+        "Hungarian": "hun",
+        "Indonesian": "ind",
+        "Italian": "ita",
+        "Dutch": "nld",
+        "Polish": "pol",
+        "Portuguese": "por",
+        "Russian": "rus",
+        "Spanish": "spa",
+        "Slovak": "slk",
+        "Swahili": "swh",
+        "Tagalog": "tgl",
+        "Turkish": "tur",
+        "Urdu": "urd",
+        "Vietnamese": "vie",
+
+    The columns of the dataset are:
+
+    id: a string id of the segment;
+    lang: 3-letter language code;
+    partition: one of train, dev, or devtest
+    public_url_segment: a string formatted as url:start:end, where start and end are indicated in milliseconds;
+    audio_file_transcript: text transctiption of the segment;
+    contains_toxicity, toxicity_types, perlocutionary_effects: annotation results as strings
+    label: an integer label, equal to 1 if contains_toxicity equals Yes and 0 otherwise;
+    etox_result: toxic word (or multiple words, separated by |) detected by the Etox matcher;
+    detoxify_score: toxicity probabilities predicted by the Detoxify system (float numbers between 0 and 1);
+    mutox_speech_score, mutox_text_score, mutox_zero_shot_speech_score, mutox_zero_shot_text_score: MuTox predictions
+    as float numbers with any value (they can be interpreted as logits,
+    i.e. probabilities before a sigmoid transformation).
+
+    Citation:
+
+    @misc{costajussà2023mutox,
+      title={MuTox: Universal MUltilingual Audio-based TOXicity Dataset and Zero-shot Detector},
+      author={ Marta R. Costa-jussà, Mariano Coria Meglioli, Pierre Andrews, David Dale, Prangthip Hansanti,
+      Elahe Kalbassi, Alex Mourachko, Christophe Ropers, Carleigh Wood},
+      year={2023},
+      eprint={},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+    }
+    """
+
+    ANNOTATIONS_URL = "https://dl.fbaipublicfiles.com/seamless/datasets/mutox.tsv"
+
+    LANGAUGE_CODES = {
+        "Arabic": "arb",
+        "Bengali": "ben",
+        "Bulgarian": "bul",
+        "Catalan": "cat",
+        "Czech": "ces",
+        "Mandarin_Chinese": "cmn",
+        "Danish": "dan",
+        "German": "deu",
+        "Greek": "ell",
+        "English": "eng",
+        "Estonian": "est",
+        "Western_Persian": "fas",
+        "Finnish": "fin",
+        "French": "fra",
+        "Hebrew": "heb",
+        "Hindi": "hin",
+        "Hungarian": "hun",
+        "Indonesian": "ind",
+        "Italian": "ita",
+        "Dutch": "nld",
+        "Polish": "pol",
+        "Portuguese": "por",
+        "Russian": "rus",
+        "Spanish": "spa",
+        "Slovak": "slk",
+        "Swahili": "swh",
+        "Tagalog": "tgl",
+        "Turkish": "tur",
+        "Urdu": "urd",
+        "Vietnamese": "vie",
+    }
+
+    name = "mutox"
+    description = "Toxicity detection benchmark ([Costa-jussà et al, 2024](https://arxiv.org/abs/2401.05060))."
+    tags = ["audio", "classification", "toxicity "]
+
+    @staticmethod
+    def track_bad_audio_file(bad_audio_file: str, output_path: str) -> None:
+        """
+        Many of the links do not exist or point to broken so we keep track of them
+        and skip them in the future runs to significantly speed up gathering the instances.
+        """
+        with open(output_path, "a") as f:
+            f.write(bad_audio_file + "\n")
+
+    def __init__(self, language: str) -> None:
+        super().__init__()
+        self._language_code: str = self.LANGAUGE_CODES[language]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the annotations
+        annotations_path: str = os.path.join(output_path, "mutox.tsv")
+        ensure_file_downloaded(self.ANNOTATIONS_URL, annotations_path)
+
+        # Read bad audio files
+        bad_audio_files: set[str] = set()
+        bad_audio_files_path: str = os.path.join(output_path, "bad_audio_files.txt")
+        if os.path.exists(bad_audio_files_path):
+            # Each line is the audio file name
+            with open(bad_audio_files_path, "r") as f:
+                for line in f:
+                    bad_audio_files.add(line.strip())
+            hlog(f"Found {len(bad_audio_files)} bad audio files.")
+
+        # Where the audio files will be downloaded to
+        audio_path: str = os.path.join(output_path, "audio")
+        ensure_directory_exists(audio_path)
+
+        instances: List[Instance] = []
+        df = pd.read_csv(annotations_path, delimiter="\t")
+        hlog(f"Found {len(df)} rows in the dataset")
+
+        valid_count: int = 0
+        total_count: int = 0
+        for row in tqdm(df.itertuples()):
+            # Only proces examples that are in devtest and the language we're interested in
+            if row.partition != "devtest":
+                continue
+
+            if row.lang != self._language_code:
+                continue
+
+            total_count += 1
+
+            # Discard known bad audio files
+            audio_filename: str = f"{row.id}.mp3"
+            with htrack_block(f"Processing audio file: {audio_filename}"):
+                if audio_filename in bad_audio_files:
+                    hlog(f"Skipping this example -- known bad audio file: {audio_filename}")
+                    continue
+
+                local_audio_path: str = os.path.join(audio_path, audio_filename)
+                if not os.path.exists(local_audio_path):
+                    # The provided URL has the complete audio, so we need to download it and clip it
+                    # public_url_segment: a string formatted as url:start:end,
+                    if not isinstance(row.public_url_segment, str):
+                        # Sometimes URL is just a float causing an error. Skip those.
+                        hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
+                        continue
+
+                    parts = row.public_url_segment.split()
+                    if len(parts) != 3:
+                        hlog(f"Skipping this example -- invalid URL: {row.public_url_segment}")
+                        continue
+
+                    audio_url, start_ms_str, end_ms_str = parts
+                    start_ms, end_ms = int(start_ms_str), int(end_ms_str)
+
+                    # Download the full audio file
+                    try:
+                        response = requests.get(audio_url, stream=True)
+                        response.raise_for_status()
+
+                        # Load audio from the downloaded file
+                        audio = AudioSegment.from_file(BytesIO(response.content))
+
+                        # Clip the audio
+                        clipped_audio = audio[start_ms:end_ms]
+
+                        # Save the clipped file
+                        clipped_audio.export(local_audio_path, format="mp3")
+                    except Exception as e:
+                        # Some files are no longer available or invalid
+                        hlog(
+                            f"Skipping this example -- error downloading / processing audio file from {audio_url}: {e}"
+                        )
+                        self.track_bad_audio_file(audio_filename, bad_audio_files_path)
+                        continue
+
+                if is_invalid_audio_file(local_audio_path):
+                    hlog(f"Skipping this example -- Invalid audio file even after downloading at {local_audio_path}")
+                    self.track_bad_audio_file(audio_filename, bad_audio_files_path)
+                    continue
+                else:
+                    valid_count += 1
+                    hlog(f"Valid audio file found at {local_audio_path} (valid/total: {valid_count}/{total_count})")
+
+            input = Input(
+                multimedia_content=MultimediaObject(
+                    media_objects=[
+                        # Following what was prompted annotators as outlined in the paper
+                        MediaObject(text="Does the utterance contain toxicity?", content_type="text/plain"),
+                        MediaObject(location=local_audio_path, content_type="audio/mpeg"),
+                    ]
+                )
+            )
+
+            is_toxic: bool = row.label == 1
+            references = [
+                Reference(Output(text="Yes"), tags=[CORRECT_TAG] if is_toxic else []),
+                Reference(Output(text="No"), tags=[CORRECT_TAG] if not is_toxic else []),
+            ]
+            instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
+
+        assert len(instances) > 0, f"No instances found for language: {self._language_code}"
+        return instances
diff --git a/src/helm/benchmark/static/schema_audio.yaml b/src/helm/benchmark/static/schema_audio.yaml
@@ -220,6 +220,7 @@ run_groups:
       - speech_robust_bench
       - meld_audio
       - air_bench_chat
+      - mutox
 
   - name: audio_mnist
     display_name: AudioMNIST
@@ -504,4 +505,22 @@ run_groups:
       what: audio, question, and answer of audio samples
       who: real speakers
       when: "2024"
-      language: English
+      language: English
+
+  - name: mutox
+    display_name: MuTox
+    description: >
+     The dataset consists of 20k audio utterances for English and Spanish, and 4k for the other languages.
+     ([Costa-jussà et al, 2018](https://arxiv.org/abs/2401.05060)).
+    metric_groups:
+      - accuracy
+      - general_information
+    environment:
+      main_name: exact_match
+      main_split: test
+    taxonomy:
+      task: toxicity detection
+      what: samples of utterances
+      who: real speakers
+      when: "2024"
+      language: 30 langguages