Add FollowTheFormatInstructionsRunExpander (#2590)

stanford-crfm · Apr 25, 2024 · 460f243 · 460f243
1 parent 5f6be0b
commit 460f243
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 67 deletions.
diff --git a/docs/reproducing_leaderboards.md b/docs/reproducing_leaderboards.md
@@ -37,7 +37,7 @@ The following specifies the appropriate parameters and configuration files for a
 ### Lite
 
 ```bash
-export RUN_ENTRIES_CONF_PATH=run_entries_dec2023.conf
+export RUN_ENTRIES_CONF_PATH=run_entries_lite_20240424.conf
 export SCHEMA_PATH=schema_lite.yaml
 export NUM_TRAIN_TRIALS=1
 export MAX_EVAL_INSTANCES=1000

diff --git a/src/helm/benchmark/presentation/run_entries_lite_20240424.conf b/src/helm/benchmark/presentation/run_entries_lite_20240424.conf
@@ -0,0 +1,133 @@
+# HELM scenarios.
+
+entries: [
+  # NarrativeQA
+  {description: "narrative_qa:model=text,follow_format_instructions=instruct", priority: 1}
+
+  # NaturalQuestions
+  {description: "natural_qa:model=text,mode=openbook_longans,follow_format_instructions=instruct", priority: 1}
+  {description: "natural_qa:model=text,mode=closedbook,follow_format_instructions=instruct", priority: 1}
+
+  # OpenbookQA
+  {description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint", priority: 1}
+
+  # MMLU
+  {description: "mmlu:model=text,subject=abstract_algebra", priority: 2}
+  {description: "mmlu:model=text,subject=anatomy", priority: 3}
+  {description: "mmlu:model=text,subject=college_chemistry", priority: 2}
+  {description: "mmlu:model=text,subject=computer_security", priority: 2}
+  {description: "mmlu:model=text,subject=econometrics", priority: 2}
+  {description: "mmlu:model=text,subject=global_facts", priority: 3}
+  {description: "mmlu:model=text,subject=jurisprudence", priority: 3}
+  {description: "mmlu:model=text,subject=philosophy", priority: 3}
+  {description: "mmlu:model=text,subject=professional_medicine", priority: 3}
+  {description: "mmlu:model=text,subject=us_foreign_policy", priority: 2}
+  {description: "mmlu:model=text,subject=astronomy", priority: 4}
+  {description: "mmlu:model=text,subject=business_ethics", priority: 4}
+  {description: "mmlu:model=text,subject=clinical_knowledge", priority: 4}
+  {description: "mmlu:model=text,subject=college_biology", priority: 4}
+  {description: "mmlu:model=text,subject=college_computer_science", priority: 4}
+  {description: "mmlu:model=text,subject=college_mathematics", priority: 4}
+  {description: "mmlu:model=text,subject=college_medicine", priority: 4}
+  {description: "mmlu:model=text,subject=college_physics", priority: 4}
+  {description: "mmlu:model=text,subject=conceptual_physics", priority: 4}
+  {description: "mmlu:model=text,subject=electrical_engineering", priority: 4}
+  {description: "mmlu:model=text,subject=elementary_mathematics", priority: 4}
+  {description: "mmlu:model=text,subject=formal_logic", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_biology", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_chemistry", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_computer_science", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_european_history", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_geography", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_government_and_politics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_macroeconomics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_mathematics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_microeconomics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_physics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_psychology", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_statistics", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_us_history", priority: 4}
+  {description: "mmlu:model=text,subject=high_school_world_history", priority: 4}
+  {description: "mmlu:model=text,subject=human_aging", priority: 4}
+  {description: "mmlu:model=text,subject=human_sexuality", priority: 4}
+  {description: "mmlu:model=text,subject=international_law", priority: 4}
+  {description: "mmlu:model=text,subject=logical_fallacies", priority: 4}
+  {description: "mmlu:model=text,subject=machine_learning", priority: 4}
+  {description: "mmlu:model=text,subject=management", priority: 4}
+  {description: "mmlu:model=text,subject=marketing", priority: 4}
+  {description: "mmlu:model=text,subject=medical_genetics", priority: 4}
+  {description: "mmlu:model=text,subject=miscellaneous", priority: 4}
+  {description: "mmlu:model=text,subject=moral_disputes", priority: 4}
+  {description: "mmlu:model=text,subject=moral_scenarios", priority: 4}
+  {description: "mmlu:model=text,subject=nutrition", priority: 4}
+  {description: "mmlu:model=text,subject=prehistory", priority: 4}
+  {description: "mmlu:model=text,subject=professional_accounting", priority: 4}
+  {description: "mmlu:model=text,subject=professional_law", priority: 4}
+  {description: "mmlu:model=text,subject=professional_psychology", priority: 4}
+  {description: "mmlu:model=text,subject=public_relations", priority: 4}
+  {description: "mmlu:model=text,subject=security_studies", priority: 4}
+  {description: "mmlu:model=text,subject=sociology", priority: 4}
+  {description: "mmlu:model=text,subject=virology", priority: 4}
+  {description: "mmlu:model=text,subject=world_religions", priority: 4}
+
+  # MATH
+  {description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
+  {description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
+  {description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
+  {description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
+  {description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
+  {description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
+
+  {description: "math:model=text_code,subject=number_theory,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=algebra,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=prealgebra,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=geometry,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=counting_and_probability,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=precalculus,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+
+  {description: "math:model=text_code,subject=number_theory,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=algebra,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=prealgebra,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=geometry,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=counting_and_probability,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=precalculus,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+
+  {description: "math:model=text_code,subject=number_theory,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=algebra,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=prealgebra,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=geometry,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=counting_and_probability,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+  {description: "math:model=text_code,subject=precalculus,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
+
+  {description: "math:model=text_code,subject=number_theory,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=intermediate_algebra,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=algebra,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=prealgebra,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=geometry,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=counting_and_probability,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+  {description: "math:model=text_code,subject=precalculus,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
+
+  # GSM
+  {description: "gsm:model=text_code,follow_format_instructions=instruct", priority: 2}
+
+  # LegalBench
+  {description: "legalbench:model=text_code,subset=abercrombie,follow_format_instructions=instruct", priority: 2}
+  {description: "legalbench:model=text_code,subset=corporate_lobbying,follow_format_instructions=instruct", priority: 2}
+  {description: "legalbench:model=text_code,subset=international_citizenship_questions,follow_format_instructions=instruct", priority: 2}
+  {description: "legalbench:model=text_code,subset=function_of_decision_section,follow_format_instructions=instruct", priority: 2}
+  {description: "legalbench:model=text_code,subset=proa,follow_format_instructions=instruct", priority: 2}
+
+  # MedQA
+  {description: "med_qa:model=text_code", priority: 2}
+
+  # WMT14
+  {description: "wmt_14:language_pair=cs-en,model=text,follow_format_instructions=instruct", priority: 2}
+  {description: "wmt_14:language_pair=de-en,model=text,follow_format_instructions=instruct", priority: 2}
+  {description: "wmt_14:language_pair=fr-en,model=text,follow_format_instructions=instruct", priority: 2}
+  {description: "wmt_14:language_pair=hi-en,model=text,follow_format_instructions=instruct", priority: 2}
+  {description: "wmt_14:language_pair=ru-en,model=text,follow_format_instructions=instruct", priority: 2}
+]
diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py
@@ -8,12 +8,14 @@
     get_all_code_models,
     get_all_models,
     get_all_text_models,
+    get_model_metadata,
     get_model_names_with_tag,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
     ABLATION_MODEL_TAG,
     TEXT_TO_IMAGE_MODEL_TAG,
     VISION_LANGUAGE_MODEL_TAG,
+    INSTRUCTION_FOLLOWING_MODEL_TAG,
 )
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
@@ -345,78 +347,37 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         return [run_spec]
 
 
-class OpenAIRunExpander(RunExpander):
-    """
-    Custom prompt for OpenAI models.
-    These models need more explicit instructions about following the format.
-    """
-
-    # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
+class FollowFormatInstructionsRunExpander(RunExpander):
+    """Adds more explicit instructions about following the format to prompts.
 
-    name = "openai"
-
-    def __init__(self):
-        pass
-
-    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        if run_spec.adapter_spec.method != ADAPT_GENERATION:
-            return [run_spec]
-
-        return [
-            replace(
-                run_spec,
-                name=run_spec.name,
-                adapter_spec=replace(
-                    run_spec.adapter_spec,
-                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
-                    global_suffix="\n\n"
-                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + "\n"
-                    + run_spec.adapter_spec.output_prefix.strip(),
-                ),
-            ),
-        ]
+    The argument controlls which models will receive these instructions.
+    If "all", all models receive these instructions.
+    If "instruct", only instruction-following models receive these instructions.
 
+    Only supports the generation adaptation method. Raises an error if used on
+    a RunSpec that uses a different adaptation method.
 
-class GoogleRunExpander(RunExpander):
-    """
-    Custom prompt for Google models.
-    These models need more explicit instructions about following the format.
+    Note: For legacy backwards compatibility reasons, despite the use of the word
+    "instructions" in this run expander's name, this run expander actually
+    modifies the global_prefix and the global_suffix of the AdapterSpec rather than
+    the instructions.
     """
 
-    # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
+    name = "follow_format_instructions"
 
-    name = "google"
+    def __init__(self, value: str):
+        if value != "all" and value != "instruct":
+            raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
+        self.value = value
 
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         if run_spec.adapter_spec.method != ADAPT_GENERATION:
-            return [run_spec]
+            raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
 
-        return [
-            replace(
-                run_spec,
-                name=run_spec.name,
-                adapter_spec=replace(
-                    run_spec.adapter_spec,
-                    global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
-                    global_suffix="\n\n"
-                    + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
-                    + "\n"
-                    + run_spec.adapter_spec.output_prefix.strip(),
-                ),
-            ),
-        ]
-
-
-class MistralRunExpander(RunExpander):
-    """Custom prompt for Mistral models."""
-
-    # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
-
-    name = "output_format_instructions"
-
-    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
-        if run_spec.adapter_spec.method != ADAPT_GENERATION:
+        if (
+            self.value == "instruct"
+            and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
+        ):
             return [run_spec]
 
         return [
@@ -1425,6 +1386,7 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
     NewlineRunExpander,
     StopRunExpander,
     FormatPromptRunExpander,
+    FollowFormatInstructionsRunExpander,
     AddToStopRunExpander,
     GlobalPrefixRunExpander,
     NumTrainTrialsRunExpander,

diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml
@@ -534,7 +534,7 @@ models:
     access: open
     num_parameters: 132000000000
     release_date: 2024-03-27
-    tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
+    tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
 
   # DeepMind
@@ -670,15 +670,15 @@ models:
     creator_organization_name: Google
     access: limited
     release_date: 2023-12-13
-    tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+    tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
   - name: google/gemini-1.0-pro-001
     display_name: Gemini 1.0 Pro
     description: Gemini 1.0 Pro is a multimodal model able to reason across text, images, video, audio and code. ([paper](https://arxiv.org/abs/2312.11805))
     creator_organization_name: Google
     access: limited
     release_date: 2023-12-13
-    tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+    tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
     # Note: This is aliased to a snapshot of gemini-pro-vision. When possible, please use a versioned snapshot instead.
   - name: google/gemini-pro-vision
@@ -703,7 +703,7 @@ models:
     creator_organization_name: Google
     access: limited
     release_date: 2024-04-10
-    tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
+    tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]
 
   - name: google/gemma-2b
     display_name: Gemma (2B)