Skip to content

Commit

Permalink
Add FollowTheFormatInstructionsRunExpander (#2590)
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai authored Apr 25, 2024
1 parent 5f6be0b commit 460f243
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 67 deletions.
2 changes: 1 addition & 1 deletion docs/reproducing_leaderboards.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ The following specifies the appropriate parameters and configuration files for a
### Lite

```bash
export RUN_ENTRIES_CONF_PATH=run_entries_dec2023.conf
export RUN_ENTRIES_CONF_PATH=run_entries_lite_20240424.conf
export SCHEMA_PATH=schema_lite.yaml
export NUM_TRAIN_TRIALS=1
export MAX_EVAL_INSTANCES=1000
Expand Down
133 changes: 133 additions & 0 deletions src/helm/benchmark/presentation/run_entries_lite_20240424.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# HELM scenarios.

entries: [
# NarrativeQA
{description: "narrative_qa:model=text,follow_format_instructions=instruct", priority: 1}

# NaturalQuestions
{description: "natural_qa:model=text,mode=openbook_longans,follow_format_instructions=instruct", priority: 1}
{description: "natural_qa:model=text,mode=closedbook,follow_format_instructions=instruct", priority: 1}

# OpenbookQA
{description: "commonsense:model=text_code,dataset=openbookqa,method=multiple_choice_joint", priority: 1}

# MMLU
{description: "mmlu:model=text,subject=abstract_algebra", priority: 2}
{description: "mmlu:model=text,subject=anatomy", priority: 3}
{description: "mmlu:model=text,subject=college_chemistry", priority: 2}
{description: "mmlu:model=text,subject=computer_security", priority: 2}
{description: "mmlu:model=text,subject=econometrics", priority: 2}
{description: "mmlu:model=text,subject=global_facts", priority: 3}
{description: "mmlu:model=text,subject=jurisprudence", priority: 3}
{description: "mmlu:model=text,subject=philosophy", priority: 3}
{description: "mmlu:model=text,subject=professional_medicine", priority: 3}
{description: "mmlu:model=text,subject=us_foreign_policy", priority: 2}
{description: "mmlu:model=text,subject=astronomy", priority: 4}
{description: "mmlu:model=text,subject=business_ethics", priority: 4}
{description: "mmlu:model=text,subject=clinical_knowledge", priority: 4}
{description: "mmlu:model=text,subject=college_biology", priority: 4}
{description: "mmlu:model=text,subject=college_computer_science", priority: 4}
{description: "mmlu:model=text,subject=college_mathematics", priority: 4}
{description: "mmlu:model=text,subject=college_medicine", priority: 4}
{description: "mmlu:model=text,subject=college_physics", priority: 4}
{description: "mmlu:model=text,subject=conceptual_physics", priority: 4}
{description: "mmlu:model=text,subject=electrical_engineering", priority: 4}
{description: "mmlu:model=text,subject=elementary_mathematics", priority: 4}
{description: "mmlu:model=text,subject=formal_logic", priority: 4}
{description: "mmlu:model=text,subject=high_school_biology", priority: 4}
{description: "mmlu:model=text,subject=high_school_chemistry", priority: 4}
{description: "mmlu:model=text,subject=high_school_computer_science", priority: 4}
{description: "mmlu:model=text,subject=high_school_european_history", priority: 4}
{description: "mmlu:model=text,subject=high_school_geography", priority: 4}
{description: "mmlu:model=text,subject=high_school_government_and_politics", priority: 4}
{description: "mmlu:model=text,subject=high_school_macroeconomics", priority: 4}
{description: "mmlu:model=text,subject=high_school_mathematics", priority: 4}
{description: "mmlu:model=text,subject=high_school_microeconomics", priority: 4}
{description: "mmlu:model=text,subject=high_school_physics", priority: 4}
{description: "mmlu:model=text,subject=high_school_psychology", priority: 4}
{description: "mmlu:model=text,subject=high_school_statistics", priority: 4}
{description: "mmlu:model=text,subject=high_school_us_history", priority: 4}
{description: "mmlu:model=text,subject=high_school_world_history", priority: 4}
{description: "mmlu:model=text,subject=human_aging", priority: 4}
{description: "mmlu:model=text,subject=human_sexuality", priority: 4}
{description: "mmlu:model=text,subject=international_law", priority: 4}
{description: "mmlu:model=text,subject=logical_fallacies", priority: 4}
{description: "mmlu:model=text,subject=machine_learning", priority: 4}
{description: "mmlu:model=text,subject=management", priority: 4}
{description: "mmlu:model=text,subject=marketing", priority: 4}
{description: "mmlu:model=text,subject=medical_genetics", priority: 4}
{description: "mmlu:model=text,subject=miscellaneous", priority: 4}
{description: "mmlu:model=text,subject=moral_disputes", priority: 4}
{description: "mmlu:model=text,subject=moral_scenarios", priority: 4}
{description: "mmlu:model=text,subject=nutrition", priority: 4}
{description: "mmlu:model=text,subject=prehistory", priority: 4}
{description: "mmlu:model=text,subject=professional_accounting", priority: 4}
{description: "mmlu:model=text,subject=professional_law", priority: 4}
{description: "mmlu:model=text,subject=professional_psychology", priority: 4}
{description: "mmlu:model=text,subject=public_relations", priority: 4}
{description: "mmlu:model=text,subject=security_studies", priority: 4}
{description: "mmlu:model=text,subject=sociology", priority: 4}
{description: "mmlu:model=text,subject=virology", priority: 4}
{description: "mmlu:model=text,subject=world_religions", priority: 4}

# MATH
{description: "math:model=text_code,subject=number_theory,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
{description: "math:model=text_code,subject=intermediate_algebra,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
{description: "math:model=text_code,subject=algebra,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
{description: "math:model=text_code,subject=prealgebra,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
{description: "math:model=text_code,subject=geometry,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
{description: "math:model=text_code,subject=counting_and_probability,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}
{description: "math:model=text_code,subject=precalculus,level=1,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 2}

{description: "math:model=text_code,subject=number_theory,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=intermediate_algebra,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=algebra,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=prealgebra,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=geometry,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=counting_and_probability,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=precalculus,level=2,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}

{description: "math:model=text_code,subject=number_theory,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=intermediate_algebra,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=algebra,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=prealgebra,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=geometry,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=counting_and_probability,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=precalculus,level=3,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}

{description: "math:model=text_code,subject=number_theory,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=intermediate_algebra,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=algebra,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=prealgebra,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=geometry,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=counting_and_probability,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}
{description: "math:model=text_code,subject=precalculus,level=4,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 4}

{description: "math:model=text_code,subject=number_theory,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=intermediate_algebra,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=algebra,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=prealgebra,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=geometry,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=counting_and_probability,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}
{description: "math:model=text_code,subject=precalculus,level=5,use_chain_of_thought=True,follow_format_instructions=instruct", priority: 3}

# GSM
{description: "gsm:model=text_code,follow_format_instructions=instruct", priority: 2}

# LegalBench
{description: "legalbench:model=text_code,subset=abercrombie,follow_format_instructions=instruct", priority: 2}
{description: "legalbench:model=text_code,subset=corporate_lobbying,follow_format_instructions=instruct", priority: 2}
{description: "legalbench:model=text_code,subset=international_citizenship_questions,follow_format_instructions=instruct", priority: 2}
{description: "legalbench:model=text_code,subset=function_of_decision_section,follow_format_instructions=instruct", priority: 2}
{description: "legalbench:model=text_code,subset=proa,follow_format_instructions=instruct", priority: 2}

# MedQA
{description: "med_qa:model=text_code", priority: 2}

# WMT14
{description: "wmt_14:language_pair=cs-en,model=text,follow_format_instructions=instruct", priority: 2}
{description: "wmt_14:language_pair=de-en,model=text,follow_format_instructions=instruct", priority: 2}
{description: "wmt_14:language_pair=fr-en,model=text,follow_format_instructions=instruct", priority: 2}
{description: "wmt_14:language_pair=hi-en,model=text,follow_format_instructions=instruct", priority: 2}
{description: "wmt_14:language_pair=ru-en,model=text,follow_format_instructions=instruct", priority: 2}
]
86 changes: 24 additions & 62 deletions src/helm/benchmark/run_expander.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
get_all_code_models,
get_all_models,
get_all_text_models,
get_model_metadata,
get_model_names_with_tag,
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
ABLATION_MODEL_TAG,
TEXT_TO_IMAGE_MODEL_TAG,
VISION_LANGUAGE_MODEL_TAG,
INSTRUCTION_FOLLOWING_MODEL_TAG,
)
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
Expand Down Expand Up @@ -345,78 +347,37 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
return [run_spec]


class OpenAIRunExpander(RunExpander):
"""
Custom prompt for OpenAI models.
These models need more explicit instructions about following the format.
"""

# TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
class FollowFormatInstructionsRunExpander(RunExpander):
"""Adds more explicit instructions about following the format to prompts.
name = "openai"

def __init__(self):
pass

def expand(self, run_spec: RunSpec) -> List[RunSpec]:
if run_spec.adapter_spec.method != ADAPT_GENERATION:
return [run_spec]

return [
replace(
run_spec,
name=run_spec.name,
adapter_spec=replace(
run_spec.adapter_spec,
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
global_suffix="\n\n"
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+ "\n"
+ run_spec.adapter_spec.output_prefix.strip(),
),
),
]
The argument controlls which models will receive these instructions.
If "all", all models receive these instructions.
If "instruct", only instruction-following models receive these instructions.
Only supports the generation adaptation method. Raises an error if used on
a RunSpec that uses a different adaptation method.
class GoogleRunExpander(RunExpander):
"""
Custom prompt for Google models.
These models need more explicit instructions about following the format.
Note: For legacy backwards compatibility reasons, despite the use of the word
"instructions" in this run expander's name, this run expander actually
modifies the global_prefix and the global_suffix of the AdapterSpec rather than
the instructions.
"""

# TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
name = "follow_format_instructions"

name = "google"
def __init__(self, value: str):
if value != "all" and value != "instruct":
raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
self.value = value

def expand(self, run_spec: RunSpec) -> List[RunSpec]:
if run_spec.adapter_spec.method != ADAPT_GENERATION:
return [run_spec]
raise Exception("follow_format_instructions run expander only supports the generation adaptation method")

return [
replace(
run_spec,
name=run_spec.name,
adapter_spec=replace(
run_spec.adapter_spec,
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
global_suffix="\n\n"
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
+ "\n"
+ run_spec.adapter_spec.output_prefix.strip(),
),
),
]


class MistralRunExpander(RunExpander):
"""Custom prompt for Mistral models."""

# TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.

name = "output_format_instructions"

def expand(self, run_spec: RunSpec) -> List[RunSpec]:
if run_spec.adapter_spec.method != ADAPT_GENERATION:
if (
self.value == "instruct"
and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
):
return [run_spec]

return [
Expand Down Expand Up @@ -1425,6 +1386,7 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
NewlineRunExpander,
StopRunExpander,
FormatPromptRunExpander,
FollowFormatInstructionsRunExpander,
AddToStopRunExpander,
GlobalPrefixRunExpander,
NumTrainTrialsRunExpander,
Expand Down
8 changes: 4 additions & 4 deletions src/helm/config/model_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ models:
access: open
num_parameters: 132000000000
release_date: 2024-03-27
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG]
tags: [TEXT_MODEL_TAG, PARTIAL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]


# DeepMind
Expand Down Expand Up @@ -670,15 +670,15 @@ models:
creator_organization_name: Google
access: limited
release_date: 2023-12-13
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]

- name: google/gemini-1.0-pro-001
display_name: Gemini 1.0 Pro
description: Gemini 1.0 Pro is a multimodal model able to reason across text, images, video, audio and code. ([paper](https://arxiv.org/abs/2312.11805))
creator_organization_name: Google
access: limited
release_date: 2023-12-13
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]

# Note: This is aliased to a snapshot of gemini-pro-vision. When possible, please use a versioned snapshot instead.
- name: google/gemini-pro-vision
Expand All @@ -703,7 +703,7 @@ models:
creator_organization_name: Google
access: limited
release_date: 2024-04-10
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG]
tags: [TEXT_MODEL_TAG, GOOGLE_GEMINI_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG]

- name: google/gemma-2b
display_name: Gemma (2B)
Expand Down

0 comments on commit 460f243

Please sign in to comment.