Skip to content

Commit

Permalink
Populate schema adapter fields from AdapterSpec docstrings (#2611)
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai authored May 7, 2024
1 parent 767a9e7 commit c63a1b4
Show file tree
Hide file tree
Showing 11 changed files with 122 additions and 512 deletions.
63 changes: 32 additions & 31 deletions src/helm/benchmark/adaptation/adapter_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,90 +39,91 @@ class AdapterSpec:
Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
"""

# Method of adaptation
method: str = ""
"""The high-level strategy for converting instances into a prompt for the language model."""

# Prepend all prompts with this string.
# For example, it is recommended to prefix all prompts with [NLG] for UL2.
global_prefix: str = ""
"""The string that is prepended to the entire prompt."""

# Append all prompts with this string.
global_suffix: str = ""
"""The string that is appended to the entire prompt."""

# Prompt starts with instructions
instructions: str = ""
"""The description of the task that is included at the very beginning of the prompt."""

# What goes before the input
input_prefix: str = "Input: "
"""The string that is included before each input (e.g., 'Question:')."""

# What goes after the input
input_suffix: str = "\n"
"""The string that is included after each input (e.g., '\\n')."""

# What goes before the input (for multiple choice)
reference_prefix: str = "A. "
"""The string that is included before each reference (for multiple-choice questions)."""

# What goes before the input (for multiple choice)
reference_suffix: str = "\n"
"""The string that is included after each reference (for multiple-choice questions)."""

# What goes before the output
output_prefix: str = "Output: "
"""The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""

# What goes after the output
output_suffix: str = "\n"
"""The string that is included after the correct answer/predicted output (e.g., '\\n')."""

# What goes between instruction and in-context example blocks in the constructed prompt
instance_prefix: str = "\n"
"""The string that is included before each instance (e.g., '\\n\\n')."""

# List of regular expression substitutions that we perform
substitutions: List[Substitution] = field(default_factory=list, hash=False)
"""A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
to perform at the very end on the prompt."""

# Maximum number of (in-context) training instances to put into the prompt
max_train_instances: int = 5
"""Maximum number of training instances to include in the prompt (currently by randomly sampling)."""

# Maximum number of evaluation instances. For getting valid numbers, this
# should be the entire dataset; only reduce this for piloting.
max_eval_instances: Optional[int] = None
"""Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""

# Generate this many outputs (which could be realized by `num_completions`
# or `top_k_per_token`).
num_outputs: int = 5
"""Maximum number of possible outputs to generate by sampling multiple outputs."""

# Number of trials, where in each trial we choose an independent, random
# set of training instances. Used to compute error bars.
num_train_trials: int = 1
"""Number of trials, where in each trial we choose an independent, random set of training instances.
Used to compute variance."""

# Number of trials, where we query the model with the same requests, but different random seeds
num_trials: int = 1
"""Number of trials, where we query the model with the same requests, but different random seeds."""

# If true, randomly sample N training examples; if false, select N consecutive training examples
sample_train: bool = True
"""If true, randomly sample N training examples; if false, select N consecutive training examples"""

# Decoding parameters (inherited by `Request`)

# Model deployment to make the request to (need to fill in)
model_deployment: str = ""
"""Name of the language model deployment (<host_organization>/<model name>) to send requests to."""

# Model to make the request to
model: str = ""
"""Name of the language model (<creator_organization>/<model name>) to send requests to."""

# Temperature to use
temperature: float = 1
"""Temperature parameter used in generation."""

# Maximum number of tokens to generate
max_tokens: int = 100
"""Maximum number of tokens to generate."""

# When to stop (set hash=False to make `AdapterSpec` hashable)
# Set hash=False to make `AdapterSpec` hashable
stop_sequences: List[str] = field(default_factory=list, hash=False)
"""List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""

# Random string (used concretely to bypass cache / see diverse results)
random: Optional[str] = None
"""Random seed (string), which guarantees reproducibility."""

# If true, for instances with multiple correct reference, the gold answer should be considered
# to be all the correct references rather than any of the correct references.
multi_label: bool = False
"""If true, for instances with multiple correct reference, the gold answer should be considered to be all
of the correct references rather than any of the correct references."""

# Parameters for image generation
image_generation_parameters: Optional[ImageGenerationParameters] = None
"""Parameters for image generation."""

# The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
# Set hash=False to make `AdapterSpec` hashable
eval_splits: Optional[List[str]] = field(default=None, hash=False)
"""The splits from which evaluation instances will be drawn."""
58 changes: 54 additions & 4 deletions src/helm/benchmark/presentation/schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import ast
import dataclasses
from dataclasses import dataclass, field
from typing import List, Optional, Dict
import dacite
from inspect import cleandoc
import mako.template
import yaml
import importlib_resources as resources
Expand All @@ -17,6 +20,11 @@
SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"


_ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
_ADAPTER_SPEC_FILENAME = "adapter_spec.py"
_ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"


@dataclass(frozen=True)
class Field:
"""
Expand Down Expand Up @@ -198,9 +206,6 @@ class RunGroup(Field):
class Schema:
"""Specifies information about what to display on the frontend."""

# Adapter fields (e.g., temperature)
adapter: List[Field]

# Information about each field
metrics: List[Field]

Expand All @@ -213,13 +218,55 @@ class Schema:
# Group the scenarios
run_groups: List[RunGroup]

# Adapter fields (e.g., temperature)
# Automatically populated from the docstrings in the AdapterSpec class definition.
# Should not be specified in the user's YAML file.
adapter: Optional[List[Field]] = None

def __post_init__(self):
self.name_to_metric = {metric.name: metric for metric in self.metrics}
self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
self.name_to_metric_group = {metric_group.name: metric_group for metric_group in self.metric_groups}
self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}


def get_adapter_fields() -> List[Field]:
"""Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
# Unfortunately there is no standard library support for getting docstrings of class fields,
# so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
with open(adapter_spec_path, "r") as f:
contents = f.read()
module_node = ast.parse(contents)
adapter_spec_node = [
node
for node in ast.iter_child_nodes(module_node)
if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
][0]
metadata_fields: List[Field] = []
field_name: str = ""
for node in ast.iter_child_nodes(adapter_spec_node):
if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
# This node is a field definition.
# Save the name of the field for later.
field_name = node.target.id
else:
# If this is a docstring that immediately follows a field definition,
# output an adapter field with the name set to the field definition and
# the description set to the docstring.
if (
field_name
and isinstance(node, ast.Expr)
and isinstance(node.value, ast.Constant)
and isinstance(node.value.value, str)
):
description = cleandoc(node.value.value).replace("\n", " ")
metadata_fields.append(Field(name=field_name, description=description))
field_name = ""

return metadata_fields


def get_default_schema_path() -> str:
return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)

Expand All @@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
hlog(f"Reading schema file {schema_path}...")
with open(schema_path, "r") as f:
raw = yaml.safe_load(f)
return dacite.from_dict(Schema, raw)
schema = dacite.from_dict(Schema, raw)
if schema.adapter:
hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
return dataclasses.replace(schema, adapter=get_adapter_fields())
11 changes: 11 additions & 0 deletions src/helm/benchmark/presentation/test_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from helm.benchmark.presentation.schema import get_adapter_fields


def test_get_adapter_fields() -> None:
adapter_fields = get_adapter_fields()
assert adapter_fields
assert adapter_fields[0].name == "method"
assert (
adapter_fields[0].description
== "The high-level strategy for converting instances into a prompt for the language model."
)
62 changes: 3 additions & 59 deletions src/helm/benchmark/static/schema_classic.yaml
Original file line number Diff line number Diff line change
@@ -1,64 +1,8 @@
---
############################################################
adapter:
- name: method
description: The high-level strategy for converting instances into a prompt for the language model.
values:
- name: generation
description: Given the input, the model generates the output free-form.
- name: multiple_choice_joint
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
- name: multiple_choice_separate_original
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
- name: multiple_choice_separate_calibrated
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
- name: language_modeling
description: Given the input, the model assigns the sequence a probability.
- name: instructions
description: The description of the task that is included at the very beginning of the prompt.
- name: global_prefix
description: The string that is prepended to the prompt.
- name: instance_prefix
description: The string that is included before each instance (e.g., '\n\n').
- name: input_prefix
description: The string that is included before each input (e.g., 'Question:').
- name: input_suffix
description: The string that is included after each input (e.g., '\n').
- name: reference_prefix
description: The string that is included before each reference (for multiple-choice questions).
- name: reference_suffix
description: The string that is included after each reference (for multiple-choice questions).
- name: output_prefix
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
- name: output_suffix
description: The string that is included after the correct answer/predicted output (e.g., '\n').
- name: substitutions
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
- name: max_train_instances
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
- name: max_eval_instances
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
- name: num_outputs
description: Maximum number of possible outputs to generate by sampling multiple outputs.
- name: num_train_trials
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
- name: sample_train
description: If true, randomly sample N training examples; if false, select N consecutive training examples
- name: model
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
- name: model_deployment
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
- name: temperature
description: Temperature parameter used in generation.
- name: max_tokens
description: Maximum number of tokens to generate.
- name: stop_sequences
description: List of sequences, where we stop generation if we encounter any of them.
- name: random
description: Random seed (string), which guarantees reproducibility.
- name: multi_label
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.

# For backwards compatibility with older versions of HELM.
# TODO: Remove this after 2024-09-01.
adapter: []
############################################################
metrics:
# Infrastructure metrics:
Expand Down
66 changes: 3 additions & 63 deletions src/helm/benchmark/static/schema_image2structure.yaml
Original file line number Diff line number Diff line change
@@ -1,68 +1,8 @@
---
############################################################
adapter:
- name: method
description: The high-level strategy for converting instances into a prompt for the language model.
values:
- name: generation
description: Given the input, the model generates the output free-form.
- name: generation_multimodal
description: Given the multimodal input, the model generates the output free-form.
- name: multiple_choice_joint
description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
- name: multiple_choice_separate_original
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
- name: multiple_choice_separate_calibrated
description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
- name: language_modeling
description: Given the input, the model assigns the sequence a probability.
- name: instructions
description: The description of the task that is included at the very beginning of the prompt.
- name: global_prefix
description: The string that is prepended to the prompt.
- name: global_suffix
description: The string that is appended to the prompt.
- name: instance_prefix
description: The string that is included before each instance (e.g., '\n\n').
- name: input_prefix
description: The string that is included before each input (e.g., 'Question:').
- name: input_suffix
description: The string that is included after each input (e.g., '\n').
- name: reference_prefix
description: The string that is included before each reference (for multiple-choice questions).
- name: reference_suffix
description: The string that is included after each reference (for multiple-choice questions).
- name: output_prefix
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
- name: output_suffix
description: The string that is included after the correct answer/predicted output (e.g., '\n').
- name: substitutions
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
- name: max_train_instances
description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
- name: max_eval_instances
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
- name: num_outputs
description: Maximum number of possible outputs to generate by sampling multiple outputs.
- name: num_train_trials
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
- name: sample_train
description: If true, randomly sample N training examples; if false, select N consecutive training examples
- name: model
description: Name of the language model (<creator_organization>/<model name>) to send requests to.
- name: model_deployment
description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
- name: temperature
description: Temperature parameter used in generation.
- name: max_tokens
description: Maximum number of tokens to generate.
- name: stop_sequences
description: List of sequences, where we stop generation if we encounter any of them.
- name: random
description: Random seed (string), which guarantees reproducibility.
- name: multi_label
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.

# For backwards compatibility with older versions of HELM.
# TODO: Remove this after 2024-09-01.
adapter: []
############################################################
metrics:
# Infrastructure metrics:
Expand Down
Loading

0 comments on commit c63a1b4

Please sign in to comment.