Populate schema adapter fields from AdapterSpec docstrings (#2611)

stanford-crfm · May 7, 2024 · c63a1b4 · c63a1b4
1 parent 767a9e7
commit c63a1b4
Show file tree

Hide file tree

Showing 11 changed files with 122 additions and 512 deletions.
diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py
@@ -39,90 +39,91 @@ class AdapterSpec:
     Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
     """
 
-    # Method of adaptation
     method: str = ""
+    """The high-level strategy for converting instances into a prompt for the language model."""
 
-    # Prepend all prompts with this string.
-    # For example, it is recommended to prefix all prompts with [NLG] for UL2.
     global_prefix: str = ""
+    """The string that is prepended to the entire prompt."""
 
-    # Append all prompts with this string.
     global_suffix: str = ""
+    """The string that is appended to the entire prompt."""
 
-    # Prompt starts with instructions
     instructions: str = ""
+    """The description of the task that is included at the very beginning of the prompt."""
 
-    # What goes before the input
     input_prefix: str = "Input: "
+    """The string that is included before each input (e.g., 'Question:')."""
 
-    # What goes after the input
     input_suffix: str = "\n"
+    """The string that is included after each input (e.g., '\\n')."""
 
-    # What goes before the input (for multiple choice)
     reference_prefix: str = "A. "
+    """The string that is included before each reference (for multiple-choice questions)."""
 
-    # What goes before the input (for multiple choice)
     reference_suffix: str = "\n"
+    """The string that is included after each reference (for multiple-choice questions)."""
 
-    # What goes before the output
     output_prefix: str = "Output: "
+    """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
 
-    # What goes after the output
     output_suffix: str = "\n"
+    """The string that is included after the correct answer/predicted output (e.g., '\\n')."""
 
-    # What goes between instruction and in-context example blocks in the constructed prompt
     instance_prefix: str = "\n"
+    """The string that is included before each instance (e.g., '\\n\\n')."""
 
-    # List of regular expression substitutions that we perform
     substitutions: List[Substitution] = field(default_factory=list, hash=False)
+    """A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
+    to perform at the very end on the prompt."""
 
-    # Maximum number of (in-context) training instances to put into the prompt
     max_train_instances: int = 5
+    """Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
 
-    # Maximum number of evaluation instances. For getting valid numbers, this
-    # should be the entire dataset; only reduce this for piloting.
     max_eval_instances: Optional[int] = None
+    """Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
 
-    # Generate this many outputs (which could be realized by `num_completions`
-    # or `top_k_per_token`).
     num_outputs: int = 5
+    """Maximum number of possible outputs to generate by sampling multiple outputs."""
 
-    # Number of trials, where in each trial we choose an independent, random
-    # set of training instances.  Used to compute error bars.
     num_train_trials: int = 1
+    """Number of trials, where in each trial we choose an independent, random set of training instances.
+    Used to compute variance."""
 
-    # Number of trials, where we query the model with the same requests, but different random seeds
     num_trials: int = 1
+    """Number of trials, where we query the model with the same requests, but different random seeds."""
 
-    # If true, randomly sample N training examples; if false, select N consecutive training examples
     sample_train: bool = True
+    """If true, randomly sample N training examples; if false, select N consecutive training examples"""
 
     # Decoding parameters (inherited by `Request`)
 
-    # Model deployment to make the request to (need to fill in)
     model_deployment: str = ""
+    """Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
 
-    # Model to make the request to
     model: str = ""
+    """Name of the language model (<creator_organization>/<model name>) to send requests to."""
 
-    # Temperature to use
     temperature: float = 1
+    """Temperature parameter used in generation."""
 
-    # Maximum number of tokens to generate
     max_tokens: int = 100
+    """Maximum number of tokens to generate."""
 
-    # When to stop (set hash=False to make `AdapterSpec` hashable)
+    # Set hash=False to make `AdapterSpec` hashable
     stop_sequences: List[str] = field(default_factory=list, hash=False)
+    """List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
 
     # Random string (used concretely to bypass cache / see diverse results)
     random: Optional[str] = None
+    """Random seed (string), which guarantees reproducibility."""
 
-    # If true, for instances with multiple correct reference, the gold answer should be considered
-    # to be all the correct references rather than any of the correct references.
     multi_label: bool = False
+    """If true, for instances with multiple correct reference, the gold answer should be considered to be all
+    of the correct references rather than any of the correct references."""
 
-    # Parameters for image generation
     image_generation_parameters: Optional[ImageGenerationParameters] = None
+    """Parameters for image generation."""
 
-    # The splits from which evaluation instances will be drawn (set hash=False to make `AdapterSpec` hashable)
+    # Set hash=False to make `AdapterSpec` hashable
     eval_splits: Optional[List[str]] = field(default=None, hash=False)
+    """The splits from which evaluation instances will be drawn."""
diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
@@ -1,6 +1,9 @@
+import ast
+import dataclasses
 from dataclasses import dataclass, field
 from typing import List, Optional, Dict
 import dacite
+from inspect import cleandoc
 import mako.template
 import yaml
 import importlib_resources as resources
@@ -17,6 +20,11 @@
 SCHEMA_CLASSIC_YAML_FILENAME: str = "schema_classic.yaml"
 
 
+_ADAPTER_SPEC_PACKAGE = "helm.benchmark.adaptation"
+_ADAPTER_SPEC_FILENAME = "adapter_spec.py"
+_ADAPTER_SPEC_CLASS_NAME = "AdapterSpec"
+
+
 @dataclass(frozen=True)
 class Field:
     """
@@ -198,9 +206,6 @@ class RunGroup(Field):
 class Schema:
     """Specifies information about what to display on the frontend."""
 
-    # Adapter fields (e.g., temperature)
-    adapter: List[Field]
-
     # Information about each field
     metrics: List[Field]
 
@@ -213,13 +218,55 @@ class Schema:
     # Group the scenarios
     run_groups: List[RunGroup]
 
+    # Adapter fields (e.g., temperature)
+    # Automatically populated from the docstrings in the AdapterSpec class definition.
+    # Should not be specified in the user's YAML file.
+    adapter: Optional[List[Field]] = None
+
     def __post_init__(self):
         self.name_to_metric = {metric.name: metric for metric in self.metrics}
         self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
         self.name_to_metric_group = {metric_group.name: metric_group for metric_group in self.metric_groups}
         self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
 
 
+def get_adapter_fields() -> List[Field]:
+    """Generate the adapter fields from the docstrings in the AdapterSpec class definition."""
+    # Unfortunately there is no standard library support for getting docstrings of class fields,
+    # so we have to do the parsing outselves. Fortunately, the parsing is quite straightforward.
+    adapter_spec_path = resources.files(_ADAPTER_SPEC_PACKAGE).joinpath(_ADAPTER_SPEC_FILENAME)
+    with open(adapter_spec_path, "r") as f:
+        contents = f.read()
+    module_node = ast.parse(contents)
+    adapter_spec_node = [
+        node
+        for node in ast.iter_child_nodes(module_node)
+        if isinstance(node, ast.ClassDef) and node.name == _ADAPTER_SPEC_CLASS_NAME
+    ][0]
+    metadata_fields: List[Field] = []
+    field_name: str = ""
+    for node in ast.iter_child_nodes(adapter_spec_node):
+        if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
+            # This node is a field definition.
+            # Save the name of the field for later.
+            field_name = node.target.id
+        else:
+            # If this is a docstring that immediately follows a field definition,
+            # output an adapter field with the name set to  the field definition and
+            # the description set to the docstring.
+            if (
+                field_name
+                and isinstance(node, ast.Expr)
+                and isinstance(node.value, ast.Constant)
+                and isinstance(node.value.value, str)
+            ):
+                description = cleandoc(node.value.value).replace("\n", " ")
+                metadata_fields.append(Field(name=field_name, description=description))
+            field_name = ""
+
+    return metadata_fields
+
+
 def get_default_schema_path() -> str:
     return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
 
@@ -229,4 +276,7 @@ def read_schema(schema_path: str) -> Schema:
     hlog(f"Reading schema file {schema_path}...")
     with open(schema_path, "r") as f:
         raw = yaml.safe_load(f)
-    return dacite.from_dict(Schema, raw)
+    schema = dacite.from_dict(Schema, raw)
+    if schema.adapter:
+        hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
+    return dataclasses.replace(schema, adapter=get_adapter_fields())
diff --git a/src/helm/benchmark/presentation/test_schema.py b/src/helm/benchmark/presentation/test_schema.py
@@ -0,0 +1,11 @@
+from helm.benchmark.presentation.schema import get_adapter_fields
+
+
+def test_get_adapter_fields() -> None:
+    adapter_fields = get_adapter_fields()
+    assert adapter_fields
+    assert adapter_fields[0].name == "method"
+    assert (
+        adapter_fields[0].description
+        == "The high-level strategy for converting instances into a prompt for the language model."
+    )
diff --git a/src/helm/benchmark/static/schema_classic.yaml b/src/helm/benchmark/static/schema_classic.yaml
@@ -1,64 +1,8 @@
 ---
 ############################################################
-adapter:
-  - name: method
-    description: The high-level strategy for converting instances into a prompt for the language model.
-    values:
-      - name: generation
-        description: Given the input, the model generates the output free-form.
-      - name: multiple_choice_joint
-        description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
-      - name: multiple_choice_separate_original
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
-      - name: multiple_choice_separate_calibrated
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
-      - name: language_modeling
-        description: Given the input, the model assigns the sequence a probability.
-  - name: instructions
-    description: The description of the task that is included at the very beginning of the prompt.
-  - name: global_prefix
-    description: The string that is prepended to the prompt.
-  - name: instance_prefix
-    description: The string that is included before each instance (e.g., '\n\n').
-  - name: input_prefix
-    description: The string that is included before each input (e.g., 'Question:').
-  - name: input_suffix
-    description: The string that is included after each input (e.g., '\n').
-  - name: reference_prefix
-    description: The string that is included before each reference (for multiple-choice questions).
-  - name: reference_suffix
-    description: The string that is included after each reference (for multiple-choice questions).
-  - name: output_prefix
-    description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
-  - name: output_suffix
-    description: The string that is included after the correct answer/predicted output (e.g., '\n').
-  - name: substitutions
-    description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
-  - name: max_train_instances
-    description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
-  - name: max_eval_instances
-    description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
-  - name: num_outputs
-    description: Maximum number of possible outputs to generate by sampling multiple outputs.
-  - name: num_train_trials
-    description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
-  - name: sample_train
-    description: If true, randomly sample N training examples; if false, select N consecutive training examples
-  - name: model
-    description: Name of the language model (<creator_organization>/<model name>) to send requests to.
-  - name: model_deployment
-    description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
-  - name: temperature
-    description: Temperature parameter used in generation.
-  - name: max_tokens
-    description: Maximum number of tokens to generate.
-  - name: stop_sequences
-    description: List of sequences, where we stop generation if we encounter any of them.
-  - name: random
-    description: Random seed (string), which guarantees reproducibility.
-  - name: multi_label
-    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
-
+# For backwards compatibility with older versions of HELM.
+# TODO: Remove this after 2024-09-01.
+adapter: []
 ############################################################
 metrics:
   # Infrastructure metrics:

diff --git a/src/helm/benchmark/static/schema_image2structure.yaml b/src/helm/benchmark/static/schema_image2structure.yaml
@@ -1,68 +1,8 @@
 ---
 ############################################################
-adapter:
-  - name: method
-    description: The high-level strategy for converting instances into a prompt for the language model.
-    values:
-      - name: generation
-        description: Given the input, the model generates the output free-form.
-      - name: generation_multimodal
-        description: Given the multimodal input, the model generates the output free-form.
-      - name: multiple_choice_joint
-        description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
-      - name: multiple_choice_separate_original
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
-      - name: multiple_choice_separate_calibrated
-        description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
-      - name: language_modeling
-        description: Given the input, the model assigns the sequence a probability.
-  - name: instructions
-    description: The description of the task that is included at the very beginning of the prompt.
-  - name: global_prefix
-    description: The string that is prepended to the prompt.
-  - name: global_suffix
-    description: The string that is appended to the prompt.
-  - name: instance_prefix
-    description: The string that is included before each instance (e.g., '\n\n').
-  - name: input_prefix
-    description: The string that is included before each input (e.g., 'Question:').
-  - name: input_suffix
-    description: The string that is included after each input (e.g., '\n').
-  - name: reference_prefix
-    description: The string that is included before each reference (for multiple-choice questions).
-  - name: reference_suffix
-    description: The string that is included after each reference (for multiple-choice questions).
-  - name: output_prefix
-    description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
-  - name: output_suffix
-    description: The string that is included after the correct answer/predicted output (e.g., '\n').
-  - name: substitutions
-    description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
-  - name: max_train_instances
-    description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
-  - name: max_eval_instances
-    description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
-  - name: num_outputs
-    description: Maximum number of possible outputs to generate by sampling multiple outputs.
-  - name: num_train_trials
-    description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
-  - name: sample_train
-    description: If true, randomly sample N training examples; if false, select N consecutive training examples
-  - name: model
-    description: Name of the language model (<creator_organization>/<model name>) to send requests to.
-  - name: model_deployment
-    description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
-  - name: temperature
-    description: Temperature parameter used in generation.
-  - name: max_tokens
-    description: Maximum number of tokens to generate.
-  - name: stop_sequences
-    description: List of sequences, where we stop generation if we encounter any of them.
-  - name: random
-    description: Random seed (string), which guarantees reproducibility.
-  - name: multi_label
-    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
-
+# For backwards compatibility with older versions of HELM.
+# TODO: Remove this after 2024-09-01.
+adapter: []
 ############################################################
 metrics:
   # Infrastructure metrics: