diff --git a/.flake8 b/.flake8 index 29658d00..742b4cd3 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,4 @@ [flake8] exclude = - herm/models/openassistant.py - herm/models/starling.py + rewardbench/models/openassistant.py extend-ignore = E203 \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index a287f216..5dadace7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# TODO: Update this when releasing HERM publicly +# TODO: Update this when releasing RewardBench publicly # This dockerfile is forked from ai2/cuda11.8-cudnn8-dev-ubuntu20.04 # To get the latest id, run `beaker image pull ai2/cuda11.8-cudnn8-dev-ubuntu20.04` # and then `docker image list`, to verify docker image is pulled @@ -19,7 +19,7 @@ RUN pip install torch torchvision torchaudio --index-url https://download.pytorc # RUN pip install flash-attn==2.2.2 --no-build-isolation # TODO: enable these when training code is complete -COPY herm herm +COPY rewardbench rewardbench COPY scripts scripts COPY setup.py setup.py COPY Makefile Makefile diff --git a/Makefile b/Makefile index bcfd0049..d8403692 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) export PYTHONPATH = src -check_dirs := herm scripts analysis tests +check_dirs := rewardbench scripts analysis tests style: python -m black --line-length 119 --target-version py310 $(check_dirs) setup.py diff --git a/README.md b/README.md index 2cdf4055..e746295f 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,27 @@ -# Holistic Evaluation of Reward Models (HERM) - -This will hold scripts for generating scores and uploading results. -Two primary to generate results (more in `scripts/`): +
+

RewardBench: Evaluating Reward Models

+

+ Leaderbord 📐 | + RewardBench Dataset | + Existing Test Sets | + Results 📊 | + Paper (coming soon) 📝 +

+ RewardBench Logo +
+ +--- + +**RewardBench** is a benchmark designed to evaluate the capabilities and safety of reward models (including those trained with Direct Preference Optimization, DPO). +The repository includes the following: +* Common inference code for a variety of reward models (Starling, PairRM, OpenAssistant, DPO, and more). +* Common dataset formatting and tests for fair reward model inference. +* Analysis and visualization tools. + +The two primary scripts to generate results (more in `scripts/`): 1. `scripts/run_rm.py`: Run evaluations for reward models. 2. `scripts/run_dpo.py`: Run evaluations for direct preference optimization (DPO) models. -## Links -Dataset, space, etc coming soon. -For contributors, it can be found in this [HuggingFace org](https://huggingface.co/ai2-adapt-dev). - ## Installation Please install `torch`` on your system, and then install the following requirements. ``` @@ -70,10 +83,10 @@ python scripts/run_bon.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2 ``` ├── README.md <- The top-level README for researchers using this project -├── analysis/ <- Directory of tools to analyze HERM results or other reward model properties -├── herm/ <- Core utils and modeling files +├── analysis/ <- Directory of tools to analyze RewardBench results or other reward model properties +├── rewardbench/ <- Core utils and modeling files | ├── models/ ├── Standalone files for running existing reward models -| └── *.py └── HERM tools and utilities +| └── *.py └── RewardBench tools and utilities ├── scripts/ <- Scripts and configs to train and evaluate reward models ├── tests <- Unit tests ├── Dockerfile <- Build file for reproducible and scaleable research at AI2 @@ -84,7 +97,7 @@ python scripts/run_bon.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2 ## Maitenence -### Updating the docker image (consider removing this section when we publicly release HERM) +### Updating the docker image (consider removing this section when we publicly release RewardBench) When updating this repo, the docker image should be rebuilt to include those changes. For AI2 members, please update the list below with any images you use regularly. For example, if you update `scripts/run_rm.py` and include a new package (or change a package version), you should rebuild the image and verify it still works on known models. diff --git a/analysis/README.md b/analysis/README.md index bb76074e..119533b1 100644 --- a/analysis/README.md +++ b/analysis/README.md @@ -14,13 +14,13 @@ python analysis/plot_per_subset_dist.py --output_dir=plots/whisker ``` ### Get benchmark results -This prints out the HERM results in a Markdown or LaTeX table. Note that you need to pass an API token to the `HF_COLLAB_TOKEN` environment variable. +This prints out the RewardBench results in a Markdown or LaTeX table. Note that you need to pass an API token to the `HF_COLLAB_TOKEN` environment variable. ``` # Use --render_latex for LaTeX output python analysis/get_benchmark_results.py ``` -Below is a snippet of the output for the HERM - General results: +Below is a snippet of the output for the RewardBench - General results: | model | average | alpacaeval | mt-bench | llmbar | refusals | hep | |--------------------------------------------------|-----------|--------------|------------|----------|------------|--------| diff --git a/analysis/bon_to_alpacaeval.py b/analysis/bon_to_alpacaeval.py index c912589f..42f0eb06 100644 --- a/analysis/bon_to_alpacaeval.py +++ b/analysis/bon_to_alpacaeval.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Script for converting HERM best of n (BoN) results into the AlpacaEval format +# Script for converting RewardBench best of n (BoN) results into the AlpacaEval format import argparse import os diff --git a/analysis/draw_model_histogram.py b/analysis/draw_model_histogram.py index a83b6fd2..b8bf7e9b 100644 --- a/analysis/draw_model_histogram.py +++ b/analysis/draw_model_histogram.py @@ -17,7 +17,10 @@ import argparse from pathlib import Path -from herm.visualization import draw_model_source_histogram, print_model_statistics +from rewardbench.visualization import ( + draw_model_source_histogram, + print_model_statistics, +) def get_args(): diff --git a/analysis/draw_per_token_reward.py b/analysis/draw_per_token_reward.py index 6c88a3b2..6584fa92 100644 --- a/analysis/draw_per_token_reward.py +++ b/analysis/draw_per_token_reward.py @@ -22,7 +22,7 @@ import numpy as np import spacy_alignments as tokenizations -from herm.visualization import draw_per_token_reward +from rewardbench.visualization import draw_per_token_reward DEFAULT_DIRNAME = "per-token-reward" diff --git a/analysis/get_benchmark_results.py b/analysis/get_benchmark_results.py index faaae5e9..9cbdc941 100644 --- a/analysis/get_benchmark_results.py +++ b/analysis/get_benchmark_results.py @@ -58,7 +58,7 @@ def get_args(): return args -def get_average_over_herm( +def get_average_over_rewardbench( df: pd.DataFrame, subsets: List[str] = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"], ) -> pd.DataFrame: @@ -96,7 +96,7 @@ def main(): print(f"Downloading repository snapshots into '{LOCAL_DIR}' directory") # Load the remote repository using the HF API hf_evals_repo = snapshot_download( - local_dir=Path(LOCAL_DIR) / "herm", + local_dir=Path(LOCAL_DIR) / "rewardbench", repo_id=args.hf_evals_repo, use_auth_token=api_token, tqdm_class=None, @@ -107,8 +107,8 @@ def main(): hf_prefs_df = load_results(hf_evals_repo, subdir="pref-sets/", ignore_columns=args.ignore_columns) all_results = { - "HERM - Overview": get_average_over_herm(hf_evals_df), - "HERM - Detailed": hf_evals_df, + "RewardBench - Overview": get_average_over_rewardbench(hf_evals_df), + "RewardBench - Detailed": hf_evals_df, "Pref Sets - Overview": hf_prefs_df, } diff --git a/analysis/get_per_token_reward.py b/analysis/get_per_token_reward.py index 8a5b8b51..b0e83a9b 100644 --- a/analysis/get_per_token_reward.py +++ b/analysis/get_per_token_reward.py @@ -35,7 +35,7 @@ pipeline, ) -from herm import models +from rewardbench import models REWARD_MODEL_CONFIG = { "default": { diff --git a/analysis/get_subtoken_statistics.py b/analysis/get_subtoken_statistics.py index c99099cc..08b1c39f 100644 --- a/analysis/get_subtoken_statistics.py +++ b/analysis/get_subtoken_statistics.py @@ -1,3 +1,17 @@ +# Copyright 2023 AllenAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse from pathlib import Path from typing import Any, Dict diff --git a/herm/__init__.py b/rewardbench/__init__.py similarity index 100% rename from herm/__init__.py rename to rewardbench/__init__.py diff --git a/herm/chattemplates.py b/rewardbench/chattemplates.py similarity index 100% rename from herm/chattemplates.py rename to rewardbench/chattemplates.py diff --git a/herm/dpo.py b/rewardbench/dpo.py similarity index 100% rename from herm/dpo.py rename to rewardbench/dpo.py diff --git a/herm/models/__init__.py b/rewardbench/models/__init__.py similarity index 100% rename from herm/models/__init__.py rename to rewardbench/models/__init__.py diff --git a/herm/models/beaver.py b/rewardbench/models/beaver.py similarity index 100% rename from herm/models/beaver.py rename to rewardbench/models/beaver.py diff --git a/herm/models/openassistant.py b/rewardbench/models/openassistant.py similarity index 100% rename from herm/models/openassistant.py rename to rewardbench/models/openassistant.py diff --git a/herm/models/openbmb.py b/rewardbench/models/openbmb.py similarity index 100% rename from herm/models/openbmb.py rename to rewardbench/models/openbmb.py diff --git a/herm/models/pairrm.py b/rewardbench/models/pairrm.py similarity index 100% rename from herm/models/pairrm.py rename to rewardbench/models/pairrm.py diff --git a/herm/models/shp.py b/rewardbench/models/shp.py similarity index 100% rename from herm/models/shp.py rename to rewardbench/models/shp.py diff --git a/herm/models/starling.py b/rewardbench/models/starling.py similarity index 100% rename from herm/models/starling.py rename to rewardbench/models/starling.py diff --git a/herm/models/ziya.py b/rewardbench/models/ziya.py similarity index 100% rename from herm/models/ziya.py rename to rewardbench/models/ziya.py diff --git a/herm/utils.py b/rewardbench/utils.py similarity index 99% rename from herm/utils.py rename to rewardbench/utils.py index d37c5a64..95aff30c 100644 --- a/herm/utils.py +++ b/rewardbench/utils.py @@ -23,7 +23,7 @@ from huggingface_hub import HfApi from transformers import PreTrainedTokenizer -from herm.models import REWARD_MODEL_CONFIG +from rewardbench.models import REWARD_MODEL_CONFIG # HuggingFace Hub locations CORE_EVAL_SET = "ai2-adapt-dev/rm-benchmark-dev" diff --git a/herm/visualization.py b/rewardbench/visualization.py similarity index 100% rename from herm/visualization.py rename to rewardbench/visualization.py diff --git a/scripts/configs/beaker_eval.yaml b/scripts/configs/beaker_eval.yaml index 8deb1b39..45e5cfa4 100644 --- a/scripts/configs/beaker_eval.yaml +++ b/scripts/configs/beaker_eval.yaml @@ -1,8 +1,8 @@ version: v2 -description: herm-eval-default +description: rewardbench-eval-default budget: ai2/allennlp tasks: - - name: herm-eval-default + - name: rewardbench-eval-default image: beaker: command: [ @@ -24,7 +24,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_PROJECT - value: herm + value: rewardbench - name: WANDB_WATCH value: false - name: WANDB_LOG_MODEL diff --git a/scripts/run_bon.py b/scripts/run_bon.py index ffb5a8ba..d49c3803 100644 --- a/scripts/run_bon.py +++ b/scripts/run_bon.py @@ -28,7 +28,7 @@ from tqdm import tqdm from transformers import AutoTokenizer, pipeline -from herm import REWARD_MODEL_CONFIG, load_bon_dataset, save_to_hub +from rewardbench import REWARD_MODEL_CONFIG, load_bon_dataset, save_to_hub # get token from HF_TOKEN env variable, but if it doesn't exist pass none HF_TOKEN = os.getenv("HF_TOKEN", None) diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py index a1dd0dba..866f9a4a 100644 --- a/scripts/run_dpo.py +++ b/scripts/run_dpo.py @@ -27,7 +27,7 @@ from tqdm import tqdm from trl.trainer.utils import DPODataCollatorWithPadding -from herm import DPO_MODEL_CONFIG, DPOInference, load_eval_dataset, save_to_hub +from rewardbench import DPO_MODEL_CONFIG, DPOInference, load_eval_dataset, save_to_hub # get token from HF_TOKEN env variable, but if it doesn't exist pass none HF_TOKEN = os.getenv("HF_TOKEN", None) diff --git a/scripts/run_rm.py b/scripts/run_rm.py index efe6a529..d8f38922 100644 --- a/scripts/run_rm.py +++ b/scripts/run_rm.py @@ -26,7 +26,7 @@ from tqdm import tqdm from transformers import AutoTokenizer, pipeline -from herm import REWARD_MODEL_CONFIG, load_eval_dataset, save_to_hub +from rewardbench import REWARD_MODEL_CONFIG, load_eval_dataset, save_to_hub # get token from HF_TOKEN env variable, but if it doesn't exist pass none HF_TOKEN = os.getenv("HF_TOKEN", None) diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 66d1fbb9..e48ff196 100644 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -95,12 +95,12 @@ experiment_group = "dpo-eval" script = "run_dpo.py" else: - experiment_group = "herm-preference-sets" + experiment_group = "rewardbench-preference-sets" script = "run_rm.py" print(f"Submitting evaluation for model: {model} on {experiment_group}") d = copy.deepcopy(d1) - name = f"herm_eval_for_{model}_on_{experiment_group}".replace("/", "-") + name = f"rewardbench_eval_for_{model}_on_{experiment_group}".replace("/", "-") d["description"] = name d["tasks"][0]["name"] = name @@ -133,5 +133,5 @@ yaml.dump(d, file, default_flow_style=True) file.close() - cmd = "beaker experiment create {} --workspace ai2/herm".format(fn) + cmd = "beaker experiment create {} --workspace ai2/rewardbench".format(fn) subprocess.Popen(cmd, shell=True) diff --git a/setup.py b/setup.py index 42f74dfa..3cb3a421 100644 --- a/setup.py +++ b/setup.py @@ -15,14 +15,14 @@ from setuptools import find_packages, setup setup( - name="herm", + name="rewardbench", version="0.1.0.dev", author="Nathan Lambert", author_email="nathanl@allenai.org", description="Tools for evaluating reward models", long_description=open("README.md").read(), long_description_content_type="text/markdown", - url="https://github.com/allenai/herm", + url="https://github.com/allenai/rewardbench", packages=find_packages(), classifiers=[ "Programming Language :: Python :: 3", diff --git a/tests/test_data.py b/tests/test_data.py index 491664eb..dad3b1fa 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -17,7 +17,11 @@ from fastchat.conversation import get_conv_template from transformers import AutoTokenizer -from herm import load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer +from rewardbench import ( + load_eval_dataset, + prepare_dialogue, + prepare_dialogue_from_tokenizer, +) class PrepareDialoguesTest(unittest.TestCase):