diff --git a/.flake8 b/.flake8
index 29658d00..742b4cd3 100644
--- a/.flake8
+++ b/.flake8
@@ -1,5 +1,4 @@
 [flake8]
 exclude = 
-    herm/models/openassistant.py
-    herm/models/starling.py
+    rewardbench/models/openassistant.py
 extend-ignore = E203
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index a287f216..5dadace7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-# TODO: Update this when releasing HERM publicly
+# TODO: Update this when releasing RewardBench publicly
 # This dockerfile is forked from ai2/cuda11.8-cudnn8-dev-ubuntu20.04
 # To get the latest id, run `beaker image pull ai2/cuda11.8-cudnn8-dev-ubuntu20.04` 
 # and then `docker image list`, to verify docker image is pulled
@@ -19,7 +19,7 @@ RUN pip install torch torchvision torchaudio --index-url https://download.pytorc
 # RUN pip install flash-attn==2.2.2 --no-build-isolation
 
 # TODO: enable these when training code is complete
-COPY herm herm
+COPY rewardbench rewardbench
 COPY scripts scripts
 COPY setup.py setup.py
 COPY Makefile Makefile
diff --git a/Makefile b/Makefile
index bcfd0049..d8403692 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
 
-check_dirs := herm scripts analysis tests
+check_dirs := rewardbench scripts analysis tests
 
 style:
 	python -m black --line-length 119 --target-version py310 $(check_dirs) setup.py
diff --git a/README.md b/README.md
index 2cdf4055..e746295f 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,27 @@
-# Holistic Evaluation of Reward Models (HERM)
-
-This will hold scripts for generating scores and uploading results.
-Two primary to generate results (more in `scripts/`):
+<div align="center">
+  <h1>RewardBench: Evaluating Reward Models</h1>
+  <p>
+  <a href="https://huggingface.co/spaces/allenai/reward-bench">Leaderbord</a> 📐 |
+  <a href="https://huggingface.co/datasets/allenai/reward-bench">RewardBench Dataset</a> |
+  <a href="https://huggingface.co/datasets/allenai/preference-test-sets">Existing Test Sets</a> |
+  <a href="https://huggingface.co/datasets/allenai/reward-bench-results">Results</a> 📊 |
+  Paper (coming soon) 📝
+</p>
+  <img src="https://github.com/allenai/reward-bench/assets/10695622/24ed272a-0844-451f-b414-fde57478703e" alt="RewardBench Logo" width="700" style="margin-left:'auto' margin-right:'auto' display:'block' "/>
+</div>
+
+---
+
+**RewardBench** is a benchmark designed to evaluate the capabilities and safety of reward models (including those trained with Direct Preference Optimization, DPO).
+The repository includes the following:
+* Common inference code for a variety of reward models (Starling, PairRM, OpenAssistant, DPO, and more).
+* Common dataset formatting and tests for fair reward model inference.
+* Analysis and visualization tools.
+
+The two primary scripts to generate results (more in `scripts/`):
 1. `scripts/run_rm.py`: Run evaluations for reward models.
 2. `scripts/run_dpo.py`: Run evaluations for direct preference optimization (DPO) models.
 
-## Links
-Dataset, space, etc coming soon.
-For contributors, it can be found in this [HuggingFace org](https://huggingface.co/ai2-adapt-dev).
-
 ## Installation
 Please install `torch`` on your system, and then install the following requirements.
 ```
@@ -70,10 +83,10 @@ python scripts/run_bon.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2
 
 ```
 ├── README.md                   <- The top-level README for researchers using this project
-├── analysis/                   <- Directory of tools to analyze HERM results or other reward model properties
-├── herm/                       <- Core utils and modeling files
+├── analysis/                   <- Directory of tools to analyze RewardBench results or other reward model properties
+├── rewardbench/                       <- Core utils and modeling files
 |   ├── models/                     ├── Standalone files for running existing reward models
-|   └── *.py                        └── HERM tools and utilities
+|   └── *.py                        └── RewardBench tools and utilities
 ├── scripts/                    <- Scripts and configs to train and evaluate reward models
 ├── tests                       <- Unit tests
 ├── Dockerfile                  <- Build file for reproducible and scaleable research at AI2
@@ -84,7 +97,7 @@ python scripts/run_bon.py --model=OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2
 
 ## Maitenence
 
-### Updating the docker image (consider removing this section when we publicly release HERM)
+### Updating the docker image (consider removing this section when we publicly release RewardBench)
 When updating this repo, the docker image should be rebuilt to include those changes. 
 For AI2 members, please update the list below with any images you use regularly.
 For example, if you update `scripts/run_rm.py` and include a new package (or change a package version), you should rebuild the image and verify it still works on known models.
diff --git a/analysis/README.md b/analysis/README.md
index bb76074e..119533b1 100644
--- a/analysis/README.md
+++ b/analysis/README.md
@@ -14,13 +14,13 @@ python analysis/plot_per_subset_dist.py --output_dir=plots/whisker
 ```
 
 ### Get benchmark results
-This prints out the HERM results in a Markdown or LaTeX table. Note that you need to pass an API token to the `HF_COLLAB_TOKEN` environment variable.
+This prints out the RewardBench results in a Markdown or LaTeX table. Note that you need to pass an API token to the `HF_COLLAB_TOKEN` environment variable.
 ```
 # Use --render_latex for LaTeX output
 python analysis/get_benchmark_results.py
 ```
 
-Below is a snippet of the output for the HERM - General results:
+Below is a snippet of the output for the RewardBench - General results:
 
 | model                                            |   average |   alpacaeval |   mt-bench |   llmbar |   refusals |    hep |
 |--------------------------------------------------|-----------|--------------|------------|----------|------------|--------|
diff --git a/analysis/bon_to_alpacaeval.py b/analysis/bon_to_alpacaeval.py
index c912589f..42f0eb06 100644
--- a/analysis/bon_to_alpacaeval.py
+++ b/analysis/bon_to_alpacaeval.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Script for converting HERM best of n (BoN) results into the AlpacaEval format
+# Script for converting RewardBench best of n (BoN) results into the AlpacaEval format
 
 import argparse
 import os
diff --git a/analysis/draw_model_histogram.py b/analysis/draw_model_histogram.py
index a83b6fd2..b8bf7e9b 100644
--- a/analysis/draw_model_histogram.py
+++ b/analysis/draw_model_histogram.py
@@ -17,7 +17,10 @@
 import argparse
 from pathlib import Path
 
-from herm.visualization import draw_model_source_histogram, print_model_statistics
+from rewardbench.visualization import (
+    draw_model_source_histogram,
+    print_model_statistics,
+)
 
 
 def get_args():
diff --git a/analysis/draw_per_token_reward.py b/analysis/draw_per_token_reward.py
index 6c88a3b2..6584fa92 100644
--- a/analysis/draw_per_token_reward.py
+++ b/analysis/draw_per_token_reward.py
@@ -22,7 +22,7 @@
 import numpy as np
 import spacy_alignments as tokenizations
 
-from herm.visualization import draw_per_token_reward
+from rewardbench.visualization import draw_per_token_reward
 
 DEFAULT_DIRNAME = "per-token-reward"
 
diff --git a/analysis/get_benchmark_results.py b/analysis/get_benchmark_results.py
index faaae5e9..9cbdc941 100644
--- a/analysis/get_benchmark_results.py
+++ b/analysis/get_benchmark_results.py
@@ -58,7 +58,7 @@ def get_args():
     return args
 
 
-def get_average_over_herm(
+def get_average_over_rewardbench(
     df: pd.DataFrame,
     subsets: List[str] = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"],
 ) -> pd.DataFrame:
@@ -96,7 +96,7 @@ def main():
     print(f"Downloading repository snapshots into '{LOCAL_DIR}' directory")
     # Load the remote repository using the HF API
     hf_evals_repo = snapshot_download(
-        local_dir=Path(LOCAL_DIR) / "herm",
+        local_dir=Path(LOCAL_DIR) / "rewardbench",
         repo_id=args.hf_evals_repo,
         use_auth_token=api_token,
         tqdm_class=None,
@@ -107,8 +107,8 @@ def main():
     hf_prefs_df = load_results(hf_evals_repo, subdir="pref-sets/", ignore_columns=args.ignore_columns)
 
     all_results = {
-        "HERM - Overview": get_average_over_herm(hf_evals_df),
-        "HERM - Detailed": hf_evals_df,
+        "RewardBench - Overview": get_average_over_rewardbench(hf_evals_df),
+        "RewardBench - Detailed": hf_evals_df,
         "Pref Sets - Overview": hf_prefs_df,
     }
 
diff --git a/analysis/get_per_token_reward.py b/analysis/get_per_token_reward.py
index 8a5b8b51..b0e83a9b 100644
--- a/analysis/get_per_token_reward.py
+++ b/analysis/get_per_token_reward.py
@@ -35,7 +35,7 @@
     pipeline,
 )
 
-from herm import models
+from rewardbench import models
 
 REWARD_MODEL_CONFIG = {
     "default": {
diff --git a/analysis/get_subtoken_statistics.py b/analysis/get_subtoken_statistics.py
index c99099cc..08b1c39f 100644
--- a/analysis/get_subtoken_statistics.py
+++ b/analysis/get_subtoken_statistics.py
@@ -1,3 +1,17 @@
+# Copyright 2023 AllenAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 from pathlib import Path
 from typing import Any, Dict
diff --git a/herm/__init__.py b/rewardbench/__init__.py
similarity index 100%
rename from herm/__init__.py
rename to rewardbench/__init__.py
diff --git a/herm/chattemplates.py b/rewardbench/chattemplates.py
similarity index 100%
rename from herm/chattemplates.py
rename to rewardbench/chattemplates.py
diff --git a/herm/dpo.py b/rewardbench/dpo.py
similarity index 100%
rename from herm/dpo.py
rename to rewardbench/dpo.py
diff --git a/herm/models/__init__.py b/rewardbench/models/__init__.py
similarity index 100%
rename from herm/models/__init__.py
rename to rewardbench/models/__init__.py
diff --git a/herm/models/beaver.py b/rewardbench/models/beaver.py
similarity index 100%
rename from herm/models/beaver.py
rename to rewardbench/models/beaver.py
diff --git a/herm/models/openassistant.py b/rewardbench/models/openassistant.py
similarity index 100%
rename from herm/models/openassistant.py
rename to rewardbench/models/openassistant.py
diff --git a/herm/models/openbmb.py b/rewardbench/models/openbmb.py
similarity index 100%
rename from herm/models/openbmb.py
rename to rewardbench/models/openbmb.py
diff --git a/herm/models/pairrm.py b/rewardbench/models/pairrm.py
similarity index 100%
rename from herm/models/pairrm.py
rename to rewardbench/models/pairrm.py
diff --git a/herm/models/shp.py b/rewardbench/models/shp.py
similarity index 100%
rename from herm/models/shp.py
rename to rewardbench/models/shp.py
diff --git a/herm/models/starling.py b/rewardbench/models/starling.py
similarity index 100%
rename from herm/models/starling.py
rename to rewardbench/models/starling.py
diff --git a/herm/models/ziya.py b/rewardbench/models/ziya.py
similarity index 100%
rename from herm/models/ziya.py
rename to rewardbench/models/ziya.py
diff --git a/herm/utils.py b/rewardbench/utils.py
similarity index 99%
rename from herm/utils.py
rename to rewardbench/utils.py
index d37c5a64..95aff30c 100644
--- a/herm/utils.py
+++ b/rewardbench/utils.py
@@ -23,7 +23,7 @@
 from huggingface_hub import HfApi
 from transformers import PreTrainedTokenizer
 
-from herm.models import REWARD_MODEL_CONFIG
+from rewardbench.models import REWARD_MODEL_CONFIG
 
 # HuggingFace Hub locations
 CORE_EVAL_SET = "ai2-adapt-dev/rm-benchmark-dev"
diff --git a/herm/visualization.py b/rewardbench/visualization.py
similarity index 100%
rename from herm/visualization.py
rename to rewardbench/visualization.py
diff --git a/scripts/configs/beaker_eval.yaml b/scripts/configs/beaker_eval.yaml
index 8deb1b39..45e5cfa4 100644
--- a/scripts/configs/beaker_eval.yaml
+++ b/scripts/configs/beaker_eval.yaml
@@ -1,8 +1,8 @@
 version: v2
-description: herm-eval-default
+description: rewardbench-eval-default
 budget: ai2/allennlp
 tasks:
-  - name: herm-eval-default
+  - name: rewardbench-eval-default
     image:
       beaker: <image>
     command: [
@@ -24,7 +24,7 @@ tasks:
       - name: TRANSFORMERS_CACHE
         value: ./cache/
       - name: WANDB_PROJECT
-        value: herm
+        value: rewardbench
       - name: WANDB_WATCH
         value: false
       - name: WANDB_LOG_MODEL
diff --git a/scripts/run_bon.py b/scripts/run_bon.py
index ffb5a8ba..d49c3803 100644
--- a/scripts/run_bon.py
+++ b/scripts/run_bon.py
@@ -28,7 +28,7 @@
 from tqdm import tqdm
 from transformers import AutoTokenizer, pipeline
 
-from herm import REWARD_MODEL_CONFIG, load_bon_dataset, save_to_hub
+from rewardbench import REWARD_MODEL_CONFIG, load_bon_dataset, save_to_hub
 
 # get token from HF_TOKEN env variable, but if it doesn't exist pass none
 HF_TOKEN = os.getenv("HF_TOKEN", None)
diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py
index a1dd0dba..866f9a4a 100644
--- a/scripts/run_dpo.py
+++ b/scripts/run_dpo.py
@@ -27,7 +27,7 @@
 from tqdm import tqdm
 from trl.trainer.utils import DPODataCollatorWithPadding
 
-from herm import DPO_MODEL_CONFIG, DPOInference, load_eval_dataset, save_to_hub
+from rewardbench import DPO_MODEL_CONFIG, DPOInference, load_eval_dataset, save_to_hub
 
 # get token from HF_TOKEN env variable, but if it doesn't exist pass none
 HF_TOKEN = os.getenv("HF_TOKEN", None)
diff --git a/scripts/run_rm.py b/scripts/run_rm.py
index efe6a529..d8f38922 100644
--- a/scripts/run_rm.py
+++ b/scripts/run_rm.py
@@ -26,7 +26,7 @@
 from tqdm import tqdm
 from transformers import AutoTokenizer, pipeline
 
-from herm import REWARD_MODEL_CONFIG, load_eval_dataset, save_to_hub
+from rewardbench import REWARD_MODEL_CONFIG, load_eval_dataset, save_to_hub
 
 # get token from HF_TOKEN env variable, but if it doesn't exist pass none
 HF_TOKEN = os.getenv("HF_TOKEN", None)
diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index 66d1fbb9..e48ff196 100644
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -95,12 +95,12 @@
         experiment_group = "dpo-eval"
         script = "run_dpo.py"
     else:
-        experiment_group = "herm-preference-sets"
+        experiment_group = "rewardbench-preference-sets"
         script = "run_rm.py"
     print(f"Submitting evaluation for model: {model} on {experiment_group}")
     d = copy.deepcopy(d1)
 
-    name = f"herm_eval_for_{model}_on_{experiment_group}".replace("/", "-")
+    name = f"rewardbench_eval_for_{model}_on_{experiment_group}".replace("/", "-")
     d["description"] = name
     d["tasks"][0]["name"] = name
 
@@ -133,5 +133,5 @@
     yaml.dump(d, file, default_flow_style=True)
     file.close()
 
-    cmd = "beaker experiment create {} --workspace ai2/herm".format(fn)
+    cmd = "beaker experiment create {} --workspace ai2/rewardbench".format(fn)
     subprocess.Popen(cmd, shell=True)
diff --git a/setup.py b/setup.py
index 42f74dfa..3cb3a421 100644
--- a/setup.py
+++ b/setup.py
@@ -15,14 +15,14 @@
 from setuptools import find_packages, setup
 
 setup(
-    name="herm",
+    name="rewardbench",
     version="0.1.0.dev",
     author="Nathan Lambert",
     author_email="nathanl@allenai.org",
     description="Tools for evaluating reward models",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",
-    url="https://github.com/allenai/herm",
+    url="https://github.com/allenai/rewardbench",
     packages=find_packages(),
     classifiers=[
         "Programming Language :: Python :: 3",
diff --git a/tests/test_data.py b/tests/test_data.py
index 491664eb..dad3b1fa 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -17,7 +17,11 @@
 from fastchat.conversation import get_conv_template
 from transformers import AutoTokenizer
 
-from herm import load_eval_dataset, prepare_dialogue, prepare_dialogue_from_tokenizer
+from rewardbench import (
+    load_eval_dataset,
+    prepare_dialogue,
+    prepare_dialogue_from_tokenizer,
+)
 
 
 class PrepareDialoguesTest(unittest.TestCase):