diff --git a/Dockerfile b/Dockerfile
index a465d9b4..4c3c98c4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
# To get the latest id, run `beaker image pull ai2/cuda11.8-cudnn8-dev-ubuntu20.04`
# and then `docker image list`, to verify docker image is pulled
# e.g. `Image is up to date for gcr.io/ai2-beaker-core/public/cncl3kcetc4q9nvqumrg:latest`
-FROM gcr.io/ai2-beaker-core/public/cp3ript9a0gcrm4lmha0:latest
+FROM gcr.io/ai2-beaker-core/public/cph14t4n343pipine0i0:latest
RUN apt update && apt install -y openjdk-8-jre-headless
diff --git a/README.md b/README.md
index 65602fc0..7ecf1fbc 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,14 @@
+
+
+
+
+
+
+
+
---
@@ -228,6 +236,7 @@ When updating the `Dockerfile`, make sure to see the instructions at the top to
In development, we have the following docker images (most recent first as it's likely what you need).
TODO: Update it so one image has VLLM (for generative RM only) and one without. Without will load much faster.
+- `nathanl/rb_v18`: Improvements to RewardBench CLI
- `nathanl/rb_v17` (with VLLM): add support for vllm + llm as a judge, `rb_v16` is similar without prometheus and some OpenAI models
- `nathanl/rb_v12`: add support for llama3
- `nathanl/rewardbench_v10`: add support for `mightbe/Better-PairRM` via jinja2
diff --git a/rewardbench/__init__.py b/rewardbench/__init__.py
index df863577..b5de9881 100644
--- a/rewardbench/__init__.py
+++ b/rewardbench/__init__.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "0.1.2"
+__version__ = "0.1.3"
from .chattemplates import * # noqa
from .dpo import DPOInference
from .models import DPO_MODEL_CONFIG, REWARD_MODEL_CONFIG
diff --git a/rewardbench/models/__init__.py b/rewardbench/models/__init__.py
index 61c7e4c8..634b5024 100644
--- a/rewardbench/models/__init__.py
+++ b/rewardbench/models/__init__.py
@@ -125,6 +125,20 @@
"custom_dialogue": False,
"model_type": "Seq. Classifier",
},
+ "PKU-Alignment/beaver-7b-v2.0-reward": {
+ "model_builder": LlamaForScore.from_pretrained,
+ "pipeline_builder": BeaverPipeline,
+ "quantized": True,
+ "custom_dialogue": False,
+ "model_type": "Seq. Classifier",
+ },
+ "PKU-Alignment/beaver-7b-v2.0-cost": {
+ "model_builder": LlamaForScore.from_pretrained,
+ "pipeline_builder": BeaverCostPipeline,
+ "quantized": True,
+ "custom_dialogue": False,
+ "model_type": "Seq. Classifier",
+ },
"RLHFlow/pair-preference-model-LLaMA3-8B": {
"model_builder": AutoModelForCausalLM.from_pretrained,
"pipeline_builder": SlicPairPMPipeline,
diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
index 4eeb72de..09ca417d 100644
--- a/scripts/submit_eval_jobs.py
+++ b/scripts/submit_eval_jobs.py
@@ -28,7 +28,7 @@
"--eval_on_pref_sets", action="store_true", default=False, help="Evaluate on preference sets rather than core set"
)
argparser.add_argument("--eval_on_bon", action="store_true", default=False, help="Evaluate on BON preference sets")
-argparser.add_argument("--image", type=str, default="nathanl/rb_v16", help="Beaker image to use")
+argparser.add_argument("--image", type=str, default="nathanl/rb_v18", help="Beaker image to use")
argparser.add_argument("--cluster", type=str, default="ai2/allennlp-cirrascale", help="Beaker cluster to use")
argparser.add_argument("--priority", type=str, default="normal", help="Priority of the job")
argparser.add_argument("--upload_to_hub", action="store_false", default=True, help="Upload to results to HF hub")
diff --git a/setup.py b/setup.py
index e2c182be..1002d642 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
# this has not yet been pushed to pypyi-test
setup(
name="rewardbench",
- version="0.1.2",
+ version="0.1.3", # do not import from init, or we get a weird build error
author="Nathan Lambert",
author_email="nathanl@allenai.org",
description="Tools for evaluating reward models",