diff --git a/Dockerfile b/Dockerfile index a465d9b4..4c3c98c4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # To get the latest id, run `beaker image pull ai2/cuda11.8-cudnn8-dev-ubuntu20.04` # and then `docker image list`, to verify docker image is pulled # e.g. `Image is up to date for gcr.io/ai2-beaker-core/public/cncl3kcetc4q9nvqumrg:latest` -FROM gcr.io/ai2-beaker-core/public/cp3ript9a0gcrm4lmha0:latest +FROM gcr.io/ai2-beaker-core/public/cph14t4n343pipine0i0:latest RUN apt update && apt install -y openjdk-8-jre-headless diff --git a/README.md b/README.md index 65602fc0..7ecf1fbc 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@

Github RewardBench Logo +

+ + GitHub License + + + PyPI + +

--- @@ -228,6 +236,7 @@ When updating the `Dockerfile`, make sure to see the instructions at the top to In development, we have the following docker images (most recent first as it's likely what you need). TODO: Update it so one image has VLLM (for generative RM only) and one without. Without will load much faster. +- `nathanl/rb_v18`: Improvements to RewardBench CLI - `nathanl/rb_v17` (with VLLM): add support for vllm + llm as a judge, `rb_v16` is similar without prometheus and some OpenAI models - `nathanl/rb_v12`: add support for llama3 - `nathanl/rewardbench_v10`: add support for `mightbe/Better-PairRM` via jinja2 diff --git a/rewardbench/__init__.py b/rewardbench/__init__.py index df863577..b5de9881 100644 --- a/rewardbench/__init__.py +++ b/rewardbench/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.1.2" +__version__ = "0.1.3" from .chattemplates import * # noqa from .dpo import DPOInference from .models import DPO_MODEL_CONFIG, REWARD_MODEL_CONFIG diff --git a/rewardbench/models/__init__.py b/rewardbench/models/__init__.py index 61c7e4c8..634b5024 100644 --- a/rewardbench/models/__init__.py +++ b/rewardbench/models/__init__.py @@ -125,6 +125,20 @@ "custom_dialogue": False, "model_type": "Seq. Classifier", }, + "PKU-Alignment/beaver-7b-v2.0-reward": { + "model_builder": LlamaForScore.from_pretrained, + "pipeline_builder": BeaverPipeline, + "quantized": True, + "custom_dialogue": False, + "model_type": "Seq. Classifier", + }, + "PKU-Alignment/beaver-7b-v2.0-cost": { + "model_builder": LlamaForScore.from_pretrained, + "pipeline_builder": BeaverCostPipeline, + "quantized": True, + "custom_dialogue": False, + "model_type": "Seq. Classifier", + }, "RLHFlow/pair-preference-model-LLaMA3-8B": { "model_builder": AutoModelForCausalLM.from_pretrained, "pipeline_builder": SlicPairPMPipeline, diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 4eeb72de..09ca417d 100644 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -28,7 +28,7 @@ "--eval_on_pref_sets", action="store_true", default=False, help="Evaluate on preference sets rather than core set" ) argparser.add_argument("--eval_on_bon", action="store_true", default=False, help="Evaluate on BON preference sets") -argparser.add_argument("--image", type=str, default="nathanl/rb_v16", help="Beaker image to use") +argparser.add_argument("--image", type=str, default="nathanl/rb_v18", help="Beaker image to use") argparser.add_argument("--cluster", type=str, default="ai2/allennlp-cirrascale", help="Beaker cluster to use") argparser.add_argument("--priority", type=str, default="normal", help="Priority of the job") argparser.add_argument("--upload_to_hub", action="store_false", default=True, help="Upload to results to HF hub") diff --git a/setup.py b/setup.py index e2c182be..1002d642 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ # this has not yet been pushed to pypyi-test setup( name="rewardbench", - version="0.1.2", + version="0.1.3", # do not import from init, or we get a weird build error author="Nathan Lambert", author_email="nathanl@allenai.org", description="Tools for evaluating reward models",