allenai · natolambert · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -60,7 +60,7 @@ RUN pip install -e .
 RUN chmod +x scripts/*
 
 # this is just very slow
-# RUN pip install flash-attn==2.5.0 --no-build-isolation
+RUN pip install flash-attn==2.6.3 --no-build-isolation
 
 # for olmo-instruct v1, weird install requirements
 # RUN pip install ai2-olmo 

diff --git a/scripts/configs/eval_configs.yaml b/scripts/configs/eval_configs.yaml
@@ -717,4 +717,21 @@ NCSOFT/Llama-3-OffsetBias-RM-8B:
   batch_size: 4
   torch_dtype: bfloat16
   dpo: False
+  trust_remote_code: False
+Skywork/Skywork-Reward-Gemma-2-27B:
+  model: Skywork/Skywork-Reward-Gemma-2-27B
+  tokenizer: Skywork/Skywork-Reward-Gemma-2-27B
+  chat_template: # none for tokenizer
+  batch_size: 2
+  dpo: False
+  torch_dtype: bfloat16
+  trust_remote_code: False
+  attention_implementation: flash_attention_2
+Skywork/Skywork-Reward-Llama-3.1-8B:
+  model: Skywork/Skywork-Reward-Llama-3.1-8B
+  tokenizer: Skywork/Skywork-Reward-Llama-3.1-8B
+  chat_template: # none for tokenizer
+  batch_size: 8
+  dpo: False
+  torch_dtype: bfloat16
   trust_remote_code: False
diff --git a/scripts/run_generative.py b/scripts/run_generative.py
@@ -127,16 +127,16 @@ def main():
         # load model
         model = LLM(args.model, trust_remote_code=args.trust_remote_code, tensor_parallel_size=args.num_gpus)
         tokenizer = AutoTokenizer.from_pretrained(args.model)
-        if "Llama-3" in args.model or "llama3-8b" in args.model:
+        if "Llama-3" in args.model or "llama3-8b" in args.model and "3.1" not in args.model:
             stop_token_ids = [128009]
         else:
-            stop_token_ids = []
+            stop_token_ids = None
 
         sampling_params = SamplingParams(
             n=1,
             temperature=0,
             top_p=1,
-            max_tokens=1024,
+            max_tokens=2048,
             stop_token_ids=stop_token_ids,
         )
 
@@ -273,7 +273,7 @@ def format_judgements(batch, optional_chat_template=None):
                 optional_chat_template.append_message(optional_chat_template.roles[0], user_prompt)
                 optional_chat_template.append_message(optional_chat_template.roles[1], None)
                 prompt = optional_chat_template.get_prompt()
-            elif model_modifier:
+            else:
                 messages = [
                     {
                         "role": "system",

diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
@@ -28,7 +28,7 @@
     "--eval_on_pref_sets", action="store_true", default=False, help="Evaluate on preference sets rather than core set"
 )
 argparser.add_argument("--eval_on_bon", action="store_true", default=False, help="Evaluate on BON preference sets")
-argparser.add_argument("--image", type=str, default="nathanl/rb_v23", help="Beaker image to use")
+argparser.add_argument("--image", type=str, default="nathanl/rewardbench_auto", help="Beaker image to use")
 argparser.add_argument("--cluster", type=str, default="ai2/allennlp-cirrascale", help="Beaker cluster to use")
 argparser.add_argument("--priority", type=str, default="normal", help="Priority of the job")
 argparser.add_argument("--upload_to_hub", action="store_false", default=True, help="Upload to results to HF hub")
@@ -97,10 +97,10 @@
 
     # check if bfloat16
     if "torch_dtype" in model_config:
-        if model_config["torch_dtype"] == "torch.bfloat16":
+        if model_config["torch_dtype"] == "torch.bfloat16" or model_config["torch_dtype"] == "bfloat16":
             eval_bfloat16 = True
-        else:
-            eval_bfloat16 = False
+    else:
+        eval_bfloat16 = False
 
     # ignore models depending on eval_dpo_only and eval_rm_only
     if args.eval_dpo_only:
@@ -159,6 +159,10 @@
     if eval_bfloat16:
         d["tasks"][0]["arguments"][0] += " --torch_dtype=bfloat16"
 
+    # for run_rm only, for now, and gemma-2-27b RMs
+    if "attention_implementation" in model_config:
+        d["tasks"][0]["arguments"][0] += f" --attn_implementation {model_config['attention_implementation']}"
+
     if "ref_model" in model_config:
         if not args.ref_free:  # if passed, ignore logic in eval configs
             d["tasks"][0]["arguments"][0] += f" --ref_model {model_config['ref_model']}"