Skip to content

Commit

Permalink
Ensemble pre-computed reward outputs (#120)
Browse files Browse the repository at this point in the history
  • Loading branch information
natolambert authored May 2, 2024
1 parent 12cf23d commit 79e5943
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 0 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ python scripts/run_dpo.py --model=stabilityai/stablelm-zephyr-3b --ref_model=sta
python scripts/run_dpo.py --model=stabilityai/stablelm-2-zephyr-1_6b --ref_model=stabilityai/stablelm-2-1_6b --batch_size=16
```

## Ensembling RMs
For reward models already in RewardBench, you can run an offline ensemble test to approximate using multiple reward models in your system. To try this, you can run:
```
python analysis/run_ensemble_offline.py --models sfairXC/FsfairX-LLaMA3-RM-v0.1 openbmb/Eurus-RM-7b Nexusflow/Starling-RM-34B
```

## Running Generative RMs (LLM-as-a-judge)
Local and API models are supported. For example, run OpenAI's models like:
```
Expand Down
173 changes: 173 additions & 0 deletions analysis/run_ensemble_offline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# Copyright 2023 AllenAI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Script for aggregating previous scores via ensemble to explore RM ensemble performance


import argparse

import numpy as np
import pandas as pd
from datasets import Dataset
from huggingface_hub import hf_hub_download

from rewardbench.constants import EXAMPLE_COUNTS, SUBSET_MAPPING
from rewardbench.utils import calculate_scores_per_section


def get_args():
"""
Argparser. Gets the models you wish to analyze primarily.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--hf_evals_repo",
type=str,
default="allenai/reward-bench-results",
help="HuggingFace repository containing the evaluation results.",
)
parser.add_argument("--models", type=str, nargs="+", help="Models to analyze.")
parser.add_argument("--do_not_normalize", action="store_true", default=False, help="Do not normalize the values.")
# mode is ether Mean, Worst, or Uncertainty
parser.add_argument("--mode", type=str, default="Mean", help="Mode of aggregation.")
parser.add_argument("--pref_sets", action="store_true", help="Use preference sets.")
parser.add_argument("--sweep", action="store_true", default=False, help="Sweep over all model options from >3.")
return parser.parse_args()


if __name__ == "__main__":
args = get_args()
all_models = args.models

#########################
# Setup and Load
#########################
assert isinstance(all_models, list), "Models must be a list."
assert len(all_models) > 1, "Models must not alone."

# Assert that modes are valid
assert args.mode in ["Mean", "Worst", "Uncertainty"], "Invalid mode."

# Load the results for the models
subdir = "eval-set-scores/" if not args.pref_sets else "pref-sets-scores/"
baseline_values = {}
data = {}

def flatten(data):
# if all rewards is list of list, unnest
if isinstance(data[0], list):
data = [item for sublist in data for item in sublist]
return data

for m in all_models:
hub_file = subdir + f"{m}.json"
f = hf_hub_download(args.hf_evals_repo, hub_file, repo_type="dataset")
eval_data = pd.read_json(f, orient="records")

# add baseline values for each model
all_rewards = np.concatenate((eval_data["scores_rejected"].values, eval_data["scores_chosen"]))
all_rewards = flatten(all_rewards)
mean_reward = np.mean(all_rewards)
std_dev_reward = np.std(all_rewards)
baseline_values[m] = {"mean": mean_reward, "std_dev": std_dev_reward}

data[m] = eval_data

#########################
# Normalize
#########################
if not args.do_not_normalize:
for m in all_models:
data[m]["scores_rejected"] = (
flatten(data[m]["scores_rejected"]) - baseline_values[m]["mean"]
) / baseline_values[m]["std_dev"]
data[m]["scores_chosen"] = (
flatten(data[m]["scores_chosen"]) - baseline_values[m]["mean"]
) / baseline_values[m]["std_dev"]

print(f"All models: {all_models}")
all_results = []

# check if sweep
if args.sweep:
modes = ["Mean", "Worst", "Uncertainty"]
model_index = 2
else:
modes = [args.mode]
model_index = len(all_models)

# iterate over all subsets from length 3 to 6 models
from itertools import combinations

for mode in modes:
args.mode = mode
for i in range(model_index, len(all_models) + 1):
for models in combinations(all_models, i):
models = list(models)

print(f"Analyzing models: {models}")

#########################
# Calculate ensembles
#########################
def compute_reward(scores, mode):
if mode == "Mean":
return np.mean(scores)
elif mode == "Worst":
return np.min(scores)
elif mode == "Uncertainty":
return np.mean(scores) - np.std(scores)

# iterate over ids in the dataframe
ids = data[models[0]]["id"].unique()
out_dataset = {"subsets": [], "results": []}
for id in ids:
scores_chosen = []
scores_rejected = []
for m in models:
scores_chosen.append(data[m].loc[data[m]["id"] == id]["scores_chosen"].values[0])
scores_rejected.append(data[m].loc[data[m]["id"] == id]["scores_rejected"].values[0])

ensemble_score_chosen = compute_reward(np.array(scores_chosen), args.mode)
ensemble_score_rejected = compute_reward(np.array(scores_rejected), args.mode)
subset = data[models[0]].loc[data[models[0]]["id"] == id]["subset"].values[0]
out_dataset["subsets"].append(subset)
value = 1 if ensemble_score_chosen > ensemble_score_rejected else 0
out_dataset["results"].append(value)

out_dataset = Dataset.from_dict(out_dataset).to_pandas() # I know this is meh

#########################
# Save / Share
#########################

results_grouped = {}
present_subsets = np.unique(out_dataset["subsets"])
for subset in present_subsets:
# subset_dataset = out_dataset.filter(lambda example: example["subsets"] == subset)
subset_dataset = out_dataset[out_dataset["subsets"] == subset]
num_correct = sum(subset_dataset["results"])
num_total = len(subset_dataset["results"])
# print(f"{subset}: {num_correct}/{num_total} ({num_correct/num_total})")
results_grouped[subset] = num_correct / num_total

if not args.pref_sets:
results_leaderboard = calculate_scores_per_section(EXAMPLE_COUNTS, SUBSET_MAPPING, results_grouped)
print(results_leaderboard)
results_leaderboard["models"] = "|".join(models)
results_leaderboard["mode"] = args.mode
all_results.append(results_leaderboard)

all_results = Dataset.from_list(all_results)
all_results.to_csv("ensemble_results.csv")

0 comments on commit 79e5943

Please sign in to comment.