openai
diff --git a/‎README.md
+28-18 b/‎README.md
+28-18
diff --git a/‎common.py
+101-2 b/‎common.py
+101-2
diff --git a/‎demo.py
+11-1 b/‎demo.py
+11-1
diff --git a/‎drop_eval.py
+2-4 b/‎drop_eval.py
+2-4
diff --git a/‎gpqa_eval.py
+7-7 b/‎gpqa_eval.py
+7-7
diff --git a/‎humaneval_eval.py
+4-2 b/‎humaneval_eval.py
+4-2
@@ -1,6 +1,6 @@
 # Overview
 This repository contains a lightweight library for evaluating language models.
-We are open sourcing it so we can be transparent about the accuracy numbers we're publishing alongside our latest models (starting with `gpt-4-turbo-2024-04-09`).
+We are open sourcing it so we can be transparent about the accuracy numbers we're publishing alongside our latest models (starting with `gpt-4-turbo-2024-04-09` and `gpt-4o`).
 
 Evals are sensitive to prompting, and there's significant variation in the formulations used in recent publications and libraries.
 Some use few-shot prompts or role playing prompts ("You are an expert software programmer...").
@@ -64,28 +64,38 @@ This will launch evaluations through the OpenAI API.
 
 
 ## Benchmark Results
-| Model                         | Prompt        |DROP(f1)| GPQA%   |   MATH% |   MGSM% |   MMLU% |HumanEval% | 
-|:-----------------------------:|:-------------:|:------:|:-------:|:-------:|:-------:|:-------:|:---------:| 
-|     GPT4s                     |               |        |         |         |         |         |           |
-| gpt-4-turbo-2024-04-09        | chatgpt[^1]   |   85.4 |  49.1   |   72.2  |   88.6  |   86.5  |      87.6 |
-| gpt-4-turbo-2024-04-09        | assistant[^2] |   86.0 |  49.3   |   73.4  |   89.6  |   86.7  |      88.2 |
-| gpt-4-1106(-vision)-preview   | chatgpt       |   81.3 |  42.1   |   64.1  |   86.5  |   84.6  |      82.2 |
-| gpt-4-1106(-vision)-preview   | assistant     |   83.2 |  42.5   |   64.3  |   87.1  |   84.7  |      83.7 |
-| gpt-4-0125-preview            | chatgpt       |   83.4 |  39.7   |   64.2  |   83.7  |   84.8  |      88.2 |
-| gpt-4-0125-preview            | assistant     |   81.5 |  41.4   |   64.5  |   85.1  |   85.4  |      86.6 |
-| REFERENCE                     |               |                  |         |         |         |           |
-| Claude-3-Opus (rerun w/ api)  | empty[^3]     |   79.0 |  49.7   |   63.2  |   89.7  |   84.1  |      84.8 |
-| Claude-3-Opus (rerun w/ api)  | lmsys[^4]     |   77.1 |  50.7   |   63.8  |   89.2  |   84.2  |      82.9 |
-| Claude-3-Opus (report[^5])    | unknown       |   83.1 |  50.4   |   60.1  |   90.7  |   86.8  |      84.9 |
-| Gemini-Ultra-1.0 (report[^6]) | unknown       |   82.4 | n/a     |   53.2  |  79.0   |   83.7  |      74.4 |
-| Gemini-Pro-1.5 (report[^6])   | unknown       |   78.9 | n/a     |   58.5  |   88.7  |   81.9  |      71.9 |
+| Model                       | Prompt        | MMLU   | GPQA    | MATH   | HumanEval| MGSM   | DROP<br>(F1,3-shot) |
+|:----------------------------|:-------------:|:------:|:-------:|:------:|:--------:|:------:|:------:|
+| OPENAI GPT4s                |               |        |         |        |          |        |        |
+| gpt-4o                      | chatgpt[^1]   |**`88.7`**|**`53.6`**|**`76.6`**| 90.2|  90.5   | 83.4  |
+| gpt-4o                      | assistant[^2] |  87.2  |  49.9 |**`76.6`**|**`91.0`**|  89.9   |  83.7  |
+| gpt-4-turbo-2024-04-09      | chatgpt       |  86.5  |  49.1  |  72.2  |    87.6   |   88.6  |  85.4  |
+| gpt-4-turbo-2024-04-09      | assistant     |  86.7  |  49.3  |  73.4  |    88.2   |  89.6  |**`86.0`**|
+| gpt-4-1106(-vision)-preview | chatgpt       |  84.6  |  42.1  |  64.1  |    82.2   |   86.5  |  81.3  |
+| gpt-4-1106(-vision)-preview | assistant     |  84.7  |  42.5  |  64.3  |    83.7   |   87.1  |  83.2  |
+| gpt-4-0125-preview          | chatgpt       |  84.8  |  39.7  |  64.2  |    88.2   |   83.7  |  83.4  |
+| gpt-4-0125-preview          | assistant     |  85.4  |  41.4  |  64.5  |    86.6   |   85.1  |  81.5  |
+| REFERENCE-RERUN             |               |         |        |        |           |         |        |
+| Claude-3-Opus (rerun w/ api) | empty[^3]     |  84.1  |  49.7  |   63.2 |   84.8   |   89.7  |  79.0  |
+| Claude-3-Opus (rerun w/ api) | lmsys[^4]     |  84.2  |  50.7  |  63.8  |   82.9   |   89.2  |  77.1  |
+| Llama3 70b (rerun w/ api)    | empty         |  80.2  |  41.3  |  52.8  |    70.1  |   82.6  |  81.4  |
+| REFERENCE-REPORT             |               |(5-shot)|        |        |           |         |        |
+| Claude-3-Opus (report[^5])   | unknown       |  86.8  |  50.4  |  60.1  |   84.9   |**`90.7`**| 83.1  |
+| Gemini-Ultra-1.0 (report[^6])| unknown       |  83.7  |  n/a   |  53.2  |    74.4  |   79.0  |  82.4  |
+| Gemini-Pro-1.5 (report[^6])  | unknown       |  81.9  |  n/a   |  58.5  |    71.9  |   88.7  |  78.9  |
+| Llama3 8b (report[^7])       | unknown       |  68.4  |  34.2  |  30.0  |    62.2  |   n/a   |  58.4  |
+| Llama3 70b (report[^7])      | unknown       |  82.0  |  39.5  |  50.4  |    81.7  |   n/a   |  79.7  |
+| Llama3 400b (still training, report[^7])| unknown |  86.1  |  48.0  |  57.8  |    84.1  |   n/a |    83.5  |
+
 
 [^1]:chatgpt system message: "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\nKnowledge cutoff: 2023-12\nCurrent date: 2024-04-01"
 [^2]:assistant system message in [OpenAI API doc](https://platform.openai.com/docs/api-reference/introduction): "You are a helpful assistant." .
 [^3]:claude-3 empty system message: suggested by Anthropic API doc, and we have done limited experiments due to [rate limit](https://docs.anthropic.com/claude/reference/rate-limits) issues, but we welcome PRs with alternative choices. 
 [^4]:claude-3 lmsys system message: system message in LMSYS [Fast-chat open source code](https://github.com/lm-sys/FastChat/blob/7899355ebe32117fdae83985cf8ee476d2f4243f/fastchat/conversation.py#L894): "The assistant is Claude, created by Anthropic. The current date is {{currentDateTime}}. Claude's knowledge base was last updated ... ". We have done limited experiments due to [rate limit](https://docs.anthropic.com/claude/reference/rate-limits) issues, but we welcome PRs with alternative choices. 
 [^5]:claude-3 reports: [https://www.anthropic.com/news/claude-3-family](https://www.anthropic.com/news/claude-3-family).
-[^6]:gemini-1.5 reports: [https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/](https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/), we dont have rerun results due to [rate_limit](https://ai.google.dev/pricing) issues and paid-as-you-go version are still "coming soon" by the time of this study on 04/02. 
+[^6]:gemini-1.5 reports: [https://goo.gle/GeminiV1-5](https://goo.gle/GeminiV1-5), we dont have rerun results due to [rate_limit](https://ai.google.dev/pricing) issues and paid-as-you-go version are still "coming at May 14" by the time of this study on 05/11. 
+[^7]:Llama 3 tech report: [https://ai.meta.com/blog/meta-llama-3/](https://ai.meta.com/blog/meta-llama-3/). Note Llama 400b is still training and these numbers are based on the best of their pretrain/instruct Llama 400b numbers.
+
 
 ## Legal Stuff
-By contributing to evals, you are agreeing to make your evaluation logic and data under the same MIT license as this repository. You must have adequate rights to upload any data used in an eval. OpenAI reserves the right to use this data in future service improvements to our product. Contributions to OpenAI evals will be subject to our usual Usage Policies: https://platform.openai.com/docs/usage-policies.
+By contributing to evals, you are agreeing to make your evaluation logic and data under the same MIT license as this repository. You must have adequate rights to upload any data used in an eval. OpenAI reserves the right to use this data in future service improvements to our product. Contributions to OpenAI evals will be subject to our usual Usage Policies: https://platform.openai.com/docs/usage-policies.
@@ -7,7 +7,106 @@
 import numpy as np
 from tqdm import tqdm
 
-from .types import EvalResult, Message, SingleEvalResult
+from .types import EvalResult, Message, SamplerBase, SingleEvalResult
+
+QUERY_TEMPLATE_MULTICHOICE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+
+{Question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+ANSWER_PATTERN_MULTICHOICE = r"(?i)Answer\s*:\s*([A-D])"
+ANSWER_PATTERN = r"(?i)Answer\s*:\s*([^\n]+)"
+
+
+EQUALITY_TEMPLATE = r"""
+Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
+
+Examples:
+
+    Expression 1: $2x+3$
+    Expression 2: $3+2x$
+
+Yes
+
+    Expression 1: 3/2
+    Expression 2: 1.5
+
+Yes
+
+    Expression 1: $x^2+2x+1$
+    Expression 2: $y^2+2y+1$
+
+No
+
+    Expression 1: $x^2+2x+1$
+    Expression 2: $(x+1)^2$
+
+Yes
+
+    Expression 1: 3245/5
+    Expression 2: 649
+
+No
+(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
+
+    Expression 1: 2/(-3)
+    Expression 2: -2/3
+
+Yes
+(trivial simplifications are allowed)
+
+    Expression 1: 72 degrees
+    Expression 2: 72
+
+Yes
+(give benefit of the doubt to units)
+
+    Expression 1: 64
+    Expression 2: 64 square feet
+
+Yes
+(give benefit of the doubt to units)
+
+---
+
+YOUR TASK
+
+
+Respond with only "Yes" or "No" (without quotes). Do not include a rationale.
+
+    Expression 1: %(expression1)s
+    Expression 2: %(expression2)s
+""".strip()
+
+
+HTML_JINJA = """
+<h3>Prompt conversation</h3>
+{% for message in prompt_messages %}
+{{ message_to_html(message) | safe }}
+{% endfor %}
+<h3>Sampled message</h3>
+{{ message_to_html(next_message) | safe }}
+<h3>Results</h3>
+<p>Correct Answer: {{ correct_answer }}</p>
+<p>Extracted Answer: {{ extracted_answer }}</p>
+<p>Score: {{ score }}</p>
+"""
+
+
+def format_multichoice_question(row):
+    return QUERY_TEMPLATE_MULTICHOICE.format(**row)
+
+
+def check_equality(sampler: SamplerBase, expr1: str, expr2: str):
+    prompt = EQUALITY_TEMPLATE % {"expression1": expr1, "expression2": expr2}
+    response = sampler([dict(content=prompt, role="user")])
+    return response.lower().strip() == "yes"
 
 
 def _compute_stat(values: list, stat: str):
@@ -152,7 +251,7 @@ def message_to_html(message: Message) -> str:
     {% endif %}
     <h1>Examples</h1>
     {% for html in htmls %}
-    {{ html }}
+    {{ html | safe }}
     <hr>
     {% endfor %}
     </body>
 
@@ -31,15 +31,25 @@ def main():
             model="gpt-4-turbo-2024-04-09",
             system_message=OPENAI_SYSTEM_MESSAGE_CHATGPT,
         ),
+        "gpt-4o_assistant": ChatCompletionSampler(
+            model="gpt-4o",
+            system_message=OPENAI_SYSTEM_MESSAGE_API,
+            max_tokens=2048,
+        ),
+        "gpt-4o_chatgpt": ChatCompletionSampler(
+            model="gpt-4o",
+            system_message=OPENAI_SYSTEM_MESSAGE_CHATGPT,
+            max_tokens=2048,
+        ),
         # claude models:
         # "claude-3-opus-20240229_empty": ClaudeCompletionSampler(
         #     model="claude-3-opus-20240229", system_message=None,
         # ),
     }
 
     equality_checker = ChatCompletionSampler(model="gpt-4-turbo-preview")
-
     # ^^^ used for fuzzy matching, just for math
+
     def get_evals(eval_name):
         match eval_name:
             case "mmlu":
 
@@ -16,7 +16,7 @@
 from scipy.optimize import linear_sum_assignment
 
 from . import common
-from .mmlu_eval import HTML_JINJA
+from .common import ANSWER_PATTERN, HTML_JINJA
 from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 """
@@ -28,8 +28,6 @@
 /eval/drop_eval.py
 """
 
-ANSWER_PATTERN = r"(?i)Answer\s*:\s*([^\n]+)"
-
 
 def _remove_articles(text: str) -> str:
     regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
@@ -282,7 +280,7 @@ def fn(example: dict[str, str]):
                     prompt += """\n
 Think step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.
                     """
-                    prompt_messages = [dict(content=prompt, role="user")]
+                    prompt_messages = [sampler._pack_message(content=prompt, role="user")]
                     response_text = sampler(prompt_messages)
                     match = re.search(ANSWER_PATTERN, response_text)
                     extracted_answer = match.group(1) if match else response_text
 
@@ -11,14 +11,10 @@
 import pandas
 
 from . import common
-from .mmlu_eval import ANSWER_PATTERN, HTML_JINJA, QUERY_TEMPLATE
+from .common import ANSWER_PATTERN_MULTICHOICE, HTML_JINJA, format_multichoice_question
 from .types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult
 
 
-def format_question(row):
-    return QUERY_TEMPLATE.format(**row)
-
-
 class GPQAEval(Eval):
     def __init__(
         self,
@@ -55,9 +51,13 @@ def fn(row: dict):
             choices_dict = dict(
                 A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=row["Question"]
             )
-            prompt_messages = [dict(content=format_question(choices_dict), role="user")]
+            prompt_messages = [
+                sampler._pack_message(
+                    content=format_multichoice_question(choices_dict), role="user"
+                )
+            ]
             response_text = sampler(prompt_messages)
-            match = re.search(ANSWER_PATTERN, response_text)
+            match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
             extracted_answer = match.group(1) if match else None
             score = 1.0 if extracted_answer == correct_answer else 0.0
             html = common.jinja_env.from_string(HTML_JINJA).render(
 
@@ -21,7 +21,7 @@
 from human_eval.execution import check_correctness  # , unsafe_execute
 
 from . import common
-from .mmlu_eval import HTML_JINJA
+from .common import HTML_JINJA
 from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
 
 
@@ -84,7 +84,9 @@ def find_code(completion):
             return extracted_answer
 
         def fn(sample: dict[str, str]):
-            prompt_messages = [{"role": "user", "content": instruction + sample["prompt"]}]
+            prompt_messages = [
+                sampler._pack_mesage(role="user", content=instruction + sample["prompt"])
+            ]
             completions = [
                 find_code(sampler(prompt_messages)) for _ in range(self._num_samples_per_task)
             ]