Skip to content

Commit 782afe8

Browse files
Humanity's last exam (#520)
--------- Co-authored-by: Nathan Habib <[email protected]> Co-authored-by: Nathan Habib <[email protected]>
1 parent 086cf90 commit 782afe8

File tree

6 files changed

+297
-15
lines changed

6 files changed

+297
-15
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
model:
2+
model_name: "deepseek-ai/DeepSeek-R1" #meta-llama/Llama-3.1-8B-Instruct" #Qwen/Qwen2.5-14B" #Qwen/Qwen2.5-7B"
3+
api:
4+
base_url: "https://huggingface.co/api/inference-proxy/together"
5+
api_key: "hf_"

src/lighteval/metrics/llm_as_judge.py

+33-7
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,21 @@
2626
from concurrent.futures import ThreadPoolExecutor
2727
from typing import Callable, Literal
2828

29+
from pydantic import BaseModel
2930
from tqdm import tqdm
3031

3132
from lighteval.utils.imports import is_litellm_available, is_openai_available, is_vllm_available
33+
from lighteval.utils.utils import as_list
3234

3335

3436
logging.getLogger("openai").setLevel(logging.ERROR)
3537
logging.getLogger("httpx").setLevel(logging.ERROR)
3638
logger = logging.getLogger(__name__)
3739

3840

41+
DEFAULT_FORMAT = {"type": "text"}
42+
43+
3944
class JudgeLM:
4045
"""
4146
A class representing a judge for evaluating answers using either the OpenAI or Transformers library.
@@ -76,6 +81,7 @@ def __init__(
7681
judge_backend: Literal["litellm", "openai", "transformers", "tgi", "vllm"],
7782
url: str | None = None,
7883
api_key: str | None = None,
84+
response_format: BaseModel = None,
7985
):
8086
self.model = model
8187
self.template = templates
@@ -91,6 +97,8 @@ def __init__(
9197
self.api_key = api_key
9298
self.backend = judge_backend
9399

100+
self.response_format = response_format if not None else DEFAULT_FORMAT
101+
94102
def __lazy_load_client(self):
95103
match self.backend:
96104
# Wether we use openai or TGI models, we go through the openai API
@@ -232,7 +240,7 @@ def __call_api(prompt):
232240

233241
def __call_api_parallel(self, prompts):
234242
results = []
235-
with ThreadPoolExecutor(100) as executor:
243+
with ThreadPoolExecutor(10) as executor:
236244
for entry in tqdm(executor.map(self.__call_api, prompts), total=len(prompts)):
237245
results.append(entry)
238246

@@ -244,16 +252,34 @@ def __call_api_parallel(self, prompts):
244252
def __call_api(self, prompt):
245253
for _ in range(self.API_MAX_RETRY):
246254
try:
247-
response = self.client.chat.completions.create(
255+
# Base model
256+
response = self.client.beta.chat.completions.parse(
248257
model=self.model,
249-
messages=prompt,
250-
response_format={"type": "text"},
251-
max_tokens=512,
258+
messages=as_list(prompt),
259+
response_format=self.response_format,
260+
max_tokens=4096,
261+
temperature=0.0,
252262
n=1,
253263
)
254-
text = response.choices[0].message.content
255-
return text
264+
answer = response.choices[0].message.parsed
265+
return answer
266+
except TypeError:
267+
try:
268+
# Finetune
269+
response = self.client.chat.completions.create(
270+
model=self.model,
271+
messages=as_list(prompt),
272+
response_format=self.response_format,
273+
max_tokens=512,
274+
n=1,
275+
)
276+
text = response.choices[0].message.content
277+
return text
278+
except Exception as e:
279+
logger.warning(f"{type(e), e}")
280+
time.sleep(self.API_RETRY_SLEEP)
256281
except Exception as e:
257282
logger.warning(f"{type(e), e}")
258283
time.sleep(self.API_RETRY_SLEEP)
284+
259285
raise Exception("Failed to get response from the API")

src/lighteval/metrics/metrics_sample.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from nltk.tokenize import word_tokenize
3636
from nltk.tokenize.treebank import TreebankWordTokenizer
3737
from nltk.translate.bleu_score import sentence_bleu
38+
from pydantic import BaseModel
3839
from transformers import AutoModelForSequenceClassification, AutoTokenizer
3940

4041
from lighteval.metrics.imports.bert_scorer import BERTScorer
@@ -864,7 +865,7 @@ def edit_similarity(self, s1, s2):
864865

865866

866867
class JudgeLLM:
867-
available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
868+
available_models_openai = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4", "gpt-4o-2024-08-06"]
868869

869870
def __init__(
870871
self,
@@ -873,6 +874,7 @@ def __init__(
873874
process_judge_response: Callable,
874875
judge_backend: Literal["litellm", "openai", "transformers", "vllm", "tgi"],
875876
short_judge_name: str | None = None,
877+
response_format: BaseModel = None,
876878
) -> None:
877879
match judge_backend:
878880
case "openai":
@@ -905,6 +907,7 @@ def __init__(
905907
api_key=api_key,
906908
url=url,
907909
judge_backend=judge_backend,
910+
response_format=response_format,
908911
)
909912

910913
def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:

src/lighteval/models/endpoints/openai_model.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from typing import Optional
2929

3030
from tqdm import tqdm
31+
from transformers import AutoTokenizer
3132

3233
from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
3334
from lighteval.models.abstract_model import LightevalModel
@@ -64,6 +65,8 @@
6465
class OpenAIModelConfig:
6566
model: str
6667
generation_parameters: GenerationParameters = None
68+
base_url: str = "https://api.openai.com/v1"
69+
api_key: str = os.environ.get("OPENAI_API_KEY", None)
6770

6871
def __post_init__(self):
6972
if not self.generation_parameters:
@@ -74,17 +77,19 @@ def from_path(cls, path: str) -> "OpenAIModelConfig":
7477
import yaml
7578

7679
with open(path, "r") as f:
77-
config = yaml.safe_load(f)["model"]
80+
loaded_file = yaml.safe_load(f)
81+
config = loaded_file["model"]
82+
api = loaded_file.get("api", {})
7883
generation_parameters = GenerationParameters.from_dict(config)
79-
return cls(model=config["model_name"], generation_parameters=generation_parameters)
84+
return cls(model=config["model_name"], generation_parameters=generation_parameters, **api)
8085

8186

8287
class OpenAIClient(LightevalModel):
8388
_DEFAULT_MAX_LENGTH: int = 4096
8489

8590
def __init__(self, config: OpenAIModelConfig, env_config) -> None:
86-
api_key = os.environ["OPENAI_API_KEY"]
87-
self.client = OpenAI(api_key=api_key)
91+
self.client = OpenAI(api_key=config.api_key, base_url=config.base_url)
92+
self.config = config
8893
self.generation_parameters = config.generation_parameters
8994
self.sampling_params = self.generation_parameters.to_vllm_openai_dict()
9095

@@ -99,22 +104,27 @@ def __init__(self, config: OpenAIModelConfig, env_config) -> None:
99104
self.API_RETRY_MULTIPLIER = 2
100105
self.CONCURENT_CALLS = 100
101106
self.model = config.model
102-
self._tokenizer = tiktoken.encoding_for_model(self.model)
107+
try:
108+
self._tokenizer = tiktoken.encoding_for_model(self.model)
109+
except KeyError:
110+
self._tokenizer = AutoTokenizer.from_pretrained(self.model)
103111
self.pairwise_tokenization = False
104112

105113
def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, logit_bias):
106114
for _ in range(self.API_MAX_RETRY):
107115
try:
116+
response_format = {"response_format": {"type": "text"}} if "openai" in self.config.base_url else {}
108117
response = self.client.chat.completions.create(
109118
model=self.model,
110119
messages=[{"role": "user", "content": prompt}],
111-
response_format={"type": "text"},
112120
max_tokens=max_new_tokens if max_new_tokens > 0 else None,
113121
logprobs=return_logits,
114122
logit_bias=logit_bias,
115123
n=num_samples,
116124
**self.sampling_params,
125+
**response_format,
117126
)
127+
self.API_RETRY_SLEEP = 3
118128
return response
119129
except Exception as e:
120130
logger.warning(f"{type(e), e}")

src/lighteval/tasks/extended/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@
2424

2525

2626
if can_load_extended_tasks():
27+
import lighteval.tasks.extended.hle.main as hle
2728
import lighteval.tasks.extended.ifeval.main as ifeval
2829
import lighteval.tasks.extended.lcb.main as lcb
2930
import lighteval.tasks.extended.mix_eval.main as mix_eval
3031
import lighteval.tasks.extended.mt_bench.main as mt_bench
3132
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
3233
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
3334

34-
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, lcb]
35+
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]
3536

3637
else:
3738
AVAILABLE_EXTENDED_TASKS_MODULES = []

0 commit comments

Comments
 (0)