Skip to content

Adding a range of multilingual evals #832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
180 changes: 180 additions & 0 deletions examples/tasks/instruct_multilingual.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
extended|belebele_native_instruct_deu_Latn|0|0
extended|belebele_native_instruct_fra_Latn|0|0
extended|belebele_native_instruct_ita_Latn|0|0
extended|belebele_native_instruct_por_Latn|0|0
extended|belebele_native_instruct_spa_Latn|0|0
extended|belebele_en_instruct_acm_Arab|0|0
extended|belebele_en_instruct_arz_Arab|0|0
extended|belebele_en_instruct_ceb_Latn|0|0
extended|belebele_en_instruct_fin_Latn|0|0
extended|belebele_en_instruct_hin_Deva|0|0
extended|belebele_en_instruct_ita_Latn|0|0
extended|belebele_en_instruct_khm_Khmr|0|0
extended|belebele_en_instruct_lvs_Latn|0|0
extended|belebele_en_instruct_npi_Deva|0|0
extended|belebele_en_instruct_pol_Latn|0|0
extended|belebele_en_instruct_slv_Latn|0|0
extended|belebele_en_instruct_swe_Latn|0|0
extended|belebele_en_instruct_afr_Latn|0|0
extended|belebele_en_instruct_asm_Beng|0|0
extended|belebele_en_instruct_ces_Latn|0|0
extended|belebele_en_instruct_fra_Latn|0|0
extended|belebele_en_instruct_hin_Latn|0|0
extended|belebele_en_instruct_jav_Latn|0|0
extended|belebele_en_instruct_mal_Mlym|0|0
extended|belebele_en_instruct_npi_Latn|0|0
extended|belebele_en_instruct_por_Latn|0|0
extended|belebele_en_instruct_swh_Latn|0|0
extended|belebele_en_instruct_tur_Latn|0|0
extended|belebele_en_instruct_yor_Latn|0|0
extended|belebele_en_instruct_als_Latn|0|0
extended|belebele_en_instruct_azj_Latn|0|0
extended|belebele_en_instruct_ckb_Arab|0|0
extended|belebele_en_instruct_hrv_Latn|0|0
extended|belebele_en_instruct_jpn_Jpan|0|0
extended|belebele_en_instruct_kir_Cyrl|0|0
extended|belebele_en_instruct_mar_Deva|0|0
extended|belebele_en_instruct_snd_Arab|0|0
extended|belebele_en_instruct_tam_Taml|0|0
extended|belebele_en_instruct_ukr_Cyrl|0|0
extended|belebele_en_instruct_zho_Hans|0|0
extended|belebele_en_instruct_amh_Ethi|0|0
extended|belebele_en_instruct_dan_Latn|0|0
extended|belebele_en_instruct_hun_Latn|0|0
extended|belebele_en_instruct_kor_Hang|0|0
extended|belebele_en_instruct_mkd_Cyrl|0|0
extended|belebele_en_instruct_ron_Latn|0|0
extended|belebele_en_instruct_som_Latn|0|0
extended|belebele_en_instruct_tel_Telu|0|0
extended|belebele_en_instruct_urd_Arab|0|0
extended|belebele_en_instruct_zho_Hant|0|0
extended|belebele_en_instruct_apc_Arab|0|0
extended|belebele_en_instruct_ben_Beng|0|0
extended|belebele_en_instruct_deu_Latn|0|0
extended|belebele_en_instruct_hye_Armn|0|0
extended|belebele_en_instruct_kan_Knda|0|0
extended|belebele_en_instruct_lao_Laoo|0|0
extended|belebele_en_instruct_mlt_Latn|0|0
extended|belebele_en_instruct_ory_Orya|0|0
extended|belebele_en_instruct_rus_Cyrl|0|0
extended|belebele_en_instruct_tgk_Cyrl|0|0
extended|belebele_en_instruct_urd_Latn|0|0
extended|belebele_en_instruct_zsm_Latn|0|0
extended|belebele_en_instruct_arb_Arab|0|0
extended|belebele_en_instruct_ben_Latn|0|0
extended|belebele_en_instruct_ell_Grek|0|0
extended|belebele_en_instruct_guj_Gujr|0|0
extended|belebele_en_instruct_kat_Geor|0|0
extended|belebele_en_instruct_pan_Guru|0|0
extended|belebele_en_instruct_spa_Latn|0|0
extended|belebele_en_instruct_tgl_Latn|0|0
extended|belebele_en_instruct_uzn_Latn|0|0
extended|belebele_en_instruct_arb_Latn|0|0
extended|belebele_en_instruct_eng_Latn|0|0
extended|belebele_en_instruct_kaz_Cyrl|0|0
extended|belebele_en_instruct_lit_Latn|0|0
extended|belebele_en_instruct_mya_Mymr|0|0
extended|belebele_en_instruct_pbt_Arab|0|0
extended|belebele_en_instruct_sin_Latn|0|0
extended|belebele_en_instruct_srp_Cyrl|0|0
extended|belebele_en_instruct_tha_Thai|0|0
extended|belebele_en_instruct_vie_Latn|0|0
extended|belebele_en_instruct_ars_Arab|0|0
extended|belebele_en_instruct_bul_Cyrl|0|0
extended|belebele_en_instruct_est_Latn|0|0
extended|belebele_en_instruct_ind_Latn|0|0
extended|belebele_en_instruct_nld_Latn|0|0
extended|belebele_en_instruct_pes_Arab|0|0
extended|belebele_en_instruct_sin_Sinh|0|0
extended|belebele_en_instruct_war_Latn|0|0
extended|belebele_en_instruct_ary_Arab|0|0
extended|belebele_en_instruct_cat_Latn|0|0
extended|belebele_en_instruct_eus_Latn|0|0
extended|belebele_en_instruct_heb_Hebr|0|0
extended|belebele_en_instruct_isl_Latn|0|0
extended|belebele_en_instruct_nob_Latn|0|0
extended|belebele_en_instruct_plt_Latn|0|0
extended|belebele_en_instruct_slk_Latn|0|0
extended|global_mmlu_instruct_amh|0|0
extended|global_mmlu_instruct_ara|0|0
extended|global_mmlu_instruct_ben|0|0
extended|global_mmlu_instruct_ces|0|0
extended|global_mmlu_instruct_deu|0|0
extended|global_mmlu_instruct_ell|0|0
extended|global_mmlu_instruct_eng|0|0
extended|global_mmlu_instruct_spa|0|0
extended|global_mmlu_instruct_fas|0|0
extended|global_mmlu_instruct_fra|0|0
extended|global_mmlu_instruct_hau|0|0
extended|global_mmlu_instruct_heb|0|0
extended|global_mmlu_instruct_hin|0|0
extended|global_mmlu_instruct_ind|0|0
extended|global_mmlu_instruct_ibo|0|0
extended|global_mmlu_instruct_ita|0|0
extended|global_mmlu_instruct_jpn|0|0
extended|global_mmlu_instruct_kor|0|0
extended|global_mmlu_instruct_kir|0|0
extended|global_mmlu_instruct_lit|0|0
extended|global_mmlu_instruct_mlg|0|0
extended|global_mmlu_instruct_msa|0|0
extended|global_mmlu_instruct_nep|0|0
extended|global_mmlu_instruct_nld|0|0
extended|global_mmlu_instruct_nor|0|0
extended|global_mmlu_instruct_pol|0|0
extended|global_mmlu_instruct_por|0|0
extended|global_mmlu_instruct_ron|0|0
extended|global_mmlu_instruct_rus|0|0
extended|global_mmlu_instruct_sin|0|0
extended|global_mmlu_instruct_sna|0|0
extended|global_mmlu_instruct_som|0|0
extended|global_mmlu_instruct_srp|0|0
extended|global_mmlu_instruct_swe|0|0
extended|global_mmlu_instruct_swa|0|0
extended|global_mmlu_instruct_tel|0|0
extended|global_mmlu_instruct_tur|0|0
extended|global_mmlu_instruct_ukr|0|0
extended|global_mmlu_instruct_vie|0|0
extended|global_mmlu_instruct_yor|0|0
extended|global_mmlu_instruct_zho|0|0
extended|global_mmlu_lite_instruct_amh|0|0
extended|global_mmlu_lite_instruct_ara|0|0
extended|global_mmlu_lite_instruct_ben|0|0
extended|global_mmlu_lite_instruct_ces|0|0
extended|global_mmlu_lite_instruct_deu|0|0
extended|global_mmlu_lite_instruct_ell|0|0
extended|global_mmlu_lite_instruct_eng|0|0
extended|global_mmlu_lite_instruct_spa|0|0
extended|global_mmlu_lite_instruct_fas|0|0
extended|global_mmlu_lite_instruct_fra|0|0
extended|global_mmlu_lite_instruct_hau|0|0
extended|global_mmlu_lite_instruct_heb|0|0
extended|global_mmlu_lite_instruct_hin|0|0
extended|global_mmlu_lite_instruct_ind|0|0
extended|global_mmlu_lite_instruct_ibo|0|0
extended|global_mmlu_lite_instruct_ita|0|0
extended|global_mmlu_lite_instruct_jpn|0|0
extended|global_mmlu_lite_instruct_kor|0|0
extended|global_mmlu_lite_instruct_kir|0|0
extended|global_mmlu_lite_instruct_lit|0|0
extended|global_mmlu_lite_instruct_mlg|0|0
extended|global_mmlu_lite_instruct_msa|0|0
extended|global_mmlu_lite_instruct_nep|0|0
extended|global_mmlu_lite_instruct_nld|0|0
extended|global_mmlu_lite_instruct_nor|0|0
extended|global_mmlu_lite_instruct_pol|0|0
extended|global_mmlu_lite_instruct_por|0|0
extended|global_mmlu_lite_instruct_ron|0|0
extended|global_mmlu_lite_instruct_rus|0|0
extended|global_mmlu_lite_instruct_sin|0|0
extended|global_mmlu_lite_instruct_sna|0|0
extended|global_mmlu_lite_instruct_som|0|0
extended|global_mmlu_lite_instruct_srp|0|0
extended|global_mmlu_lite_instruct_swe|0|0
extended|global_mmlu_lite_instruct_swa|0|0
extended|global_mmlu_lite_instruct_tel|0|0
extended|global_mmlu_lite_instruct_tur|0|0
extended|global_mmlu_lite_instruct_ukr|0|0
extended|global_mmlu_lite_instruct_vie|0|0
extended|global_mmlu_lite_instruct_yor|0|0
extended|global_mmlu_lite_instruct_zho|0|0
extended|mmlu_pro|0|0
6 changes: 6 additions & 0 deletions examples/tasks/instruct_multilingual_test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
extended|global_mmlu_lite_instruct_deu|0|0
extended|global_mmlu_lite_instruct_eng|0|0
extended|global_mmlu_lite_instruct_spa|0|0
extended|global_mmlu_lite_instruct_fra|0|0
extended|global_mmlu_lite_instruct_ita|0|0
extended|global_mmlu_lite_instruct_por|0|0
5 changes: 3 additions & 2 deletions src/lighteval/metrics/utils/extractive_match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,12 @@ class IndicesExtractionConfig:

Attributes:
prefix_for_extraction (ChoicePrefix): The style to use for extracting choice indices (e.g. A,B,C or 1,2,3)
try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is"
try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is".
Recommended False for indices extraction, as some indices (for example `A` which is also a word) can lead to a lot of false positives.
"""

prefix_for_extraction: ChoicePrefix
try_extract_without_anchor: bool = True
try_extract_without_anchor: bool = False


ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | IndicesExtractionConfig
Expand Down
2 changes: 2 additions & 0 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class VLLMModelConfig(ModelConfig):
max_num_batched_tokens: PositiveInt = 2048 # maximum number of tokens per batch
subfolder: str | None = None
is_async: bool = False # Whether to use the async version or sync version of the model
enforce_eager: bool = False


class VLLMModel(LightevalModel):
Expand Down Expand Up @@ -187,6 +188,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
"seed": int(config.seed),
"max_num_seqs": int(config.max_num_seqs),
"max_num_batched_tokens": int(config.max_num_batched_tokens),
"enforce_eager": bool(config.enforce_eager),
}

if config.quantization is not None:
Expand Down
15 changes: 12 additions & 3 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,15 +899,24 @@ def gpqa_instruct(line, task_name: str = None):
gold_index = random.randint(0, 3)
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
choices.insert(gold_index, line["Correct Answer"])
query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
query = query_template.format(
# Stripping to avoid accidental extra whitespaces, present in GPQA
A=choices[0].strip(),
B=choices[1].strip(),
C=choices[2].strip(),
D=choices[3].strip(),
Question=line["Question"].strip(),
Instruction=instruction,
)

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(choices)],
gold_index=gold_index,
instruction=query,
instruction=instruction,
)


Expand Down
11 changes: 7 additions & 4 deletions src/lighteval/tasks/extended/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import lighteval.tasks.extended.instruct.belebele as belebele
import lighteval.tasks.extended.instruct.global_mmlu as global_mmlu
import lighteval.tasks.extended.instruct.mgsm as mgsm
import lighteval.tasks.extended.instruct.mmlu_pro as mmlu_pro
from lighteval.utils.imports import can_load_extended_tasks


AVAILABLE_EXTENDED_TASKS_MODULES = [belebele, mmlu_pro, mgsm, global_mmlu]

if can_load_extended_tasks():
import lighteval.tasks.extended.hle.main as hle
import lighteval.tasks.extended.ifeval.main as ifeval
Expand All @@ -32,7 +38,4 @@
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks

AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]

else:
AVAILABLE_EXTENDED_TASKS_MODULES = []
AVAILABLE_EXTENDED_TASKS_MODULES.extend([ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb])
Loading
Loading