fit newest opencompass

mtbench101 · May 29, 2024 · 6976666 · 6976666
1 parent 228aa0d
commit 6976666
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 26 deletions.
diff --git a/configs/eval_subjective_mtbench101.py b/configs/eval_subjective_mtbench101.py
@@ -4,9 +4,7 @@
     from .datasets.subjective.multiround.mtbench101_judge import subjective_datasets
 
 
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
-from opencompass.models.openai_api import OpenAIAllesAPIN
-# from opencompass.models.idealab_api import IdeaLabAllesAPIN
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
@@ -74,7 +72,6 @@
     partitioner=dict(type=SizePartitioner, max_task_size=10000),
     runner=dict(
         type=SlurmSequentialRunner,
-        # type=LocalRunner,
         partition='llm_dev2',
         quotatype='auto',
         max_num_workers=32,
@@ -85,28 +82,26 @@
 # -------------Evalation Stage ----------------------------------------
 
 ## ------------- JudgeLLM Configuration
-judge_model = dict(
+judge_models = [dict(
     abbr='GPT4-Turbo',
-    type=OpenAIAllesAPIN,
-    # type=IdeaLabAllesAPIN,
+    type=OpenAI,
     path='', 
     key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    url='https://api.openai.com/v1',
     meta_template=api_meta_template,
     query_per_second=16,
     max_out_len=4096,
     max_seq_len=4096,
     batch_size=8,
-    temperature=0,
-)
+    temperature=0.8,
+)]
 
 ## ------------- Evaluation Configuration
 
 
 
 eval = dict(
-    partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models),
-    runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
+    partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100000, mode='singlescore', models=models, judge_models=judge_models),
+    runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
 )
 
 summarizer = dict(type=MTBench101Summarizer, judge_type='single')

diff --git a/opencompass/datasets/subjective/mtbench101.py b/opencompass/datasets/subjective/mtbench101.py
@@ -272,22 +272,14 @@ def load(self, path: str, name: str):
         # filename = osp.join(path, 'mtbench101.jsonl')
         dataset = DatasetDict()
         raw_data = []
-        print('load...........')
-
 
         lines = open(filename, 'r', encoding='utf-8').readlines()
         conversations = []
         for line in lines:
             line = json.loads(line)
             conversations.append(line)
 
-
-        step = 0
-        print('=========dataset========================')
-
         for  dialogue in conversations:
-
-
             multi_id = dialogue['id']
             task = dialogue['task']
             if task in skip_first_tasks:

diff --git a/opencompass/summarizers/subjective/mtbench101.py b/opencompass/summarizers/subjective/mtbench101.py
@@ -45,9 +45,7 @@ def post_process_mtbench101(judgement: str):
 
     else:
         return None
-    print('=========judgement=========')
-    print(judgement)
-
+
     return {'score': score,'judgement':judgement}
 
 
@@ -79,7 +77,7 @@ def get_final_results(judged_answers, references,output_dir,fout_flag,model):
     fout = osp.join(
                         output_dir,
                         'task_score.csv')
-    
+
     columns = list(final_task_scores.keys())
 
     print('================task_score=====================')
@@ -113,7 +111,7 @@ def __init__(self, config: ConfigDict, judge_type='single') -> None:
             model_abbr_from_cfg(model) for model in self.eval_model_cfgs
         ]
 
-        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
+        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
 
         self.judge_function =post_process_mtbench101
 

diff --git a/readme_mtbench101.md b/readme_mtbench101.md
@@ -25,7 +25,7 @@
 ## 💥What's New
 
 - **[2024.02.22]** Our paper is now accessible at https://arxiv.org/abs/2402.14762.
-- **[2024.05.15]** MT-Bench-101 is accepted to the ACL 2024 main conference.
+- **[2024.05.15]** MT-Bench-101 has been accepted by ACL 2024 main conference.
 - **[2024.05.28]** Code and dataset are now available. 
 
 ## About MT-Bench-101