10
10
# Standard
11
11
from pathlib import Path
12
12
from typing import Optional
13
+ from typing_extensions import Annotated
13
14
import json
14
15
15
16
# Third Party
17
+ from rich import print
16
18
import typer
17
19
18
20
app = typer .Typer ()
19
21
20
22
21
23
@app .command ()
22
- def main (
24
+ def best_checkpoint (
23
25
input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
24
26
output_file : Optional [Path ] = typer .Option (None , help = "Optional output file path" ),
27
+ tasks : Annotated [Optional [list [str ]], typer .Option ()] = None ,
25
28
):
26
29
"""
27
30
Process files in the input directory and optionally save results to an output file.
@@ -54,6 +57,8 @@ def main(
54
57
evaluator = LeaderboardV2Evaluator (
55
58
model_path = str (checkpoint ), output_file = ckpt_output_file , num_gpus = 8
56
59
)
60
+ if tasks :
61
+ evaluator .tasks = tasks
57
62
result = evaluator .run ()
58
63
checkpoint_results [checkpoint .name ] = result
59
64
typer .echo (f"Checkpoint { checkpoint .name } results: { result ['overall_score' ]} " )
@@ -63,12 +68,37 @@ def main(
63
68
checkpoint_results .items (), key = lambda x : x [1 ]["overall_score" ], reverse = True
64
69
)
65
70
typer .echo ("Sorted checkpoints by score:" )
66
- for checkpoint_name , result in sorted_checkpoints :
71
+ for i , ( checkpoint_name , result ) in enumerate ( sorted_checkpoints ) :
67
72
typer .echo (f"{ '=' * 100 } " )
68
- typer .echo (json .dumps (result , indent = 2 ))
73
+ # Add [BEST CHECKPOINT] label for the first checkpoint
74
+ if i == 0 :
75
+ typer .echo (
76
+ f"[bold]Leaderboard results[/bold]: { checkpoint_name } [bold green][BEST CHECKPOINT][/bold green]"
77
+ )
78
+ else :
79
+ typer .echo (f"[bold]Leaderboard results[/bold]: { checkpoint_name } " )
80
+ typer .echo (f"Overall: { result ['overall_score' ] * 100 :.2f} %" )
81
+ if "leaderboard_bbh" in result :
82
+ typer .echo (f"BBH: { result ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
83
+ if "leaderboard_gpqa" in result :
84
+ typer .echo (f"GPQA: { result ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
85
+ if "leaderboard_ifeval" in result :
86
+ typer .echo (f"IFEval: { result ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
87
+ if "leaderboard_math_hard" in result :
88
+ typer .echo (
89
+ f"MATH-Hard: { result ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
90
+ )
91
+ if "leaderboard_mmlu_pro" in result :
92
+ typer .echo (
93
+ f"MMLU-Pro: { result ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %"
94
+ )
95
+ if "leaderboard_musr" in result :
96
+ typer .echo (f"MUSR: { result ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
69
97
70
98
typer .echo (f"{ '=' * 100 } " )
71
- typer .echo (f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} " )
99
+ typer .echo (
100
+ f"Best checkpoint: { sorted_checkpoints [0 ][0 ]} [bold green][BEST CHECKPOINT][/bold green]"
101
+ )
72
102
73
103
if output_file :
74
104
typer .echo (f"Output will be saved to: { output_file } " )
@@ -80,5 +110,152 @@ def main(
80
110
typer .echo ("Processing complete!" )
81
111
82
112
113
+ @app .command ()
114
+ def evaluate (
115
+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
116
+ tasks : Annotated [Optional [list [str ]], typer .Option ()] = None ,
117
+ ):
118
+ """
119
+ Process files in the input directory and optionally save results to an output file.
120
+ """
121
+ if not input_dir .exists ():
122
+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
123
+ raise typer .Exit (1 )
124
+
125
+ if not input_dir .is_dir ():
126
+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
127
+ raise typer .Exit (1 )
128
+
129
+ typer .echo ("importing LeaderboardV2Evaluator, this may take a while..." )
130
+ # First Party
131
+ from instructlab .eval .leaderboard import LeaderboardV2Evaluator
132
+
133
+ typer .echo ("done" )
134
+
135
+ evaluator = LeaderboardV2Evaluator (
136
+ model_path = str (input_dir ), num_gpus = 8 , eval_config = {"batch_size" : "auto" }
137
+ )
138
+ if tasks :
139
+ evaluator .tasks = tasks
140
+ result = evaluator .run ()
141
+
142
+ # now just print out the checkpoint results
143
+ print (f"[bold]Leaderboard results[/bold]: { input_dir } " )
144
+ print (f"Overall: { result ['overall_score' ] * 100 :.2f} %" )
145
+ if "leaderboard_bbh" in result :
146
+ print (f"BBH: { result ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
147
+ if "leaderboard_gpqa" in result :
148
+ print (f"GPQA: { result ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
149
+ if "leaderboard_ifeval" in result :
150
+ print (f"IFEval: { result ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
151
+ if "leaderboard_math_hard" in result :
152
+ print (f"MATH-Hard: { result ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %" )
153
+ if "leaderboard_mmlu_pro" in result :
154
+ print (f"MMLU-Pro: { result ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %" )
155
+ if "leaderboard_musr" in result :
156
+ print (f"MUSR: { result ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
157
+
158
+ output_file = input_dir / "leaderboard_results.json"
159
+ output_file .write_text (json .dumps (result , indent = 2 ))
160
+
161
+
162
+ @app .command ()
163
+ def find_best (
164
+ input_dir : Path = typer .Argument (..., help = "Input directory to process" ),
165
+ show_all : bool = typer .Option (
166
+ False , "--show-all" , help = "Show scores for all checkpoints"
167
+ ),
168
+ ):
169
+ """
170
+ Find the best checkpoint by looking through leaderboard_results.json files.
171
+ """
172
+ if not input_dir .exists ():
173
+ typer .echo (f"Error: Input directory '{ input_dir } ' does not exist" )
174
+ raise typer .Exit (1 )
175
+
176
+ if not input_dir .is_dir ():
177
+ typer .echo (f"Error: '{ input_dir } ' is not a directory" )
178
+ raise typer .Exit (1 )
179
+
180
+ # Find all leaderboard_results.json files
181
+ result_files = list (input_dir .glob ("**/leaderboard_results.json" ))
182
+
183
+ if not result_files :
184
+ typer .echo ("No leaderboard results found in any subdirectories" )
185
+ raise typer .Exit (1 )
186
+
187
+ # Load and compare results
188
+ best_score = - 1
189
+ best_checkpoint = None
190
+ best_results = None
191
+ all_results = []
192
+
193
+ for result_file in result_files :
194
+ try :
195
+ results = json .loads (result_file .read_text ())
196
+ score = results .get ("overall_score" , - 1 )
197
+ all_results .append ((result_file .parent , score , results ))
198
+
199
+ if score > best_score :
200
+ best_score = score
201
+ best_checkpoint = result_file .parent
202
+ best_results = results
203
+ except Exception as e :
204
+ typer .echo (f"Error reading { result_file } : { e } " )
205
+ continue
206
+
207
+ if best_checkpoint is None :
208
+ typer .echo ("No valid results found" )
209
+ raise typer .Exit (1 )
210
+
211
+ # Sort all results by score
212
+ all_results .sort (key = lambda x : x [1 ], reverse = True )
213
+
214
+ # Print all results if requested
215
+ if show_all :
216
+ print ("\n [bold]All checkpoint results:[/bold]" )
217
+ for checkpoint , score , results in all_results :
218
+ is_best = checkpoint == best_checkpoint
219
+ prefix = "→ " if is_best else " "
220
+ print (f"\n { prefix } Checkpoint: { checkpoint } " )
221
+ print (f" Overall score: { score * 100 :.2f} %" )
222
+ if "leaderboard_bbh" in results :
223
+ print (f" BBH: { results ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
224
+ if "leaderboard_gpqa" in results :
225
+ print (f" GPQA: { results ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
226
+ if "leaderboard_ifeval" in results :
227
+ print (f" IFEval: { results ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
228
+ if "leaderboard_math_hard" in results :
229
+ print (
230
+ f" MATH-Hard: { results ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
231
+ )
232
+ if "leaderboard_mmlu_pro" in results :
233
+ print (
234
+ f" MMLU-Pro: { results ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %"
235
+ )
236
+ if "leaderboard_musr" in results :
237
+ print (f" MUSR: { results ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
238
+ else :
239
+ # Print only best results
240
+ print (f"\n [bold]Best checkpoint found[/bold]: { best_checkpoint } " )
241
+ print (f"Overall score: { best_score * 100 :.2f} %" )
242
+ if "leaderboard_bbh" in best_results :
243
+ print (f"BBH: { best_results ['leaderboard_bbh' ]['score' ] * 100 :.2f} %" )
244
+ if "leaderboard_gpqa" in best_results :
245
+ print (f"GPQA: { best_results ['leaderboard_gpqa' ]['score' ] * 100 :.2f} %" )
246
+ if "leaderboard_ifeval" in best_results :
247
+ print (f"IFEval: { best_results ['leaderboard_ifeval' ]['score' ] * 100 :.2f} %" )
248
+ if "leaderboard_math_hard" in best_results :
249
+ print (
250
+ f"MATH-Hard: { best_results ['leaderboard_math_hard' ]['score' ] * 100 :.2f} %"
251
+ )
252
+ if "leaderboard_mmlu_pro" in best_results :
253
+ print (
254
+ f"MMLU-Pro: { best_results ['leaderboard_mmlu_pro' ]['score' ] * 100 :.2f} %"
255
+ )
256
+ if "leaderboard_musr" in best_results :
257
+ print (f"MUSR: { best_results ['leaderboard_musr' ]['score' ] * 100 :.2f} %" )
258
+
259
+
83
260
if __name__ == "__main__" :
84
261
app ()
0 commit comments