Skip to content

Commit d96e395

Browse files
committed
push-fix
Signed-off-by: Oleg Silkin <[email protected]>
1 parent bc3e43d commit d96e395

File tree

2 files changed

+183
-6
lines changed

2 files changed

+183
-6
lines changed

scripts/evaluate_best_checkpoint.py

Lines changed: 181 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,21 @@
1010
# Standard
1111
from pathlib import Path
1212
from typing import Optional
13+
from typing_extensions import Annotated
1314
import json
1415

1516
# Third Party
17+
from rich import print
1618
import typer
1719

1820
app = typer.Typer()
1921

2022

2123
@app.command()
22-
def main(
24+
def best_checkpoint(
2325
input_dir: Path = typer.Argument(..., help="Input directory to process"),
2426
output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
27+
tasks: Annotated[Optional[list[str]], typer.Option()] = None,
2528
):
2629
"""
2730
Process files in the input directory and optionally save results to an output file.
@@ -54,6 +57,8 @@ def main(
5457
evaluator = LeaderboardV2Evaluator(
5558
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
5659
)
60+
if tasks:
61+
evaluator.tasks = tasks
5762
result = evaluator.run()
5863
checkpoint_results[checkpoint.name] = result
5964
typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
@@ -63,12 +68,37 @@ def main(
6368
checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
6469
)
6570
typer.echo("Sorted checkpoints by score:")
66-
for checkpoint_name, result in sorted_checkpoints:
71+
for i, (checkpoint_name, result) in enumerate(sorted_checkpoints):
6772
typer.echo(f"{'=' * 100}")
68-
typer.echo(json.dumps(result, indent=2))
73+
# Add [BEST CHECKPOINT] label for the first checkpoint
74+
if i == 0:
75+
typer.echo(
76+
f"[bold]Leaderboard results[/bold]: {checkpoint_name} [bold green][BEST CHECKPOINT][/bold green]"
77+
)
78+
else:
79+
typer.echo(f"[bold]Leaderboard results[/bold]: {checkpoint_name}")
80+
typer.echo(f"Overall: {result['overall_score'] * 100:.2f}%")
81+
if "leaderboard_bbh" in result:
82+
typer.echo(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
83+
if "leaderboard_gpqa" in result:
84+
typer.echo(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
85+
if "leaderboard_ifeval" in result:
86+
typer.echo(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
87+
if "leaderboard_math_hard" in result:
88+
typer.echo(
89+
f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%"
90+
)
91+
if "leaderboard_mmlu_pro" in result:
92+
typer.echo(
93+
f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
94+
)
95+
if "leaderboard_musr" in result:
96+
typer.echo(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
6997

7098
typer.echo(f"{'=' * 100}")
71-
typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
99+
typer.echo(
100+
f"Best checkpoint: {sorted_checkpoints[0][0]} [bold green][BEST CHECKPOINT][/bold green]"
101+
)
72102

73103
if output_file:
74104
typer.echo(f"Output will be saved to: {output_file}")
@@ -80,5 +110,152 @@ def main(
80110
typer.echo("Processing complete!")
81111

82112

113+
@app.command()
114+
def evaluate(
115+
input_dir: Path = typer.Argument(..., help="Input directory to process"),
116+
tasks: Annotated[Optional[list[str]], typer.Option()] = None,
117+
):
118+
"""
119+
Process files in the input directory and optionally save results to an output file.
120+
"""
121+
if not input_dir.exists():
122+
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
123+
raise typer.Exit(1)
124+
125+
if not input_dir.is_dir():
126+
typer.echo(f"Error: '{input_dir}' is not a directory")
127+
raise typer.Exit(1)
128+
129+
typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
130+
# First Party
131+
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
132+
133+
typer.echo("done")
134+
135+
evaluator = LeaderboardV2Evaluator(
136+
model_path=str(input_dir), num_gpus=8, eval_config={"batch_size": "auto"}
137+
)
138+
if tasks:
139+
evaluator.tasks = tasks
140+
result = evaluator.run()
141+
142+
# now just print out the checkpoint results
143+
print(f"[bold]Leaderboard results[/bold]: {input_dir}")
144+
print(f"Overall: {result['overall_score'] * 100:.2f}%")
145+
if "leaderboard_bbh" in result:
146+
print(f"BBH: {result['leaderboard_bbh']['score'] * 100:.2f}%")
147+
if "leaderboard_gpqa" in result:
148+
print(f"GPQA: {result['leaderboard_gpqa']['score'] * 100:.2f}%")
149+
if "leaderboard_ifeval" in result:
150+
print(f"IFEval: {result['leaderboard_ifeval']['score'] * 100:.2f}%")
151+
if "leaderboard_math_hard" in result:
152+
print(f"MATH-Hard: {result['leaderboard_math_hard']['score'] * 100:.2f}%")
153+
if "leaderboard_mmlu_pro" in result:
154+
print(f"MMLU-Pro: {result['leaderboard_mmlu_pro']['score'] * 100:.2f}%")
155+
if "leaderboard_musr" in result:
156+
print(f"MUSR: {result['leaderboard_musr']['score'] * 100:.2f}%")
157+
158+
output_file = input_dir / "leaderboard_results.json"
159+
output_file.write_text(json.dumps(result, indent=2))
160+
161+
162+
@app.command()
163+
def find_best(
164+
input_dir: Path = typer.Argument(..., help="Input directory to process"),
165+
show_all: bool = typer.Option(
166+
False, "--show-all", help="Show scores for all checkpoints"
167+
),
168+
):
169+
"""
170+
Find the best checkpoint by looking through leaderboard_results.json files.
171+
"""
172+
if not input_dir.exists():
173+
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
174+
raise typer.Exit(1)
175+
176+
if not input_dir.is_dir():
177+
typer.echo(f"Error: '{input_dir}' is not a directory")
178+
raise typer.Exit(1)
179+
180+
# Find all leaderboard_results.json files
181+
result_files = list(input_dir.glob("**/leaderboard_results.json"))
182+
183+
if not result_files:
184+
typer.echo("No leaderboard results found in any subdirectories")
185+
raise typer.Exit(1)
186+
187+
# Load and compare results
188+
best_score = -1
189+
best_checkpoint = None
190+
best_results = None
191+
all_results = []
192+
193+
for result_file in result_files:
194+
try:
195+
results = json.loads(result_file.read_text())
196+
score = results.get("overall_score", -1)
197+
all_results.append((result_file.parent, score, results))
198+
199+
if score > best_score:
200+
best_score = score
201+
best_checkpoint = result_file.parent
202+
best_results = results
203+
except Exception as e:
204+
typer.echo(f"Error reading {result_file}: {e}")
205+
continue
206+
207+
if best_checkpoint is None:
208+
typer.echo("No valid results found")
209+
raise typer.Exit(1)
210+
211+
# Sort all results by score
212+
all_results.sort(key=lambda x: x[1], reverse=True)
213+
214+
# Print all results if requested
215+
if show_all:
216+
print("\n[bold]All checkpoint results:[/bold]")
217+
for checkpoint, score, results in all_results:
218+
is_best = checkpoint == best_checkpoint
219+
prefix = "→ " if is_best else " "
220+
print(f"\n{prefix}Checkpoint: {checkpoint}")
221+
print(f" Overall score: {score * 100:.2f}%")
222+
if "leaderboard_bbh" in results:
223+
print(f" BBH: {results['leaderboard_bbh']['score'] * 100:.2f}%")
224+
if "leaderboard_gpqa" in results:
225+
print(f" GPQA: {results['leaderboard_gpqa']['score'] * 100:.2f}%")
226+
if "leaderboard_ifeval" in results:
227+
print(f" IFEval: {results['leaderboard_ifeval']['score'] * 100:.2f}%")
228+
if "leaderboard_math_hard" in results:
229+
print(
230+
f" MATH-Hard: {results['leaderboard_math_hard']['score'] * 100:.2f}%"
231+
)
232+
if "leaderboard_mmlu_pro" in results:
233+
print(
234+
f" MMLU-Pro: {results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
235+
)
236+
if "leaderboard_musr" in results:
237+
print(f" MUSR: {results['leaderboard_musr']['score'] * 100:.2f}%")
238+
else:
239+
# Print only best results
240+
print(f"\n[bold]Best checkpoint found[/bold]: {best_checkpoint}")
241+
print(f"Overall score: {best_score * 100:.2f}%")
242+
if "leaderboard_bbh" in best_results:
243+
print(f"BBH: {best_results['leaderboard_bbh']['score'] * 100:.2f}%")
244+
if "leaderboard_gpqa" in best_results:
245+
print(f"GPQA: {best_results['leaderboard_gpqa']['score'] * 100:.2f}%")
246+
if "leaderboard_ifeval" in best_results:
247+
print(f"IFEval: {best_results['leaderboard_ifeval']['score'] * 100:.2f}%")
248+
if "leaderboard_math_hard" in best_results:
249+
print(
250+
f"MATH-Hard: {best_results['leaderboard_math_hard']['score'] * 100:.2f}%"
251+
)
252+
if "leaderboard_mmlu_pro" in best_results:
253+
print(
254+
f"MMLU-Pro: {best_results['leaderboard_mmlu_pro']['score'] * 100:.2f}%"
255+
)
256+
if "leaderboard_musr" in best_results:
257+
print(f"MUSR: {best_results['leaderboard_musr']['score'] * 100:.2f}%")
258+
259+
83260
if __name__ == "__main__":
84261
app()

src/instructlab/eval/leaderboard.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,8 @@ def get_score_by_metric(score_dict: t.Dict[str, t.Any], metric: str) -> t.Any:
251251
extracted_value = value
252252
break
253253

254-
if not extracted_value:
255-
if alias := score_dict.get("alias", None):
254+
if extracted_value is None:
255+
if alias := score_dict.get("alias", "[no-alias]"):
256256
error_msg = (
257257
f"Failed to find a metric matching '{metric}' for task '{alias}'."
258258
)

0 commit comments

Comments
 (0)