Skip to content

Commit

Permalink
deepseek-r1
Browse files Browse the repository at this point in the history
  • Loading branch information
femto committed Feb 17, 2025
1 parent 245d06b commit 306e1b3
Show file tree
Hide file tree
Showing 7 changed files with 292 additions and 393 deletions.
14 changes: 14 additions & 0 deletions examples/smart_minion/aime/aime_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"type": "ensemble",
"pre_processing": ["problem_reflect","example_reasoning"],
"workers": [
{
"name": "raw",
"count": 1,
"check": 1
}
],
"result_strategy": {
"name": "majority_voting"
}
}
246 changes: 246 additions & 0 deletions examples/smart_minion/aime/evalute_aime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
import asyncio
import json
import os
import re
import sys
import threading
import time
from typing import List, Dict, Tuple, Optional, Any
from contextlib import redirect_stdout
from io import StringIO

import aiofiles
import numpy as np
from tqdm.asyncio import tqdm

from minion.configs.config import config
from minion.main.brain import Brain
from minion.main.rpyc_python_env import RpycPythonEnv
from minion.utils.syncheck import run_with_timeout
from minion.utils.utils import extract_number_from_string
from minion.providers import create_llm_provider
from minion.providers.cost import CostManager
from minion.utils.process import run_code_in_separate_process

# Load JSONL file
def load_json(file_path):
with open(file_path, "r") as f:
data = json.load(f)
return data


# Load JSONL file
def load_jsonl(file_path):
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line.strip()))
return data


def extract_answer(answer_str):
# Regular expression to find the answer after '####'
match = re.search(r"####\s*(.*)", answer_str)
if match:
return match.group(1).strip() # Extract and remove any surrounding whitespace
else:
return answer_str # Return None if no match is found

async def evaluate_dataset(
data,
last_processed_id=None,
start_id=None,
to_processed_id=None,
route="cot",
run_filename=None,
continue_process=False,
concurrency_count=1,
):
correct = 0
count = 0
total_count = len(data)
matched_ids = []
mismatch = []
tasks = []

async def process_batch(tasks, correct):
results = await asyncio.gather(*tasks)
for result in results:
correct += result["result"]
if result["result"] == 1:
matched_ids.append(result["item_id"])
else:
mismatch.append(result)
last_processed_item = results[-1] # Get the last processed item
return correct, last_processed_item

async def read_json_file(filename):
async with aiofiles.open(filename, "r") as f:
contents = await f.read()
data = json.loads(contents)
return data

async def save_run_info(filename, last_processed_id):
run_info = {
"last_processed_id": last_processed_id,
"matched_ids": matched_ids,
"mismatched_ids": mismatch,
"correct": correct,
"count": count,
"correct_percentage": correct / count if count > 0 else 0,
"total_prompt_tokens": cost_manager.total_prompt_tokens,
"total_completion_tokens": cost_manager.total_completion_tokens,
"total_cost": cost_manager.total_cost,
}
async with aiofiles.open(filename, "w") as f:
await f.write(json.dumps(run_info, indent=4))

if continue_process and os.path.exists(run_filename):
async with aiofiles.open(run_filename, "r") as f:
run_info = json.loads(await f.read())
last_processed_id = run_info["last_processed_id"]
matched_ids = run_info["matched_ids"]
mismatch = run_info["mismatched_ids"]
correct = run_info["correct"]
count = run_info["count"]
cost_manager.total_prompt_tokens = run_info.get("total_prompt_tokens", 0)
cost_manager.total_completion_tokens = run_info.get("total_completion_tokens", 0)
cost_manager.total_cost = run_info.get("total_cost", 0)

with tqdm(total=total_count, desc="Evaluating") as pbar:
for i, item in enumerate(data):
item_id = i
item["idx"] = i
if last_processed_id is not None and item_id <= last_processed_id:
continue
if start_id and item_id < start_id:
continue
if to_processed_id and item_id > to_processed_id:
break

count += 1
tasks.append(solve_single_question(item, route=route))

if len(tasks) == concurrency_count:
correct, last_processed_item = await process_batch(tasks, correct)
last_processed_id = last_processed_item["item_id"]
tasks = [] # Reset tasks after processing
pbar.set_postfix({"Correct": correct, "count": count})
pbar.update(concurrency_count)

# Save running information after each batch
await save_run_info(filename=run_filename, last_processed_id=last_processed_id)

# Process remaining tasks
if tasks:
correct, last_processed_item = await process_batch(tasks, correct)
last_processed_id = last_processed_item["item_id"]
pbar.set_postfix({"Correct": correct})
pbar.update(len(tasks))

# Save running information after each batch
await save_run_info(filename=run_filename, last_processed_id=last_processed_id)

return correct, count, matched_ids, mismatch

PASS = "PASS"
FAIL = "FAIL"

def check_solution(solution, test):
print(f"solution: {solution}")

try:
# Get test cases from the dictionary
inputs = test.get('input', [])
outputs = test.get('output', [])

# Run each test case
for input_data, expected_output in zip(inputs, outputs):
try:
# Run the code in a separate process
result = run_code_in_separate_process(solution, input_data)

if result.stderr:
print(f"Test produced stderr: {result.stderr}")

# Compare outputs (strip both to handle trailing newlines)
if result.stdout.strip() != expected_output.strip():
return (FAIL, f"Test failed:\nInput: {input_data}\nExpected: {expected_output}\nGot: {result.stdout}\nStderr: {result.stderr if result.stderr else 'None'}")
except Exception as e:
return (FAIL, f"Test execution failed: {str(e)}")

return (PASS, "Solution passed all test cases.")

except TimeoutError:
return (FAIL, "Execution timeout. Please check if your solution contains infinite loops or time-consuming operations.")
except Exception as e:
# Record detailed error information
error_message = f"Error: {str(e)}.\n Solution: {solution}.\n Test: {test}"

# Write error information to error.log file
with open('error.log', 'a', encoding='utf-8') as log_file:
log_file.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {error_message}\n")

return (FAIL, error_message)

async def solve_single_question(item, route="cot"):
item_id = item.get("id", -1) # Extract the ID or use a default value

#correct_answer = extract_answer(ground_truth_raw)
question = item["problem"]
answer = await solve_question(item)
if answer == item["answer"]:
return {"result": 1, "item_id": item_id, "question": question, "answer": answer, "idx": item_id}

else:
# Append the mismatched item to the JSONL file
return {
"result": 0,
"item_id": item_id,
"item": item,
"question": question,
"answer": answer,
"idx": item_id,
}


# Load ensemble logic from JSON files
def load_execution_config(file_path):
with open(file_path, "r") as file:
ensemble_logic = json.load(file)
return ensemble_logic

async def solve_question(item):
brain = Brain(stats_storer=None, python_env=RpycPythonEnv(ports=3007), llm=llm)
current_dir = os.path.dirname(os.path.abspath(__file__))
ensemble_logic_path = os.path.join(current_dir, "aime_config.json")
# 加载测试用例

answer, score, *_ = await brain.step(
query=item["problem"],
execution_config=load_execution_config(ensemble_logic_path),
)
return answer

#model = "gpt-4o"
#model = "claude"
model = "deepseek-r1"

llm = create_llm_provider(config.models.get(model))
cost_manager = CostManager()
llm.cost_manager = cost_manager
async def main():
from datasets import load_dataset
ds = load_dataset("HuggingFaceH4/aime_2024", split='train')
correct, count, matched_ids, mismatched_ids = await evaluate_dataset(
ds, run_filename=f"run_aime_{model}.json", continue_process=True, concurrency_count=1
)

print(f"Accuracy: {correct/count:.2%}")
print(f"Mismatched IDs: {mismatched_ids}")


# Run the async main function
if __name__ == "__main__":
asyncio.run(main())
# Example usage
25 changes: 13 additions & 12 deletions examples/smart_minion/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ async def smart_brain():
# 使用从 minion/__init__.py 导入的 config 对象
model = "gpt-4o"
model = "deepseek-r1"
model = "phi-4"
#model = "llama3.2"
llm_config = config.models.get(model)

Expand All @@ -34,18 +35,18 @@ async def smart_brain():
# obs, score, *_ = await brain.step(query="what's the solution for game of 24 for 1 3 4 6", route="python")
# print(obs)

obs, score, *_ = await brain.step(query="what's the solution for game of 24 for 2 3 5 12", route="python")
print(obs)

current_file_dir = os.path.dirname(os.path.abspath(__file__))
cache_plan = os.path.join(current_file_dir, "aime", "plan_gpt4o.1.json")
obs, score, *_ = await brain.step(
query="Every morning Aya goes for a $9$-kilometer-long walk and stops at a coffee shop afterwards. When she walks at a constant speed of $s$ kilometers per hour, the walk takes her 4 hours, including $t$ minutes spent in the coffee shop. When she walks $s+2$ kilometers per hour, the walk takes her 2 hours and 24 minutes, including $t$ minutes spent in the coffee shop. Suppose Aya walks at $s+\frac{1}{2}$ kilometers per hour. Find the number of minutes the walk takes her, including the $t$ minutes spent in the coffee shop.",
route="plan",
dataset="aime 2024",
cache_plan=cache_plan,
)
print(obs)
# obs, score, *_ = await brain.step(query="what's the solution for game of 24 for 2 3 5 12", route="python")
# print(obs)
#
# current_file_dir = os.path.dirname(os.path.abspath(__file__))
# cache_plan = os.path.join(current_file_dir, "aime", "plan_gpt4o.1.json")
# obs, score, *_ = await brain.step(
# query="Every morning Aya goes for a $9$-kilometer-long walk and stops at a coffee shop afterwards. When she walks at a constant speed of $s$ kilometers per hour, the walk takes her 4 hours, including $t$ minutes spent in the coffee shop. When she walks $s+2$ kilometers per hour, the walk takes her 2 hours and 24 minutes, including $t$ minutes spent in the coffee shop. Suppose Aya walks at $s+\frac{1}{2}$ kilometers per hour. Find the number of minutes the walk takes her, including the $t$ minutes spent in the coffee shop.",
# route="plan",
# dataset="aime 2024",
# cache_plan=cache_plan,
# )
# print(obs)

# 从 HumanEval/88 提取的测试用例
test_data = {
Expand Down
Loading

0 comments on commit 306e1b3

Please sign in to comment.