ChengyuBench/run-connotation.py at main · sofyc/ChengyuBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
from tqdm import tqdm
import re
import time
import random
import os

from llm import LLM


# model_name = "claude-3-7-sonnet@20250219"
# model_name = "gemini-2.0-flash-lite"
# model_name = "gemini-2.5-pro-exp-03-25"
# model_name = "gpt-4o-240513"
model_name = "gpt-4.1"
# model_name = "Qwen/Qwen2.5-72B-Instruct-Turbo"
# model_name = "deepseek-ai/DeepSeek-V3"
# model_name = "deepseek-ai/DeepSeek-R1"

if "/" in model_name:
    file_name = model_name.split("/")[-1]
else:
    file_name = model_name

model = LLM(model_name)


def extract_answer(text):
    """
    Extracts the model answer from the given text.
    Expected format is <positive> or <negative>.

    Args:
        text (str): The input text containing the answer.

    Returns:
        str or None: Returns 'positive' or 'negative' if found, else None.
    """
    match = re.search(r"<(positive|negative)>", text.strip(), re.IGNORECASE)
    return match.group(1).lower() if match else None

# if model_name == "deepseek-ai/DeepSeek-R1":
#     extract_idioms = extract_idioms_r1
# else:
#     extract_idioms = extract_idioms_others

# Load data
with open("dataset/dataset-connotation-verified.json", mode='r', encoding='utf-8') as f:
    data = json.load(f)

# Try to resume from existing results
results_path = f"results-connotation/{file_name}.json"
if os.path.exists(results_path):
    with open(results_path, mode='r', encoding='utf-8') as f:
        results = json.load(f)
else:
    results = {}

# Loop and resume inference
for line in tqdm(data):
    index = str(line["index"])

    if index in results:
        continue  # Skip already processed


    prompt = f"""Please determine the evaluative connotation of the following Chinese idiom. Classify the idiom as either positive (with a favorable meaning) or negative (with an unfavorable meaning). Do not choose neutral.

The idiom is as follows:
{line["idiom"]}

Please provide your final answer in the format:
<positive> or <negative>"""

    max_retries = 5
    retry_count = 0
    success = False

    while retry_count < max_retries and not success:
        try:

            response = model.call_llm(prompt, max_tokens=1024)
            results[index] = {"response_text": response, "answer": extract_answer(response), "label": line["sentiment"]}

            # Save after each successful inference
            with open(results_path, mode='w', encoding='utf-8') as f:
                json.dump(results, f, indent=4, ensure_ascii=False)

            success = True

        except Exception as e:
            retry_count += 1
            print(f"Error at index {index} (attempt {retry_count}/{max_retries}): {e}")

            # If this is a rate limit or overloaded error, wait longer
            if "overloaded" in str(e).lower() or "rate limit" in str(e).lower() or '429' in str(e).lower():
                longer_delay = 10 * (2 ** retry_count) + random.uniform(0, 5)
                print(f"Rate limit hit. Waiting {longer_delay:.2f} seconds...")
                time.sleep(longer_delay)

            # If we've used all our retries, save progress and exit
            if retry_count >= max_retries:
                print(f"Max retries reached for index {index}. Saving and continuing.")