-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathprocess_data.py
125 lines (93 loc) · 2.85 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import tqdm
from utils.func import generate_and_tokenize_prompt
from datasets import load_dataset, Dataset
from utils.generate_prompt import generate_prompt_test, generate_few_shot_prompt
import ast
def process_data_train(file_path, tokenizer):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
training_samples = []
for sample in data["data"]:
try:
choices = ast.literal_eval(sample['choices'])
except:
break
explanation = sample['explanation'].strip()
question = sample['question']
answer = sample['answer']
choices = '\n'.join(choices)
training_sample = generate_and_tokenize_prompt(
tokenizer, question, choices, explanation, answer
)
training_samples.append(training_sample)
choices_data = Dataset.from_list(training_samples)
return choices_data
def process_data_cot(file_path):
# Load the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
test_samples = []
for sample in data["data"]:
try:
choices = sample['choices']
except:
break
question = sample['question']
choices = '\n'.join(choices)
test_sample = generate_prompt_test(
question, choices
)
test_samples.append(test_sample)
return test_samples
def process_data_few_shot_cot(file_path):
# Load the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
test_samples = []
for sample in data["data"]:
try:
choices = sample['choices']
except:
break
question = sample['question']
choices = '\n'.join(choices)
test_sample = generate_few_shot_prompt(
question, choices
)
test_samples.append(test_sample)
return test_samples
def parse_json_test_to_lists(file_name):
with open(file_name) as json_file:
json_test = json.load(json_file)
list_id = []
list_question = []
list_A = []
list_B = []
list_C = []
list_D = []
list_answer = []
for record in json_test['data']:
id = record['id']
question = record['question']
choices = record['choices']
answer = record['answer'][0]
list_A.append(choices[0])
list_B.append(choices[1])
list_C.append(choices[2])
try:
list_D.append(choices[3])
except IndexError:
list_D.append("None")
list_id.append(id)
list_question.append(question)
list_answer.append(answer)
return {
"list_id":list_id,
"list_question":list_question,
"list_A":list_A,
"list_B":list_B,
"list_C":list_C,
"list_D":list_D,
"list_answer":list_answer
}