-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
34 lines (27 loc) · 1021 Bytes
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from tqdm import tqdm
from utils import translate
import jsonlines
idx = 0 # selecting the idx's part of dataset
split = list(range(0, 15000, 100)) + [14999]
cnt = 0
category = []
with open("databricks-dolly-15k.jsonl", "r+", encoding="utf-8") as f:
data = []
for item in jsonlines.Reader(f):
if cnt >= split[idx] and cnt < split[idx+1]:
data.append(item['instruction'])
data.append(item['context'])
data.append(item['response'])
category.append(item['category'])
elif cnt >= split[idx+1]:
break
cnt += 1
# need parallel
ret = [translate(d) for d in tqdm(data)]
instruction_tr = ret[0::3]
context_tr = ret[1::3]
response_tr = ret[2::3]
with open("dolly_chinese_{}.jsonl".format(idx), "w", encoding="utf-8") as f:
writer = jsonlines.Writer(f)
for i in range(len(instruction_tr)):
writer.write({'instruction': instruction_tr[i], 'context': context_tr[i], 'response': response_tr[i], 'category': category[i]})