-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paths2a.py
104 lines (80 loc) · 4.55 KB
/
s2a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
from datasets import load_dataset
os.environ['HF_TOKEN'] = ''
os.environ["REPLICATE_API_TOKEN"] = ""
# ------ Load Data ---------
def load_sttn(model, ent_type, sem=True, return_dataset=False):
if ent_type == "Multi" and sem:
file = pd.read_csv(f'../Sorting-Through-The-Noise/data/combined_data/multiple_entity_distractor/{model}/complete_data_For_MultipleEntityObjectDistractorAccuracy{model}.csv', sep="\t")
elif ent_type == "Multi" and not sem:
file = pd.read_csv(f'../Sorting-Through-The-Noise/data/combined_data/multiple_entity_with_neutral_distractor/{model}/complete_data_For_MultipleEntityObjectDistractorAccuracy{model}.csv', sep="\t")
elif ent_type == "Single" and sem:
file = pd.read_csv(f'../Sorting-Through-The-Noise/data/combined_data/single_entity_distractor/{model}/complete_data_For_MultipleEntityObjectDistractorAccuracy{model}.csv', sep="\t")
elif ent_type == "Single" and not sem:
file = pd.read_csv(f'../Sorting-Through-The-Noise/data/combined_data/single_entity_with_neutral_distractor/{model}/complete_data_For_MultipleEntityObjectDistractorAccuracy{model}.csv', sep="\t")
file['target'] = file['target_occupation']
del file['target_occupation']
if return_dataset:
data = {"sentence": file['sentence'], "target": file['target']}
return Dataset.from_dict(data)
else:
return pd.DataFrame({"sentence": file['sentence'], "target": file['target']})
def get_zero_attractors(data1, data2):
if isinstance(data1, Dataset):
data1 = pd.DataFrame({'sentence': data1['sentence'], 'target': data1['target']})
data2 = pd.DataFrame({'sentence': data2['sentence'], 'target': data2['target']})
concat = pd.concat([data1, data2])
duplicates = concat[concat.duplicated(subset='sentence', keep=False)]
unique = duplicates.drop_duplicates(subset='sentence')
return unique[::4]['sentence'].to_list(), unique[::4]['target'].to_list()
def load_comps(in_between=False, before=False, base=False, return_dataset=False):
if in_between:
file = pd.read_json('../comps/data/comps/comps_wugs_dist-in-between.jsonl', lines=True)
if before:
file = pd.read_json('../comps/data/comps/comps_wugs_dist-before.jsonl', lines=True)
if base:
file = pd.read_json('../comps/data/comps/comps_wugs.jsonl', lines=True)
#sents_acc = [file['prefix_acceptable'][i] + " " + file['property_phrase'][i] for i in range(len(file))]
#sents_inacc = [file['prefix_unacceptable'][i] + " " + file['property_phrase'][i] for i in range(len(file))]
sents_acc = [sent + " " + prop for sent, prop in zip(file['prefix_acceptable'], file['property_phrase'])]
sents_inacc = [sent + " " + prop for sent, prop in zip(file['prefix_unacceptable'], file['property_phrase'])]
data = {'sentence_acc': sents_acc,
'sentence_unacc': sents_inacc,
'acceptable_concept': file['acceptable_concept'],
'unacceptable_concept': file['unacceptable_concept'],
'negative_sample_type': file['negative_sample_type']}
if return_dataset:
return Dataset.from_dict(data)
else:
return pd.DataFrame(data)
def load_tqa():
return load_dataset('truthful_qa', 'generation')
def load_ic_qa(ic=False, return_dataset=False):
file = pd.read_csv('/local/js/lmcm_project/factual_qa/qa_ic.csv')
if ic:
questions = [file['irrelevant_context'][i] + " " + file['question'][i] for i in range(len(file))]
else:
questions = file['question']
data = {'question': questions,
'true_answer': file['true_answer'],
'false_answer': file['false_answer']}
if return_dataset:
return Dataset.from_dict(data)
else:
return pd.DataFrame(data)
def write_out(sents, predictions, targets, file_spec, comps=False):
if comps:
sents_acc, sents_unacc = sents[0], sents[1]
if len(predictions) < len(targets):
length_diff = len(targets) - len(predictions)
predictions += ['na'] * length_diff
data = {"sentence_acceptable": sents_acc, "sentence_unacceptable": sents_unacc, "prediction": predictions, "target": targets}
else:
if len(predictions) < len(targets):
length_diff = len(targets) - len(predictions)
predictions += ['na'] * length_diff
data = {"sentence": sents, "prediction": predictions, "target": targets}
pd.DataFrame(data).to_csv(f"/local/js/LMCM_project/eval/{file_spec}.csv", index=False)