-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAPI.py
229 lines (179 loc) · 8.29 KB
/
API.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python
# coding: utf-8
from looper import *
from looper_data import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text as sk_text
import shutil
from omegaconf import OmegaConf
class Experiment:
def __init__( self, wl, input_dir, model_type, model_path , pretrain_path, mid_dir, output_dir , m , mul, cal):
self.model_type = model_type
self.input_dir = input_dir
self.mid_dir = mid_dir
if mid_dir == 'default':
self.mid_dir = '/'.join(input_dir.split('/')[:-2])+'/mid_dir/'
if not os.path.exists(self.mid_dir):
os.mkdir(self.mid_dir)
self.out_dir = output_dir
if output_dir == 'default':
self.out_dir = '/'.join(input_dir.split('/')[:-2])+'/out_dir/'
if not os.path.exists(self.out_dir):
os.mkdir(self.out_dir)
self.calibrate = cal
if cal == 'default':
self.calibrate = True
self.multiple_swap = mul
if mul == 'default':
self.multiple_swap = False
self.model_path = model_path
self.M = m
if m == 'default':
self.M = 5
self.wl = self.make_sure_list(wl) # list, len>=1
self.filter_dir = self.mid_dir + "filtered/"
if not os.path.exists(self.filter_dir):
os.mkdir(self.filter_dir)
self.perturbed_dir = self.mid_dir + "perturbed/"
if not os.path.exists(self.perturbed_dir):
os.mkdir(self.perturbed_dir)
if not os.path.exists(self.perturbed_dir+'multiple/'):
os.mkdir(self.perturbed_dir+'multiple/')
if not os.path.exists(self.perturbed_dir+'onetime/'):
os.mkdir(self.perturbed_dir+'onetime/')
self.reference_dir = self.mid_dir + "reference/"
if not os.path.exists(self.reference_dir):
os.mkdir(self.reference_dir)
self.pretrain_path = pretrain_path
self.fre_ref = None
self.output = None
# ...
pass
def make_sure_list(self, wl):
if not isinstance(wl, list):
wl = [wl]
return wl
# the main function to check sensitivity score and relative rankings of a wordlist. return a df that contains all information
def check_sensitivity_for_wl(self):
self.set_fre_ref()
# sensitivity_on_wl is from looper.py
output = sensitivity_on_wl(self)
output_path = self.out_dir + 'out.csv'
output.to_csv(output_path)
ranked_reference = self.read_reference()
print(f'Result for this experiment:\n{output}')
return output, ranked_reference
# the function to gain insights from the reference saved from past experiment. If no past experience, the function will return null
# the function is bug-free for now
def read_reference(self):
if self.multiple_swap:
swap = 'multiple'
else:
swap = 'onetime'
# reference file name ex: tfidf_onetime_reference.csv
reference_path = f'{self.reference_dir}{self.model_type}_{swap}_reference.csv'
try:
# rank_reference is from looper.py, delete sd in later versions
ranked_reference = rank_reference(reference_path)
print(f'Experiment Setting: {self.model_type}+{swap}')
print(f'See Result of All Previous Experiment:\n {ranked_reference}')
return ranked_reference
except (FileNotFoundError, IOError) as exc:
print('No previous experiment or reference file not found')
return None
# provided the corpus, the method returns a reference_df that contains the occurences of each word in the training and test corpus. bug_free now
def set_fre_ref(self):
classes = ["negative", "positive"]
SAVE_FRE_PATH = self.mid_dir+'/corpus_fre.csv'
if os.path.exists(SAVE_FRE_PATH):
combined = pd.read_csv(SAVE_FRE_PATH, index_col = [0])
else:
custom_words = []#["patient","pt","date","discharge","admission","history","hospital", "md", "right", "left","mg","normal","cm"]
my_stop_words = sk_text.ENGLISH_STOP_WORDS.union(custom_words)
vec = CountVectorizer(stop_words=my_stop_words,
preprocessor=lambda x: re.sub(r'(\d[\d\.])+', '', x.lower()),
ngram_range=(1,1))
train_path = self.input_dir+'train.csv'
test_path = self.input_dir+'test.csv'
train_corpus = read_corpus(train_path)
test_corpus = read_corpus(test_path)
X_train = vec.fit_transform(train_corpus)
X_train = X_train.toarray()
bow_train = pd.DataFrame(X_train,columns = vec.get_feature_names_out())
bow_train.index = classes
train_fre = X_train[0] + X_train[1]
train_df = pd.DataFrame(list(train_fre),index=vec.get_feature_names_out())
train_df.reset_index(inplace = True)
train_df.rename(columns = {'index':'word', 0: 'train_fre'}, inplace= True)
X_test = vec.fit_transform(test_corpus)
X_test = X_test.toarray()
bow_test = pd.DataFrame(X_test,columns = vec.get_feature_names_out())
bow_test.index = classes
test_fre = X_test[0] + X_test[1]
test_df = pd.DataFrame(list(test_fre),index=vec.get_feature_names_out())
test_df.reset_index(inplace = True)
test_df.rename(columns = {'index':'word', 0: 'test_fre'}, inplace= True)
combined = pd.merge(train_df, test_df, on='word',how='outer')
combined.to_csv(SAVE_FRE_PATH)
self.fre_ref = combined
return combined
def check_input_error(conf):
if conf.model_type not in ['tfidf', 'finetune']:
print('Unsupported model type!')
sys.exit()
if len(conf.wl.split(','))==1:
print('Warning: recommend at least two words to get relative rankings.')
try:
files_in_input_dir = os.listdir(conf.input_dir[:-1])
except:
print('Input directory cannot be located')
sys.exit()
if 'train.csv' not in files_in_input_dir or 'val.csv' not in files_in_input_dir or 'test.csv' not in files_in_input_dir:
print('The input directory should contain three files: train.csv, val.csv, test.csv')
sys.exit()
if conf.mid_dir != 'default':
if not os.path.exists(conf.mid_dir):
print('Mid-dir cannot be located.')
sys.exit()
if conf.output_dir != 'default':
if not os.path.exists(conf.output_dir):
print('Output directory cannot be located.')
sys.exit()
if not os.path.exists(conf.model_path):
print('The model cannot be located.')
sys.exit()
if conf.model_type == 'finetune':
if not os.path.exists(conf.pretrain_path):
print('The pretrain model cannot be located.')
sys.exit()
print('Parameters condition satisfied!')
# optional: all files used during the calculation is deleted after the experiment if called.
def clear_cache(self):
mid_dir = self.mid_dir[:-1]
out_dir = self.out_dir[:-1]
shutil.rmtree(mid_dir)
shutil.rmtree(out_dir)
print('Cache is cleared from the server.')
# the highest-level function up till now
# @param: the config_path
# @returns: the sensitivity scores of the words in the word_list provided; the sensitivity score of all words calculated hitherto
def Sensitivity(config_path):
try:
conf = OmegaConf.load(config_path)
except FileNotFoundError:
print("Config file cannot be located.")
sys.exit()
check_input_error(conf)
e = Experiment(model_type = conf.model_type,
wl = conf.wl.split(','),
input_dir = conf.input_dir,
mid_dir = conf.mid_dir,
output_dir = conf.output_dir,
model_path = conf.model_path,
pretrain_path = conf.pretrain_path,
m = conf.M,
mul = conf.multiple_swap,
cal = conf.calibrate)
e.check_sensitivity_for_wl()
CONFIG_PATH = '/home/gy654/Documents/text_classification/experiments/sensitivity_analysis/api_config.yaml'
Sensitivity(CONFIG_PATH)