@@ -55,7 +55,7 @@ def get_aligns(rf, cf, alignments):
55
55
res .append ( ( int (x [0 ]), int (x [1 ]) ) )
56
56
else :
57
57
return None
58
-
58
+
59
59
return res
60
60
61
61
def add_aligns (aligns , aligns_dict , token_counts , re , ce , existing_items ):
@@ -69,7 +69,7 @@ def add_aligns(aligns, aligns_dict, token_counts, re, ce, existing_items):
69
69
token_counts [re ] = align [0 ]
70
70
if align [1 ] > token_counts [ce ]:
71
71
token_counts [ce ] = align [1 ]
72
-
72
+
73
73
existing_items [re ][ce ].append (f"{ align [0 ]} ,{ align [1 ]} " )
74
74
75
75
def add_negative_samples (aligns_dict , existing_items , token_counts , verse_id ):
@@ -89,7 +89,7 @@ def add_negative_samples(aligns_dict, existing_items, token_counts, verse_id):
89
89
aligns_dict ['userID' ].append (re + str (i ))
90
90
aligns_dict ['itemID' ].append (ce + str (jp ))
91
91
aligns_dict ['rating' ].append (1 )
92
-
92
+
93
93
ip %= (token_counts [re ] + 1 )
94
94
aligns_dict ['userID' ].append (re + str (ip ))
95
95
aligns_dict ['itemID' ].append (ce + str (j ))
@@ -112,11 +112,11 @@ def get_alignments_df(row_editions, col_editions, verse_alignments,
112
112
113
113
if not aligns is None :
114
114
add_aligns (aligns , aligns_dict , token_counts , re , ce , existing_items )
115
-
115
+
116
116
add_negative_samples (aligns_dict , existing_items , token_counts , verse_id )
117
117
118
118
return pd .DataFrame (aligns_dict ), token_counts [source_edition ], token_counts [target_edition ]
119
-
119
+
120
120
def iter_max (sim_matrix : np .ndarray , max_count : int = 2 , alpha_ratio = 0.7 ) -> np .ndarray :
121
121
m , n = sim_matrix .shape
122
122
forward = np .eye (n )[sim_matrix .argmax (axis = 1 )] # m x n
@@ -156,14 +156,14 @@ def get_itermax_predictions(raw_s_predictions, max_count=2, alpha_ratio=0.9):
156
156
for i in raw_s_predictions :
157
157
for j , s in raw_s_predictions [i ]:
158
158
matrix [i ,j ] = s
159
-
159
+
160
160
itermax_res = iter_max (matrix , max_count , alpha_ratio )
161
161
res = []
162
162
for i in range (rows ):
163
163
for j in range (cols ):
164
164
if itermax_res [i ,j ] != 0 :
165
165
res .append ((i ,j ))
166
-
166
+
167
167
return res
168
168
169
169
def predict_alignments (algo , source_edition , target_edition ):
@@ -197,13 +197,13 @@ def train_model(df, s_tok_count, t_tok_count, row_editions, col_editions):
197
197
algo .row_editions = row_editions
198
198
algo .col_editions = col_editions
199
199
algo .df = df
200
-
200
+
201
201
return algo
202
202
203
203
def get_induced_alignments (source_edition , target_edition , verse_alignments_path , verse_id , all_editions ):
204
204
205
205
verse_alignments = get_verse_alignments (verse_alignments_path , verse_id , editions = all_editions )
206
-
206
+
207
207
# this is only for saving the gdfa alignments from source to target for the evauation
208
208
verse_alignments_gdfa = get_verse_alignments (verse_alignments_path , verse_id , editions = [source_edition , target_edition ], gdfa = True )
209
209
@@ -213,17 +213,17 @@ def get_induced_alignments(source_edition, target_edition, verse_alignments_path
213
213
df , s_tok_count , t_tok_count = get_alignments_df (row_editions , col_editions , verse_alignments , source_edition , target_edition , verse_id )
214
214
215
215
algo = train_model (df , s_tok_count , t_tok_count , row_editions , col_editions )
216
-
216
+
217
217
predicted_alignments = predict_alignments (algo , source_edition , target_edition )
218
218
base_inter_alignments = verse_alignments [source_edition ][target_edition ]
219
219
base_gdfa_alignments = verse_alignments_gdfa [source_edition ][target_edition ]
220
-
220
+
221
221
with cnt .get_lock ():
222
222
cnt .value += 1
223
223
if cnt .value % 20 == 0 :
224
224
LOG .info (f"Done inferring alignments for { cnt .value } verses" )
225
225
226
- return predicted_alignments , base_inter_alignments , base_gdfa_alignments , len (algo .col_editions )+ 1
226
+ return predicted_alignments , base_inter_alignments , base_gdfa_alignments , len (algo .col_editions ) + 1
227
227
228
228
229
229
def init_globals (counter ):
@@ -255,7 +255,7 @@ def main(args):
255
255
256
256
# get predicted alignments using parallel processing
257
257
cnt = Value ('i' , 0 )
258
- with Pool (processes = args .core_count , initializer = init_globals , initargs = (cnt ,)) as p :
258
+ with Pool (processes = args .core_count , initializer = init_globals , initargs = (cnt ,)) as p :
259
259
all_alignments = p .starmap (get_induced_alignments , starmap_args )
260
260
261
261
out_NMF_f_name = f"predicted_alignments_from_{ args .source_edition } _to_{ args .target_edition } _with_max_{ len (all_editions )} _editions_for_{ len (all_verses )} _verses_NMF.txt"
@@ -283,14 +283,14 @@ def main(args):
283
283
parser = argparse .ArgumentParser ()
284
284
285
285
parser .add_argument ('--save_path' , default = os .path .join (current_path , "predicted_alignments" ), type = str )
286
- parser .add_argument ('--gold_file' , default = os .path .join (current_path , "data/gold-standards/blinker/eng-fra.gold" ), type = str )
286
+ parser .add_argument ('--gold_file' , default = os .path .join (current_path , "data/gold-standards/blinker/eng-fra.gold" ), type = str )
287
287
parser .add_argument ('--verse_alignments_path' , default = "/mounts/data/proj/ayyoob/align_induction/verse_alignments/" , type = str )
288
- parser .add_argument ('--source_edition' , default = "eng-x-bible-mixed" , type = str )
289
- parser .add_argument ('--target_edition' , default = "fra-x-bible-louissegond" , type = str )
288
+ parser .add_argument ('--source_edition' , default = "eng-x-bible-mixed" , type = str )
289
+ parser .add_argument ('--target_edition' , default = "fra-x-bible-louissegond" , type = str )
290
290
parser .add_argument ('--editions_file' , default = os .path .join (current_path , "data/edition_lists/blinker_edition_list.txt" ), type = str )
291
291
parser .add_argument ('--core_count' , default = 80 , type = int )
292
292
parser .add_argument ('--seed' , default = 42 , type = int )
293
293
294
294
args = parser .parse_args ()
295
295
main (args )
296
-
296
+
0 commit comments