1
- # Index CSV
2
- # Die CSV enthält alle Infos für BM25
3
- # Und die Vektor-Embeddings aller Argumente
4
-
5
- import argparse
6
1
import os
7
- import rootpath
8
- import sys
9
- import json
10
2
import csv
11
- import warnings
3
+ import json
4
+ import os
5
+ import sys
12
6
import traceback
13
- import math
7
+ import warnings
8
+
14
9
import numpy as np
10
+ import rootpath
11
+ from sklearn .preprocessing import normalize
15
12
from tqdm import tqdm
16
- from sklearn import preprocessing
17
- from sklearn .preprocessing import MinMaxScaler , normalize
18
13
19
14
warnings .filterwarnings ("error" )
20
15
@@ -79,10 +74,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
79
74
zweiter array 2 dim. Dualsim scores
80
75
"""
81
76
82
- # love titties 4 eva
83
- # booty squad <333333333
84
- # ariane staudte <333333
85
-
86
77
dual_embedding = DualEmbedding (cbow_model )
87
78
processed_queries = dual_embedding .get_processed_queries (queries )
88
79
@@ -112,7 +103,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
112
103
bm25_queries_scores = []
113
104
114
105
for query_terms , query_matrix in processed_queries :
115
-
116
106
desim_query_score = dual_embedding .desim (
117
107
query_matrix , arg_emb
118
108
)
@@ -135,8 +125,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
135
125
if i + 1 == max_args :
136
126
break
137
127
138
- # bm25_scores = np.transpose(np.array(bm25_scores))
139
- # desim_scores = np.transpose(np.array(desim_scores))
140
128
bm25_scores = np .array (bm25_scores )
141
129
desim_scores = np .array (desim_scores )
142
130
arg_ids = np .asarray (arg_ids )
@@ -152,14 +140,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
152
140
bm25_norm = normalize (bm25_scores , norm = 'max' , axis = 0 )
153
141
desim_norm = normalize (desim_scores , norm = 'max' , axis = 0 )
154
142
155
- # bm25_norm = np.linalg.norm(bm25_scores, axis=0)
156
- # bm25_norm[bm25_norm == 0] = 0.0001
157
- # desim_norm = np.linalg.norm(desim_scores, axis=0)
158
- # desim_norm[desim_norm == 0] = 0.0001
159
-
160
- # bm25_scores = np.transpose(bm25_scores / bm25_norm)
161
- # desim_scores = np.transpose(desim_scores / desim_norm)
162
-
163
143
print (f'norm Min BM25: { bm25_norm .min ()} ' )
164
144
print (f'norm Max BM25: { bm25_norm .max ()} ' )
165
145
print (f'norm Min Desim: { desim_norm .min ()} ' )
@@ -183,7 +163,6 @@ def combine_scores(bm25_scores, desim_scores, alpha):
183
163
184
164
185
165
def get_top_args (arg_ids , bm25_scores , desim_scores , alpha = 0.5 , top_n = 10 ):
186
-
187
166
top_args_list = []
188
167
final_scores , influences = combine_scores (bm25_scores , desim_scores , alpha )
189
168
@@ -192,8 +171,6 @@ def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
192
171
f"--------------------\n \n "
193
172
f"BM25 > Desim: { influences .count (True )} Mal.\n "
194
173
f"Desim > BM25: { influences .count (False )} Mal.\n \n "
195
- # f"Desim und BM25 sollten standardmäßig ähnlich sein (Verhältnis = 1.0)\n"
196
- # f"Verhältnis: {abs(influences.count(True) / influences.count(False))}\n\n"
197
174
))
198
175
199
176
for bs , ds , fs in zip (bm25_scores , desim_scores , final_scores ):
@@ -212,7 +189,7 @@ def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
212
189
213
190
def get_sentiments (top_args ):
214
191
with open (
215
- setup .SENTIMENTS_PATH , 'r' , newline = '' , encoding = 'utf-8'
192
+ setup .SENTIMENTS_PATH , 'r' , newline = '' , encoding = 'utf-8'
216
193
) as f_in :
217
194
reader = csv .reader (f_in , ** setup .SENTIMENTS_CONFIG )
218
195
header = next (reader )
@@ -235,95 +212,3 @@ def get_sentiments(top_args):
235
212
)
236
213
237
214
return query_sentiments
238
-
239
- # results = []
240
- # for arg_ids, query_sents in zip(top_args, query_sentiments):
241
- # abs_sent = [abs(i) for i in query_sents[0]]
242
- # pairings = [(i, p[0], p[1])
243
- # for i, p in enumerate(zip(arg_ids[1], abs_sent))]
244
- # pairings = sorted(pairings, key=lambda x: (x[2], x[1]), reverse=True)
245
- # indices_order = [i[0] for i in pairings]
246
- # new_arg_ids = [arg_ids[0][i] for i in indices_order]
247
- # new_arg_scores = [(p[1], p[2]) for p in pairings]
248
- # results.append(
249
- # (new_arg_ids, new_arg_scores)
250
- # )
251
- # return results
252
-
253
- # if args.mode != 'read':
254
- # cbow = CBOW.load(CBOW_MODEL_PATH)
255
- # bm25_manager = BM25Manager.load(BM25_PATH)
256
-
257
- # if args.mode == 'train':
258
- # create_index(INDEX_PATH, TRAIN_PATH, cbow.model, bm25_manager.index)
259
-
260
- # elif args.mode == 'load':
261
-
262
- # queries = [
263
- # 'Donald Trump is bad',
264
- # 'pregnant women abortion stop',
265
- # ]
266
-
267
- # bm25_scores, desim_scores, arg_ids = analyze_query(
268
- # queries,
269
- # INDEX_PATH,
270
- # cbow.model,
271
- # bm25_manager.index,
272
- # max_args=-1
273
- # )
274
-
275
- # combined_scores = []
276
- # alpha = 0.5
277
-
278
- # for bm25_query_scores, desim_query_scores in zip(bm25_scores, desim_scores):
279
- # combined_query_scores = []
280
-
281
- # for b, d in zip(bm25_query_scores, desim_query_scores):
282
- # combined_query_scores.append(alpha * b + (1 - alpha) * d)
283
-
284
- # combined_scores.append(combined_query_scores)
285
-
286
- # combined_scores = np.asarray(combined_scores)
287
- # top_n = 10
288
- # results = []
289
-
290
- # for cs in combined_scores:
291
- # best_n = np.argsort(cs)[::-1][:top_n]
292
- # top_args = arg_ids[best_n]
293
- # top_scores = cs[best_n]
294
-
295
- # results.append((top_args, top_scores))
296
-
297
- # if not os.path.isfile(RESULT_LOG_PATH):
298
- # result_log_header = ['query', 'top_args', 'scores', 'alpha']
299
- # with open(RESULT_LOG_PATH, 'w', newline='', encoding='utf-8') as f_out:
300
- # writer = csv.writer(f_out, delimiter='|', quotechar='"',
301
- # quoting=csv.QUOTE_MINIMAL)
302
- # writer.writerow(result_log_header)
303
-
304
- # with open(RESULT_LOG_PATH, 'a', newline='', encoding='utf-8') as f_out:
305
- # writer = csv.writer(f_out, delimiter='|',
306
- # quotechar='"', quoting=csv.QUOTE_MINIMAL)
307
-
308
- # for q, res in zip(queries, results):
309
-
310
- # row = [
311
- # q,
312
- # res[0].tolist(),
313
- # res[1].tolist(),
314
- # alpha,
315
- # ]
316
- # writer.writerow(row)
317
-
318
- # elif args.mode == 'read':
319
- # with open(RESULT_LOG_PATH, 'r', newline='', encoding='utf-8') as f_in:
320
- # reader = csv.reader(f_in, delimiter='|', quotechar='"',
321
- # quoting=csv.QUOTE_MINIMAL)
322
-
323
- # header = next(reader)
324
- # for line in reader:
325
- # query, best_args = line[0], line[1]
326
- # print(query)
327
- # print(best_args)
328
- # print_argument_texts(best_args, CSV_PATH, print_full_texts=True)
329
- # print('*' * 60)
0 commit comments