Skip to content

Commit fb29c4d

Browse files
committed
remove unused code (commented out code / vars)
1 parent 88155be commit fb29c4d

File tree

8 files changed

+37
-285
lines changed

8 files changed

+37
-285
lines changed

argU/__main__.py

+4-45
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
21
import argparse
32
import os
43
import sys
5-
import csv
4+
65
import rootpath
76

87
try:
@@ -56,7 +55,6 @@
5655
argparsed = parser.parse_args()
5756
print(f"Args: {argparsed}")
5857

59-
6058
setup.assert_file_exists(setup.CBOW_PATH)
6159

6260
db = load_db()
@@ -114,14 +112,13 @@
114112
)
115113

116114
merged_args.sort(key=lambda x: x[1], reverse=True)
117-
# merged_args = merged_args[:20]
118115

119116
if argparsed.sentiments != 'no':
120117
merged_args_with_sents = []
121118
for ma in merged_args:
122119
dph, sent = ma[1], ma[2]
123120
if argparsed.sentiments == 'emotional':
124-
dph = dph + dph * (abs(sent) / 2)
121+
dph = dph + dph * (abs(sent))
125122
elif argparsed.sentiments == 'neutral':
126123
dph = dph - dph * (abs(sent) / 2)
127124
merged_args_with_sents.append(
@@ -131,16 +128,7 @@
131128
merged_args.sort(key=lambda x: x[1], reverse=True)
132129

133130
print(f'### {query_id} {desm_scores["query_text"]}')
134-
# print('---')
135-
# arguments.fancy_print(
136-
# coll_args,
137-
# merged_args_list[:20],
138-
# trans_dict=trans_dict,
139-
# arg_len=2000,
140-
# )
141-
142-
# Sentiment Analysis
143-
131+
144132
if len(merged_args) != 0:
145133
output_dict[query_id] = merged_args
146134
else:
@@ -163,36 +151,7 @@
163151
trans_id = coll_trans.find_one({'_id': arg_id})['arg_id']
164152
except:
165153
trans_id = arg_id
166-
154+
167155
f_out.write(' '.join([
168156
str(id), 'Q0', trans_id, str(i + 1), str(score), method, '\n'
169157
]))
170-
171-
172-
# for i, res in enumerate(coll_res.find()):
173-
# if i == 3:
174-
# break
175-
# args = arguments.find(coll_args, res['args'])
176-
# print(res['query_text'])
177-
# print('=' * 40)
178-
# for a in args:
179-
# print('> ', Argument.get_text(a)[:200])
180-
# print()
181-
182-
# if args.mode == 'retrieve' or args.mode == 'collect':
183-
184-
# # Speichere Argumente in dem passenden Output Format
185-
# queries_args = scores.evaluate(threshold=0.5)
186-
187-
# with open(setup.RUN_PATH, 'w') as f_out:
188-
# for (query_id, query_text, args) in queries_args:
189-
# for i, arg in enumerate(args):
190-
# f_out.write(' '.join([
191-
# query_id,
192-
# 'Q0',
193-
# arg[0],
194-
# str(i + 1),
195-
# str(arg[1]),
196-
# method,
197-
# '\n',
198-
# ]))

argU/indexing/a2v.py

+3-24
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import argparse
22
import os
3-
import rootpath
43
import sys
5-
from tqdm import tqdm
6-
import numpy as np
4+
5+
import rootpath
76

87
try:
98
sys.path.append(os.path.join(rootpath.detect()))
@@ -16,7 +15,6 @@
1615
print(e)
1716
sys.exit(0)
1817

19-
2018
if __name__ == '__main__':
2119
parser = argparse.ArgumentParser()
2220
parser.add_argument(
@@ -45,32 +43,13 @@
4543
print(desm.model_in.wv.most_similar('Trump'))
4644
print(desm.model_out.most_similar('Trump'))
4745

48-
# diff_words = set([])
49-
# for i in range(10):
50-
# v = np.random.random_sample((100,))
51-
# for (w, s) in desm.model_out.most_similar(positive=[v], topn=10):
52-
# diff_words.add(w)
53-
# print(diff_words)
54-
# print(len(diff_words))
55-
# sys.exit(0)
56-
5746
embeddings = []
5847
last_emb = None
5948

60-
from scipy.spatial.distance import cosine
6149
for i, arg in enumerate(
62-
TrainArgsIterator(coll, full_data=True, max_args=-1)
50+
TrainArgsIterator(coll, full_data=True, max_args=-1)
6351
):
6452
arg_emb = desm.arg_to_emb(arg, model_type='out')
65-
66-
# if last_emb is not None:
67-
# print()
68-
# print('\t', 1 - cosine(last_emb, arg_emb))
69-
# print()
70-
# last_emb = arg_emb
71-
72-
# print(arg['text'][:200])
73-
# print()
7453
embeddings.append({
7554
'_id': arg['_id'],
7655
'emb': arg_emb.tolist(),

argU/indexing/index.py

+8-123
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,15 @@
1-
# Index CSV
2-
# Die CSV enthält alle Infos für BM25
3-
# Und die Vektor-Embeddings aller Argumente
4-
5-
import argparse
61
import os
7-
import rootpath
8-
import sys
9-
import json
102
import csv
11-
import warnings
3+
import json
4+
import os
5+
import sys
126
import traceback
13-
import math
7+
import warnings
8+
149
import numpy as np
10+
import rootpath
11+
from sklearn.preprocessing import normalize
1512
from tqdm import tqdm
16-
from sklearn import preprocessing
17-
from sklearn.preprocessing import MinMaxScaler, normalize
1813

1914
warnings.filterwarnings("error")
2015

@@ -79,10 +74,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
7974
zweiter array 2 dim. Dualsim scores
8075
"""
8176

82-
# love titties 4 eva
83-
# booty squad <333333333
84-
# ariane staudte <333333
85-
8677
dual_embedding = DualEmbedding(cbow_model)
8778
processed_queries = dual_embedding.get_processed_queries(queries)
8879

@@ -112,7 +103,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
112103
bm25_queries_scores = []
113104

114105
for query_terms, query_matrix in processed_queries:
115-
116106
desim_query_score = dual_embedding.desim(
117107
query_matrix, arg_emb
118108
)
@@ -135,8 +125,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
135125
if i + 1 == max_args:
136126
break
137127

138-
# bm25_scores = np.transpose(np.array(bm25_scores))
139-
# desim_scores = np.transpose(np.array(desim_scores))
140128
bm25_scores = np.array(bm25_scores)
141129
desim_scores = np.array(desim_scores)
142130
arg_ids = np.asarray(arg_ids)
@@ -152,14 +140,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
152140
bm25_norm = normalize(bm25_scores, norm='max', axis=0)
153141
desim_norm = normalize(desim_scores, norm='max', axis=0)
154142

155-
# bm25_norm = np.linalg.norm(bm25_scores, axis=0)
156-
# bm25_norm[bm25_norm == 0] = 0.0001
157-
# desim_norm = np.linalg.norm(desim_scores, axis=0)
158-
# desim_norm[desim_norm == 0] = 0.0001
159-
160-
# bm25_scores = np.transpose(bm25_scores / bm25_norm)
161-
# desim_scores = np.transpose(desim_scores / desim_norm)
162-
163143
print(f'norm Min BM25: {bm25_norm.min()}')
164144
print(f'norm Max BM25: {bm25_norm.max()}')
165145
print(f'norm Min Desim: {desim_norm.min()}')
@@ -183,7 +163,6 @@ def combine_scores(bm25_scores, desim_scores, alpha):
183163

184164

185165
def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
186-
187166
top_args_list = []
188167
final_scores, influences = combine_scores(bm25_scores, desim_scores, alpha)
189168

@@ -192,8 +171,6 @@ def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
192171
f"--------------------\n\n"
193172
f"BM25 > Desim: {influences.count(True)} Mal.\n"
194173
f"Desim > BM25: {influences.count(False)} Mal.\n\n"
195-
# f"Desim und BM25 sollten standardmäßig ähnlich sein (Verhältnis = 1.0)\n"
196-
# f"Verhältnis: {abs(influences.count(True) / influences.count(False))}\n\n"
197174
))
198175

199176
for bs, ds, fs in zip(bm25_scores, desim_scores, final_scores):
@@ -212,7 +189,7 @@ def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
212189

213190
def get_sentiments(top_args):
214191
with open(
215-
setup.SENTIMENTS_PATH, 'r', newline='', encoding='utf-8'
192+
setup.SENTIMENTS_PATH, 'r', newline='', encoding='utf-8'
216193
) as f_in:
217194
reader = csv.reader(f_in, **setup.SENTIMENTS_CONFIG)
218195
header = next(reader)
@@ -235,95 +212,3 @@ def get_sentiments(top_args):
235212
)
236213

237214
return query_sentiments
238-
239-
# results = []
240-
# for arg_ids, query_sents in zip(top_args, query_sentiments):
241-
# abs_sent = [abs(i) for i in query_sents[0]]
242-
# pairings = [(i, p[0], p[1])
243-
# for i, p in enumerate(zip(arg_ids[1], abs_sent))]
244-
# pairings = sorted(pairings, key=lambda x: (x[2], x[1]), reverse=True)
245-
# indices_order = [i[0] for i in pairings]
246-
# new_arg_ids = [arg_ids[0][i] for i in indices_order]
247-
# new_arg_scores = [(p[1], p[2]) for p in pairings]
248-
# results.append(
249-
# (new_arg_ids, new_arg_scores)
250-
# )
251-
# return results
252-
253-
# if args.mode != 'read':
254-
# cbow = CBOW.load(CBOW_MODEL_PATH)
255-
# bm25_manager = BM25Manager.load(BM25_PATH)
256-
257-
# if args.mode == 'train':
258-
# create_index(INDEX_PATH, TRAIN_PATH, cbow.model, bm25_manager.index)
259-
260-
# elif args.mode == 'load':
261-
262-
# queries = [
263-
# 'Donald Trump is bad',
264-
# 'pregnant women abortion stop',
265-
# ]
266-
267-
# bm25_scores, desim_scores, arg_ids = analyze_query(
268-
# queries,
269-
# INDEX_PATH,
270-
# cbow.model,
271-
# bm25_manager.index,
272-
# max_args=-1
273-
# )
274-
275-
# combined_scores = []
276-
# alpha = 0.5
277-
278-
# for bm25_query_scores, desim_query_scores in zip(bm25_scores, desim_scores):
279-
# combined_query_scores = []
280-
281-
# for b, d in zip(bm25_query_scores, desim_query_scores):
282-
# combined_query_scores.append(alpha * b + (1 - alpha) * d)
283-
284-
# combined_scores.append(combined_query_scores)
285-
286-
# combined_scores = np.asarray(combined_scores)
287-
# top_n = 10
288-
# results = []
289-
290-
# for cs in combined_scores:
291-
# best_n = np.argsort(cs)[::-1][:top_n]
292-
# top_args = arg_ids[best_n]
293-
# top_scores = cs[best_n]
294-
295-
# results.append((top_args, top_scores))
296-
297-
# if not os.path.isfile(RESULT_LOG_PATH):
298-
# result_log_header = ['query', 'top_args', 'scores', 'alpha']
299-
# with open(RESULT_LOG_PATH, 'w', newline='', encoding='utf-8') as f_out:
300-
# writer = csv.writer(f_out, delimiter='|', quotechar='"',
301-
# quoting=csv.QUOTE_MINIMAL)
302-
# writer.writerow(result_log_header)
303-
304-
# with open(RESULT_LOG_PATH, 'a', newline='', encoding='utf-8') as f_out:
305-
# writer = csv.writer(f_out, delimiter='|',
306-
# quotechar='"', quoting=csv.QUOTE_MINIMAL)
307-
308-
# for q, res in zip(queries, results):
309-
310-
# row = [
311-
# q,
312-
# res[0].tolist(),
313-
# res[1].tolist(),
314-
# alpha,
315-
# ]
316-
# writer.writerow(row)
317-
318-
# elif args.mode == 'read':
319-
# with open(RESULT_LOG_PATH, 'r', newline='', encoding='utf-8') as f_in:
320-
# reader = csv.reader(f_in, delimiter='|', quotechar='"',
321-
# quoting=csv.QUOTE_MINIMAL)
322-
323-
# header = next(reader)
324-
# for line in reader:
325-
# query, best_args = line[0], line[1]
326-
# print(query)
327-
# print(best_args)
328-
# print_argument_texts(best_args, CSV_PATH, print_full_texts=True)
329-
# print('*' * 60)

argU/indexing/models.py

+3-18
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
1-
2-
import csv
3-
import json
41
import os
52
import sys
6-
import rootpath
73
import time
8-
from tqdm import tqdm
94

105
import numpy as np
6+
import rootpath
7+
from gensim.models import KeyedVectors
118
from gensim.models import Word2Vec
129
from gensim.models.callbacks import CallbackAny2Vec
13-
from gensim.models import KeyedVectors
1410
from scipy.spatial import distance
11+
from tqdm import tqdm
1512

1613
try:
1714
sys.path.append(os.path.join(rootpath.detect()))
@@ -75,7 +72,6 @@ def store(self):
7572

7673
@staticmethod
7774
def load():
78-
7975
print("Load CBOW...")
8076
tick = time.time()
8177
cbow = CBOW()
@@ -136,14 +132,6 @@ def arg_to_emb(self, arg_train, model_type='in'):
136132
unk += 1
137133

138134
vec = np.sum(emb_matrix, axis=0) / (emb_matrix.shape[0])
139-
140-
# print(wv.most_similar(positive=[vec], topn=10))
141-
# print(f'Unk. count: {unk}')
142-
143-
# for (w, s) in wv.most_similar(positive=[vec], topn=10):
144-
# diff_words.add(w)
145-
# print(diff_words)
146-
# print(len(diff_words))
147135
return vec
148136

149137
def queries_to_emb(self, queries, model_type='in'):
@@ -174,9 +162,6 @@ def queries_to_emb(self, queries, model_type='in'):
174162
f'[{query.id}] {query.text} {emb_matrix.shape} -> '
175163
f'{unk} von {len(query.text.split())} Wörtern unbekannt'
176164
))
177-
# for i, emb in enumerate(emb_matrix):
178-
# most_sim = model.wv.most_similar(positive=[emb], topn=4)
179-
# print(f'\t{terms[i]} -> {most_sim}')
180165
print()
181166
unk_all += unk
182167
query_embs.append(emb_matrix)

0 commit comments

Comments
 (0)