luckyos-code
diff --git a/‎argU/__main__.py
+4-45 b/‎argU/__main__.py
+4-45
diff --git a/‎argU/indexing/a2v.py
+3-24 b/‎argU/indexing/a2v.py
+3-24
diff --git a/‎argU/indexing/index.py
+8-123 b/‎argU/indexing/index.py
+8-123
diff --git a/‎argU/indexing/models.py
+3-18 b/‎argU/indexing/models.py
+3-18
@@ -1,8 +1,7 @@
-
 import argparse
 import os
 import sys
-import csv
+
 import rootpath
 
 try:
@@ -56,7 +55,6 @@
 argparsed = parser.parse_args()
 print(f"Args: {argparsed}")
 
-
 setup.assert_file_exists(setup.CBOW_PATH)
 
 db = load_db()
@@ -114,14 +112,13 @@
                 )
 
         merged_args.sort(key=lambda x: x[1], reverse=True)
-        # merged_args = merged_args[:20]
 
         if argparsed.sentiments != 'no':
             merged_args_with_sents = []
             for ma in merged_args:
                 dph, sent = ma[1], ma[2]
                 if argparsed.sentiments == 'emotional':
-                    dph = dph + dph * (abs(sent) / 2)
+                    dph = dph + dph * (abs(sent))
                 elif argparsed.sentiments == 'neutral':
                     dph = dph - dph * (abs(sent) / 2)
                 merged_args_with_sents.append(
@@ -131,16 +128,7 @@
             merged_args.sort(key=lambda x: x[1], reverse=True)
 
         print(f'### {query_id} {desm_scores["query_text"]}')
-        # print('---')
-        # arguments.fancy_print(
-        #     coll_args,
-        #     merged_args_list[:20],
-        #     trans_dict=trans_dict,
-        #     arg_len=2000,
-        # )
-
-        # Sentiment Analysis
-        
+
         if len(merged_args) != 0:
             output_dict[query_id] = merged_args
         else:
@@ -163,36 +151,7 @@
                     trans_id = coll_trans.find_one({'_id': arg_id})['arg_id']
                 except:
                     trans_id = arg_id
-              
+
                 f_out.write(' '.join([
                     str(id), 'Q0', trans_id, str(i + 1), str(score), method, '\n'
                 ]))
-
-
-# for i, res in enumerate(coll_res.find()):
-#     if i == 3:
-#         break
-#     args = arguments.find(coll_args, res['args'])
-#     print(res['query_text'])
-#     print('=' * 40)
-#     for a in args:
-#         print('> ', Argument.get_text(a)[:200])
-#     print()
-
-    # if args.mode == 'retrieve' or args.mode == 'collect':
-
-    #     # Speichere Argumente in dem passenden Output Format
-    #     queries_args = scores.evaluate(threshold=0.5)
-
-    #     with open(setup.RUN_PATH, 'w') as f_out:
-    #         for (query_id, query_text, args) in queries_args:
-    #             for i, arg in enumerate(args):
-    #                 f_out.write(' '.join([
-    #                     query_id,
-    #                     'Q0',
-    #                     arg[0],
-    #                     str(i + 1),
-    #                     str(arg[1]),
-    #                     method,
-    #                     '\n',
-    #                 ]))
@@ -1,9 +1,8 @@
 import argparse
 import os
-import rootpath
 import sys
-from tqdm import tqdm
-import numpy as np
+
+import rootpath
 
 try:
     sys.path.append(os.path.join(rootpath.detect()))
@@ -16,7 +15,6 @@
     print(e)
     sys.exit(0)
 
-
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -45,32 +43,13 @@
     print(desm.model_in.wv.most_similar('Trump'))
     print(desm.model_out.most_similar('Trump'))
 
-    # diff_words = set([])
-    # for i in range(10):
-    #     v = np.random.random_sample((100,))
-    #     for (w, s) in desm.model_out.most_similar(positive=[v], topn=10):
-    #         diff_words.add(w)
-    # print(diff_words)
-    # print(len(diff_words))
-    # sys.exit(0)
-
     embeddings = []
     last_emb = None
 
-    from scipy.spatial.distance import cosine
     for i, arg in enumerate(
-        TrainArgsIterator(coll, full_data=True, max_args=-1)
+            TrainArgsIterator(coll, full_data=True, max_args=-1)
     ):
         arg_emb = desm.arg_to_emb(arg, model_type='out')
-
-        # if last_emb is not None:
-        # print()
-        # print('\t', 1 - cosine(last_emb, arg_emb))
-        # print()
-        # last_emb = arg_emb
-
-        # print(arg['text'][:200])
-        # print()
         embeddings.append({
             '_id': arg['_id'],
             'emb': arg_emb.tolist(),
 
@@ -1,20 +1,15 @@
-# Index CSV
-# Die CSV enthält alle Infos für BM25
-# Und die Vektor-Embeddings aller Argumente
-
-import argparse
 import os
-import rootpath
-import sys
-import json
 import csv
-import warnings
+import json
+import os
+import sys
 import traceback
-import math
+import warnings
+
 import numpy as np
+import rootpath
+from sklearn.preprocessing import normalize
 from tqdm import tqdm
-from sklearn import preprocessing
-from sklearn.preprocessing import MinMaxScaler, normalize
 
 warnings.filterwarnings("error")
 
@@ -79,10 +74,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
             zweiter array 2 dim. Dualsim scores 
     """
 
-    # love titties 4 eva
-    # booty squad <333333333
-    # ariane staudte <333333
-
     dual_embedding = DualEmbedding(cbow_model)
     processed_queries = dual_embedding.get_processed_queries(queries)
 
@@ -112,7 +103,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
                 bm25_queries_scores = []
 
                 for query_terms, query_matrix in processed_queries:
-
                     desim_query_score = dual_embedding.desim(
                         query_matrix, arg_emb
                     )
@@ -135,8 +125,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
             if i + 1 == max_args:
                 break
 
-        # bm25_scores = np.transpose(np.array(bm25_scores))
-        # desim_scores = np.transpose(np.array(desim_scores))
         bm25_scores = np.array(bm25_scores)
         desim_scores = np.array(desim_scores)
         arg_ids = np.asarray(arg_ids)
@@ -152,14 +140,6 @@ def collect_arguments(queries, cbow_model, bm25_model, max_args=-1):
         bm25_norm = normalize(bm25_scores, norm='max', axis=0)
         desim_norm = normalize(desim_scores, norm='max', axis=0)
 
-        # bm25_norm = np.linalg.norm(bm25_scores, axis=0)
-        # bm25_norm[bm25_norm == 0] = 0.0001
-        # desim_norm = np.linalg.norm(desim_scores, axis=0)
-        # desim_norm[desim_norm == 0] = 0.0001
-
-        # bm25_scores = np.transpose(bm25_scores / bm25_norm)
-        # desim_scores = np.transpose(desim_scores / desim_norm)
-
         print(f'norm Min BM25: {bm25_norm.min()}')
         print(f'norm Max BM25: {bm25_norm.max()}')
         print(f'norm Min Desim: {desim_norm.min()}')
@@ -183,7 +163,6 @@ def combine_scores(bm25_scores, desim_scores, alpha):
 
 
 def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
-
     top_args_list = []
     final_scores, influences = combine_scores(bm25_scores, desim_scores, alpha)
 
@@ -192,8 +171,6 @@ def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
         f"--------------------\n\n"
         f"BM25 > Desim: {influences.count(True)} Mal.\n"
         f"Desim > BM25: {influences.count(False)} Mal.\n\n"
-        # f"Desim und BM25 sollten standardmäßig ähnlich sein (Verhältnis = 1.0)\n"
-        # f"Verhältnis: {abs(influences.count(True) / influences.count(False))}\n\n"
     ))
 
     for bs, ds, fs in zip(bm25_scores, desim_scores, final_scores):
@@ -212,7 +189,7 @@ def get_top_args(arg_ids, bm25_scores, desim_scores, alpha=0.5, top_n=10):
 
 def get_sentiments(top_args):
     with open(
-        setup.SENTIMENTS_PATH, 'r', newline='', encoding='utf-8'
+            setup.SENTIMENTS_PATH, 'r', newline='', encoding='utf-8'
     ) as f_in:
         reader = csv.reader(f_in, **setup.SENTIMENTS_CONFIG)
         header = next(reader)
@@ -235,95 +212,3 @@ def get_sentiments(top_args):
         )
 
     return query_sentiments
-
-    # results = []
-    # for arg_ids, query_sents in zip(top_args, query_sentiments):
-    #     abs_sent = [abs(i) for i in query_sents[0]]
-    #     pairings = [(i, p[0], p[1])
-    #                 for i, p in enumerate(zip(arg_ids[1], abs_sent))]
-    #     pairings = sorted(pairings, key=lambda x: (x[2], x[1]), reverse=True)
-    #     indices_order = [i[0] for i in pairings]
-    #     new_arg_ids = [arg_ids[0][i] for i in indices_order]
-    #     new_arg_scores = [(p[1], p[2]) for p in pairings]
-    #     results.append(
-    #         (new_arg_ids, new_arg_scores)
-    #     )
-    # return results
-
-# if args.mode != 'read':
-#     cbow = CBOW.load(CBOW_MODEL_PATH)
-#     bm25_manager = BM25Manager.load(BM25_PATH)
-
-# if args.mode == 'train':
-#     create_index(INDEX_PATH, TRAIN_PATH, cbow.model, bm25_manager.index)
-
-# elif args.mode == 'load':
-
-#     queries = [
-#         'Donald Trump is bad',
-#         'pregnant women abortion stop',
-#     ]
-
-#     bm25_scores, desim_scores, arg_ids = analyze_query(
-#         queries,
-#         INDEX_PATH,
-#         cbow.model,
-#         bm25_manager.index,
-#         max_args=-1
-#     )
-
-#     combined_scores = []
-#     alpha = 0.5
-
-#     for bm25_query_scores, desim_query_scores in zip(bm25_scores, desim_scores):
-#         combined_query_scores = []
-
-#         for b, d in zip(bm25_query_scores, desim_query_scores):
-#             combined_query_scores.append(alpha * b + (1 - alpha) * d)
-
-#         combined_scores.append(combined_query_scores)
-
-#     combined_scores = np.asarray(combined_scores)
-#     top_n = 10
-#     results = []
-
-#     for cs in combined_scores:
-#         best_n = np.argsort(cs)[::-1][:top_n]
-#         top_args = arg_ids[best_n]
-#         top_scores = cs[best_n]
-
-#         results.append((top_args, top_scores))
-
-#     if not os.path.isfile(RESULT_LOG_PATH):
-#         result_log_header = ['query', 'top_args', 'scores', 'alpha']
-#         with open(RESULT_LOG_PATH, 'w', newline='', encoding='utf-8') as f_out:
-#             writer = csv.writer(f_out, delimiter='|', quotechar='"',
-#                                 quoting=csv.QUOTE_MINIMAL)
-#             writer.writerow(result_log_header)
-
-#     with open(RESULT_LOG_PATH, 'a', newline='', encoding='utf-8') as f_out:
-#         writer = csv.writer(f_out, delimiter='|',
-#                             quotechar='"', quoting=csv.QUOTE_MINIMAL)
-
-#         for q, res in zip(queries, results):
-
-#             row = [
-#                 q,
-#                 res[0].tolist(),
-#                 res[1].tolist(),
-#                 alpha,
-#             ]
-#             writer.writerow(row)
-
-# elif args.mode == 'read':
-#     with open(RESULT_LOG_PATH, 'r', newline='', encoding='utf-8') as f_in:
-#         reader = csv.reader(f_in, delimiter='|', quotechar='"',
-#                                 quoting=csv.QUOTE_MINIMAL)
-
-#         header = next(reader)
-#         for line in reader:
-#             query, best_args = line[0], line[1]
-#             print(query)
-#             print(best_args)
-#             print_argument_texts(best_args, CSV_PATH, print_full_texts=True)
-#             print('*' * 60)
@@ -1,17 +1,14 @@
-
-import csv
-import json
 import os
 import sys
-import rootpath
 import time
-from tqdm import tqdm
 
 import numpy as np
+import rootpath
+from gensim.models import KeyedVectors
 from gensim.models import Word2Vec
 from gensim.models.callbacks import CallbackAny2Vec
-from gensim.models import KeyedVectors
 from scipy.spatial import distance
+from tqdm import tqdm
 
 try:
     sys.path.append(os.path.join(rootpath.detect()))
@@ -75,7 +72,6 @@ def store(self):
 
     @staticmethod
     def load():
-
         print("Load CBOW...")
         tick = time.time()
         cbow = CBOW()
@@ -136,14 +132,6 @@ def arg_to_emb(self, arg_train, model_type='in'):
                     unk += 1
 
         vec = np.sum(emb_matrix, axis=0) / (emb_matrix.shape[0])
-
-        # print(wv.most_similar(positive=[vec], topn=10))
-        # print(f'Unk. count: {unk}')
-
-        # for (w, s) in wv.most_similar(positive=[vec], topn=10):
-        # diff_words.add(w)
-        # print(diff_words)
-        # print(len(diff_words))
         return vec
 
     def queries_to_emb(self, queries, model_type='in'):
@@ -174,9 +162,6 @@ def queries_to_emb(self, queries, model_type='in'):
                 f'[{query.id}] {query.text}  {emb_matrix.shape} -> '
                 f'{unk} von {len(query.text.split())} Wörtern unbekannt'
             ))
-            # for i, emb in enumerate(emb_matrix):
-            # most_sim = model.wv.most_similar(positive=[emb], topn=4)
-            # print(f'\t{terms[i]} -> {most_sim}')
             print()
             unk_all += unk
             query_embs.append(emb_matrix)