From 64df1246713c3198bae0214fddf6ca428d4205f3 Mon Sep 17 00:00:00 2001 From: LucasPages Date: Tue, 10 May 2022 20:51:12 -0400 Subject: [PATCH] Corrections + README.md config_files --- augment_mentions.py | 3 +++ config_files/README.md | 5 +++++ extract_collection.py | 3 +-- 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 config_files/README.md diff --git a/augment_mentions.py b/augment_mentions.py index fc79acc..d8fd712 100644 --- a/augment_mentions.py +++ b/augment_mentions.py @@ -142,6 +142,9 @@ def write_mentions(writing_q, process_q, num_processes, number_documents): for index in range(num_processes): process_q.put("exit") + collection_write.drop_index("article_id_1_sent_index_1") + collection_write.create_index([("article_id", 1), ("sent_index", 1)]) + if __name__ == "__main__": client = pymongo.MongoClient("localhost", 27017) diff --git a/config_files/README.md b/config_files/README.md new file mode 100644 index 0000000..30d79ea --- /dev/null +++ b/config_files/README.md @@ -0,0 +1,5 @@ +Those two files are configuration files for the extraction process that creates DataNER. +They serve as parameters for the `extract_collection.py` script. + +* anchors.yaml : only extracts anchors from the MongoDB collections +* dataner_config.yaml : extracts all entities that have been created through the augmentation process. You need to have run `augment_mentions.py` to use this one. \ No newline at end of file diff --git a/extract_collection.py b/extract_collection.py index 840f0d3..ba0e8b4 100644 --- a/extract_collection.py +++ b/extract_collection.py @@ -44,7 +44,7 @@ def process_article(process_q, writing_q, config_p): db_w = client_w.wikipedia collection_tokens = db_w["dump_tokens"] - coll_w = db_w[args.collection] + coll_w = db_w["dump_mentions"] while True: article_id_p = process_q.get() @@ -159,7 +159,6 @@ def write_article(process_q, writing_q, number_processes, number_documents): pool = multiprocessing.Pool(processes=num_workers - 1, initializer=process_article, initargs=(processing_queue, writing_queue, config)) - writing_process = multiprocessing.Process(target=write_article, args=(processing_queue, writing_queue, num_workers, doc_count))