Skip to content

Commit

Permalink
Corrections + README.md config_files
Browse files Browse the repository at this point in the history
  • Loading branch information
LucasPages committed May 11, 2022
1 parent 4f192ac commit 64df124
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 2 deletions.
3 changes: 3 additions & 0 deletions augment_mentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ def write_mentions(writing_q, process_q, num_processes, number_documents):
for index in range(num_processes):
process_q.put("exit")

collection_write.drop_index("article_id_1_sent_index_1")
collection_write.create_index([("article_id", 1), ("sent_index", 1)])


if __name__ == "__main__":
client = pymongo.MongoClient("localhost", 27017)
Expand Down
5 changes: 5 additions & 0 deletions config_files/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Those two files are configuration files for the extraction process that creates DataNER.
They serve as parameters for the `extract_collection.py` script.

* anchors.yaml : only extracts anchors from the MongoDB collections
* dataner_config.yaml : extracts all entities that have been created through the augmentation process. You need to have run `augment_mentions.py` to use this one.
3 changes: 1 addition & 2 deletions extract_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def process_article(process_q, writing_q, config_p):
db_w = client_w.wikipedia

collection_tokens = db_w["dump_tokens"]
coll_w = db_w[args.collection]
coll_w = db_w["dump_mentions"]

while True:
article_id_p = process_q.get()
Expand Down Expand Up @@ -159,7 +159,6 @@ def write_article(process_q, writing_q, number_processes, number_documents):
pool = multiprocessing.Pool(processes=num_workers - 1, initializer=process_article, initargs=(processing_queue,
writing_queue,
config))

writing_process = multiprocessing.Process(target=write_article,
args=(processing_queue, writing_queue, num_workers,
doc_count))
Expand Down

0 comments on commit 64df124

Please sign in to comment.