Added comments to describe the actions of every script.

LucasPages · May 11, 2022 · 819d06f · 819d06f
1 parent 64df124
commit 819d06f
Show file tree

Hide file tree

Showing 9 changed files with 49 additions and 13 deletions.
diff --git a/augment_mentions.py b/augment_mentions.py
@@ -8,6 +8,8 @@
 
 import entity_utils
 
+"""This script is used to create new named entity mentions in the mentions collections."""
+
 parser = argparse.ArgumentParser()
 
 parser.add_argument("workers", type=int, help="Number of workers to use for processing.", default=8)

diff --git a/entity_utils.py b/entity_utils.py
@@ -1,6 +1,14 @@
 import random
 from nltk.corpus import stopwords
 
+"""
+This module contains utility functions meant to handle mentions of named entities in this project.
+It contains three classes :
+    * EntityPair : objects made to compare two mentions to one another
+    * EntitySet  : objects made to deal with a set of mentions
+    * Mention    : objects representing one single named entity mention
+    """
+
 
 class EntityPair:
 
@@ -37,11 +45,6 @@ def get_anchor(self):
             return self.entity2
 
     def choose_from_same_span(self):
-        # technique_ranking = {"anchor": 0, "title-expansion": 1, "outlinks-sentence": 2, "outlinks-article": 3,
-        #                      "alias-sentence": 4, "alias-article": 5, "fusion": 6}
-        # technique_ranking = ["anchor", "title-expansion", "alias-title", "outlinks-sentence", "alias-sentence",
-        #                      "outlinks-article", "alias-article", "consecutive-fusion", "overlap-fusion"]
-
         technique_ranking = ["anchor", "title-expansion", "alias-title", "outlinks", "alias",
                              "consecutive-fusion", "overlap-fusion"]
 
@@ -100,7 +103,7 @@ def same_class_and_link(self):
         return self.same_link() and self.same_class()
 
     def same_link(self):
-        return self.entity1["link"] == self.entity2["link"]
+        return self.entity1["mention_title"] == self.entity2["mention_title"]
 
     def same_class(self):
         return self.entity1["ne_class"] == self.entity2["ne_class"]
@@ -186,7 +189,7 @@ def filter_multiclass(self, neckar_coll):
         counter = 0
 
         for index, mention_iter in enumerate(self.list):
-            neckar_info = set([info["neClass"] for info in neckar_coll.find({"en_sitelink": mention_iter["link"]})])
+            neckar_info = set([info["neClass"] for info in neckar_coll.find({"en_sitelink": mention_iter["mention_title"]})])
             if len(neckar_info) <= 1:
                 new_entities.append(mention_iter)
             else:
@@ -326,9 +329,9 @@ def filter_not_winer(self, articles_winer, collection_articles, collection_menti
 
         for entity in self.list:
             if entity["technique"] == "anchor":
-                link_to_recover = entity["link"]
+                link_to_recover = entity["mention_title"]
             elif entity["technique"] in ["outlinks", "alias"]:
-                link_to_recover = collection_mentions.find_one({"_id": entity["origin"]})["link"]
+                link_to_recover = collection_mentions.find_one({"_id": entity["origin"]})["mention_title"]
             else:
                 filtered_list.append(entity)
                 continue
@@ -343,21 +346,21 @@ class Mention:
 
     def __init__(self, entity):
         self.entity = entity
-        self.link = entity["link"]
+        self.mention_title = entity["mention_title"]
 
     def get_outlinks(self, coll_mentions):
         new_mentions = []
-        for entity in coll_mentions.find({"article_title": self.link, "technique": "anchor"}):
+        for entity in coll_mentions.find({"article_title": self.mention_title, "technique": "anchor"}):
             entity["origin"] = self.entity["_id"]
             entity["origin_sent"] = self.entity["sent_index"]
             new_mentions.append(entity)
         return new_mentions
 
     def get_aliases(self, coll_articles):
-        return [{"alias": alias, "link": self.link, "ne_class": self.entity["ne_class"],
+        return [{"alias": alias, "mention_title": self.mention_title, "ne_class": self.entity["ne_class"],
                  "origin": self.entity["_id"], "origin_sent": self.entity["sent_index"],
                  "origin_technique": self.entity["technique"]}
-                for alias in coll_articles.find_one({"title": self.link})["aliases"]]
+                for alias in coll_articles.find_one({"title": self.mention_title})["aliases"]]
 
     def get_technique(self):
         return self.entity["technique"]

diff --git a/extract_collection.py b/extract_collection.py
@@ -6,6 +6,9 @@
 import multiprocessing
 from tqdm import tqdm
 
+"""This script takes all the information about the text and the mentions and puts them all together to extract an 
+annotated corpus. It requires a configuration file found in the config_files folder of this repository."""
+
 parser = argparse.ArgumentParser()
 
 parser.add_argument("config", help="Path to the YAML extraction configuration file.")

diff --git a/process_wikipedia_dump.sh b/process_wikipedia_dump.sh
@@ -1,5 +1,8 @@
 #! /bin/bash
 
+# This file encompasses all the actions to be done with the Wikipedia dump up to before the augmentation process
+# and extraction of a corpus.
+
 if [ $# != 1 ]
 then
   echo "Usage : ./process_wikipedia_dump.sh wikipedia_dump_path"

diff --git a/wikidata_scripts/create_id_to_title_mapping.py b/wikidata_scripts/create_id_to_title_mapping.py
@@ -1,6 +1,9 @@
 import pymongo
 from tqdm import tqdm
 
+"""This script creates a mapping between WikiData IDs and Wikipedia article titles. In order to do so, it uses the
+WikiData dump MongoDB collection created by the NECKAr tool."""
+
 
 def upload_mapping(wikidata_item, collection):
     wikidata_id = wikidata_item["id"]

diff --git a/wikidata_scripts/select_en.sh b/wikidata_scripts/select_en.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# This script selects the subset of WikiData pages (items) that are linked to English Wikipedia. In this repository, we
+# we are only interested in this subset because we seek structured information about English Wikipedia articles.
+
 if [ "$#" != 2 ]
 then
 	echo "Usage : ./select_en.sh wikidata_dump output_dump"

diff --git a/wikipedia_scripts/add_metadata_to_dump.py b/wikipedia_scripts/add_metadata_to_dump.py
@@ -6,6 +6,13 @@
 from tqdm import tqdm
 
 
+"""This script adds metadata information to the articles of a Wikipedia dump and uploads it into MongoDB. 
+Said metadata is :
+    * the corresponding WikiData ID
+    * its NER class according to NECKAr
+    * its list of WikiData aliases if it exists
+"""
+
 parser = argparse.ArgumentParser()
 
 parser.add_argument("wikipedia_dump", help="Path to the Wikipedia dump to add the metadata to.")

diff --git a/wikipedia_scripts/extract_wikipedia_articles.sh b/wikipedia_scripts/extract_wikipedia_articles.sh
@@ -1,5 +1,9 @@
 #! /bin/bash
 
+# This script extracts Wikipedia articles from an English Wikipedia dump and removes all articles with no text
+# (redirection pages for the most part). You can use the compressed xml.bz2 dump as an argument of this script
+# (no need to decompress the file).
+
 if [ $# != 2 ]
 then
   echo "Usage : ./extract_wikipedia_articles.sh wiki_dump_path output_file_path"

diff --git a/wikipedia_scripts/process_dump.py b/wikipedia_scripts/process_dump.py
@@ -7,6 +7,14 @@
 import spacy
 from urllib.parse import unquote
 
+"""This script takes the Wikipedia dump collection previously uploaded into MongoDB by the 'add_metadata_to_dump.py 
+script', processes its text and makes it into 3 MongoDB collections : 
+    * dump_articles : metadata about a given article (title, ner class, ...)
+    * dump_tokens   : contains all the sentences of the Wikipedia dump
+    * dump_mentions : contains all the mentions of named entities of the dump
+"""
+
+
 parser = argparse.ArgumentParser()
 
 parser.add_argument("workers", type=int, help="Number of workers to use for processing articles.")