Skip to content

Commit

Permalink
Added comments to describe the actions of every script.
Browse files Browse the repository at this point in the history
  • Loading branch information
LucasPages committed May 11, 2022
1 parent 64df124 commit 819d06f
Show file tree
Hide file tree
Showing 9 changed files with 49 additions and 13 deletions.
2 changes: 2 additions & 0 deletions augment_mentions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

import entity_utils

"""This script is used to create new named entity mentions in the mentions collections."""

parser = argparse.ArgumentParser()

parser.add_argument("workers", type=int, help="Number of workers to use for processing.", default=8)
Expand Down
29 changes: 16 additions & 13 deletions entity_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
import random
from nltk.corpus import stopwords

"""
This module contains utility functions meant to handle mentions of named entities in this project.
It contains three classes :
* EntityPair : objects made to compare two mentions to one another
* EntitySet : objects made to deal with a set of mentions
* Mention : objects representing one single named entity mention
"""


class EntityPair:

Expand Down Expand Up @@ -37,11 +45,6 @@ def get_anchor(self):
return self.entity2

def choose_from_same_span(self):
# technique_ranking = {"anchor": 0, "title-expansion": 1, "outlinks-sentence": 2, "outlinks-article": 3,
# "alias-sentence": 4, "alias-article": 5, "fusion": 6}
# technique_ranking = ["anchor", "title-expansion", "alias-title", "outlinks-sentence", "alias-sentence",
# "outlinks-article", "alias-article", "consecutive-fusion", "overlap-fusion"]

technique_ranking = ["anchor", "title-expansion", "alias-title", "outlinks", "alias",
"consecutive-fusion", "overlap-fusion"]

Expand Down Expand Up @@ -100,7 +103,7 @@ def same_class_and_link(self):
return self.same_link() and self.same_class()

def same_link(self):
return self.entity1["link"] == self.entity2["link"]
return self.entity1["mention_title"] == self.entity2["mention_title"]

def same_class(self):
return self.entity1["ne_class"] == self.entity2["ne_class"]
Expand Down Expand Up @@ -186,7 +189,7 @@ def filter_multiclass(self, neckar_coll):
counter = 0

for index, mention_iter in enumerate(self.list):
neckar_info = set([info["neClass"] for info in neckar_coll.find({"en_sitelink": mention_iter["link"]})])
neckar_info = set([info["neClass"] for info in neckar_coll.find({"en_sitelink": mention_iter["mention_title"]})])
if len(neckar_info) <= 1:
new_entities.append(mention_iter)
else:
Expand Down Expand Up @@ -326,9 +329,9 @@ def filter_not_winer(self, articles_winer, collection_articles, collection_menti

for entity in self.list:
if entity["technique"] == "anchor":
link_to_recover = entity["link"]
link_to_recover = entity["mention_title"]
elif entity["technique"] in ["outlinks", "alias"]:
link_to_recover = collection_mentions.find_one({"_id": entity["origin"]})["link"]
link_to_recover = collection_mentions.find_one({"_id": entity["origin"]})["mention_title"]
else:
filtered_list.append(entity)
continue
Expand All @@ -343,21 +346,21 @@ class Mention:

def __init__(self, entity):
self.entity = entity
self.link = entity["link"]
self.mention_title = entity["mention_title"]

def get_outlinks(self, coll_mentions):
new_mentions = []
for entity in coll_mentions.find({"article_title": self.link, "technique": "anchor"}):
for entity in coll_mentions.find({"article_title": self.mention_title, "technique": "anchor"}):
entity["origin"] = self.entity["_id"]
entity["origin_sent"] = self.entity["sent_index"]
new_mentions.append(entity)
return new_mentions

def get_aliases(self, coll_articles):
return [{"alias": alias, "link": self.link, "ne_class": self.entity["ne_class"],
return [{"alias": alias, "mention_title": self.mention_title, "ne_class": self.entity["ne_class"],
"origin": self.entity["_id"], "origin_sent": self.entity["sent_index"],
"origin_technique": self.entity["technique"]}
for alias in coll_articles.find_one({"title": self.link})["aliases"]]
for alias in coll_articles.find_one({"title": self.mention_title})["aliases"]]

def get_technique(self):
return self.entity["technique"]
Expand Down
3 changes: 3 additions & 0 deletions extract_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import multiprocessing
from tqdm import tqdm

"""This script takes all the information about the text and the mentions and puts them all together to extract an
annotated corpus. It requires a configuration file found in the config_files folder of this repository."""

parser = argparse.ArgumentParser()

parser.add_argument("config", help="Path to the YAML extraction configuration file.")
Expand Down
3 changes: 3 additions & 0 deletions process_wikipedia_dump.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#! /bin/bash

# This file encompasses all the actions to be done with the Wikipedia dump up to before the augmentation process
# and extraction of a corpus.

if [ $# != 1 ]
then
echo "Usage : ./process_wikipedia_dump.sh wikipedia_dump_path"
Expand Down
3 changes: 3 additions & 0 deletions wikidata_scripts/create_id_to_title_mapping.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pymongo
from tqdm import tqdm

"""This script creates a mapping between WikiData IDs and Wikipedia article titles. In order to do so, it uses the
WikiData dump MongoDB collection created by the NECKAr tool."""


def upload_mapping(wikidata_item, collection):
wikidata_id = wikidata_item["id"]
Expand Down
3 changes: 3 additions & 0 deletions wikidata_scripts/select_en.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

# This script selects the subset of WikiData pages (items) that are linked to English Wikipedia. In this repository, we
# we are only interested in this subset because we seek structured information about English Wikipedia articles.

if [ "$#" != 2 ]
then
echo "Usage : ./select_en.sh wikidata_dump output_dump"
Expand Down
7 changes: 7 additions & 0 deletions wikipedia_scripts/add_metadata_to_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
from tqdm import tqdm


"""This script adds metadata information to the articles of a Wikipedia dump and uploads it into MongoDB.
Said metadata is :
* the corresponding WikiData ID
* its NER class according to NECKAr
* its list of WikiData aliases if it exists
"""

parser = argparse.ArgumentParser()

parser.add_argument("wikipedia_dump", help="Path to the Wikipedia dump to add the metadata to.")
Expand Down
4 changes: 4 additions & 0 deletions wikipedia_scripts/extract_wikipedia_articles.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#! /bin/bash

# This script extracts Wikipedia articles from an English Wikipedia dump and removes all articles with no text
# (redirection pages for the most part). You can use the compressed xml.bz2 dump as an argument of this script
# (no need to decompress the file).

if [ $# != 2 ]
then
echo "Usage : ./extract_wikipedia_articles.sh wiki_dump_path output_file_path"
Expand Down
8 changes: 8 additions & 0 deletions wikipedia_scripts/process_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@
import spacy
from urllib.parse import unquote

"""This script takes the Wikipedia dump collection previously uploaded into MongoDB by the 'add_metadata_to_dump.py
script', processes its text and makes it into 3 MongoDB collections :
* dump_articles : metadata about a given article (title, ner class, ...)
* dump_tokens : contains all the sentences of the Wikipedia dump
* dump_mentions : contains all the mentions of named entities of the dump
"""


parser = argparse.ArgumentParser()

parser.add_argument("workers", type=int, help="Number of workers to use for processing articles.")
Expand Down

0 comments on commit 819d06f

Please sign in to comment.