Skip to content

Commit

Permalink
added some fine-tuning to most recent PR by Tom Aarsen
Browse files Browse the repository at this point in the history
  • Loading branch information
davidberenstein1957 committed Nov 10, 2022
1 parent f0e7db5 commit 0e84319
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 17 deletions.
62 changes: 46 additions & 16 deletions concise_concepts/conceptualizer/Conceptualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,12 @@ def __init__(
"PART",
"PRON",
]
self.match_rule["POS"] = {"NOT_IN": exclude_pos}
if exclude_pos:
self.match_rule["POS"] = {"NOT_IN": exclude_pos}
if exclude_dep is None:
exclude_dep = []
self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
if exclude_dep:
self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
self.json_path = json_path
self.check_validity_path()
self.include_compound_words = include_compound_words
Expand All @@ -102,7 +104,7 @@ def __init__(
self.run()
self.data_upper = {k.upper(): v for k, v in data.items()}

def run(self):
def run(self) -> None:
self.check_validity_path()
self.determine_topn()
self.set_gensim_model()
Expand All @@ -119,7 +121,11 @@ def run(self):
if not self.ent_score:
del self.kv

def check_validity_path(self):
def check_validity_path(self) -> None:
"""
If the path is a file, create the parent directory if it doesn't exist. If the path is a directory, create the
directory and set the path to the default file name
"""
if self.json_path:
if Path(self.json_path).suffix:
Path(self.json_path).parents[0].mkdir(parents=True, exist_ok=True)
Expand All @@ -132,7 +138,7 @@ def check_validity_path(self):
f" ´json_path´to {self.json_path}"
)

def determine_topn(self):
def determine_topn(self) -> None:
"""
If the user doesn't specify a topn value for each class,
then the topn value for each class is set to 100
Expand All @@ -146,7 +152,7 @@ def determine_topn(self):
), f"Provide a topn integer for each of the {num_classes} classes."
self.topn_dict = dict(zip(self.data, self.topn))

def set_gensim_model(self):
def set_gensim_model(self) -> None:
"""
If the model_path is not None, then we try to load the model from the path.
If it's not a valid path, then we raise an exception.
Expand Down Expand Up @@ -186,7 +192,7 @@ def set_gensim_model(self):

self.kv.add_vectors(wordList, vectorList)

def verify_data(self, verbose: bool = True):
def verify_data(self, verbose: bool = True) -> None:
"""
It takes a dictionary of lists of words, and returns a dictionary of lists of words,
where each word in the list is present in the word2vec model
Expand Down Expand Up @@ -222,7 +228,7 @@ def verify_data(self, verbose: bool = True):
raise Exception(msg)
self.data = deepcopy(verified_data)

def expand_concepts(self):
def expand_concepts(self) -> None:
"""
For each key in the data dictionary, find the topn most similar words to the key and the values in the data
dictionary, and add those words to the values in the data dictionary
Expand All @@ -244,7 +250,7 @@ def expand_concepts(self):
{self.check_presence_vocab(word) for word, _ratio in similar}
)

def resolve_overlapping_concepts(self):
def resolve_overlapping_concepts(self) -> None:
"""
It removes words from the data that are in other concepts, and then removes words that are not closest to the
centroid of the concept
Expand All @@ -256,7 +262,7 @@ def resolve_overlapping_concepts(self):
if key == self.kv.most_similar_to_given(word, list(self.data.keys()))
]

def infer_original_data(self):
def infer_original_data(self) -> None:
"""
It takes the original data and adds the new data to it, then removes the new data from the original data.
"""
Expand All @@ -272,7 +278,7 @@ def infer_original_data(self):
if word not in self.original_data[key_y]
]

def lemmatize_concepts(self):
def lemmatize_concepts(self) -> None:
"""
For each key in the data dictionary,
the function takes the list of concepts associated with that key, and lemmatizes
Expand All @@ -283,7 +289,7 @@ def lemmatize_concepts(self):
set([doc[0].lemma_ for doc in self.nlp.pipe(self.data[key])])
)

def create_conceptual_patterns(self):
def create_conceptual_patterns(self) -> None:
"""
For each key in the data dictionary,
create a pattern for each word in the list of words associated with that key.
Expand Down Expand Up @@ -311,7 +317,13 @@ def create_conceptual_patterns(self):
"""
patterns = []

def add_patterns(input_dict):
def add_patterns(input_dict: dict) -> None:
"""
It creates a list of dictionaries that can be used for a spaCy entity ruler
:param input_dict: a dictionary
:type input_dict: dict
"""
for key in input_dict:
if self.match_key == "LEMMA":
words = [
Expand Down Expand Up @@ -383,7 +395,7 @@ def add_patterns(input_dict):
self.ruler = self.nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
self.ruler.add_patterns(patterns)

def __call__(self, doc: Doc):
def __call__(self, doc: Doc) -> Doc:
"""
It takes a doc object and assigns a score to each entity in the doc object
Expand All @@ -394,7 +406,7 @@ def __call__(self, doc: Doc):
doc = self.assign_score_to_entities(doc)
return doc

def pipe(self, stream, batch_size=128):
def pipe(self, stream, batch_size=128) -> Doc:
"""
It takes a stream of documents, and for each document,
it assigns a score to each entity in the document
Expand All @@ -408,7 +420,7 @@ def pipe(self, stream, batch_size=128):
doc = self.assign_score_to_entities(doc)
yield doc

def assign_score_to_entities(self, doc: Doc):
def assign_score_to_entities(self, doc: Doc) -> Doc:
"""
The function takes a spaCy document as input and assigns a score to each entity in the document. The score is
calculated using the word embeddings of the entity and the concept.
Expand Down Expand Up @@ -460,6 +472,14 @@ def assign_score_to_entities(self, doc: Doc):
return doc

def _check_presence_vocab(self, word: str) -> str:
"""
If the word is in the vocabulary, return the word. If not, replace spaces and dashes with the word delimiter and
check if the new word is in the vocabulary. If so, return the new word
:param word: str - the word to check
:type word: str
:return: The word or the check_word
"""
if word in self.kv:
return word
for op in [" ", "-"]:
Expand All @@ -468,6 +488,16 @@ def _check_presence_vocab(self, word: str) -> str:
return check_word

def check_presence_vocab(self, word: str) -> str:
"""
If the word is not lowercase and the case_sensitive flag is set to False, then check if the lowercase version of
the word is in the vocabulary. If it is, return the lowercase version of the word. Otherwise, return the word
itself
:param word: The word to check for presence in the vocabulary
:type word: str
:return: The word itself if it is present in the vocabulary, otherwise the word with the highest probability of
being the word that was intended.
"""
if not word.islower() and not self.case_sensitive:
present_word = self._check_presence_vocab(word.lower())
if present_word:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "concise-concepts"
version = "0.6.3"
version = "0.7"
description = "This repository contains an easy and intuitive approach to few-shot NER using most similar expansion over spaCy embeddings. Now with entity confidence scores!"
authors = ["David Berenstein <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit 0e84319

Please sign in to comment.