Skip to content

Commit

Permalink
added support for standalone usage
Browse files Browse the repository at this point in the history
  • Loading branch information
davidberenstein1957 committed Jan 12, 2023
1 parent ed5b74a commit 4610fcd
Show file tree
Hide file tree
Showing 6 changed files with 922 additions and 651 deletions.
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pip install concise-concepts

# Quickstart

## Spacy Pipeline Component

```python
import spacy
from spacy import displacy
Expand Down Expand Up @@ -80,6 +82,33 @@ displacy.render(doc, style="ent", options=options)
```
![](https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png)

## Standalone

This might be useful when iterating over few_shot training data when not wanting to reload larger models continuously.\

```python
import gensim
import spacy

from concise_concepts import Conceptualizer

model = gensim.downloader.load("fasttext-wiki-news-subwords-300")
nlp = spacy.load("en_core_web_sm")
data = {
"disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"],
"symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"],
}
conceptualizer = Conceptualizer(nlp, data, model)
conceptualizer.nlp("I have a headache and a fever.").ents

data = {
"disease": ["cancer", "diabetes"],
"symptom": ["headache", "fever"],
}
conceptualizer = Conceptualizer(nlp, data, model)
conceptualizer.nlp("I have a headache and a fever.").ents
```

# Features
## Matching Pattern Rules
A general introduction about the usage of matching patterns in the [usage section](#usage).
Expand Down
12 changes: 6 additions & 6 deletions concise_concepts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
"concise_concepts",
default_config={
"data": None,
"topn": [],
"model": None,
"topn": None,
"model_path": None,
"word_delimiter": "_",
"ent_score": False,
"exclude_pos": [
Expand All @@ -39,8 +39,8 @@ def make_concise_concepts(
nlp: Language,
name: str,
data: Union[dict, list],
topn: list,
model: Union[str, FastText, Word2Vec, KeyedVectors],
topn: Union[list, None],
model_path: Union[str, FastText, Word2Vec, KeyedVectors, None],
word_delimiter: str,
ent_score: bool,
exclude_pos: List[str],
Expand All @@ -52,10 +52,9 @@ def make_concise_concepts(
):
return Conceptualizer(
nlp=nlp,
name=name,
data=data,
topn=topn,
model=model,
model=model_path,
word_delimiter=word_delimiter,
ent_score=ent_score,
exclude_pos=exclude_pos,
Expand All @@ -64,4 +63,5 @@ def make_concise_concepts(
case_sensitive=case_sensitive,
json_path=json_path,
verbose=verbose,
name=name,
)
70 changes: 46 additions & 24 deletions concise_concepts/conceptualizer/Conceptualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
import re
import types
from copy import deepcopy
from pathlib import Path
from typing import List, Union
Expand All @@ -19,10 +20,9 @@ class Conceptualizer:
def __init__(
self,
nlp: Language,
name: str,
data: dict,
topn: list = None,
data: dict = {},
model: Union[str, FastText, KeyedVectors, Word2Vec] = None,
topn: list = None,
word_delimiter: str = "_",
ent_score: bool = False,
exclude_pos: list = None,
Expand All @@ -31,6 +31,7 @@ def __init__(
case_sensitive: bool = False,
json_path: str = "./matching_patterns.json",
verbose: bool = True,
name: str = "concise_concepts",
):
"""
The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and
Expand All @@ -55,6 +56,7 @@ def __init__(
if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional)
:param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional)
"""
assert data, ValueError("You must provide a dictionary of words to match")
self.verbose = verbose
self.log_cache = {"key": list(), "word": list(), "word_key": list()}
if Span.has_extension("ent_score"):
Expand All @@ -70,6 +72,34 @@ def __init__(
self.topn = topn
self.model = model
self.match_rule = {}
self.set_exclude_pos(exclude_pos)
self.set_exclude_dep(exclude_dep)
self.json_path = json_path
self.include_compound_words = include_compound_words
self.case_sensitive = case_sensitive
self.word_delimiter = word_delimiter
if "lemmatizer" not in self.nlp.component_names:
logger.warning(
"No lemmatizer found in spacy pipeline. Consider adding it for matching"
" on LEMMA instead of exact text."
)
self.match_key = "ORTH"
else:
self.match_key = "LEMMA"
if "entity_ruler" in self.nlp.component_names:
logger.warning(
"Entity Ruler already exists in the pipeline. Removing old rulers"
)
self.nlp.remove_pipe("entity_ruler")
self.run()

def set_exclude_dep(self, exclude_dep: list):
if exclude_dep is None:
exclude_dep = []
if exclude_dep:
self.match_rule["DEP"] = {"NOT_IN": exclude_dep}

def set_exclude_pos(self, exclude_pos: list):
if exclude_pos is None:
exclude_pos = [
"VERB",
Expand All @@ -85,24 +115,6 @@ def __init__(
]
if exclude_pos:
self.match_rule["POS"] = {"NOT_IN": exclude_pos}
if exclude_dep is None:
exclude_dep = []
if exclude_dep:
self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
self.json_path = json_path
self.check_validity_path()
self.include_compound_words = include_compound_words
self.case_sensitive = case_sensitive
self.word_delimiter = word_delimiter
if "lemmatizer" not in self.nlp.component_names:
logger.warning(
"No lemmatizer found in spacy pipeline. Consider adding it for matching"
" on LEMMA instead of exact text."
)
self.match_key = "ORTH"
else:
self.match_key = "LEMMA"
self.run()

def run(self) -> None:
self.check_validity_path()
Expand Down Expand Up @@ -146,7 +158,7 @@ def determine_topn(self) -> None:
If the user doesn't specify a topn value for each class,
then the topn value for each class is set to 100
"""
if not self.topn:
if self.topn is None:
self.topn_dict = {key: 100 for key in self.data}
else:
num_classes = len(self.data)
Expand Down Expand Up @@ -413,8 +425,12 @@ def __call__(self, doc: Doc) -> Doc:
:param doc: Doc
:type doc: Doc
"""
if self.ent_score:
doc = self.assign_score_to_entities(doc)
if isinstance(doc, str):
doc = self.nlp(doc)
elif isinstance(doc, Doc):
if self.ent_score:
doc = self.assign_score_to_entities(doc)

return doc

def pipe(self, stream, batch_size=128) -> Doc:
Expand All @@ -425,6 +441,12 @@ def pipe(self, stream, batch_size=128) -> Doc:
:param stream: a generator of documents
:param batch_size: The number of documents to be processed at a time, defaults to 128 (optional)
"""
if isinstance(stream, str):
stream = [stream]

if not isinstance(stream, types.GeneratorType):
stream = self.nlp.pipe(stream, batch_size=batch_size)

for docs in util.minibatch(stream, size=batch_size):
for doc in docs:
if self.ent_score:
Expand Down
Loading

0 comments on commit 4610fcd

Please sign in to comment.