added support for standalone usage

davidberenstein1957 · Jan 12, 2023 · 4610fcd · 4610fcd
1 parent ed5b74a
commit 4610fcd
Show file tree

Hide file tree

Showing 6 changed files with 922 additions and 651 deletions.
diff --git a/README.md b/README.md
@@ -29,6 +29,8 @@ pip install concise-concepts
 
 # Quickstart
 
+## Spacy Pipeline Component
+
 ```python
 import spacy
 from spacy import displacy
@@ -80,6 +82,33 @@ displacy.render(doc, style="ent", options=options)
 ```
 ![](https://raw.githubusercontent.com/Pandora-Intelligence/concise-concepts/master/img/example.png)
 
+## Standalone
+
+This might be useful when iterating over few_shot training data when not wanting to reload larger models continuously.\
+
+```python
+import gensim
+import spacy
+
+from concise_concepts import Conceptualizer
+
+model = gensim.downloader.load("fasttext-wiki-news-subwords-300")
+nlp = spacy.load("en_core_web_sm")
+data = {
+    "disease": ["cancer", "diabetes", "heart disease", "influenza", "pneumonia"],
+    "symptom": ["headache", "fever", "cough", "nausea", "vomiting", "diarrhea"],
+}
+conceptualizer = Conceptualizer(nlp, data, model)
+conceptualizer.nlp("I have a headache and a fever.").ents
+
+data = {
+    "disease": ["cancer", "diabetes"],
+    "symptom": ["headache", "fever"],
+}
+conceptualizer = Conceptualizer(nlp, data, model)
+conceptualizer.nlp("I have a headache and a fever.").ents
+```
+
 # Features
 ## Matching Pattern Rules
 A general introduction about the usage of matching patterns in the [usage section](#usage).

diff --git a/concise_concepts/__init__.py b/concise_concepts/__init__.py
@@ -12,8 +12,8 @@
     "concise_concepts",
     default_config={
         "data": None,
-        "topn": [],
-        "model": None,
+        "topn": None,
+        "model_path": None,
         "word_delimiter": "_",
         "ent_score": False,
         "exclude_pos": [
@@ -39,8 +39,8 @@ def make_concise_concepts(
     nlp: Language,
     name: str,
     data: Union[dict, list],
-    topn: list,
-    model: Union[str, FastText, Word2Vec, KeyedVectors],
+    topn: Union[list, None],
+    model_path: Union[str, FastText, Word2Vec, KeyedVectors, None],
     word_delimiter: str,
     ent_score: bool,
     exclude_pos: List[str],
@@ -52,10 +52,9 @@ def make_concise_concepts(
 ):
     return Conceptualizer(
         nlp=nlp,
-        name=name,
         data=data,
         topn=topn,
-        model=model,
+        model=model_path,
         word_delimiter=word_delimiter,
         ent_score=ent_score,
         exclude_pos=exclude_pos,
@@ -64,4 +63,5 @@ def make_concise_concepts(
         case_sensitive=case_sensitive,
         json_path=json_path,
         verbose=verbose,
+        name=name,
     )
diff --git a/concise_concepts/conceptualizer/Conceptualizer.py b/concise_concepts/conceptualizer/Conceptualizer.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import re
+import types
 from copy import deepcopy
 from pathlib import Path
 from typing import List, Union
@@ -19,10 +20,9 @@ class Conceptualizer:
     def __init__(
         self,
         nlp: Language,
-        name: str,
-        data: dict,
-        topn: list = None,
+        data: dict = {},
         model: Union[str, FastText, KeyedVectors, Word2Vec] = None,
+        topn: list = None,
         word_delimiter: str = "_",
         ent_score: bool = False,
         exclude_pos: list = None,
@@ -31,6 +31,7 @@ def __init__(
         case_sensitive: bool = False,
         json_path: str = "./matching_patterns.json",
         verbose: bool = True,
+        name: str = "concise_concepts",
     ):
         """
         The function takes in a dictionary of words and their synonyms, and then creates a new dictionary of words and
@@ -55,6 +56,7 @@ def __init__(
             if the entity is "New York", it will also include "New York City" as an entity, defaults to False (optional)
         :param case_sensitive: Whether to match the case of the words in the text, defaults to False (optional)
         """
+        assert data, ValueError("You must provide a dictionary of words to match")
         self.verbose = verbose
         self.log_cache = {"key": list(), "word": list(), "word_key": list()}
         if Span.has_extension("ent_score"):
@@ -70,6 +72,34 @@ def __init__(
         self.topn = topn
         self.model = model
         self.match_rule = {}
+        self.set_exclude_pos(exclude_pos)
+        self.set_exclude_dep(exclude_dep)
+        self.json_path = json_path
+        self.include_compound_words = include_compound_words
+        self.case_sensitive = case_sensitive
+        self.word_delimiter = word_delimiter
+        if "lemmatizer" not in self.nlp.component_names:
+            logger.warning(
+                "No lemmatizer found in spacy pipeline. Consider adding it for matching"
+                " on LEMMA instead of exact text."
+            )
+            self.match_key = "ORTH"
+        else:
+            self.match_key = "LEMMA"
+        if "entity_ruler" in self.nlp.component_names:
+            logger.warning(
+                "Entity Ruler already exists in the pipeline. Removing old rulers"
+            )
+            self.nlp.remove_pipe("entity_ruler")
+        self.run()
+
+    def set_exclude_dep(self, exclude_dep: list):
+        if exclude_dep is None:
+            exclude_dep = []
+        if exclude_dep:
+            self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
+
+    def set_exclude_pos(self, exclude_pos: list):
         if exclude_pos is None:
             exclude_pos = [
                 "VERB",
@@ -85,24 +115,6 @@ def __init__(
             ]
         if exclude_pos:
             self.match_rule["POS"] = {"NOT_IN": exclude_pos}
-        if exclude_dep is None:
-            exclude_dep = []
-        if exclude_dep:
-            self.match_rule["DEP"] = {"NOT_IN": exclude_dep}
-        self.json_path = json_path
-        self.check_validity_path()
-        self.include_compound_words = include_compound_words
-        self.case_sensitive = case_sensitive
-        self.word_delimiter = word_delimiter
-        if "lemmatizer" not in self.nlp.component_names:
-            logger.warning(
-                "No lemmatizer found in spacy pipeline. Consider adding it for matching"
-                " on LEMMA instead of exact text."
-            )
-            self.match_key = "ORTH"
-        else:
-            self.match_key = "LEMMA"
-        self.run()
 
     def run(self) -> None:
         self.check_validity_path()
@@ -146,7 +158,7 @@ def determine_topn(self) -> None:
         If the user doesn't specify a topn value for each class,
         then the topn value for each class is set to 100
         """
-        if not self.topn:
+        if self.topn is None:
             self.topn_dict = {key: 100 for key in self.data}
         else:
             num_classes = len(self.data)
@@ -413,8 +425,12 @@ def __call__(self, doc: Doc) -> Doc:
         :param doc: Doc
         :type doc: Doc
         """
-        if self.ent_score:
-            doc = self.assign_score_to_entities(doc)
+        if isinstance(doc, str):
+            doc = self.nlp(doc)
+        elif isinstance(doc, Doc):
+            if self.ent_score:
+                doc = self.assign_score_to_entities(doc)
+
         return doc
 
     def pipe(self, stream, batch_size=128) -> Doc:
@@ -425,6 +441,12 @@ def pipe(self, stream, batch_size=128) -> Doc:
         :param stream: a generator of documents
         :param batch_size: The number of documents to be processed at a time, defaults to 128 (optional)
         """
+        if isinstance(stream, str):
+            stream = [stream]
+
+        if not isinstance(stream, types.GeneratorType):
+            stream = self.nlp.pipe(stream, batch_size=batch_size)
+
         for docs in util.minibatch(stream, size=batch_size):
             for doc in docs:
                 if self.ent_score: