CogStack
diff --git a/‎.github/workflows/main.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/main.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/production.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/production.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.readthedocs.yaml
Lines changed: 3 additions & 3 deletions b/‎.readthedocs.yaml
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.md
Lines changed: 12 additions & 0 deletions b/‎README.md
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/requirements.txt
Lines changed: 102 additions & 4 deletions b/‎docs/requirements.txt
Lines changed: 102 additions & 4 deletions
diff --git a/‎medcat/cat.py
Lines changed: 60 additions & 5 deletions b/‎medcat/cat.py
Lines changed: 60 additions & 5 deletions
@@ -16,9 +16,9 @@ jobs:
       max-parallel: 4
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
@@ -48,13 +48,13 @@ jobs:
 
     steps:
       - name: Checkout master
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           ref: 'master'
           fetch-depth: 0
 
       - name: Set up Python 3.9
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.9
 
 
@@ -14,13 +14,13 @@ jobs:
 
     steps:
       - name: Checkout production
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           ref: ${{ github.event.release.target_commitish }}
           fetch-depth: 0
 
       - name: Set up Python 3.9
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.9
 
 
@@ -7,13 +7,13 @@ version: 2
 build:
   os: ubuntu-20.04
   tools:
-    python: "3.9"
+    python: "3.10"
 
 sphinx:
   configuration: docs/conf.py
 
 python:
   install:
+    - requirements: docs/requirements.txt
     - method: setuptools
-      path: .
-    - requirements: docs/requirements.txt
+      path: .
@@ -38,8 +38,20 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g
 - **Paper**: [What’s in a Summary? Laying the Groundwork for Advances in Hospital-Course Summarization](https://www.aclweb.org/anthology/2021.naacl-main.382.pdf)
 - ([more...](https://github.com/CogStack/MedCAT/blob/master/media/news.md))
 
+## Installation
+To install the latest version of MedCAT run the following command:
+```
+pip install medcat
+```
+Normal installations of MedCAT will install torch-gpu and all relevant dependancies (such as CUDA). This can require as much as 10 GB more disk space, which isn't required for CPU only usage.
+
+To install the latest version of MedCAT without torch GPU support run the following command:
+```
+pip install medcat --extra_index_url https://download.pytorch.org/whl/cpu/
+```
 ## Demo
 A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT.
+PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive.
 
 ## Tutorials
 A guide on how to use MedCAT is available at [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials). Read more about MedCAT on [Towards Data Science](https://towardsdatascience.com/medcat-introduction-analyzing-electronic-health-records-e1c420afa13a).
 
@@ -1,6 +1,104 @@
-Sphinx~=4.0
+sphinx==6.2.1
 sphinx-rtd-theme~=1.0
 myst-parser~=0.17
-sphinx-autoapi~=1.8
-setuptools>=60.0
-aiohttp==3.8.5
+sphinx-autoapi~=3.0.0
+MarkupSafe==2.1.3
+accelerate==0.23.0
+aiofiles==23.2.1
+aiohttp==3.8.5
+aiosignal==1.3.1
+asttokens==2.4.0
+async-timeout==4.0.3
+attrs==23.1.0
+backcall==0.2.0
+blis==0.7.11
+catalogue==2.0.10
+certifi==2023.7.22
+charset-normalizer==3.3.0
+click==8.1.7
+comm==0.1.4
+confection==0.1.3
+cymem==2.0.8
+datasets==2.14.5
+decorator==5.1.1
+dill==0.3.7
+exceptiongroup==1.1.3
+executing==2.0.0
+filelock==3.12.4
+flake8==4.0.1
+frozenlist==1.4.0
+fsspec==2023.6.0
+gensim==4.3.2
+huggingface-hub==0.17.3
+idna==3.4
+ipython==8.16.1
+ipywidgets==8.1.1
+jedi==0.19.1
+jinja2==3.1.2
+joblib==1.3.2
+jsonpickle==3.0.2
+jupyterlab-widgets==3.0.9
+langcodes==3.3.0
+matplotlib-inline==0.1.6
+mccabe==0.6.1
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+murmurhash==1.0.10
+mypy==1.0.0
+mypy-extensions==0.4.3
+networkx==3.1
+numpy==1.25.2
+packaging==23.2
+pandas==2.1.1
+parso==0.8.3
+pathy==0.10.2
+pexpect==4.8.0
+pickleshare==0.7.5
+preshed==3.0.9
+prompt-toolkit==3.0.39
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==13.0.0
+pycodestyle==2.8.0
+pydantic==1.10.13
+pyflakes==2.4.0
+pygments==2.16.1
+python-dateutil==2.8.2
+pytz==2023.3.post1
+pyyaml==6.0.1
+regex==2023.10.3
+requests==2.31.0
+safetensors==0.4.0
+scikit-learn==1.3.1
+scipy==1.9.3
+six==1.16.0
+smart-open==6.4.0
+spacy==3.4.4
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+thinc==8.1.12
+threadpoolctl==3.2.0
+tokenizers==0.14.1
+tomli==2.0.1
+torch==2.1.0
+tqdm==4.66.1
+traitlets==5.11.2
+transformers==4.34.0
+triton==2.1.0
+typer==0.7.0
+types-PyYAML==6.0.3
+types-aiofiles==0.8.3
+types-setuptools==57.4.10
+typing-extensions==4.8.0
+tzdata==2023.3
+urllib3==2.0.6
+wasabi==0.10.1
+wcwidth==0.2.8
+widgetsnbextension==4.0.9
+xxhash==3.4.1
+yarl==1.9.2
@@ -271,6 +271,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
         cdb_path = os.path.join(save_dir_path, "cdb.dat")
         self.cdb.save(cdb_path, json_path)
 
+        # Save the config
+        config_path = os.path.join(save_dir_path, "config.json")
+        self.cdb.config.save(config_path)
+
         # Save the Vocab
         vocab_path = os.path.join(save_dir_path, "vocab.dat")
         if self.vocab is not None:
@@ -362,6 +366,10 @@ def load_model_pack(cls,
         logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format')
         cdb = CDB.load(cdb_path, json_path)
 
+        # load config
+        config_path = os.path.join(model_pack_path, "config.json")
+        cdb.load_config(config_path)
+
         # TODO load addl_ner
 
         # Modify the config to contain full path to spacy model
@@ -832,9 +840,13 @@ def add_and_train_concept(self,
                 Refer to medcat.cat.cdb.CDB.add_concept
         """
         names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
+        if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P':
+            logger.warning("No names were able to be prepared in CAT.add_and_train_concept "
+                           "method. As such no preferred name will be able to be specifeid. "
+                           "The CUI: '%s' and raw name: '%s'", cui, name)
         # Only if not negative, otherwise do not add the new name if in fact it should not be detected
         if do_add_concept and not negative:
-            self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
+            self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
                                  full_build=full_build)
 
         if spacy_entity is not None and spacy_doc is not None:
@@ -1327,19 +1339,42 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_
             pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb'))
         return part_counter
 
+    @deprecated(message="Use `multiprocessing_batch_char_size` instead")
     def multiprocessing(self,
                         data: Union[List[Tuple], Iterable[Tuple]],
                         nproc: int = 2,
                         batch_size_chars: int = 5000 * 1000,
                         only_cui: bool = False,
-                        addl_info: List[str] = [],
+                        addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
                         separate_nn_components: bool = True,
                         out_split_size_chars: Optional[int] = None,
                         save_dir_path: str = os.path.abspath(os.getcwd()),
                         min_free_memory=0.1) -> Dict:
+        return self.multiprocessing_batch_char_size(data=data, nproc=nproc,
+                                                    batch_size_chars=batch_size_chars,
+                                                    only_cui=only_cui, addl_info=addl_info,
+                                                    separate_nn_components=separate_nn_components,
+                                                    out_split_size_chars=out_split_size_chars,
+                                                    save_dir_path=save_dir_path,
+                                                    min_free_memory=min_free_memory)
+
+    def multiprocessing_batch_char_size(self,
+                                        data: Union[List[Tuple], Iterable[Tuple]],
+                                        nproc: int = 2,
+                                        batch_size_chars: int = 5000 * 1000,
+                                        only_cui: bool = False,
+                                        addl_info: List[str] = [],
+                                        separate_nn_components: bool = True,
+                                        out_split_size_chars: Optional[int] = None,
+                                        save_dir_path: str = os.path.abspath(os.getcwd()),
+                                        min_free_memory=0.1) -> Dict:
         r"""Run multiprocessing for inference, if out_save_path and out_split_size_chars is used this will also continue annotating
         documents if something is saved in that directory.
 
+        This method batches the data based on the number of characters as specified by user.
+
+        PS: This method is unlikely to work on a Windows machine.
+
         Args:
             data:
                 Iterator or array with format: [(id, text), (id, text), ...]
@@ -1523,15 +1558,35 @@ def _multiprocessing_batch(self,
 
         return docs
 
-    def multiprocessing_pipe(self,
-                             in_data: Union[List[Tuple], Iterable[Tuple]],
+    @deprecated(message="Use `multiprocessing_batch_docs_size` instead")
+    def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]],
                              nproc: Optional[int] = None,
                              batch_size: Optional[int] = None,
                              only_cui: bool = False,
                              addl_info: List[str] = [],
                              return_dict: bool = True,
                              batch_factor: int = 2) -> Union[List[Tuple], Dict]:
-        """Run multiprocessing NOT FOR TRAINING
+        return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc,
+                                                     batch_size=batch_size,
+                                                     only_cui=only_cui,
+                                                     addl_info=addl_info,
+                                                     return_dict=return_dict,
+                                                     batch_factor=batch_factor)
+
+    def multiprocessing_batch_docs_size(self,
+                             in_data: Union[List[Tuple], Iterable[Tuple]],
+                             nproc: Optional[int] = None,
+                             batch_size: Optional[int] = None,
+                             only_cui: bool = False,
+                             addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
+                             return_dict: bool = True,
+                             batch_factor: int = 2) -> Union[List[Tuple], Dict]:
+        """Run multiprocessing NOT FOR TRAINING.
+
+        This method batches the data based on the number of documents as specified by the user.
+
+        PS:
+        This method supports Windows.
 
         Args:
             in_data (Union[List[Tuple], Iterable[Tuple]]): List with format: [(id, text), (id, text), ...]