CogStack · mart-r · May 13, 2025 · May 13, 2025 · May 13, 2025 · May 13, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -34,8 +34,8 @@ jobs:
           python -m mypy `git ls-tree --full-tree --name-only -r HEAD | grep ".py$" | grep -v "tests/"` --explicit-package-bases --follow-imports=normal
       - name: Test
         run: |
-          python -m unittest discover
-          python -m unittest discover -s medcat/compare_models
+          python tests/runner/custom_test_runner.py
+          python tests/runner/custom_test_runner.py -s medcat/compare_models
 # TODO - in the future, we might want to add automated tests for notebooks as well
 # though it's not really possible right now since the notebooks are designed
 # in a way that assumes interaction (i.e specifying model pack names)
diff --git a/medcat/1_create_model/create_cdb/create_cdb.py b/medcat/1_create_model/create_cdb/create_cdb.py
@@ -1,7 +1,8 @@
 import os
 import pandas as pd
 from medcat.config import Config
-from medcat.cdb_maker import CDBMaker
+from medcat.model_creation.cdb_maker import CDBMaker
+from medcat.storage.serialisers import serialise, AvailableSerialisers
 
 pd.options.mode.chained_assignment = None  # type: ignore
 
@@ -24,6 +25,10 @@
 
 model_dir = os.path.join(BASE_PATH, "models", "cdb")
 output_cdb = os.path.join(model_dir, f"{release}_SNOMED_cdb.dat")
+os.makedirs(output_cdb, exist_ok=True)
+# NOTE: by default, new models creaeted at the same location will not be saved
+#       so here we allow overwrtiing
+allow_overwrite = True
 csv = pd.read_csv(csv_path)
 
 # Remove null values
@@ -50,9 +55,9 @@
 
 # Setup config
 config = Config()
-config.general['spacy_model'] = 'en_core_web_md'
-config.cdb_maker['remove_parenthesis'] = 1
-config.general['cdb_source_name'] = f'SNOMED_{release}'
+config.general.nlp.modelname = 'en_core_web_md'
+config.cdb_maker.remove_parenthesis = 1
+# config.general.cdb_source_name = f'SNOMED_{release}'
 
 maker = CDBMaker(config)
 
@@ -64,8 +69,8 @@
 
 # Add type_id pretty names to cdb
 cdb.addl_info['type_id2name'] = pd.Series(csv.description_type_ids.values, index=csv.type_ids.astype(str)).to_dict()
-cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist())  # Add all cuis to filter out legacy terms.
+cdb.config.components.linking.filters.cuis = set(csv['cui'].tolist())  # Add all cuis to filter out legacy terms.
 
 # save model
-cdb.save(output_cdb)
+serialise(AvailableSerialisers.dill, cdb, output_cdb, overwrite=allow_overwrite)
 print(f"CDB Model saved successfully as: {output_cdb}")
diff --git a/medcat/1_create_model/create_cdb/create_umls_cdb.py b/medcat/1_create_model/create_cdb/create_umls_cdb.py
@@ -1,7 +1,8 @@
 import os
 import pandas as pd
 from medcat.config import Config
-from medcat.cdb_maker import CDBMaker
+from medcat.model_creation.cdb_maker import CDBMaker
+from medcat.storage.serialisers import serialise, AvailableSerialisers
 
 pd.options.mode.chained_assignment = None  # type: ignore
 
@@ -28,6 +29,10 @@
 
 model_dir = os.path.join(BASE_PATH, "models", "cdb")
 output_cdb = os.path.join(model_dir, f"{release}_UMLS_cdb.dat")
+os.makedirs(output_cdb, exist_ok=True)
+# NOTE: by default, new models creaeted at the same location will not be saved
+#       so here we allow overwrtiing
+allow_overwrite = True
 csv = pd.read_csv(csv_path)
 
 # Remove null values
@@ -39,9 +44,9 @@
 
 # Setup config
 config = Config()
-config.general['spacy_model'] = 'en_core_web_md'
-config.cdb_maker['remove_parenthesis'] = 1
-config.general['cdb_source_name'] = f'UMLS_{release}'
+config.general.nlp.modelname = 'en_core_web_md'
+config.cdb_maker.remove_parenthesis = 1
+# config.general.cdb_source_name = f'UMLS_{release}'
 
 maker = CDBMaker(config)
 
@@ -52,8 +57,8 @@
 cdb = maker.prepare_csvs(csv_paths, full_build=True) 
 
 # Add type_id pretty names to cdb
-cdb.config.linking['filters']['cuis'] = set(csv['cui'].tolist())  # Add all cuis to filter out legacy terms.
+cdb.config.components.linking.filters.cuis = set(csv['cui'].tolist())  # Add all cuis to filter out legacy terms.
 
 # save model
-cdb.save(output_cdb)
+serialise(AvailableSerialisers.dill, cdb, output_cdb, overwrite=allow_overwrite)
 print(f"CDB Model saved successfully as: {output_cdb}")
diff --git a/medcat/1_create_model/create_modelpack/create_modelpack.py b/medcat/1_create_model/create_modelpack/create_modelpack.py
@@ -39,27 +39,37 @@ def load_cdb_and_save_modelpack(cdb_path: str,
         str: The model pack path.
     """
     # Load cdb
-    cdb = CDB.load(cdb_path)
+    cdb: CDB
+    try:
+        cdb = CDB.load(cdb_path)
+    except NotADirectoryError:
+        from medcat.utils.legacy.convert_cdb import get_cdb_from_old
+        cdb = get_cdb_from_old(cdb_path)
 
     # Set cdb configuration
     # technically we already created this during the cdb creation
-    cdb.config.ner['min_name_len'] = 2
-    cdb.config.ner['upper_case_limit_len'] = 3
-    cdb.config.general['spell_check'] = True
-    cdb.config.linking['train_count_threshold'] = 10
-    cdb.config.linking['similarity_threshold'] = 0.3
-    cdb.config.linking['train'] = True
-    cdb.config.linking['disamb_length_limit'] = 4
-    cdb.config.general['full_unlink'] = True
+    cdb.config.components.ner.min_name_len = 2
+    cdb.config.components.ner.upper_case_limit_len = 3
+    cdb.config.general.spell_check = True
+    cdb.config.components.linking.train_count_threshold = 10
+    cdb.config.components.linking.similarity_threshold = 0.3
+    cdb.config.components.linking.train = True
+    cdb.config.components.linking.disamb_length_limit = 4
+    cdb.config.general.full_unlink = True
 
     # Load vocab
-    vocab = Vocab.load(vocab_path)
+    vocab: Vocab
+    try:
+        vocab = Vocab.load(vocab_path)
+    except NotADirectoryError:
+        from medcat.utils.legacy.convert_vocab import get_vocab_from_old
+        vocab = get_vocab_from_old(vocab_path)
 
     # Initialise the model
     cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
 
     # Create and save model pack
-    return cat.create_model_pack(save_dir_path=modelpack_path, model_pack_name=modelpack_name)
+    return cat.save_model_pack(modelpack_path, pack_name=modelpack_name)
 
 
 def load_cdb_and_save_modelpack_in_def_location(cdb_name: str,

diff --git a/medcat/1_create_model/create_vocab/create_vocab.py b/medcat/1_create_model/create_vocab/create_vocab.py
@@ -1,4 +1,5 @@
 from medcat.vocab import Vocab
+from medcat.storage.serialisers import serialise, AvailableSerialisers
 import os
 
 vocab = Vocab()
@@ -17,5 +18,6 @@
 # embeddings of 300 dimensions is standard
 
 vocab.add_words(os.path.join(vocab_dir, 'vocab_data.txt'), replace=True)
-vocab.make_unigram_table()
-vocab.save(os.path.join(vocab_dir, "vocab.dat"))
+vocab_folder = os.path.join(vocab_dir, "vocab.dat")
+os.makedirs(vocab_folder, exist_ok=True)
+serialise(AvailableSerialisers.dill, vocab, vocab_folder)
diff --git a/medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb b/medcat/2_train_model/1_unsupervised_training/unsupervised training.ipynb
@@ -55,7 +55,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cat.cdb.print_stats()"
+    "cat.cdb.get_basic_info()"
    ]
   },
   {
@@ -88,21 +88,12 @@
    "outputs": [],
    "source": [
     "# Print statistics on the CDB before training\n",
-    "cat.cdb.print_stats()\n",
+    "cat.cdb.get_basic_info()\n",
     "\n",
     "# Run the annotation procedure over all the documents we have,\n",
     "# given that we have a large number of documents this can take quite some time.\n",
     "\n",
-    "for i, text in enumerate(data['text'].values):\n",
-    "    # This will now run the training in the background \n",
-    "    try:\n",
-    "        _ = cat(text, do_train=True)\n",
-    "    except TypeError:\n",
-    "        pass\n",
-    "        \n",
-    "    # So we know how things are moving\n",
-    "    if i % 10000 == 0:\n",
-    "        print(\"Finished {} - text blocks\".format(i))\n"
+    "cat.trainer.train_unsupervised(data.text)\n"
    ]
   },
   {
@@ -112,7 +103,7 @@
    "outputs": [],
    "source": [
     "# Print statistics on the CDB after training\n",
-    "cat.cdb.print_stats()"
+    "cat.cdb.get_basic_info()"
    ]
   },
   {
@@ -122,7 +113,8 @@
    "outputs": [],
    "source": [
     "# save modelpack\n",
-    "cat.create_model_pack(save_dir_path=model_dir, model_pack_name=output_modelpack)\n"
+    "\n",
+    "cat.save_model_pack(model_dir, pack_name=output_modelpack)\n"
    ]
   },
   {
@@ -135,7 +127,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "medcat",
+   "display_name": "venv_v2",
    "language": "python",
    "name": "python3"
   },
@@ -149,12 +141,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8 (main, Nov 24 2022, 08:08:27) [Clang 14.0.6 ]"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "4e4ccc64ca47f932c34194843713e175cf3a19af3798844e4190152d16ba61ca"
-   }
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,