Skip to content
This repository was archived by the owner on Jul 28, 2025. It is now read-only.

Commit 4a490ce

Browse files
authored
Merge branch 'master' into CU-2e77a31-improve-print_stats
2 parents eb7655e + 22e4aec commit 4a490ce

File tree

20 files changed

+623
-43
lines changed

20 files changed

+623
-43
lines changed

.github/workflows/main.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ jobs:
1616
max-parallel: 4
1717

1818
steps:
19-
- uses: actions/checkout@v2
19+
- uses: actions/checkout@v4
2020
- name: Set up Python ${{ matrix.python-version }}
21-
uses: actions/setup-python@v2
21+
uses: actions/setup-python@v4
2222
with:
2323
python-version: ${{ matrix.python-version }}
2424
- name: Install dependencies
@@ -48,13 +48,13 @@ jobs:
4848

4949
steps:
5050
- name: Checkout master
51-
uses: actions/checkout@v2
51+
uses: actions/checkout@v4
5252
with:
5353
ref: 'master'
5454
fetch-depth: 0
5555

5656
- name: Set up Python 3.9
57-
uses: actions/setup-python@v2
57+
uses: actions/setup-python@v4
5858
with:
5959
python-version: 3.9
6060

.github/workflows/production.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ jobs:
1414

1515
steps:
1616
- name: Checkout production
17-
uses: actions/checkout@v2
17+
uses: actions/checkout@v4
1818
with:
1919
ref: ${{ github.event.release.target_commitish }}
2020
fetch-depth: 0
2121

2222
- name: Set up Python 3.9
23-
uses: actions/setup-python@v2
23+
uses: actions/setup-python@v4
2424
with:
2525
python-version: 3.9
2626

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,20 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g
3838
- **Paper**: [What’s in a Summary? Laying the Groundwork for Advances in Hospital-Course Summarization](https://www.aclweb.org/anthology/2021.naacl-main.382.pdf)
3939
- ([more...](https://github.com/CogStack/MedCAT/blob/master/media/news.md))
4040

41+
## Installation
42+
To install the latest version of MedCAT run the following command:
43+
```
44+
pip install medcat
45+
```
46+
Normal installations of MedCAT will install torch-gpu and all relevant dependancies (such as CUDA). This can require as much as 10 GB more disk space, which isn't required for CPU only usage.
47+
48+
To install the latest version of MedCAT without torch GPU support run the following command:
49+
```
50+
pip install medcat --extra_index_url https://download.pytorch.org/whl/cpu/
51+
```
4152
## Demo
4253
A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT.
54+
PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive.
4355

4456
## Tutorials
4557
A guide on how to use MedCAT is available at [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials). Read more about MedCAT on [Towards Data Science](https://towardsdatascience.com/medcat-introduction-analyzing-electronic-health-records-e1c420afa13a).

medcat/cat.py

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
270270
cdb_path = os.path.join(save_dir_path, "cdb.dat")
271271
self.cdb.save(cdb_path, json_path)
272272

273+
# Save the config
274+
config_path = os.path.join(save_dir_path, "config.json")
275+
self.cdb.config.save(config_path)
276+
273277
# Save the Vocab
274278
vocab_path = os.path.join(save_dir_path, "vocab.dat")
275279
if self.vocab is not None:
@@ -361,6 +365,10 @@ def load_model_pack(cls,
361365
logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format')
362366
cdb = CDB.load(cdb_path, json_path)
363367

368+
# load config
369+
config_path = os.path.join(model_pack_path, "config.json")
370+
cdb.load_config(config_path)
371+
364372
# TODO load addl_ner
365373

366374
# Modify the config to contain full path to spacy model
@@ -640,9 +648,13 @@ def add_and_train_concept(self,
640648
Refer to medcat.cat.cdb.CDB.add_concept
641649
"""
642650
names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
651+
if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P':
652+
logger.warning("No names were able to be prepared in CAT.add_and_train_concept "
653+
"method. As such no preferred name will be able to be specifeid. "
654+
"The CUI: '%s' and raw name: '%s'", cui, name)
643655
# Only if not negative, otherwise do not add the new name if in fact it should not be detected
644656
if do_add_concept and not negative:
645-
self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
657+
self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
646658
full_build=full_build)
647659

648660
if spacy_entity is not None and spacy_doc is not None:
@@ -1135,19 +1147,42 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_
11351147
pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb'))
11361148
return part_counter
11371149

1150+
@deprecated(message="Use `multiprocessing_batch_char_size` instead")
11381151
def multiprocessing(self,
11391152
data: Union[List[Tuple], Iterable[Tuple]],
11401153
nproc: int = 2,
11411154
batch_size_chars: int = 5000 * 1000,
11421155
only_cui: bool = False,
1143-
addl_info: List[str] = [],
1156+
addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
11441157
separate_nn_components: bool = True,
11451158
out_split_size_chars: Optional[int] = None,
11461159
save_dir_path: str = os.path.abspath(os.getcwd()),
11471160
min_free_memory=0.1) -> Dict:
1161+
return self.multiprocessing_batch_char_size(data=data, nproc=nproc,
1162+
batch_size_chars=batch_size_chars,
1163+
only_cui=only_cui, addl_info=addl_info,
1164+
separate_nn_components=separate_nn_components,
1165+
out_split_size_chars=out_split_size_chars,
1166+
save_dir_path=save_dir_path,
1167+
min_free_memory=min_free_memory)
1168+
1169+
def multiprocessing_batch_char_size(self,
1170+
data: Union[List[Tuple], Iterable[Tuple]],
1171+
nproc: int = 2,
1172+
batch_size_chars: int = 5000 * 1000,
1173+
only_cui: bool = False,
1174+
addl_info: List[str] = [],
1175+
separate_nn_components: bool = True,
1176+
out_split_size_chars: Optional[int] = None,
1177+
save_dir_path: str = os.path.abspath(os.getcwd()),
1178+
min_free_memory=0.1) -> Dict:
11481179
r"""Run multiprocessing for inference, if out_save_path and out_split_size_chars is used this will also continue annotating
11491180
documents if something is saved in that directory.
11501181
1182+
This method batches the data based on the number of characters as specified by user.
1183+
1184+
PS: This method is unlikely to work on a Windows machine.
1185+
11511186
Args:
11521187
data:
11531188
Iterator or array with format: [(id, text), (id, text), ...]
@@ -1331,15 +1366,35 @@ def _multiprocessing_batch(self,
13311366

13321367
return docs
13331368

1334-
def multiprocessing_pipe(self,
1335-
in_data: Union[List[Tuple], Iterable[Tuple]],
1369+
@deprecated(message="Use `multiprocessing_batch_docs_size` instead")
1370+
def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]],
13361371
nproc: Optional[int] = None,
13371372
batch_size: Optional[int] = None,
13381373
only_cui: bool = False,
13391374
addl_info: List[str] = [],
13401375
return_dict: bool = True,
13411376
batch_factor: int = 2) -> Union[List[Tuple], Dict]:
1342-
"""Run multiprocessing NOT FOR TRAINING
1377+
return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc,
1378+
batch_size=batch_size,
1379+
only_cui=only_cui,
1380+
addl_info=addl_info,
1381+
return_dict=return_dict,
1382+
batch_factor=batch_factor)
1383+
1384+
def multiprocessing_batch_docs_size(self,
1385+
in_data: Union[List[Tuple], Iterable[Tuple]],
1386+
nproc: Optional[int] = None,
1387+
batch_size: Optional[int] = None,
1388+
only_cui: bool = False,
1389+
addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
1390+
return_dict: bool = True,
1391+
batch_factor: int = 2) -> Union[List[Tuple], Dict]:
1392+
"""Run multiprocessing NOT FOR TRAINING.
1393+
1394+
This method batches the data based on the number of documents as specified by the user.
1395+
1396+
PS:
1397+
This method supports Windows.
13431398
13441399
Args:
13451400
in_data (Union[List[Tuple], Iterable[Tuple]]): List with format: [(id, text), (id, text), ...]

0 commit comments

Comments
 (0)