Skip to content
This repository was archived by the owner on Jul 28, 2025. It is now read-only.

Commit d9a1fac

Browse files
authored
CU-8693b0a61 Add method to get spacy model version (#381)
* CU-8693b0a61: Add method to find spacy folder in model pack along with some tests * CU-8693b0a61: Add test for spacy folder finding (full path) * CU-8693b0a61: Add method for finding spacy model in model pack along with tests * CU-8693b0a61: Add method for finding current spacy version * CU-8693b0a61: Add method for getting spacy model version installed * CU-8693b0a61: Fix getting spacy model folder return path * CU-8693b0a61: Add method to get name and meta of spacy model based on model pack * CU-8693b0a61: Add missing fake spacy model meta * CU-8693b0a61: Add missing docstrings * CU-8693b0a61: Change name of method for clarity * CU-8693b0a61: Add method to get spacy model name and version from model pack path * CU-8693b0a61: Fix a few typing issues * CU-8693b0a61: Add a missing docstring * CU-8693b0a61: Match folder name of fake spacy model to its name * CU-8693b0a61: Make the final method return true name of spacy model instead of folder name * Add additional output to method for getting spacy model version - the compatible spacy versions * CU-8693b0a61: Add method for querying whether the spacy version is compatible with a range * CU-8693b0a61: Add better abstraction for spacy version mocking in tests * CU-8693b0a61: Add some more abstraction for fake model pack in tests * CU-8693b0a61: Add method for checking whethera model pack has a spacy model compatible with installed spacy version * CU-8693b0a61: Improve abstraction within tests * CU-8693b0a61: Add method to check which of two versions is older * CU-8693b0a61: Fix fake spacy model versioning * CU-8693b0a61: Add method for determining whether a model pack has semi-compatible spacy model * CU-8693b0a61: Add missing word in docstring. * CU-8693b0a61: Change some method to protected ones
1 parent abfb1e7 commit d9a1fac

File tree

3 files changed

+521
-0
lines changed

3 files changed

+521
-0
lines changed

medcat/utils/spacy_compatibility.py

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
"""This module attempts to read the spacy compatibilty of
2+
a model pack and (if necessary) compare it to the installed
3+
spacy version.
4+
"""
5+
from typing import Tuple, List, cast
6+
import os
7+
import re
8+
from packaging import version
9+
from packaging.specifiers import SpecifierSet
10+
11+
import spacy
12+
13+
14+
SPACY_MODEL_REGEX = re.compile(r"(\w{2}_core_(\w{3,4})_(sm|md|lg|trf|xxl|\w+))|(spacy_model)")
15+
16+
17+
def _is_spacy_model_folder(folder_name: str) -> bool:
18+
"""Check if a folder within a model pack contains a spacy model.
19+
20+
The idea is to do this without loading the model. That is because
21+
the version of the model may be incompatible with what we've got.
22+
And as such, loading may not be possible.
23+
24+
Args:
25+
folder_name (str): The folder to check.
26+
27+
Returns:
28+
bool: Whether the folder contains a spacy model.
29+
"""
30+
# since we're trying to identify this solely from the
31+
# folder name, we only care about the base name.
32+
folder_name = os.path.basename(folder_name)
33+
if folder_name.startswith("meta_"):
34+
# these are MetaCat stuff (or should be)
35+
return False
36+
return bool(SPACY_MODEL_REGEX.match(folder_name))
37+
38+
39+
def _find_spacy_model_folder(model_pack_folder: str) -> str:
40+
"""Find the spacy model folder in a model pack folder.
41+
42+
Args:
43+
model_pack_folder (str): The model pack folder
44+
45+
Raises:
46+
ValueError: If it's ambiguous or there's no model folder.
47+
48+
Returns:
49+
str: The full path to the model folder.
50+
"""
51+
options: List[str] = []
52+
for folder_name in os.listdir(model_pack_folder):
53+
full_folder_path = os.path.join(model_pack_folder, folder_name)
54+
if not os.path.isdir(full_folder_path):
55+
continue
56+
if _is_spacy_model_folder(folder_name):
57+
options.append(full_folder_path)
58+
if len(options) != 1:
59+
raise ValueError("Unable to determine spacy folder name from "
60+
f"{len(options)} ambiguous folders: {options}")
61+
return options[0]
62+
63+
64+
def get_installed_spacy_version() -> str:
65+
"""Get the spacy version installed currently.
66+
67+
Returns:
68+
str: The currently installed spacy verison.
69+
"""
70+
return spacy.__version__
71+
72+
73+
def get_installed_model_version(model_name: str) -> str:
74+
"""Get the version of a model installed in spacy.
75+
76+
Args:
77+
model_name (str): The model name.
78+
79+
Returns:
80+
str: The version of the installed model.
81+
"""
82+
if model_name not in spacy.util.get_installed_models():
83+
return 'N/A'
84+
# NOTE: I don't really know when spacy.info
85+
# might return a str instead
86+
return cast(dict, spacy.info(model_name))['version']
87+
88+
89+
def _get_name_and_meta_of_spacy_model_in_medcat_modelpack(model_pack_path: str) -> Tuple[str, dict]:
90+
"""Gets the name and meta information about a spacy model within a medcat model pack.
91+
92+
PS: This gets the raw (folder) name of the spacy model.
93+
While this is usually (in models created after v1.2.4)
94+
identical to the spacy model version, that may not always
95+
be the case.
96+
97+
Args:
98+
model_pack_path (str): The model pack path.
99+
100+
Returns:
101+
Tuple[str, dict]: The name of the spacy model, and the meta information.
102+
"""
103+
spacy_model_folder = _find_spacy_model_folder(model_pack_path)
104+
# NOTE: I don't really know when spacy.info
105+
# might return a str instead
106+
info = cast(dict, spacy.info(spacy_model_folder))
107+
return os.path.basename(spacy_model_folder), info
108+
109+
110+
def get_name_and_version_of_spacy_model_in_medcat_modelpack(model_pack_path: str) -> Tuple[str, str, str]:
111+
"""Get the name, version, and compatible spacy versions of a spacy model within a medcat model pack.
112+
113+
PS: This gets the real name of the spacy model.
114+
While this is usually (in models created after v1.2.4)
115+
identical to the folder name, that may not always
116+
be the case.
117+
118+
Args:
119+
model_pack_path (str): The model pack path.
120+
121+
Returns:
122+
Tuple[str, str, str]: The name of the spacy model, its version, and supported spacy version.
123+
"""
124+
_, info = _get_name_and_meta_of_spacy_model_in_medcat_modelpack(model_pack_path)
125+
true_name = info["lang"] + "_" + info['name']
126+
return true_name, info['version'], info["spacy_version"]
127+
128+
129+
def _is_spacy_version_within_range(spacy_version_range: str) -> bool:
130+
"""Checks whether the spacy version is within the specified range.
131+
132+
The expected format of the version range is similar to that used
133+
in requirements and/or pip installs. E.g:
134+
- >=3.1.0,<3.2.0
135+
- ==3.1.0
136+
- >=3.1.0
137+
- <3.20
138+
139+
Args:
140+
spacy_version_range (str): The requires spacy version range.
141+
142+
Returns:
143+
bool: Whether the specified range is compatible.
144+
"""
145+
spacy_version = version.parse(get_installed_spacy_version())
146+
range = SpecifierSet(spacy_version_range)
147+
return range.contains(spacy_version)
148+
149+
150+
def medcat_model_pack_has_compatible_spacy_model(model_pack_path: str) -> bool:
151+
"""Checks whether a medcat model pack has a spacy model compatible with installed spacy version.
152+
153+
Args:
154+
model_pack_path (str): The model pack path.
155+
156+
Returns:
157+
bool: Whether the spacy model in the model pack is compatible.
158+
"""
159+
_, _, spacy_range = get_name_and_version_of_spacy_model_in_medcat_modelpack(model_pack_path)
160+
return _is_spacy_version_within_range(spacy_range)
161+
162+
163+
def is_older_spacy_version(model_version: str) -> bool:
164+
"""Checks if the specified version is older than the installed version.
165+
166+
Args:
167+
model_version (str): The specified spacy version.
168+
169+
Returns:
170+
bool: Whether the specified version is older.
171+
"""
172+
installed_version = version.parse(get_installed_spacy_version())
173+
model_version = version.parse(model_version)
174+
return model_version <= installed_version
175+
176+
177+
def medcat_model_pack_has_semi_compatible_spacy_model(model_pack_path: str) -> bool:
178+
"""Checks whether the spacy model within a medcat model pack is
179+
compatible or older than the installed spacy version.
180+
181+
This method returns `True` if the spacy model is compatible or
182+
released with a lower version number compared to the spacy
183+
version currently installed.
184+
185+
We've found that most of the time older models will work with
186+
a newer version of spacy. Though there is a warning on spacy's
187+
side and they do not guarantee 100% compatibility, we've not
188+
seen issues so far.
189+
190+
E.g for installed spacy 3.4.4 all the following will be suiable:
191+
- en_core_web_md-3.1.0
192+
- en_core_web_md-3.2.0
193+
- en_core_web_md-3.3.0
194+
- en_core_web_md-3.4.1
195+
However, for the same version, the following would not be suitable:
196+
- en_core_web_md-3.5.0
197+
- en_core_web_md-3.6.0
198+
- en_core_web_md-3.7.1
199+
200+
Args:
201+
model_pack_path (str): The model pack path.
202+
203+
Returns:
204+
bool: Whether the spacy model in the model pack is compatible.
205+
"""
206+
(_,
207+
model_version,
208+
spacy_range) = get_name_and_version_of_spacy_model_in_medcat_modelpack(model_pack_path)
209+
if _is_spacy_version_within_range(spacy_range):
210+
return True
211+
return is_older_spacy_version(model_version)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"lang":"ff",
3+
"name":"core_fake_dr",
4+
"version":"3.1.0",
5+
"description":"This is a FAKE model",
6+
"author":"Fakio Martimus",
7+
"spacy_version":">=3.1.0,<3.2.0"
8+
}

0 commit comments

Comments
 (0)