Skip to content

Commit e155063

Browse files
committed
Refactored around a new metadata module
1 parent 9d531a2 commit e155063

File tree

3 files changed

+136
-77
lines changed

3 files changed

+136
-77
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313

1414
- `zim.creator.Creator.config_metadata` method (returning Self) exposing all mandatory Metdata, all standard ones and allowing extra text metdadata.
1515
- `zim.creator.Creator.config_dev_metadata` method setting stub metdata for all mandatory ones (allowing overrides)
16+
- `zim.metadata` module with a list of per-metadata validation functions
1617
- `zim.creator.Creator.validate_metadata` (called on `start`) to verify metadata respects the spec (and its recommendations)
1718
- `zim.filesystem.make_zim_file` accepts a new optional `long_description` param.
1819
- `i18n.is_valid_iso_639_3` to check ISO-639-3 codes

src/zimscraperlib/zim/creator.py

Lines changed: 23 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -22,24 +22,31 @@
2222
import pathlib
2323
import re
2424
import weakref
25-
from collections.abc import Iterable as IterableT
2625
from typing import Any, Callable, Iterable, Optional, Tuple, Union
2726

2827
import libzim.writer
2928

3029
from ..constants import (
3130
DEFAULT_DEV_ZIM_METADATA,
3231
FRONT_ARTICLE_MIMETYPES,
33-
ILLUSTRATIONS_METADATA_RE,
3432
MANDATORY_ZIM_METADATA_KEYS,
35-
MAXIMUM_DESCRIPTION_METADATA_LENGTH,
36-
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
3733
)
3834
from ..filesystem import delete_callback, get_content_mimetype, get_file_mimetype
3935
from ..i18n import is_valid_iso_639_3
40-
from ..image.probing import is_valid_image
4136
from ..types import get_mime_for_name
4237
from .items import StaticItem
38+
from .metadata import (
39+
validate_counter,
40+
validate_date,
41+
validate_description,
42+
validate_illustrations,
43+
validate_language,
44+
validate_longdescription,
45+
validate_required_values,
46+
validate_standard_str_types,
47+
validate_tags,
48+
validate_title,
49+
)
4350

4451
DUPLICATE_EXC_STR = re.compile(
4552
r"^Impossible to add(.+)"
@@ -155,78 +162,17 @@ def validate_metadata(
155162
Also enforces recommendations
156163
See https://wiki.openzim.org/wiki/Metadata"""
157164

158-
# spec doesnt require any value but empty strings are not useful
159-
if name in MANDATORY_ZIM_METADATA_KEYS and not value:
160-
raise ValueError(f"Missing value for {name}")
161-
162-
# most require/standard and al
163-
if name in (
164-
"Name",
165-
"Title",
166-
"Creator",
167-
"Publisher",
168-
"Description",
169-
"LongDescription",
170-
"License",
171-
"Relation",
172-
"Relation",
173-
"Flavour",
174-
"Source",
175-
"Scraper",
176-
) and not isinstance(value, str):
177-
raise ValueError(f"Invalid type for {name}")
178-
179-
if name == "Title" and len(value) > 30:
180-
raise ValueError(f"{name} is too long.")
181-
182-
if name == "Date":
183-
if not isinstance(value, (datetime.datetime, datetime.date, str)):
184-
raise ValueError(f"Invalid type for {name}.")
185-
elif isinstance(value, str):
186-
match = re.match(
187-
r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", value
188-
)
189-
try:
190-
datetime.date(**{k: int(v) for k, v in match.groupdict().items()})
191-
except Exception as exc:
192-
raise ValueError(f"Invalid {name} format: {exc}")
193-
194-
if name == "Language" and not is_valid_iso_639_3(value):
195-
raise ValueError(f"{value} is not ISO-639-3.")
196-
197-
if name == "Counter":
198-
raise ValueError(f"{name} cannot be set. libzim sets it.")
199-
200-
if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH:
201-
raise ValueError(f"{name} is too long.")
202-
203-
if (
204-
name == "LongDescription"
205-
and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
206-
):
207-
raise ValueError(f"{name} is too long.")
208-
209-
if name == "Tags" and (
210-
not isinstance(value, IterableT)
211-
or not all([isinstance(tag, str) for tag in value])
212-
):
213-
raise ValueError(f"Invalid type(s) for {name}")
214-
215-
if name.startswith("Illustration_"):
216-
match = ILLUSTRATIONS_METADATA_RE.match(name)
217-
if match and not is_valid_image(
218-
image=value,
219-
imformat="PNG",
220-
size=(
221-
int(match.groupdict()["width"]),
222-
int(match.groupdict()["height"]),
223-
),
224-
):
225-
raise ValueError(
226-
f"{name} is not a "
227-
f"{match.groupdict()['width']}x{match.groupdict()['height']} "
228-
"PNG Image"
229-
)
165+
validate_required_values(name, value)
166+
validate_standard_str_types(name, value)
167+
168+
validate_title(name, value)
169+
validate_date(name, value)
170+
validate_language(name, value)
171+
validate_counter(name, value)
172+
validate_description(name, value)
173+
validate_longdescription(name, value)
174+
validate_tags(name, value)
175+
validate_illustrations(name, value)
230176

231177
def add_metadata(
232178
self,

src/zimscraperlib/zim/metadata.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import datetime
2+
import re
3+
from collections.abc import Iterable as IterableT
4+
from typing import Any, Iterable, Union
5+
6+
from ..constants import (
7+
ILLUSTRATIONS_METADATA_RE,
8+
MANDATORY_ZIM_METADATA_KEYS,
9+
MAXIMUM_DESCRIPTION_METADATA_LENGTH,
10+
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
11+
)
12+
from ..i18n import is_valid_iso_639_3
13+
from ..image.probing import is_valid_image
14+
15+
16+
def validate_required_values(name: str, value: Any):
17+
"""ensures required ones have a value (spec doesnt requires it but makes sense)"""
18+
if name in MANDATORY_ZIM_METADATA_KEYS and not value:
19+
raise ValueError(f"Missing value for {name}")
20+
21+
22+
def validate_standard_str_types(name: str, value: str):
23+
"""ensures standard string metadata are indeed str"""
24+
if name in (
25+
"Name",
26+
"Title",
27+
"Creator",
28+
"Publisher",
29+
"Description",
30+
"LongDescription",
31+
"License",
32+
"Relation",
33+
"Relation",
34+
"Flavour",
35+
"Source",
36+
"Scraper",
37+
) and not isinstance(value, str):
38+
raise ValueError(f"Invalid type for {name}")
39+
40+
41+
def validate_title(name: str, value: str):
42+
"""ensures Title metadata is within recommended length"""
43+
if name == "Title" and len(value) > 30:
44+
raise ValueError(f"{name} is too long.")
45+
46+
47+
def validate_date(name: str, value: Union[datetime.datetime, datetime.date, str]):
48+
"""ensures Date metadata can be casted to an ISO 8601 string"""
49+
if name == "Date":
50+
if not isinstance(value, (datetime.datetime, datetime.date, str)):
51+
raise ValueError(f"Invalid type for {name}.")
52+
elif isinstance(value, str):
53+
match = re.match(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", value)
54+
try:
55+
datetime.date(**{k: int(v) for k, v in match.groupdict().items()})
56+
except Exception as exc:
57+
raise ValueError(f"Invalid {name} format: {exc}")
58+
59+
60+
def validate_language(name: str, value: Union[Iterable[str], str]):
61+
"""ensures Language metadata is a single or list of ISO-639-3 codes"""
62+
if name == "Language" and not is_valid_iso_639_3(value):
63+
raise ValueError(f"{value} is not ISO-639-3.")
64+
65+
66+
def validate_counter(name: str, value: str):
67+
"""ensures Counter metadata is not manually set"""
68+
if name == "Counter":
69+
raise ValueError(f"{name} cannot be set. libzim sets it.")
70+
71+
72+
def validate_description(name: str, value: str):
73+
"""ensures Description metadata is with required length"""
74+
if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH:
75+
raise ValueError(f"{name} is too long.")
76+
77+
78+
def validate_longdescription(name: str, value: str):
79+
"""ensures LongDescription metadata is with required length"""
80+
if (
81+
name == "LongDescription"
82+
and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
83+
):
84+
raise ValueError(f"{name} is too long.")
85+
86+
87+
def validate_tags(name: str, value: Union[Iterable[str], str]):
88+
"""ensures Tags metadata is either one or a list of strings"""
89+
if name == "Tags" and (
90+
not isinstance(value, IterableT)
91+
or not all([isinstance(tag, str) for tag in value])
92+
):
93+
raise ValueError(f"Invalid type(s) for {name}")
94+
95+
96+
def validate_illustrations(name: str, value: bytes):
97+
"""ensures illustrations are PNG images or the advertised size"""
98+
if name.startswith("Illustration_"):
99+
match = ILLUSTRATIONS_METADATA_RE.match(name)
100+
if match and not is_valid_image(
101+
image=value,
102+
imformat="PNG",
103+
size=(
104+
int(match.groupdict()["width"]),
105+
int(match.groupdict()["height"]),
106+
),
107+
):
108+
raise ValueError(
109+
f"{name} is not a "
110+
f"{match.groupdict()['width']}x{match.groupdict()['height']} "
111+
"PNG Image"
112+
)

0 commit comments

Comments
 (0)