Skip to content

Commit 9d531a2

Browse files
committed
Improvements to Creator API
- Requiring libzim 3.0 to support setting LongDescription metadata - Exposing Creator.validate_metadata - Expanded validate_metadata to support all current checks for mandatory and recommendations - Added tests for all validate_metadata cases - Exposing max length for (Long)Description so scrapers can use it in their entrypoints - Marking `Creator.config_indexing`'s `language` argument optional (when disabling) - Only indexing on start() if config_indexing() was not already called manually - Fixed default dev illustration to be 48x48 - Added new image format and size validation check in image.probing module - Fixed test image (commons.png) to be 48x48 - Adapted tests for commons48
1 parent cd9c63f commit 9d531a2

File tree

13 files changed

+312
-57
lines changed

13 files changed

+312
-57
lines changed

CHANGELOG.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,36 @@ All notable changes to this project are documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.5.0).
77

8+
## [Unreleased]
9+
10+
⚠️ Warning: this release introduce several API changes to `zim.creator.Creator` and `zim.filesystem.make_zim_file`
11+
12+
### Added
13+
14+
- `zim.creator.Creator.config_metadata` method (returning Self) exposing all mandatory Metdata, all standard ones and allowing extra text metdadata.
15+
- `zim.creator.Creator.config_dev_metadata` method setting stub metdata for all mandatory ones (allowing overrides)
16+
- `zim.creator.Creator.validate_metadata` (called on `start`) to verify metadata respects the spec (and its recommendations)
17+
- `zim.filesystem.make_zim_file` accepts a new optional `long_description` param.
18+
- `i18n.is_valid_iso_639_3` to check ISO-639-3 codes
19+
- `image.probing.is_valid_image` to check Image format and size
20+
21+
### Changed
22+
23+
- `zim.creator.Creator` `main_path` argument now mandatory
24+
- `zim.creator.Creator.start` now fails on missing required or invalid metadata
25+
- `zim.creator.Creator.add_metadata` nows enforces validation checks
26+
- `zim.filesystem.make_zim_file` renamed its `favicon_path` param to `illustration_path`
27+
- `zim.creator.Creator.config_indexing` `language` argument now optionnal when `indexing=False`
28+
- `zim.creator.Creator.config_indexing` now validates `language` is ISO- 639-3 when `indexing=True`
29+
30+
### Removed
31+
32+
- `zim.creator.Creator.update_metadata`. See `.config_metadata()` instead
33+
- `zim.creator.Creator` `language` argument. See `.config_metadata()` instead
34+
- `zim.creator.Creator` keyword arguments. See `.config_metadata()` instead
35+
- `zim.creator.Creator.add_default_illustration`. See `.config_metadata()` instead
36+
- `zim.archibe.Archive.media_counter` (deprecated in `2.0.0`)
37+
838
## [2.1.0] - 2023-03-06
939

1040
## Added

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ colorthief==0.2.1
44
python-resize-image>=1.1.19,<1.2
55
Babel>=2.9,<3.0
66
file-magic>=0.4.0,<0.5
7-
libzim>=2.1.0,<3.0
7+
libzim>=3.0.0,<3.1
88
beautifulsoup4>=4.9.3,<4.10
99
lxml>=4.6.3,<4.10
1010
optimize-images>=1.3.6,<1.6

src/zimscraperlib/constants.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import base64
66
import pathlib
7+
import re
78

89
ROOT_DIR = pathlib.Path(__file__).parent
910
NAME = pathlib.Path(__file__).parent.name
@@ -39,9 +40,17 @@
3940
"Date": "2023-01-01",
4041
"Description": "Test Description",
4142
"Language": "fra",
43+
# blank 48x48 transparent PNG
4244
"Illustration_48x48_at_1": base64.b64decode(
43-
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAGXRFWHRTb2Z0d2FyZQBB"
44-
"ZG9iZSBJbWFnZVJlYWR5ccllPAAAAA9JREFUeNpi+P//P0CAAQAF/gL+Lc6J7gAAAABJ"
45-
"RU5ErkJggg=="
45+
"iVBORw0KGgoAAAANSUhEUgAAADAAAAAwAQMAAABtzGvEAAAAGXRFWHRTb2Z0d2FyZQBB"
46+
"ZG9iZSBJbWFnZVJlYWR5ccllPAAAAANQTFRFR3BMgvrS0gAAAAF0Uk5TAEDm2GYAAAAN"
47+
"SURBVBjTY2AYBdQEAAFQAAGn4toWAAAAAElFTkSuQmCC"
4648
),
4749
}
50+
51+
MAXIMUM_DESCRIPTION_METADATA_LENGTH = 80
52+
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH = 4000
53+
54+
ILLUSTRATIONS_METADATA_RE = re.compile(
55+
r"^Illustration_(?P<height>\d+)x(?P<width>\d+)@(?P<scale>\d+)$"
56+
)

src/zimscraperlib/image/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# flake8: noqa
66
from .convertion import convert_image
77
from .optimization import optimize_image
8+
from .probing import is_valid_image
89
from .transformation import resize_image
910

10-
__all__ = ["convert_image", "optimize_image", "resize_image"]
11+
__all__ = ["convert_image", "is_valid_image", "optimize_image", "resize_image"]

src/zimscraperlib/image/probing.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,22 @@ def format_for(src: Union[pathlib.Path, io.BytesIO], from_suffix: bool = True) -
6060

6161
init_pil()
6262
return ext_fmt_map[src.suffix] # might raise KeyError on unknown extension
63+
64+
65+
def is_valid_image(
66+
image: Union[pathlib.Path, io.IOBase, bytes],
67+
imformat: str,
68+
size: Optional[Tuple[int, int]] = None,
69+
) -> bool:
70+
"""whether image is a valid imformat (PNG) image, optionnaly of requested size"""
71+
if isinstance(image, bytes):
72+
image = io.BytesIO(image)
73+
try:
74+
img = PIL.Image.open(image)
75+
if img.format != imformat:
76+
return False
77+
if size and img.size != size:
78+
return False
79+
except Exception:
80+
return False
81+
return True

src/zimscraperlib/zim/creator.py

Lines changed: 114 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,22 @@
2222
import pathlib
2323
import re
2424
import weakref
25+
from collections.abc import Iterable as IterableT
2526
from typing import Any, Callable, Iterable, Optional, Tuple, Union
2627

2728
import libzim.writer
2829

2930
from ..constants import (
3031
DEFAULT_DEV_ZIM_METADATA,
3132
FRONT_ARTICLE_MIMETYPES,
33+
ILLUSTRATIONS_METADATA_RE,
3234
MANDATORY_ZIM_METADATA_KEYS,
35+
MAXIMUM_DESCRIPTION_METADATA_LENGTH,
36+
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
3337
)
3438
from ..filesystem import delete_callback, get_content_mimetype, get_file_mimetype
39+
from ..i18n import is_valid_iso_639_3
40+
from ..image.probing import is_valid_image
3541
from ..types import get_mime_for_name
3642
from .items import StaticItem
3743

@@ -91,6 +97,7 @@ def __init__(
9197
):
9298
super().__init__(filename=filename)
9399
self._metadata = dict()
100+
self.__indexing_configured = False
94101
self.can_finish = True
95102

96103
self.set_mainpath(main_path)
@@ -105,18 +112,28 @@ def __init__(
105112
self.workaround_nocancel = workaround_nocancel
106113
self.ignore_duplicates = ignore_duplicates
107114

115+
def config_indexing(self, indexing: bool, language: Optional[str] = None):
116+
"""Toggle full-text and title indexing of entries
117+
118+
Uses Language metadata's value (or "") if not set"""
119+
language = language or self._metadata.get("Language", "")
120+
if indexing and not is_valid_iso_639_3(language):
121+
raise ValueError("Not a valid ISO-639-3 language code")
122+
super().config_indexing(indexing, language)
123+
self.__indexing_configured = True
124+
return self
125+
108126
def start(self):
109-
if not all(
110-
[
111-
key in self._metadata.keys() and self._metadata.get(key, None)
112-
for key in MANDATORY_ZIM_METADATA_KEYS
113-
]
114-
):
127+
if not all([self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS]):
115128
raise ValueError("Mandatory metadata are not all set.")
116129

117130
for name, value in self._metadata.items():
118131
if value:
119-
self._validate_metadata(name, value)
132+
self.validate_metadata(name, value)
133+
134+
language = self._metadata.get("Language", "").split(",")
135+
if language[0] and not self.__indexing_configured:
136+
self.config_indexing(True, language[0])
120137

121138
super().__enter__()
122139

@@ -128,15 +145,97 @@ def start(self):
128145

129146
return self
130147

131-
def _validate_metadata(self, name, value):
148+
def validate_metadata(
149+
self,
150+
name: str,
151+
value: Union[bytes, str, datetime.datetime, datetime.date, Iterable[str]],
152+
):
153+
"""Ensures metadata value for name is conform with the openZIM spec on Metadata
154+
155+
Also enforces recommendations
156+
See https://wiki.openzim.org/wiki/Metadata"""
157+
158+
# spec doesnt require any value but empty strings are not useful
159+
if name in MANDATORY_ZIM_METADATA_KEYS and not value:
160+
raise ValueError(f"Missing value for {name}")
161+
162+
# most require/standard and al
163+
if name in (
164+
"Name",
165+
"Title",
166+
"Creator",
167+
"Publisher",
168+
"Description",
169+
"LongDescription",
170+
"License",
171+
"Relation",
172+
"Relation",
173+
"Flavour",
174+
"Source",
175+
"Scraper",
176+
) and not isinstance(value, str):
177+
raise ValueError(f"Invalid type for {name}")
178+
179+
if name == "Title" and len(value) > 30:
180+
raise ValueError(f"{name} is too long.")
181+
182+
if name == "Date":
183+
if not isinstance(value, (datetime.datetime, datetime.date, str)):
184+
raise ValueError(f"Invalid type for {name}.")
185+
elif isinstance(value, str):
186+
match = re.match(
187+
r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})", value
188+
)
189+
try:
190+
datetime.date(**{k: int(v) for k, v in match.groupdict().items()})
191+
except Exception as exc:
192+
raise ValueError(f"Invalid {name} format: {exc}")
193+
194+
if name == "Language" and not is_valid_iso_639_3(value):
195+
raise ValueError(f"{value} is not ISO-639-3.")
196+
132197
if name == "Counter":
133-
raise ValueError("You do not need to set Counter.")
198+
raise ValueError(f"{name} cannot be set. libzim sets it.")
199+
200+
if name == "Description" and len(value) > MAXIMUM_DESCRIPTION_METADATA_LENGTH:
201+
raise ValueError(f"{name} is too long.")
134202

135-
if name == "Description" and len(value) > 80:
136-
raise ValueError("Description is too long.")
203+
if (
204+
name == "LongDescription"
205+
and len(value) > MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH
206+
):
207+
raise ValueError(f"{name} is too long.")
137208

138-
if name == "LongDescription" and len(value) > 4000:
139-
raise ValueError("LongDescription is too long.")
209+
if name == "Tags" and (
210+
not isinstance(value, IterableT)
211+
or not all([isinstance(tag, str) for tag in value])
212+
):
213+
raise ValueError(f"Invalid type(s) for {name}")
214+
215+
if name.startswith("Illustration_"):
216+
match = ILLUSTRATIONS_METADATA_RE.match(name)
217+
if match and not is_valid_image(
218+
image=value,
219+
imformat="PNG",
220+
size=(
221+
int(match.groupdict()["width"]),
222+
int(match.groupdict()["height"]),
223+
),
224+
):
225+
raise ValueError(
226+
f"{name} is not a "
227+
f"{match.groupdict()['width']}x{match.groupdict()['height']} "
228+
"PNG Image"
229+
)
230+
231+
def add_metadata(
232+
self,
233+
name: str,
234+
content: Union[str, bytes, datetime.date, datetime.datetime],
235+
mimetype: str = "text/plain;charset=UTF-8",
236+
):
237+
self.validate_metadata(name, content)
238+
super().add_metadata(name, content, mimetype)
140239

141240
def config_metadata(
142241
self,
@@ -158,17 +257,7 @@ def config_metadata(
158257
Relation: Optional[str] = None,
159258
**extras: str,
160259
):
161-
"""
162-
A chaining functions which configures the metadata of the Creator class.
163-
You must set all mandatory metadata in this phase.
164-
165-
Parameters:
166-
check out: https://wiki.openzim.org/wiki/Metadata
167-
all the extra metadata must be plain text.
168-
169-
Returns:
170-
Self
171-
"""
260+
"""Sets all mandatory Metadata as well as standard and any other text ones"""
172261
self._metadata.update(
173262
{
174263
"Name": Name,
@@ -189,18 +278,10 @@ def config_metadata(
189278
}
190279
)
191280
self._metadata.update(extras)
192-
language = self._metadata.get("Language", "").split(",")
193-
self.config_indexing(True, language[0])
194-
195281
return self
196282

197283
def config_dev_metadata(self, **extras: str):
198-
"""
199-
A Test function. It will set the default test metadata for a Creator instance.
200-
201-
Returns:
202-
Self
203-
"""
284+
"""Calls config_metadata with default (yet overridable) values for dev"""
204285
devel_default_metadata = DEFAULT_DEV_ZIM_METADATA.copy()
205286
devel_default_metadata.update(extras)
206287
return self.config_metadata(**devel_default_metadata)

tests/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ def file_src(fname):
8585

8686
@pytest.fixture(scope="module")
8787
def png_image():
88+
return file_src("commons48.png")
89+
90+
91+
@pytest.fixture(scope="module")
92+
def png_image2():
8893
return file_src("commons.png")
8994

9095

tests/files/commons48.png

3.2 KB
Loading

0 commit comments

Comments
 (0)