diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dad70bf..bbbefc0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,15 +8,15 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
- rev: '25.1.0'
+ rev: '25.9.0'
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.11.8
+ rev: v0.13.1
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
- rev: v1.1.400
+ rev: v1.1.405
hooks:
- id: pyright
name: pyright (system)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ee536d4..40aca14 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,18 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
-### Added
+### Added
- New `zim.dedup.Deduplicator` class to handle automatic deduplication of content before adding to the ZIM (#33)
### Changed
-- Upgrade dependencies, especially wombat 3.8.12 (#262)
+- Upgrade dependencies, especially wombat 3.9.1 (#262, #263)
- Backport changes in wabac.js around JS rewriting rules (#259)
### Fixed
- JS rewriting abusively rewrite import function (#255)
+- Test about badly escaped src in HTML is failing (#264)
### Added
diff --git a/openzim.toml b/openzim.toml
index c01d4b0..6bea206 100644
--- a/openzim.toml
+++ b/openzim.toml
@@ -6,5 +6,5 @@ execute_after=[
[files.assets.actions."wombat.js"]
action="get_file"
-source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.8.12/dist/wombat.js"
+source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.9.1/dist/wombat.js"
target_file="wombat.js"
diff --git a/pyproject.toml b/pyproject.toml
index c2f64c7..754e150 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
"python-magic>=0.4.3,<0.5",
"libzim>=3.4.0,<4.0",
"beautifulsoup4>=4.9.3,<5.0",
- "lxml>=4.6.3,<6.0",
+ "lxml>=4.6.3,<7.0",
"optimize-images>=1.3.6,<2.0",
# regex has no upper-bound due to "date-based" release numbers, no semver, so their
# promise is that they will never (or always) break the API, and the API is very
@@ -26,11 +26,11 @@ dependencies = [
"regex>=2020.7.14",
"pymupdf>=1.24.0,<2.0",
"CairoSVG>=2.2.0,<3.0",
- "beartype>=0.19,<0.21",
+ "beartype>=0.19,<0.22",
# youtube-dl should be updated as frequently as possible
"yt-dlp",
"pillow>=7.0.0,<12.0",
- "urllib3>=1.26.5,<2.5.0",
+ "urllib3>=1.26.5,<2.6.0",
"piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway
"idna>=2.5,<4.0",
"xxhash>=2.0,<4.0",
@@ -58,30 +58,30 @@ scripts = [
]
lint = [
- "black==25.1.0",
- "ruff==0.11.8",
+ "black==25.9.0",
+ "ruff==0.13.1",
]
check = [
- "pyright==1.1.400",
- "pytest==8.3.5",
+ "pyright==1.1.405",
+ "pytest==8.4.2",
]
test = [
- "pytest==8.3.5",
- "pytest-mock==3.14.0",
- "coverage==7.8.0",
+ "pytest==8.4.2",
+ "pytest-mock==3.15.1",
+ "coverage==7.10.7",
]
docs = [
"mkdocs==1.6.1",
- "mkdocstrings[python]==0.29.1",
- "mkdocs-material==9.6.12",
- "pymdown-extensions==10.15",
+ "mkdocs-include-markdown-plugin==7.1.7",
+ "mkdocs-material==9.6.20",
+ "mkdocstrings[python]==0.30.1",
+ "pymdown-extensions==10.16.1",
"mkdocs-gen-files==0.5.0",
"mkdocs-literate-nav==0.6.2",
- "mkdocs-include-markdown-plugin==7.1.5",
]
dev = [
- "ipython==9.2.0",
- "pre-commit==4.2.0",
+ "ipython==9.5.0",
+ "pre-commit==4.3.0",
"zimscraperlib[scripts]",
"zimscraperlib[lint]",
"zimscraperlib[test]",
diff --git a/src/zimscraperlib/download.py b/src/zimscraperlib/download.py
index ca0a962..7b08e0e 100644
--- a/src/zimscraperlib/download.py
+++ b/src/zimscraperlib/download.py
@@ -34,7 +34,9 @@ def shutdown(self) -> None:
self.executor.shutdown(wait=True)
def _run_youtube_dl(self, url: str, options: dict[str, Any]) -> None:
- with youtube_dl.YoutubeDL(options) as ydl:
+ with youtube_dl.YoutubeDL(
+ options # pyright: ignore[reportArgumentType]
+ ) as ydl:
ydl.download([url]) # pyright: ignore[reportUnknownMemberType]
def download(
diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py
index 79beae7..7396ecc 100644
--- a/src/zimscraperlib/i18n.py
+++ b/src/zimscraperlib/i18n.py
@@ -164,6 +164,17 @@ def __eq__(self, value: object) -> bool:
and self.native == getattr(value, "native", None)
)
+ def __hash__(self):
+ return hash(
+ f"{getattr(self, "iso_639_1", None)}$"
+ f"{getattr(self, "iso_639_2b", None)}$"
+ f"{getattr(self, "iso_639_2t", None)}$"
+ f"{getattr(self, "iso_639_3", None)}$"
+ f"{getattr(self, "iso_639_5", None)}$"
+ f"{getattr(self, "english", None)}$"
+ f"{getattr(self, "native", None)}"
+ )
+
def find_language_names(query: str) -> tuple[str, str]:
"""(native, english) language names for query"""
diff --git a/src/zimscraperlib/image/probing.py b/src/zimscraperlib/image/probing.py
index c84e823..03a331d 100644
--- a/src/zimscraperlib/image/probing.py
+++ b/src/zimscraperlib/image/probing.py
@@ -5,9 +5,16 @@
import colorthief # pyright: ignore[reportMissingTypeStubs]
import PIL.Image
+from PIL.Image import EXTENSION as PIL_FMT_EXTENSION
+from PIL.Image import init as init_pil
from zimscraperlib.filesystem import get_content_mimetype, get_file_mimetype
+init_pil() # populate the PIL_FMT_EXTENSION dictionary
+
+known_extensions = {".svg": "SVG"}
+known_extensions.update(PIL_FMT_EXTENSION)
+
def get_colors(
src: pathlib.Path, *, use_palette: bool | None = True
@@ -82,13 +89,6 @@ def format_for(
"Cannot guess image format from file suffix when byte array is passed"
)
- from PIL.Image import EXTENSION as PIL_FMT_EXTENSION
- from PIL.Image import init as init_pil
-
- init_pil() # populate the PIL_FMT_EXTENSION dictionary
-
- known_extensions = {".svg": "SVG"}
- known_extensions.update(PIL_FMT_EXTENSION)
return known_extensions[src.suffix] if src.suffix in known_extensions else None
diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py
index 2f76c06..1947286 100644
--- a/src/zimscraperlib/inputs.py
+++ b/src/zimscraperlib/inputs.py
@@ -2,7 +2,6 @@
import shutil
import tempfile
from collections.abc import Iterable
-from typing import TypeVar
from zimscraperlib import logger
from zimscraperlib.constants import DEFAULT_USER_AGENT
@@ -14,8 +13,6 @@
)
from zimscraperlib.download import stream_file
-T = TypeVar("T")
-
def handle_user_provided_file(
source: pathlib.Path | str | None = None,
@@ -136,6 +133,6 @@ def compute_tags(
}
-def unique_values(items: list[T]) -> list[T]:
+def unique_values[T](items: list[T]) -> list[T]:
"""Return unique values in input list while preserving list order"""
return list(dict.fromkeys(items))
diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py
index c58c45c..ca03ba0 100644
--- a/src/zimscraperlib/zim/creator.py
+++ b/src/zimscraperlib/zim/creator.py
@@ -381,6 +381,7 @@ def add_item_for(
def add_item( # pyright: ignore[reportIncompatibleMethodOverride]
self,
item: libzim.writer.Item,
+ *,
duplicate_ok: bool | None = None,
callbacks: list[Callback] | Callback | None = None,
):
@@ -417,6 +418,7 @@ def add_redirect(
path: str,
target_path: str,
title: str | None = "",
+ *,
is_front: bool | None = None,
duplicate_ok: bool | None = None,
):
diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py
index f0c4802..767747e 100644
--- a/src/zimscraperlib/zim/items.py
+++ b/src/zimscraperlib/zim/items.py
@@ -235,6 +235,7 @@ def __init__(
title: str | None = None,
mimetype: str | None = None,
hints: dict[libzim.writer.Hint, int] | None = None,
+ *,
use_disk: bool | None = None,
**kwargs: Any,
):
diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py
index 24bed69..1b3cb12 100644
--- a/src/zimscraperlib/zim/metadata.py
+++ b/src/zimscraperlib/zim/metadata.py
@@ -163,11 +163,6 @@ def get_libzim_value(self) -> bytes: ...
# Alias for convenience when function accept any metadata
AnyMetadata = MetadataBase[Any]
-# TypeVar bounded to subclasses of GenericMetadata, used by class decorators so that
-# they properly accommodate to the class they are used on while still knowing they have
-# access to all attributes of the MetadataBase class
-U = TypeVar("U", bound=AnyMetadata)
-
def clean_str(value: str) -> str:
"""Clean a string value for unwanted control characters and strip white chars"""
@@ -179,39 +174,39 @@ def nb_grapheme_for(value: str) -> int:
return len(regex.findall(r"\X", value))
-def mandatory(cls: type[U]):
+def mandatory[U: AnyMetadata](cls: type[U]):
"""Marks a Metadata mandatory: must be set to please Creator and cannot be empty"""
cls.is_required = True
cls.empty_allowed = False
return cls
-def allow_empty(cls: type[U]):
+def allow_empty[U: AnyMetadata](cls: type[U]):
"""Whether input can be blank"""
cls.empty_allowed = True
return cls
-def allow_duplicates(cls: type[U]):
+def allow_duplicates[U: AnyMetadata](cls: type[U]):
"""Whether list input can accept duplicate values"""
cls.duplicates_allowed = True
return cls
-def deduplicate(cls: type[U]):
+def deduplicate[U: AnyMetadata](cls: type[U]):
"""Whether duplicates in list inputs should be reduced"""
cls.duplicates_allowed = True
cls.require_deduplication = True
return cls
-def only_lang_codes(cls: type[U]):
+def only_lang_codes[U: AnyMetadata](cls: type[U]):
"""Whether list input should be checked to only accept ISO-639-1 codes"""
cls.oz_only_iso636_3_allowed = True
return cls
-def x_protected(cls: type[U]):
+def x_protected[U: AnyMetadata](cls: type[U]):
"""Whether metadata name should be checked for collision with reserved names
when applying recommendations"""
@@ -219,7 +214,7 @@ def x_protected(cls: type[U]):
return cls
-def x_prefixed(cls: type[U]):
+def x_prefixed[U: AnyMetadata](cls: type[U]):
"""Whether metadata names should be automatically X-Prefixed"""
cls.oz_x_protected = False
cls.oz_x_prefixed = True
diff --git a/tests/conftest.py b/tests/conftest.py
index 58e8daa..510ff3d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,6 +2,8 @@
import pytest
+from zimscraperlib.download import stream_file
+
def pytest_addoption(parser: pytest.Parser):
parser.addoption(
@@ -62,12 +64,12 @@ def timeout_url() -> str:
@pytest.fixture(scope="module")
def png_image_url() -> str:
- return "https://commons.wikimedia.org/static/images/project-logos/commonswiki.png"
+ return "https://farm.openzim.org/assets/favicon-96x96.png"
@pytest.fixture(scope="module")
def gzip_html_url() -> str:
- return "https://en.wikipedia.org/wiki/Main_Page"
+ return "https://kiwix.org/en"
@pytest.fixture(scope="module")
@@ -162,7 +164,6 @@ def valid_user_agent():
@pytest.fixture(scope="session")
def small_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
- from zimscraperlib.download import stream_file
dst = pathlib.Path(tmpdir_factory.mktemp("data") / "small.zim")
stream_file(
@@ -174,7 +175,6 @@ def small_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
@pytest.fixture(scope="session")
def ns_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
- from zimscraperlib.download import stream_file
dst = pathlib.Path(tmpdir_factory.mktemp("data") / "ns.zim")
stream_file(
@@ -187,7 +187,6 @@ def ns_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
@pytest.fixture(scope="session")
def real_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
- from zimscraperlib.download import stream_file
dst = pathlib.Path(tmpdir_factory.mktemp("data") / "small.zim")
stream_file(
diff --git a/tests/download/test_download.py b/tests/download/test_download.py
index 8f138ef..3c47357 100644
--- a/tests/download/test_download.py
+++ b/tests/download/test_download.py
@@ -8,7 +8,7 @@
import pytest
import requests
import requests.structures
-from yt_dlp import DownloadError # pyright: ignore[reportMissingTypeStubs]
+from yt_dlp.utils import DownloadError
from zimscraperlib.constants import DEFAULT_WEB_REQUESTS_TIMEOUT
from zimscraperlib.download import (
diff --git a/tests/image/test_illustration.py b/tests/image/test_illustration.py
index b9ae0a7..461b659 100644
--- a/tests/image/test_illustration.py
+++ b/tests/image/test_illustration.py
@@ -40,7 +40,7 @@ def test_get_zim_illustration(
def test_get_missing_user_zim_illustration():
- with pytest.raises(Exception, match="missing.png could not be found"):
+ with pytest.raises(Exception, match=r"missing\.png could not be found"):
get_zim_illustration("./missing.png")
diff --git a/tests/rewriting/test_html_rewriting.py b/tests/rewriting/test_html_rewriting.py
index 48c6db1..36d7841 100644
--- a/tests/rewriting/test_html_rewriting.py
+++ b/tests/rewriting/test_html_rewriting.py
@@ -789,7 +789,7 @@ def test_rewrite_base_href(rewrite_base_href_content: ContentForTests):
),
pytest.param(
"""
""",
- """
""",
+ """
""",
id="badly_escaped_src",
),
],
@@ -1067,7 +1067,7 @@ def test_html_drop_rules(
def test_bad_html_drop_rules_argument_name():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* is unsupported in function"):
+ with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"):
@bad_rules.drop_attribute()
def bad_signature(foo: str) -> bool: # pyright: ignore[reportUnusedFunction]
@@ -1077,7 +1077,7 @@ def bad_signature(foo: str) -> bool: # pyright: ignore[reportUnusedFunction]
def test_bad_html_drop_rules_argument_type():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"):
+ with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"):
@bad_rules.drop_attribute()
def bad_signature( # pyright: ignore[reportUnusedFunction]
@@ -1231,7 +1231,7 @@ def notify(path: ZimPath):
def test_bad_html_attribute_rewrite_rules_argument_name():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* is unsupported in function"):
+ with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"):
@bad_rules.rewrite_attribute()
def bad_signature( # pyright: ignore[reportUnusedFunction]
@@ -1243,7 +1243,7 @@ def bad_signature( # pyright: ignore[reportUnusedFunction]
def test_bad_html_attribute_rewrite_rules_argument_type():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"):
+ with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"):
@bad_rules.rewrite_attribute()
def bad_signature( # pyright: ignore[reportUnusedFunction]
@@ -1375,7 +1375,7 @@ def test_html_tag_rewrite_rules(
def test_bad_html_tag_rewrite_rules_argument_name():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* is unsupported in function"):
+ with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"):
@bad_rules.rewrite_tag()
def bad_signature(foo: str) -> str: # pyright: ignore[reportUnusedFunction]
@@ -1385,7 +1385,7 @@ def bad_signature(foo: str) -> str: # pyright: ignore[reportUnusedFunction]
def test_bad_html_tag_rewrite_rules_argument_type():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"):
+ with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"):
@bad_rules.rewrite_tag()
def bad_signature(attrs: int) -> str: # pyright: ignore[reportUnusedFunction]
@@ -1463,7 +1463,7 @@ def notify(path: ZimPath):
def test_bad_html_data_rewrite_rules_argument_name():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* is unsupported in function"):
+ with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"):
@bad_rules.rewrite_data()
def bad_signature( # pyright: ignore[reportUnusedFunction]
@@ -1475,7 +1475,7 @@ def bad_signature( # pyright: ignore[reportUnusedFunction]
def test_bad_html_data_rewrite_rules_argument_type():
bad_rules = HTMLRewritingRules()
- with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"):
+ with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"):
@bad_rules.rewrite_data()
def bad_signature( # pyright: ignore[reportUnusedFunction]
diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py
index 7cc1d4c..ad2bf7b 100644
--- a/tests/zim/test_zim_creator.py
+++ b/tests/zim/test_zim_creator.py
@@ -331,7 +331,7 @@ def test_urlitem_html(tmp_path: pathlib.Path, gzip_html_url: str):
creator.add_item(URLItem(url=gzip_html_url))
zim = Archive(fpath)
- assert bytes(zim.get_item("wiki/Main_Page").content) == file_bytes
+ assert bytes(zim.get_item("en").content) == file_bytes
def test_urlitem_nonhtmlgzip(tmp_path: pathlib.Path, gzip_nonhtml_url: str):
@@ -362,10 +362,7 @@ def test_urlitem_binary(tmp_path: pathlib.Path, png_image_url: str):
creator.add_item(URLItem(url=png_image_url))
zim = Archive(fpath)
- assert (
- bytes(zim.get_item("static/images/project-logos/commonswiki.png").content)
- == file_bytes
- )
+ assert bytes(zim.get_item("assets/favicon-96x96.png").content) == file_bytes
def test_urlitem_staticcontent(tmp_path: pathlib.Path, gzip_nonhtml_url: str):
@@ -600,7 +597,7 @@ def test_ignore_duplicates(tmp_path: pathlib.Path):
def test_without_metadata(tmp_path: pathlib.Path):
- with pytest.raises(ValueError, match="Mandatory metadata are not all set."):
+ with pytest.raises(ValueError, match=r"Mandatory metadata are not all set\."):
Creator(tmp_path, "").start()
@@ -629,7 +626,7 @@ def test_start_logs_metadata_log_contents(
tmp_path: pathlib.Path,
ignore_metadata_conventions: NoneType, # noqa: ARG001
):
- mocked_logger.isEnabledFor.side_effect = ( # pyright: ignore[reportFunctionMemberAccess]
+ mocked_logger.isEnabledFor.side_effect = ( # pyright: ignore[reportAttributeAccessIssue]
lambda level: level == logging.DEBUG # pyright: ignore[reportUnknownLambdaType]
)
fpath = tmp_path / "test_config.zim"
@@ -690,7 +687,7 @@ def __str__(self):
"BadRawValue"
] = "Value"
creator._log_metadata() # pyright: ignore[reportPrivateUsage]
- mocked_logger.debug.assert_has_calls( # pyright: ignore[reportFunctionMemberAccess]
+ mocked_logger.debug.assert_has_calls( # pyright: ignore[reportUnknownMemberType, reportAttributeAccessIssue]
[
call("Metadata: BadRawValue is improper metadata type: str: Value"),
call("Metadata: Chars = šɔɛ"),