diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dad70bf..bbbefc0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,15 +8,15 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: '25.1.0' + rev: '25.9.0' hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.8 + rev: v0.13.1 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.400 + rev: v1.1.405 hooks: - id: pyright name: pyright (system) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee536d4..40aca14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,18 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -### Added +### Added - New `zim.dedup.Deduplicator` class to handle automatic deduplication of content before adding to the ZIM (#33) ### Changed -- Upgrade dependencies, especially wombat 3.8.12 (#262) +- Upgrade dependencies, especially wombat 3.9.1 (#262, #263) - Backport changes in wabac.js around JS rewriting rules (#259) ### Fixed - JS rewriting abusively rewrite import function (#255) +- Test about badly escaped src in HTML is failing (#264) ### Added diff --git a/openzim.toml b/openzim.toml index c01d4b0..6bea206 100644 --- a/openzim.toml +++ b/openzim.toml @@ -6,5 +6,5 @@ execute_after=[ [files.assets.actions."wombat.js"] action="get_file" -source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.8.12/dist/wombat.js" +source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.9.1/dist/wombat.js" target_file="wombat.js" diff --git a/pyproject.toml b/pyproject.toml index c2f64c7..754e150 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "python-magic>=0.4.3,<0.5", "libzim>=3.4.0,<4.0", "beautifulsoup4>=4.9.3,<5.0", - "lxml>=4.6.3,<6.0", + "lxml>=4.6.3,<7.0", "optimize-images>=1.3.6,<2.0", # regex has no upper-bound due to "date-based" release numbers, no semver, so their # promise is that they will never (or always) break the API, and the API is very @@ -26,11 +26,11 @@ dependencies = [ "regex>=2020.7.14", "pymupdf>=1.24.0,<2.0", "CairoSVG>=2.2.0,<3.0", - "beartype>=0.19,<0.21", + "beartype>=0.19,<0.22", # youtube-dl should be updated as frequently as possible "yt-dlp", "pillow>=7.0.0,<12.0", - "urllib3>=1.26.5,<2.5.0", + "urllib3>=1.26.5,<2.6.0", "piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway "idna>=2.5,<4.0", "xxhash>=2.0,<4.0", @@ -58,30 +58,30 @@ scripts = [ ] lint = [ - "black==25.1.0", - "ruff==0.11.8", + "black==25.9.0", + "ruff==0.13.1", ] check = [ - "pyright==1.1.400", - "pytest==8.3.5", + "pyright==1.1.405", + "pytest==8.4.2", ] test = [ - "pytest==8.3.5", - "pytest-mock==3.14.0", - "coverage==7.8.0", + "pytest==8.4.2", + "pytest-mock==3.15.1", + "coverage==7.10.7", ] docs = [ "mkdocs==1.6.1", - "mkdocstrings[python]==0.29.1", - "mkdocs-material==9.6.12", - "pymdown-extensions==10.15", + "mkdocs-include-markdown-plugin==7.1.7", + "mkdocs-material==9.6.20", + "mkdocstrings[python]==0.30.1", + "pymdown-extensions==10.16.1", "mkdocs-gen-files==0.5.0", "mkdocs-literate-nav==0.6.2", - "mkdocs-include-markdown-plugin==7.1.5", ] dev = [ - "ipython==9.2.0", - "pre-commit==4.2.0", + "ipython==9.5.0", + "pre-commit==4.3.0", "zimscraperlib[scripts]", "zimscraperlib[lint]", "zimscraperlib[test]", diff --git a/src/zimscraperlib/download.py b/src/zimscraperlib/download.py index ca0a962..7b08e0e 100644 --- a/src/zimscraperlib/download.py +++ b/src/zimscraperlib/download.py @@ -34,7 +34,9 @@ def shutdown(self) -> None: self.executor.shutdown(wait=True) def _run_youtube_dl(self, url: str, options: dict[str, Any]) -> None: - with youtube_dl.YoutubeDL(options) as ydl: + with youtube_dl.YoutubeDL( + options # pyright: ignore[reportArgumentType] + ) as ydl: ydl.download([url]) # pyright: ignore[reportUnknownMemberType] def download( diff --git a/src/zimscraperlib/i18n.py b/src/zimscraperlib/i18n.py index 79beae7..7396ecc 100644 --- a/src/zimscraperlib/i18n.py +++ b/src/zimscraperlib/i18n.py @@ -164,6 +164,17 @@ def __eq__(self, value: object) -> bool: and self.native == getattr(value, "native", None) ) + def __hash__(self): + return hash( + f"{getattr(self, "iso_639_1", None)}$" + f"{getattr(self, "iso_639_2b", None)}$" + f"{getattr(self, "iso_639_2t", None)}$" + f"{getattr(self, "iso_639_3", None)}$" + f"{getattr(self, "iso_639_5", None)}$" + f"{getattr(self, "english", None)}$" + f"{getattr(self, "native", None)}" + ) + def find_language_names(query: str) -> tuple[str, str]: """(native, english) language names for query""" diff --git a/src/zimscraperlib/image/probing.py b/src/zimscraperlib/image/probing.py index c84e823..03a331d 100644 --- a/src/zimscraperlib/image/probing.py +++ b/src/zimscraperlib/image/probing.py @@ -5,9 +5,16 @@ import colorthief # pyright: ignore[reportMissingTypeStubs] import PIL.Image +from PIL.Image import EXTENSION as PIL_FMT_EXTENSION +from PIL.Image import init as init_pil from zimscraperlib.filesystem import get_content_mimetype, get_file_mimetype +init_pil() # populate the PIL_FMT_EXTENSION dictionary + +known_extensions = {".svg": "SVG"} +known_extensions.update(PIL_FMT_EXTENSION) + def get_colors( src: pathlib.Path, *, use_palette: bool | None = True @@ -82,13 +89,6 @@ def format_for( "Cannot guess image format from file suffix when byte array is passed" ) - from PIL.Image import EXTENSION as PIL_FMT_EXTENSION - from PIL.Image import init as init_pil - - init_pil() # populate the PIL_FMT_EXTENSION dictionary - - known_extensions = {".svg": "SVG"} - known_extensions.update(PIL_FMT_EXTENSION) return known_extensions[src.suffix] if src.suffix in known_extensions else None diff --git a/src/zimscraperlib/inputs.py b/src/zimscraperlib/inputs.py index 2f76c06..1947286 100644 --- a/src/zimscraperlib/inputs.py +++ b/src/zimscraperlib/inputs.py @@ -2,7 +2,6 @@ import shutil import tempfile from collections.abc import Iterable -from typing import TypeVar from zimscraperlib import logger from zimscraperlib.constants import DEFAULT_USER_AGENT @@ -14,8 +13,6 @@ ) from zimscraperlib.download import stream_file -T = TypeVar("T") - def handle_user_provided_file( source: pathlib.Path | str | None = None, @@ -136,6 +133,6 @@ def compute_tags( } -def unique_values(items: list[T]) -> list[T]: +def unique_values[T](items: list[T]) -> list[T]: """Return unique values in input list while preserving list order""" return list(dict.fromkeys(items)) diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index c58c45c..ca03ba0 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -381,6 +381,7 @@ def add_item_for( def add_item( # pyright: ignore[reportIncompatibleMethodOverride] self, item: libzim.writer.Item, + *, duplicate_ok: bool | None = None, callbacks: list[Callback] | Callback | None = None, ): @@ -417,6 +418,7 @@ def add_redirect( path: str, target_path: str, title: str | None = "", + *, is_front: bool | None = None, duplicate_ok: bool | None = None, ): diff --git a/src/zimscraperlib/zim/items.py b/src/zimscraperlib/zim/items.py index f0c4802..767747e 100644 --- a/src/zimscraperlib/zim/items.py +++ b/src/zimscraperlib/zim/items.py @@ -235,6 +235,7 @@ def __init__( title: str | None = None, mimetype: str | None = None, hints: dict[libzim.writer.Hint, int] | None = None, + *, use_disk: bool | None = None, **kwargs: Any, ): diff --git a/src/zimscraperlib/zim/metadata.py b/src/zimscraperlib/zim/metadata.py index 24bed69..1b3cb12 100644 --- a/src/zimscraperlib/zim/metadata.py +++ b/src/zimscraperlib/zim/metadata.py @@ -163,11 +163,6 @@ def get_libzim_value(self) -> bytes: ... # Alias for convenience when function accept any metadata AnyMetadata = MetadataBase[Any] -# TypeVar bounded to subclasses of GenericMetadata, used by class decorators so that -# they properly accommodate to the class they are used on while still knowing they have -# access to all attributes of the MetadataBase class -U = TypeVar("U", bound=AnyMetadata) - def clean_str(value: str) -> str: """Clean a string value for unwanted control characters and strip white chars""" @@ -179,39 +174,39 @@ def nb_grapheme_for(value: str) -> int: return len(regex.findall(r"\X", value)) -def mandatory(cls: type[U]): +def mandatory[U: AnyMetadata](cls: type[U]): """Marks a Metadata mandatory: must be set to please Creator and cannot be empty""" cls.is_required = True cls.empty_allowed = False return cls -def allow_empty(cls: type[U]): +def allow_empty[U: AnyMetadata](cls: type[U]): """Whether input can be blank""" cls.empty_allowed = True return cls -def allow_duplicates(cls: type[U]): +def allow_duplicates[U: AnyMetadata](cls: type[U]): """Whether list input can accept duplicate values""" cls.duplicates_allowed = True return cls -def deduplicate(cls: type[U]): +def deduplicate[U: AnyMetadata](cls: type[U]): """Whether duplicates in list inputs should be reduced""" cls.duplicates_allowed = True cls.require_deduplication = True return cls -def only_lang_codes(cls: type[U]): +def only_lang_codes[U: AnyMetadata](cls: type[U]): """Whether list input should be checked to only accept ISO-639-1 codes""" cls.oz_only_iso636_3_allowed = True return cls -def x_protected(cls: type[U]): +def x_protected[U: AnyMetadata](cls: type[U]): """Whether metadata name should be checked for collision with reserved names when applying recommendations""" @@ -219,7 +214,7 @@ def x_protected(cls: type[U]): return cls -def x_prefixed(cls: type[U]): +def x_prefixed[U: AnyMetadata](cls: type[U]): """Whether metadata names should be automatically X-Prefixed""" cls.oz_x_protected = False cls.oz_x_prefixed = True diff --git a/tests/conftest.py b/tests/conftest.py index 58e8daa..510ff3d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,8 @@ import pytest +from zimscraperlib.download import stream_file + def pytest_addoption(parser: pytest.Parser): parser.addoption( @@ -62,12 +64,12 @@ def timeout_url() -> str: @pytest.fixture(scope="module") def png_image_url() -> str: - return "https://commons.wikimedia.org/static/images/project-logos/commonswiki.png" + return "https://farm.openzim.org/assets/favicon-96x96.png" @pytest.fixture(scope="module") def gzip_html_url() -> str: - return "https://en.wikipedia.org/wiki/Main_Page" + return "https://kiwix.org/en" @pytest.fixture(scope="module") @@ -162,7 +164,6 @@ def valid_user_agent(): @pytest.fixture(scope="session") def small_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path: - from zimscraperlib.download import stream_file dst = pathlib.Path(tmpdir_factory.mktemp("data") / "small.zim") stream_file( @@ -174,7 +175,6 @@ def small_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path: @pytest.fixture(scope="session") def ns_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path: - from zimscraperlib.download import stream_file dst = pathlib.Path(tmpdir_factory.mktemp("data") / "ns.zim") stream_file( @@ -187,7 +187,6 @@ def ns_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path: @pytest.fixture(scope="session") def real_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path: - from zimscraperlib.download import stream_file dst = pathlib.Path(tmpdir_factory.mktemp("data") / "small.zim") stream_file( diff --git a/tests/download/test_download.py b/tests/download/test_download.py index 8f138ef..3c47357 100644 --- a/tests/download/test_download.py +++ b/tests/download/test_download.py @@ -8,7 +8,7 @@ import pytest import requests import requests.structures -from yt_dlp import DownloadError # pyright: ignore[reportMissingTypeStubs] +from yt_dlp.utils import DownloadError from zimscraperlib.constants import DEFAULT_WEB_REQUESTS_TIMEOUT from zimscraperlib.download import ( diff --git a/tests/image/test_illustration.py b/tests/image/test_illustration.py index b9ae0a7..461b659 100644 --- a/tests/image/test_illustration.py +++ b/tests/image/test_illustration.py @@ -40,7 +40,7 @@ def test_get_zim_illustration( def test_get_missing_user_zim_illustration(): - with pytest.raises(Exception, match="missing.png could not be found"): + with pytest.raises(Exception, match=r"missing\.png could not be found"): get_zim_illustration("./missing.png") diff --git a/tests/rewriting/test_html_rewriting.py b/tests/rewriting/test_html_rewriting.py index 48c6db1..36d7841 100644 --- a/tests/rewriting/test_html_rewriting.py +++ b/tests/rewriting/test_html_rewriting.py @@ -789,7 +789,7 @@ def test_rewrite_base_href(rewrite_base_href_content: ContentForTests): ), pytest.param( """""", - """""", + """""", id="badly_escaped_src", ), ], @@ -1067,7 +1067,7 @@ def test_html_drop_rules( def test_bad_html_drop_rules_argument_name(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"): @bad_rules.drop_attribute() def bad_signature(foo: str) -> bool: # pyright: ignore[reportUnusedFunction] @@ -1077,7 +1077,7 @@ def bad_signature(foo: str) -> bool: # pyright: ignore[reportUnusedFunction] def test_bad_html_drop_rules_argument_type(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"): @bad_rules.drop_attribute() def bad_signature( # pyright: ignore[reportUnusedFunction] @@ -1231,7 +1231,7 @@ def notify(path: ZimPath): def test_bad_html_attribute_rewrite_rules_argument_name(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"): @bad_rules.rewrite_attribute() def bad_signature( # pyright: ignore[reportUnusedFunction] @@ -1243,7 +1243,7 @@ def bad_signature( # pyright: ignore[reportUnusedFunction] def test_bad_html_attribute_rewrite_rules_argument_type(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"): @bad_rules.rewrite_attribute() def bad_signature( # pyright: ignore[reportUnusedFunction] @@ -1375,7 +1375,7 @@ def test_html_tag_rewrite_rules( def test_bad_html_tag_rewrite_rules_argument_name(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"): @bad_rules.rewrite_tag() def bad_signature(foo: str) -> str: # pyright: ignore[reportUnusedFunction] @@ -1385,7 +1385,7 @@ def bad_signature(foo: str) -> str: # pyright: ignore[reportUnusedFunction] def test_bad_html_tag_rewrite_rules_argument_type(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"): @bad_rules.rewrite_tag() def bad_signature(attrs: int) -> str: # pyright: ignore[reportUnusedFunction] @@ -1463,7 +1463,7 @@ def notify(path: ZimPath): def test_bad_html_data_rewrite_rules_argument_name(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* is unsupported in function"): + with pytest.raises(TypeError, match=r"Parameter .* is unsupported in function"): @bad_rules.rewrite_data() def bad_signature( # pyright: ignore[reportUnusedFunction] @@ -1475,7 +1475,7 @@ def bad_signature( # pyright: ignore[reportUnusedFunction] def test_bad_html_data_rewrite_rules_argument_type(): bad_rules = HTMLRewritingRules() - with pytest.raises(TypeError, match="Parameter .* in function .* must be of type"): + with pytest.raises(TypeError, match=r"Parameter .* in function .* must be of type"): @bad_rules.rewrite_data() def bad_signature( # pyright: ignore[reportUnusedFunction] diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 7cc1d4c..ad2bf7b 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -331,7 +331,7 @@ def test_urlitem_html(tmp_path: pathlib.Path, gzip_html_url: str): creator.add_item(URLItem(url=gzip_html_url)) zim = Archive(fpath) - assert bytes(zim.get_item("wiki/Main_Page").content) == file_bytes + assert bytes(zim.get_item("en").content) == file_bytes def test_urlitem_nonhtmlgzip(tmp_path: pathlib.Path, gzip_nonhtml_url: str): @@ -362,10 +362,7 @@ def test_urlitem_binary(tmp_path: pathlib.Path, png_image_url: str): creator.add_item(URLItem(url=png_image_url)) zim = Archive(fpath) - assert ( - bytes(zim.get_item("static/images/project-logos/commonswiki.png").content) - == file_bytes - ) + assert bytes(zim.get_item("assets/favicon-96x96.png").content) == file_bytes def test_urlitem_staticcontent(tmp_path: pathlib.Path, gzip_nonhtml_url: str): @@ -600,7 +597,7 @@ def test_ignore_duplicates(tmp_path: pathlib.Path): def test_without_metadata(tmp_path: pathlib.Path): - with pytest.raises(ValueError, match="Mandatory metadata are not all set."): + with pytest.raises(ValueError, match=r"Mandatory metadata are not all set\."): Creator(tmp_path, "").start() @@ -629,7 +626,7 @@ def test_start_logs_metadata_log_contents( tmp_path: pathlib.Path, ignore_metadata_conventions: NoneType, # noqa: ARG001 ): - mocked_logger.isEnabledFor.side_effect = ( # pyright: ignore[reportFunctionMemberAccess] + mocked_logger.isEnabledFor.side_effect = ( # pyright: ignore[reportAttributeAccessIssue] lambda level: level == logging.DEBUG # pyright: ignore[reportUnknownLambdaType] ) fpath = tmp_path / "test_config.zim" @@ -690,7 +687,7 @@ def __str__(self): "BadRawValue" ] = "Value" creator._log_metadata() # pyright: ignore[reportPrivateUsage] - mocked_logger.debug.assert_has_calls( # pyright: ignore[reportFunctionMemberAccess] + mocked_logger.debug.assert_has_calls( # pyright: ignore[reportUnknownMemberType, reportAttributeAccessIssue] [ call("Metadata: BadRawValue is improper metadata type: str: Value"), call("Metadata: Chars = šɔɛ"),