Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
rev: '25.1.0'
rev: '25.9.0'
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.8
rev: v0.13.1
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.400
rev: v1.1.405
hooks:
- id: pyright
name: pyright (system)
Expand Down
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added
### Added

- New `zim.dedup.Deduplicator` class to handle automatic deduplication of content before adding to the ZIM (#33)

### Changed

- Upgrade dependencies, especially wombat 3.8.12 (#262)
- Upgrade dependencies, especially wombat 3.9.1 (#262, #263)
- Backport changes in wabac.js around JS rewriting rules (#259)

### Fixed

- JS rewriting abusively rewrite import function (#255)
- Test about badly escaped src in HTML is failing (#264)

### Added

Expand Down
2 changes: 1 addition & 1 deletion openzim.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ execute_after=[

[files.assets.actions."wombat.js"]
action="get_file"
source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.8.12/dist/wombat.js"
source="https://cdn.jsdelivr.net/npm/@webrecorder/wombat@3.9.1/dist/wombat.js"
target_file="wombat.js"
32 changes: 16 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,19 @@ dependencies = [
"python-magic>=0.4.3,<0.5",
"libzim>=3.4.0,<4.0",
"beautifulsoup4>=4.9.3,<5.0",
"lxml>=4.6.3,<6.0",
"lxml>=4.6.3,<7.0",
"optimize-images>=1.3.6,<2.0",
# regex has no upper-bound due to "date-based" release numbers, no semver, so their
# promise is that they will never (or always) break the API, and the API is very
# limited and we use only a very small subset of it.
"regex>=2020.7.14",
"pymupdf>=1.24.0,<2.0",
"CairoSVG>=2.2.0,<3.0",
"beartype>=0.19,<0.21",
"beartype>=0.19,<0.22",
# youtube-dl should be updated as frequently as possible
"yt-dlp",
"pillow>=7.0.0,<12.0",
"urllib3>=1.26.5,<2.5.0",
"urllib3>=1.26.5,<2.6.0",
"piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway
"idna>=2.5,<4.0",
"xxhash>=2.0,<4.0",
Expand Down Expand Up @@ -58,30 +58,30 @@ scripts = [

]
lint = [
"black==25.1.0",
"ruff==0.11.8",
"black==25.9.0",
"ruff==0.13.1",
]
check = [
"pyright==1.1.400",
"pytest==8.3.5",
"pyright==1.1.405",
"pytest==8.4.2",
]
test = [
"pytest==8.3.5",
"pytest-mock==3.14.0",
"coverage==7.8.0",
"pytest==8.4.2",
"pytest-mock==3.15.1",
"coverage==7.10.7",
]
docs = [
"mkdocs==1.6.1",
"mkdocstrings[python]==0.29.1",
"mkdocs-material==9.6.12",
"pymdown-extensions==10.15",
"mkdocs-include-markdown-plugin==7.1.7",
"mkdocs-material==9.6.20",
"mkdocstrings[python]==0.30.1",
"pymdown-extensions==10.16.1",
"mkdocs-gen-files==0.5.0",
"mkdocs-literate-nav==0.6.2",
"mkdocs-include-markdown-plugin==7.1.5",
]
dev = [
"ipython==9.2.0",
"pre-commit==4.2.0",
"ipython==9.5.0",
"pre-commit==4.3.0",
"zimscraperlib[scripts]",
"zimscraperlib[lint]",
"zimscraperlib[test]",
Expand Down
4 changes: 3 additions & 1 deletion src/zimscraperlib/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ def shutdown(self) -> None:
self.executor.shutdown(wait=True)

def _run_youtube_dl(self, url: str, options: dict[str, Any]) -> None:
with youtube_dl.YoutubeDL(options) as ydl:
with youtube_dl.YoutubeDL(
options # pyright: ignore[reportArgumentType]
) as ydl:
ydl.download([url]) # pyright: ignore[reportUnknownMemberType]

def download(
Expand Down
11 changes: 11 additions & 0 deletions src/zimscraperlib/i18n.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,17 @@ def __eq__(self, value: object) -> bool:
and self.native == getattr(value, "native", None)
)

def __hash__(self):
return hash(
f"{getattr(self, "iso_639_1", None)}$"
f"{getattr(self, "iso_639_2b", None)}$"
f"{getattr(self, "iso_639_2t", None)}$"
f"{getattr(self, "iso_639_3", None)}$"
f"{getattr(self, "iso_639_5", None)}$"
f"{getattr(self, "english", None)}$"
f"{getattr(self, "native", None)}"
)


def find_language_names(query: str) -> tuple[str, str]:
"""(native, english) language names for query"""
Expand Down
14 changes: 7 additions & 7 deletions src/zimscraperlib/image/probing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,16 @@

import colorthief # pyright: ignore[reportMissingTypeStubs]
import PIL.Image
from PIL.Image import EXTENSION as PIL_FMT_EXTENSION
from PIL.Image import init as init_pil

from zimscraperlib.filesystem import get_content_mimetype, get_file_mimetype

init_pil() # populate the PIL_FMT_EXTENSION dictionary

known_extensions = {".svg": "SVG"}
known_extensions.update(PIL_FMT_EXTENSION)


def get_colors(
src: pathlib.Path, *, use_palette: bool | None = True
Expand Down Expand Up @@ -82,13 +89,6 @@ def format_for(
"Cannot guess image format from file suffix when byte array is passed"
)

from PIL.Image import EXTENSION as PIL_FMT_EXTENSION
from PIL.Image import init as init_pil

init_pil() # populate the PIL_FMT_EXTENSION dictionary

known_extensions = {".svg": "SVG"}
known_extensions.update(PIL_FMT_EXTENSION)
return known_extensions[src.suffix] if src.suffix in known_extensions else None


Expand Down
5 changes: 1 addition & 4 deletions src/zimscraperlib/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import shutil
import tempfile
from collections.abc import Iterable
from typing import TypeVar

from zimscraperlib import logger
from zimscraperlib.constants import DEFAULT_USER_AGENT
Expand All @@ -14,8 +13,6 @@
)
from zimscraperlib.download import stream_file

T = TypeVar("T")


def handle_user_provided_file(
source: pathlib.Path | str | None = None,
Expand Down Expand Up @@ -136,6 +133,6 @@ def compute_tags(
}


def unique_values(items: list[T]) -> list[T]:
def unique_values[T](items: list[T]) -> list[T]:
"""Return unique values in input list while preserving list order"""
return list(dict.fromkeys(items))
2 changes: 2 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ def add_item_for(
def add_item( # pyright: ignore[reportIncompatibleMethodOverride]
self,
item: libzim.writer.Item,
*,
duplicate_ok: bool | None = None,
callbacks: list[Callback] | Callback | None = None,
):
Expand Down Expand Up @@ -417,6 +418,7 @@ def add_redirect(
path: str,
target_path: str,
title: str | None = "",
*,
is_front: bool | None = None,
duplicate_ok: bool | None = None,
):
Expand Down
1 change: 1 addition & 0 deletions src/zimscraperlib/zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def __init__(
title: str | None = None,
mimetype: str | None = None,
hints: dict[libzim.writer.Hint, int] | None = None,
*,
use_disk: bool | None = None,
**kwargs: Any,
):
Expand Down
19 changes: 7 additions & 12 deletions src/zimscraperlib/zim/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,6 @@ def get_libzim_value(self) -> bytes: ...
# Alias for convenience when function accept any metadata
AnyMetadata = MetadataBase[Any]

# TypeVar bounded to subclasses of GenericMetadata, used by class decorators so that
# they properly accommodate to the class they are used on while still knowing they have
# access to all attributes of the MetadataBase class
U = TypeVar("U", bound=AnyMetadata)


def clean_str(value: str) -> str:
"""Clean a string value for unwanted control characters and strip white chars"""
Expand All @@ -179,47 +174,47 @@ def nb_grapheme_for(value: str) -> int:
return len(regex.findall(r"\X", value))


def mandatory(cls: type[U]):
def mandatory[U: AnyMetadata](cls: type[U]):
"""Marks a Metadata mandatory: must be set to please Creator and cannot be empty"""
cls.is_required = True
cls.empty_allowed = False
return cls


def allow_empty(cls: type[U]):
def allow_empty[U: AnyMetadata](cls: type[U]):
"""Whether input can be blank"""
cls.empty_allowed = True
return cls


def allow_duplicates(cls: type[U]):
def allow_duplicates[U: AnyMetadata](cls: type[U]):
"""Whether list input can accept duplicate values"""
cls.duplicates_allowed = True
return cls


def deduplicate(cls: type[U]):
def deduplicate[U: AnyMetadata](cls: type[U]):
"""Whether duplicates in list inputs should be reduced"""
cls.duplicates_allowed = True
cls.require_deduplication = True
return cls


def only_lang_codes(cls: type[U]):
def only_lang_codes[U: AnyMetadata](cls: type[U]):
"""Whether list input should be checked to only accept ISO-639-1 codes"""
cls.oz_only_iso636_3_allowed = True
return cls


def x_protected(cls: type[U]):
def x_protected[U: AnyMetadata](cls: type[U]):
"""Whether metadata name should be checked for collision with reserved names

when applying recommendations"""
cls.oz_x_protected = True
return cls


def x_prefixed(cls: type[U]):
def x_prefixed[U: AnyMetadata](cls: type[U]):
"""Whether metadata names should be automatically X-Prefixed"""
cls.oz_x_protected = False
cls.oz_x_prefixed = True
Expand Down
9 changes: 4 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import pytest

from zimscraperlib.download import stream_file


def pytest_addoption(parser: pytest.Parser):
parser.addoption(
Expand Down Expand Up @@ -62,12 +64,12 @@ def timeout_url() -> str:

@pytest.fixture(scope="module")
def png_image_url() -> str:
return "https://commons.wikimedia.org/static/images/project-logos/commonswiki.png"
return "https://farm.openzim.org/assets/favicon-96x96.png"


@pytest.fixture(scope="module")
def gzip_html_url() -> str:
return "https://en.wikipedia.org/wiki/Main_Page"
return "https://kiwix.org/en"


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -162,7 +164,6 @@ def valid_user_agent():

@pytest.fixture(scope="session")
def small_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
from zimscraperlib.download import stream_file

dst = pathlib.Path(tmpdir_factory.mktemp("data") / "small.zim")
stream_file(
Expand All @@ -174,7 +175,6 @@ def small_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:

@pytest.fixture(scope="session")
def ns_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
from zimscraperlib.download import stream_file

dst = pathlib.Path(tmpdir_factory.mktemp("data") / "ns.zim")
stream_file(
Expand All @@ -187,7 +187,6 @@ def ns_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:

@pytest.fixture(scope="session")
def real_zim_file(tmpdir_factory: pytest.TempdirFactory) -> pathlib.Path:
from zimscraperlib.download import stream_file

dst = pathlib.Path(tmpdir_factory.mktemp("data") / "small.zim")
stream_file(
Expand Down
2 changes: 1 addition & 1 deletion tests/download/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pytest
import requests
import requests.structures
from yt_dlp import DownloadError # pyright: ignore[reportMissingTypeStubs]
from yt_dlp.utils import DownloadError

from zimscraperlib.constants import DEFAULT_WEB_REQUESTS_TIMEOUT
from zimscraperlib.download import (
Expand Down
2 changes: 1 addition & 1 deletion tests/image/test_illustration.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_get_zim_illustration(


def test_get_missing_user_zim_illustration():
with pytest.raises(Exception, match="missing.png could not be found"):
with pytest.raises(Exception, match=r"missing\.png could not be found"):
get_zim_illustration("./missing.png")


Expand Down
Loading