Skip to content

Commit 81283d1

Browse files
committed
Fix #3515 by preserving images quality by default
1 parent 85b53d8 commit 81283d1

File tree

5 files changed

+46
-9
lines changed

5 files changed

+46
-9
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,4 @@ mypy:
2626
mypy pypdf --ignore-missing-imports --check-untyped --strict
2727

2828
ruff:
29-
ruff check pypdf tests make_release.py
29+
ruff check --fix pypdf tests make_release.py

pypdf/_page.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -389,16 +389,18 @@ def replace(self, new_image: Image, **kwargs: Any) -> None:
389389
b = BytesIO()
390390
new_image.save(b, "PDF", **kwargs)
391391
reader = PdfReader(b)
392-
assert reader.pages[0].images[0].indirect_reference is not None
392+
page_image = reader.pages[0].images[0]
393+
assert page_image.indirect_reference is not None
393394
self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
394-
reader.pages[0].images[0].indirect_reference.get_object()
395+
page_image.indirect_reference.get_object()
395396
)
396397
cast(
397398
PdfObject, self.indirect_reference.get_object()
398399
).indirect_reference = self.indirect_reference
399400
# change the object attributes
400401
extension, byte_stream, img = _xobj_to_image(
401-
cast(DictionaryObject, self.indirect_reference.get_object())
402+
cast(DictionaryObject, self.indirect_reference.get_object()),
403+
pillow_parameters=kwargs,
402404
)
403405
assert extension is not None
404406
self.name = self.name[: self.name.rfind(".")] + extension

pypdf/filters.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -791,15 +791,20 @@ def decode_stream_data(stream: Any) -> bytes:
791791
return data
792792

793793

794-
def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]:
794+
def _xobj_to_image(
795+
x_object: dict[str, Any],
796+
pillow_parameters: Union[dict[str, Any], None] = None
797+
) -> tuple[Optional[str], bytes, Any]:
795798
"""
796799
Users need to have the pillow package installed.
797800
798801
It's unclear if pypdf will keep this function here, hence it's private.
799802
It might get removed at any point.
800803
801804
Args:
802-
x_object:
805+
x_object:
806+
pillow_parameters: parameters provided to Pillow Image.save() method,
807+
cf. <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save>
803808
804809
Returns:
805810
Tuple[file extension, bytes, PIL.Image.Image]
@@ -947,10 +952,18 @@ def _apply_alpha(
947952
img, x_object, obj_as_text, image_format, extension
948953
)
949954

955+
if pillow_parameters is None:
956+
pillow_parameters = {}
957+
# Preserve JPEG image quality - see issue #3515.
958+
if image_format == "JPEG" and "quality" not in pillow_parameters:
959+
pillow_parameters["quality"] = "keep"
960+
# This prevent: Cannot use 'keep' when original image is not a JPEG:
961+
img.format = image_format # type: ignore
962+
950963
# Save image to bytes
951964
img_byte_arr = BytesIO()
952965
try:
953-
img.save(img_byte_arr, format=image_format)
966+
img.save(img_byte_arr, format=image_format, **pillow_parameters)
954967
except OSError: # pragma: no cover # covered with pillow 10.3
955968
# in case of we convert to RGBA and then to PNG
956969
img1 = img.convert("RGBA")

pypdf/generic/_data_structures.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,10 +1046,14 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
10461046
retval._data = FlateDecode.encode(self._data, level)
10471047
return retval
10481048

1049-
def decode_as_image(self) -> Any:
1049+
def decode_as_image(self, pillow_parameters: Union[dict[str, Any], None] = None) -> Any:
10501050
"""
10511051
Try to decode the stream object as an image
10521052
1053+
Args:
1054+
pillow_parameters: parameters provided to Pillow Image.save() method,
1055+
cf. <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save>
1056+
10531057
Returns:
10541058
a PIL image if proper decoding has been found
10551059
Raises:
@@ -1066,7 +1070,7 @@ def decode_as_image(self) -> Any:
10661070
except AttributeError:
10671071
msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover
10681072
logger_warning(msg, __name__)
1069-
extension, _, img = _xobj_to_image(self)
1073+
extension, _, img = _xobj_to_image(self, pillow_parameters)
10701074
if extension is None:
10711075
return None # pragma: no cover
10721076
return img

tests/test_images.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ def open_image(path: Union[Path, Image.Image, BytesIO]) -> Image.Image:
3939
return img
4040

4141

42+
def image_size(image: Image.Image):
43+
buffer = BytesIO()
44+
image.save(buffer, format=image.format)
45+
return buffer.tell()
46+
47+
4248
def image_similarity(
4349
path1: Union[Path, Image.Image, BytesIO], path2: Union[Path, Image.Image, BytesIO]
4450
) -> float:
@@ -474,6 +480,18 @@ def test_extract_image_from_object(caplog):
474480
assert "does not seem to be an Image" in caplog.text
475481

476482

483+
def test_extract_jpeg_with_explicit_quality():
484+
reader = PdfReader(RESOURCE_ROOT / "side-by-side-subfig.pdf")
485+
page = reader.pages[0]
486+
x_object = page["/Resources"]["/XObject"]["/Im1"]
487+
assert x_object["/Filter"] == "/DCTDecode"
488+
image = x_object.decode_as_image()
489+
assert isinstance(image, Image.Image)
490+
assert image.format == "JPEG"
491+
small_image = x_object.decode_as_image({"quality": 75})
492+
assert image_size(small_image) < image_size(image)
493+
494+
477495
@pytest.mark.enable_socket
478496
def test_4bits_images(caplog):
479497
url = "https://github.com/user-attachments/files/16624406/tt.pdf"

0 commit comments

Comments
 (0)