diff --git a/pypdf/_page.py b/pypdf/_page.py index 3409dc16f..6c04e8dd2 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -389,16 +389,18 @@ def replace(self, new_image: Image, **kwargs: Any) -> None: b = BytesIO() new_image.save(b, "PDF", **kwargs) reader = PdfReader(b) - assert reader.pages[0].images[0].indirect_reference is not None + page_image = reader.pages[0].images[0] + assert page_image.indirect_reference is not None self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = ( - reader.pages[0].images[0].indirect_reference.get_object() + page_image.indirect_reference.get_object() ) cast( PdfObject, self.indirect_reference.get_object() ).indirect_reference = self.indirect_reference # change the object attributes extension, byte_stream, img = _xobj_to_image( - cast(DictionaryObject, self.indirect_reference.get_object()) + cast(DictionaryObject, self.indirect_reference.get_object()), + pillow_parameters=kwargs, ) assert extension is not None self.name = self.name[: self.name.rfind(".")] + extension diff --git a/pypdf/filters.py b/pypdf/filters.py index a3f87ad40..1a5611824 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -791,7 +791,10 @@ def decode_stream_data(stream: Any) -> bytes: return data -def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]: +def _xobj_to_image( + x_object: dict[str, Any], + pillow_parameters: Union[dict[str, Any], None] = None +) -> tuple[Optional[str], bytes, Any]: """ Users need to have the pillow package installed. @@ -799,7 +802,9 @@ def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any] It might get removed at any point. Args: - x_object: + x_object: + pillow_parameters: parameters provided to Pillow Image.save() method, + cf. Returns: Tuple[file extension, bytes, PIL.Image.Image] @@ -947,10 +952,20 @@ def _apply_alpha( img, x_object, obj_as_text, image_format, extension ) + if pillow_parameters is None: + pillow_parameters = {} + # Preserve JPEG image quality - see issue #3515. + if image_format == "JPEG": + # This prevents: Cannot use 'keep' when original image is not a JPEG: + # "JPEG" is the value of PIL.JpegImagePlugin.JpegImageFile.format + img.format = "JPEG" # type: ignore[misc] + if "quality" not in pillow_parameters: + pillow_parameters["quality"] = "keep" + # Save image to bytes img_byte_arr = BytesIO() try: - img.save(img_byte_arr, format=image_format) + img.save(img_byte_arr, format=image_format, **pillow_parameters) except OSError: # pragma: no cover # covered with pillow 10.3 # in case of we convert to RGBA and then to PNG img1 = img.convert("RGBA") diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 5573a6d2d..78d39af95 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1046,10 +1046,14 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval._data = FlateDecode.encode(self._data, level) return retval - def decode_as_image(self) -> Any: + def decode_as_image(self, pillow_parameters: Union[dict[str, Any], None] = None) -> Any: """ Try to decode the stream object as an image + Args: + pillow_parameters: parameters provided to Pillow Image.save() method, + cf. + Returns: a PIL image if proper decoding has been found Raises: @@ -1066,7 +1070,7 @@ def decode_as_image(self) -> Any: except AttributeError: msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover logger_warning(msg, __name__) - extension, _, img = _xobj_to_image(self) + extension, _, img = _xobj_to_image(self, pillow_parameters) if extension is None: return None # pragma: no cover return img diff --git a/tests/test_images.py b/tests/test_images.py index dd4ccebef..1adbe0d02 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -39,6 +39,12 @@ def open_image(path: Union[Path, Image.Image, BytesIO]) -> Image.Image: return img +def image_size(image: Image.Image): + buffer = BytesIO() + image.save(buffer, format=image.format) + return buffer.tell() + + def image_similarity( path1: Union[Path, Image.Image, BytesIO], path2: Union[Path, Image.Image, BytesIO] ) -> float: @@ -474,6 +480,18 @@ def test_extract_image_from_object(caplog): assert "does not seem to be an Image" in caplog.text +def test_extract_jpeg_with_explicit_quality(): + reader = PdfReader(RESOURCE_ROOT / "side-by-side-subfig.pdf") + page = reader.pages[0] + x_object = page["/Resources"]["/XObject"]["/Im1"] + assert x_object["/Filter"] == "/DCTDecode" + image = x_object.decode_as_image() + assert isinstance(image, Image.Image) + assert image.format == "JPEG" + small_image = x_object.decode_as_image(pillow_parameters={"quality": 75}) + assert image_size(small_image) < image_size(image) + + @pytest.mark.enable_socket def test_4bits_images(caplog): url = "https://github.com/user-attachments/files/16624406/tt.pdf"