Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,16 +389,18 @@ def replace(self, new_image: Image, **kwargs: Any) -> None:
b = BytesIO()
new_image.save(b, "PDF", **kwargs)
reader = PdfReader(b)
assert reader.pages[0].images[0].indirect_reference is not None
page_image = reader.pages[0].images[0]
assert page_image.indirect_reference is not None
self.indirect_reference.pdf._objects[self.indirect_reference.idnum - 1] = (
reader.pages[0].images[0].indirect_reference.get_object()
page_image.indirect_reference.get_object()
)
cast(
PdfObject, self.indirect_reference.get_object()
).indirect_reference = self.indirect_reference
# change the object attributes
extension, byte_stream, img = _xobj_to_image(
cast(DictionaryObject, self.indirect_reference.get_object())
cast(DictionaryObject, self.indirect_reference.get_object()),
pillow_parameters=kwargs,
)
assert extension is not None
self.name = self.name[: self.name.rfind(".")] + extension
Expand Down
21 changes: 18 additions & 3 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,15 +791,20 @@ def decode_stream_data(stream: Any) -> bytes:
return data


def _xobj_to_image(x_object: dict[str, Any]) -> tuple[Optional[str], bytes, Any]:
def _xobj_to_image(
x_object: dict[str, Any],
pillow_parameters: Union[dict[str, Any], None] = None
) -> tuple[Optional[str], bytes, Any]:
"""
Users need to have the pillow package installed.

It's unclear if pypdf will keep this function here, hence it's private.
It might get removed at any point.

Args:
x_object:
x_object:
pillow_parameters: parameters provided to Pillow Image.save() method,
cf. <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save>

Returns:
Tuple[file extension, bytes, PIL.Image.Image]
Expand Down Expand Up @@ -947,10 +952,20 @@ def _apply_alpha(
img, x_object, obj_as_text, image_format, extension
)

if pillow_parameters is None:
pillow_parameters = {}
# Preserve JPEG image quality - see issue #3515.
if image_format == "JPEG":
# This prevents: Cannot use 'keep' when original image is not a JPEG:
# "JPEG" is the value of PIL.JpegImagePlugin.JpegImageFile.format
img.format = "JPEG" # type: ignore[misc]
if "quality" not in pillow_parameters:
pillow_parameters["quality"] = "keep"

# Save image to bytes
img_byte_arr = BytesIO()
try:
img.save(img_byte_arr, format=image_format)
img.save(img_byte_arr, format=image_format, **pillow_parameters)
except OSError: # pragma: no cover # covered with pillow 10.3
# in case of we convert to RGBA and then to PNG
img1 = img.convert("RGBA")
Expand Down
8 changes: 6 additions & 2 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,10 +1046,14 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
retval._data = FlateDecode.encode(self._data, level)
return retval

def decode_as_image(self) -> Any:
def decode_as_image(self, pillow_parameters: Union[dict[str, Any], None] = None) -> Any:
"""
Try to decode the stream object as an image

Args:
pillow_parameters: parameters provided to Pillow Image.save() method,
cf. <https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save>

Returns:
a PIL image if proper decoding has been found
Raises:
Expand All @@ -1066,7 +1070,7 @@ def decode_as_image(self) -> Any:
except AttributeError:
msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover
logger_warning(msg, __name__)
extension, _, img = _xobj_to_image(self)
extension, _, img = _xobj_to_image(self, pillow_parameters)
if extension is None:
return None # pragma: no cover
return img
Expand Down
18 changes: 18 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ def open_image(path: Union[Path, Image.Image, BytesIO]) -> Image.Image:
return img


def image_size(image: Image.Image):
buffer = BytesIO()
image.save(buffer, format=image.format)
return buffer.tell()


def image_similarity(
path1: Union[Path, Image.Image, BytesIO], path2: Union[Path, Image.Image, BytesIO]
) -> float:
Expand Down Expand Up @@ -474,6 +480,18 @@ def test_extract_image_from_object(caplog):
assert "does not seem to be an Image" in caplog.text


def test_extract_jpeg_with_explicit_quality():
reader = PdfReader(RESOURCE_ROOT / "side-by-side-subfig.pdf")
page = reader.pages[0]
x_object = page["/Resources"]["/XObject"]["/Im1"]
assert x_object["/Filter"] == "/DCTDecode"
image = x_object.decode_as_image()
assert isinstance(image, Image.Image)
assert image.format == "JPEG"
small_image = x_object.decode_as_image(pillow_parameters={"quality": 75})
assert image_size(small_image) < image_size(image)


@pytest.mark.enable_socket
def test_4bits_images(caplog):
url = "https://github.com/user-attachments/files/16624406/tt.pdf"
Expand Down
Loading