Skip to content

Commit 1b80078

Browse files
committed
Exposed PDF backend argument in Docling parse_pdf_to_pages
1 parent 1b57725 commit 1b80078

File tree

1 file changed

+11
-2
lines changed
  • packages/paper-qa-docling/src/paperqa_docling

1 file changed

+11
-2
lines changed

packages/paper-qa-docling/src/paperqa_docling/reader.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
from collections.abc import Mapping
66
from importlib.metadata import version
77
from pathlib import Path
8-
from typing import Any, cast
8+
from typing import TYPE_CHECKING, Any, cast
99

1010
import docling
11+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
1112
from docling.datamodel.base_models import ConversionStatus
1213
from docling.datamodel.pipeline_options import PdfPipelineOptions
1314
from docling.datamodel.settings import DEFAULT_PAGE_RANGE
@@ -24,6 +25,9 @@
2425
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
2526
from paperqa.utils import ImpossibleParsingError
2627

28+
if TYPE_CHECKING:
29+
from docling.backend.abstract_backend import AbstractDocumentBackend
30+
2731
DOCLING_VERSION = version(docling.__name__)
2832
DOCLING_IMAGES_SCALE_PER_DPI = (
2933
72 # SEE: https://github.com/docling-project/docling/issues/2405
@@ -38,6 +42,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
3842
pipeline_cls: type = StandardPdfPipeline,
3943
dpi: int | None = None,
4044
custom_pipeline_options: Mapping[str, Any] | None = None,
45+
backend: "type[AbstractDocumentBackend]" = DoclingParseV4DocumentBackend,
4146
**_,
4247
) -> ParsedText:
4348
"""Parse a PDF.
@@ -56,6 +61,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
5661
page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
5762
to parse only specific pages, where pages are one-indexed.
5863
Leaving as the default of None will parse all pages.
64+
backend: PDF backend class to use for parsing, defaults to docling-parse v4.
5965
**_: Thrown away kwargs.
6066
"""
6167
path = Path(path)
@@ -73,7 +79,9 @@ def parse_pdf_to_pages( # noqa: PLR0912
7379
converter = DocumentConverter(
7480
format_options={
7581
InputFormat.PDF: PdfFormatOption(
76-
pipeline_options=pipeline_options, pipeline_cls=pipeline_cls
82+
pipeline_options=pipeline_options,
83+
pipeline_cls=pipeline_cls,
84+
backend=backend,
7785
)
7886
}
7987
)
@@ -253,6 +261,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
253261
name=(
254262
f"pdf|pipeline={pipeline_cls.__name__}"
255263
f"|page_range={str(page_range).replace(' ', '')}" # Remove space in tuple
264+
f"|backend={backend.__name__}"
256265
f"{multimodal_string if parse_media else ''}"
257266
),
258267
)

0 commit comments

Comments
 (0)