Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
495a4f8
Moving away from poetry and towards uv
vinodganesan Apr 2, 2025
44cd249
Bumping local changes towards iterating on pdftext
vinodganesan Apr 10, 2025
9f2c8e1
Adding changes to schema
vinodganesan Apr 10, 2025
a03ae7c
should work
Nischith-Shadagopan-M-N Apr 10, 2025
ec6f7ef
small change
Nischith-Shadagopan-M-N Apr 10, 2025
79cfc24
Adding pydantic variables for Page
vinodganesan Apr 10, 2025
4fde3ca
Merge branch 'master' of https://github.com/disk-brakes/pdftext
vinodganesan Apr 10, 2025
7d211c0
Update package configuration for distribution
vinodganesan Apr 10, 2025
d937e9c
Adding functionality for extracting non-text from the pdfium objects
vinodganesan Apr 10, 2025
5761cff
First full version of extracting non text objects out of a given page
vinodganesan Apr 13, 2025
e98e691
Changes to pdftext schema with more member functions
vinodganesan Apr 14, 2025
c90a869
Bug fix to get page bbox
vinodganesan Apr 14, 2025
494a3de
mypy fixes for pdftext
vinodganesan Apr 15, 2025
f00b24b
added pytest dependency
Nischith-Shadagopan-M-N Apr 15, 2025
d2aa38b
Merge branch 'master' of github.com:disk-brakes/pdftext
Nischith-Shadagopan-M-N Apr 15, 2025
c61a9d9
fixed some dependencies
Nischith-Shadagopan-M-N Apr 15, 2025
0ca637b
fixed all mypy errors
Nischith-Shadagopan-M-N Apr 15, 2025
c53fa2a
changes to top extract file for all the massive random changes with …
vinodganesan Apr 15, 2025
583ef26
Fixing random af cast errors. cast(x,y) is a scam function
vinodganesan Apr 15, 2025
e2736c5
added setup for visual testing
Nischith-Shadagopan-M-N Apr 16, 2025
45712d8
removed no text spans
Nischith-Shadagopan-M-N Apr 16, 2025
2472ea9
fixed span not splitting issue
Nischith-Shadagopan-M-N Apr 19, 2025
03f2004
fixed the inside function for bbox
Nischith-Shadagopan-M-N Apr 23, 2025
a5981d0
Merge branch 'master' of https://github.com/datalab-to/pdftext
Nischith-Shadagopan-M-N Jul 12, 2025
cea6a19
small fix
Nischith-Shadagopan-M-N Jul 12, 2025
6fa6c42
small change
Nischith-Shadagopan-M-N Jul 17, 2025
7d1a91e
small change to allow control characters
Nischith-Shadagopan-M-N Jul 19, 2025
5399a62
small change
Nischith-Shadagopan-M-N Jul 19, 2025
31f8b35
fixed unicode surrogate pairs
Nischith-Shadagopan-M-N Jul 19, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ build/
develop-eggs/
dist/
downloads/
input/
output/*
eggs/
.eggs/
lib/
Expand Down Expand Up @@ -165,4 +167,8 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.idea/

# Random inputs and outputs
random_inputs/
random_outputs/
34 changes: 31 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,33 @@ Text extraction like [PyMuPDF](https://github.com/pymupdf/PyMuPDF), but without

# Installation

You'll need python 3.9+ first. Then run `pip install pdftext`.
You'll need python 3.10+ first. Then run `pip install pdftext`.

# Development

To set up the development environment:

1. Clone the repository:
```shell
git clone https://github.com/VikParuchuri/pdftext.git
cd pdftext
```

2. Create and activate a virtual environment:
```shell
python -m venv .venv
source .venv/bin/activate # On Windows use `.venv\Scripts\activate`
```

3. Install dependencies using UV:
```shell
uv pip install -r requirements-dev.txt
```

4. Run tests:
```shell
uv run pytest
```

# Usage

Expand Down Expand Up @@ -132,12 +158,14 @@ For the alignment score, I extracted the text, then used the rapidfuzz library t

## Running benchmarks

You can run the benchmarks yourself. To do so, you have to first install pdftext manually. The install assumes you have poetry and Python 3.9+ installed.
You can run the benchmarks yourself. To do so, you have to first install pdftext manually. The install assumes you have Python 3.10+ installed.

```shell
git clone https://github.com/VikParuchuri/pdftext.git
cd pdftext
poetry install
python -m venv .venv
source .venv/bin/activate # On Windows use `.venv\Scripts\activate`
uv pip install -r requirements-dev.txt
python benchmark.py # Will download the benchmark pdfs automatically
```

Expand Down
Binary file added input/reliance_shares.pdf
Binary file not shown.
25 changes: 25 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[mypy]
python_version = 3.13
warn_return_any = True
warn_unused_configs = True
disallow_untyped_defs = True
disallow_incomplete_defs = True
check_untyped_defs = True
ignore_missing_imports = True
# We'll progressively remove these as we fix the errors
disable_error_code = no-redef,misc,no-any-return

# For third-party libraries without type stubs
[mypy-pypdfium2.*]
ignore_missing_imports = True

[mypy-pypdfium2]
ignore_missing_imports = True

# Exclude root level extract_text.py file
[mypy-extract_text]
ignore_errors = True

# Ignore errors in pdftext.tables
[mypy-pdftext.tables]
ignore_errors = True
Binary file added output/bbox_images/page_1_visualization.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions output/reliance_bbox.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pdftext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""pdftext package."""
184 changes: 130 additions & 54 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,104 +3,163 @@
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from itertools import repeat
from typing import List
from typing import List, Optional, Any, Union, cast

import pypdfium2 as pdfium

from pdftext.pdf.links import add_links_and_refs
from pdftext.pdf.pages import get_pages
from pdftext.postprocessing import handle_hyphens, merge_text, postprocess_text, sort_blocks
from pdftext.schema import Pages, TableInputs, Tables
from pdftext.postprocessing import (
handle_hyphens,
merge_text,
postprocess_text,
sort_blocks,
)
from pdftext.schema import Bbox, Pages, Span, TableInputs, Tables
from pdftext.settings import settings
from pdftext.tables import table_cell_text

# Define global variable used in worker processes
pdf_doc: Any = None

def _load_pdf(pdf, flatten_pdf):
pdf = pdfium.PdfDocument(pdf)

def _load_pdf(pdf: str, flatten_pdf: bool) -> Any:
pdf_doc = pdfium.PdfDocument(pdf)

# Must be called on the parent pdf, before the page was retrieved
if flatten_pdf:
pdf.init_forms()
pdf_doc.init_forms()

return pdf
return pdf_doc


def _get_page_range(page_range, flatten_pdf=False, quote_loosebox=True) -> Pages:
def _get_page_range(
page_range: List[int], flatten_pdf: bool = False, quote_loosebox: bool = True
) -> Pages:
global pdf_doc
# Convert list to range for get_pages
return get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox)


def worker_shutdown(pdf_doc):
def worker_shutdown(pdf_doc: Any) -> None:
pdf_doc.close()


def worker_init(pdf_path, flatten_pdf):
def worker_init(pdf_path: str, flatten_pdf: bool) -> None:
global pdf_doc

pdf_doc = _load_pdf(pdf_path, flatten_pdf)

atexit.register(partial(worker_shutdown, pdf_doc))


def _get_pages(pdf_path, page_range=None, flatten_pdf=False, quote_loosebox=True, workers=None) -> Pages:
def _get_pages(
pdf_path: str,
page_range: Optional[List[int]] = None,
flatten_pdf: bool = False,
quote_loosebox: bool = True,
workers: Optional[int] = None,
) -> Pages:
pdf_doc = _load_pdf(pdf_path, flatten_pdf)
if page_range is None:
page_range = range(len(pdf_doc))
page_range_obj = list(range(len(pdf_doc)))
else:
page_range_obj = page_range

if workers is not None:
workers = min(workers, len(page_range) // settings.WORKER_PAGE_THRESHOLD) # It's inefficient to have too many workers, since we batch in inference
workers = min(
workers, len(page_range_obj) // settings.WORKER_PAGE_THRESHOLD
) # It's inefficient to have too many workers, since we batch in inference

if workers is None or workers <= 1:
pages = get_pages(pdf_doc, page_range, flatten_pdf, quote_loosebox)
pages = get_pages(pdf_doc, page_range_obj, flatten_pdf, quote_loosebox)
pdf_doc.close()
return pages

pdf_doc.close()
page_range = list(page_range)

pages_per_worker = math.ceil(len(page_range) / workers)
page_range_chunks = [page_range[i * pages_per_worker:(i + 1) * pages_per_worker] for i in range(workers)]

with ProcessPoolExecutor(max_workers=workers, initializer=worker_init, initargs=(pdf_path, flatten_pdf)) as executor:
pages = list(executor.map(_get_page_range, page_range_chunks, repeat(flatten_pdf), repeat(quote_loosebox)))

ordered_pages = [page for sublist in pages for page in sublist]
pages_per_worker = math.ceil(len(page_range_obj) / workers)
page_range_chunks = [
page_range_obj[i * pages_per_worker : (i + 1) * pages_per_worker]
for i in range(workers)
]

with ProcessPoolExecutor(
max_workers=workers, initializer=worker_init, initargs=(pdf_path, flatten_pdf)
) as executor:
pages_lists = list(
executor.map(
_get_page_range,
page_range_chunks,
repeat(flatten_pdf),
repeat(quote_loosebox),
)
)

ordered_pages = [page for sublist in pages_lists for page in sublist]
return ordered_pages


def plain_text_output(pdf_path, sort=False, hyphens=False, page_range=None, flatten_pdf=False, workers=None) -> str:
text = paginated_plain_text_output(pdf_path, sort=sort, hyphens=hyphens, page_range=page_range, workers=workers, flatten_pdf=flatten_pdf)
def plain_text_output(
pdf_path: str,
sort: bool = False,
hyphens: bool = False,
page_range: Optional[List[int]] = None,
flatten_pdf: bool = False,
workers: Optional[int] = None,
) -> str:
text = paginated_plain_text_output(
pdf_path,
sort=sort,
hyphens=hyphens,
page_range=page_range,
workers=workers,
flatten_pdf=flatten_pdf,
)
return "\n".join(text)


def paginated_plain_text_output(pdf_path, sort=False, hyphens=False, page_range=None, flatten_pdf=False, workers=None) -> List[str]:
pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf)
def paginated_plain_text_output(
pdf_path: str,
sort: bool = False,
hyphens: bool = False,
page_range: Optional[List[int]] = None,
flatten_pdf: bool = False,
workers: Optional[int] = None,
) -> List[str]:
pages: Pages = _get_pages(
pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf
)
text = []
for page in pages:
text.append(merge_text(page, sort=sort, hyphens=hyphens).strip())
return text


def _process_span(span, page_width, page_height, keep_chars):
span["bbox"] = span["bbox"].bbox
def _process_span(
span: Span, page_width: int, page_height: int, keep_chars: bool
) -> None:
span["text"] = handle_hyphens(postprocess_text(span["text"]), keep_hyphens=True)
if not keep_chars:
del span["chars"]
else:
for char in span["chars"]:
char["bbox"] = char["bbox"].bbox


def dictionary_output(
pdf_path,
sort=False,
page_range=None,
keep_chars=False,
flatten_pdf=False,
quote_loosebox=True,
disable_links=False,
workers=None
pdf_path: str,
sort: bool = False,
page_range: Optional[List[int]] = None,
keep_chars: bool = False,
flatten_pdf: bool = False,
quote_loosebox: bool = True,
disable_links: bool = False,
workers: Optional[int] = None,
) -> Pages:
pages: Pages = _get_pages(pdf_path, page_range, workers=workers, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox)
pages: Pages = _get_pages(
pdf_path,
page_range,
workers=workers,
flatten_pdf=flatten_pdf,
quote_loosebox=quote_loosebox,
)

if not disable_links:
pdf = _load_pdf(pdf_path, False)
Expand All @@ -113,12 +172,10 @@ def dictionary_output(
for k in list(block.keys()):
if k not in ["lines", "bbox"]:
del block[k]
block["bbox"] = block["bbox"].bbox
for line in block["lines"]:
for k in list(line.keys()):
if k not in ["spans", "bbox"]:
del line[k]
line["bbox"] = line["bbox"].bbox
for span in line["spans"]:
_process_span(span, page_width, page_height, keep_chars)

Expand All @@ -127,29 +184,48 @@ def dictionary_output(

if page["rotation"] == 90 or page["rotation"] == 270:
page["width"], page["height"] = page["height"], page["width"]
page["bbox"] = [page["bbox"][2], page["bbox"][3], page["bbox"][0], page["bbox"][1]]

# Create a new Bbox instance from the list of floats
bbox_list = [
page["bbox"][2],
page["bbox"][3],
page["bbox"][0],
page["bbox"][1],
]
page["bbox"] = Bbox(bbox_list)
return pages


def table_output(
pdf_path: str,
table_inputs: TableInputs,
page_range=None,
flatten_pdf=False,
quote_loosebox=True,
workers=None,
pages: Pages | None = None
page_range: Optional[List[int]] = None,
flatten_pdf: bool = False,
quote_loosebox: bool = True,
workers: Optional[int] = None,
pages: Optional[Pages] = None,
) -> List[Tables]:
# Extract pages if they don't exist
if not pages:
pages: Pages = dictionary_output(pdf_path, page_range=page_range, flatten_pdf=flatten_pdf, quote_loosebox=quote_loosebox, workers=workers, keep_chars=True)

assert len(pages) == len(table_inputs), "Number of pages and table inputs must match"
if pages is None:
pages = dictionary_output(
pdf_path,
page_range=page_range,
flatten_pdf=flatten_pdf,
quote_loosebox=quote_loosebox,
workers=workers,
keep_chars=True,
)

assert len(pages) == len(
table_inputs
), "Number of pages and table inputs must match"

# Extract table cells per page
out_tables = []
for page, table_input in zip(pages, table_inputs):
tables = table_cell_text(table_input["tables"], page, table_input["img_size"])
assert len(tables) == len(table_input["tables"]), "Number of tables and table inputs must match"
assert len(tables) == len(
table_input["tables"]
), "Number of tables and table inputs must match"
out_tables.append(tables)
return out_tables
Loading