From 201f8ed17d5a07301b348d005fcf5891651b99e6 Mon Sep 17 00:00:00 2001 From: CesarPetrescu Date: Mon, 18 Aug 2025 03:54:46 +0300 Subject: [PATCH] docs: add PDF invocation examples --- README.md | 40 +++++++++++--- invoice-extract/Dockerfile | 9 ++++ invoice-extract/app/main.py | 85 ++++++++++++++++++++++++++++++ invoice-extract/docker-compose.yml | 11 ++++ invoice-extract/requirements.txt | 8 +++ 5 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 invoice-extract/Dockerfile create mode 100644 invoice-extract/app/main.py create mode 100644 invoice-extract/docker-compose.yml create mode 100644 invoice-extract/requirements.txt diff --git a/README.md b/README.md index 8b04801..b781477 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,37 @@ # dots.ocr-api -invoice-extract/ - Dockerfile - docker-compose.yml - requirements.txt - app/ - main.py +This repository contains a minimal setup for extracting invoice fields with the [dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) model using a CPU-only stack. +## Running the API + +Build and start the service with Docker Compose: + +```bash +cd invoice-extract +docker compose up -d --build +``` + +The API will be available at `http://localhost:8000`. + +## Example: extract from a PDF + +Send a PDF file to the `/extract` endpoint using `curl`: + +```bash +curl -F "file=@/path/to/invoice.pdf" http://localhost:8000/extract +``` + +The response is a JSON object containing the parsed invoice fields. + +## Batch processing + +Process all PDF or image files in a folder and save the outputs: + +```bash +mkdir -p out +for f in /invoices/*.{pdf,jpg,png,jpeg}; do + [ -e "$f" ] || continue + b=$(basename "$f") + curl -s -F "file=@$f" http://localhost:8000/extract > "out/${b%.*}.json" +done +``` diff --git a/invoice-extract/Dockerfile b/invoice-extract/Dockerfile new file mode 100644 index 0000000..2c7f5e7 --- /dev/null +++ b/invoice-extract/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.11-slim +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY app /app +ENV MODEL_ID=rednote-hilab/dots.ocr +ENV TORCH_DTYPE=float32 +EXPOSE 8000 +CMD ["uvicorn","main:app","--host","0.0.0.0","--port","8000"] diff --git a/invoice-extract/app/main.py b/invoice-extract/app/main.py new file mode 100644 index 0000000..80d81bc --- /dev/null +++ b/invoice-extract/app/main.py @@ -0,0 +1,85 @@ +import os, io, re, json +from typing import List +from fastapi import FastAPI, UploadFile, File +from pydantic import BaseModel +from PIL import Image +import fitz +import torch +from transformers import AutoModelForCausalLM, AutoProcessor + +MODEL_ID = os.getenv("MODEL_ID", "rednote-hilab/dots.ocr") +DTYPE = os.getenv("TORCH_DTYPE", "bfloat16") + +app = FastAPI() +model = None +processor = None +device = "cpu" + +def load_model(): + global model, processor, device + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.bfloat16 if device == "cuda" and DTYPE.lower().startswith("bf") else torch.float32 + model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, trust_remote_code=True, + torch_dtype=dtype, device_map="auto" if device=="cuda" else None + ) + processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) + +@app.on_event("startup") +def _startup(): + load_model() + +def pdf_or_image_to_pils(data: bytes, filename: str) -> List[Image.Image]: + if filename.lower().endswith(".pdf"): + doc = fitz.open(stream=data, filetype="pdf") + imgs = [] + for p in doc: + pix = p.get_pixmap(matrix=fitz.Matrix(200/72, 200/72), alpha=False) + imgs.append(Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")) + return imgs + return [Image.open(io.BytesIO(data)).convert("RGB")] + +PROMPT = """ +Extrage câmpurile de factură și returnează UN SINGUR obiect JSON valid, fără alt text. + +Schema: +{ + "invoice_number": null, + "issue_date": null, + "due_date": null, + "seller": {"name": null, "vat_id": null, "iban": null, "address": null}, + "buyer": {"name": null, "vat_id": null, "address": null}, + "currency": null, + "line_items": [{"description": null, "quantity": null, "unit_price": null, "line_total": null, "tax_rate": null}], + "subtotal": null, "tax": null, "total": null +} +Reguli: păstrează limba originală a textelor; nu traduce; nu inventa; dacă lipsesc date, lasă null; numere cu punct zecimal și fără separatori de mii. +""" + +def infer(images: List[Image.Image]) -> dict: + content = [{"type":"image","image":img} for img in images] + [{"type":"text","text":PROMPT}] + messages = [{"role":"user","content":content}] + text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = processor(text=[text], images=images, padding=True, return_tensors="pt") + if device == "cuda": + inputs = {k: v.to("cuda") for k, v in inputs.items()} + out = model.generate(**inputs, max_new_tokens=4096, temperature=0.01) + resp = processor.batch_decode(out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0] + m = re.search(r"\{.*\}", resp, flags=re.S) + payload = m.group(0) if m else resp + try: + import json5 + return json5.loads(payload) + except Exception: + return json.loads(payload) + +class ExtractResponse(BaseModel): + pages: int + data: dict + +@app.post("/extract", response_model=ExtractResponse) +async def extract(file: UploadFile = File(...)): + blob = await file.read() + images = pdf_or_image_to_pils(blob, file.filename) + data = infer(images) + return {"pages": len(images), "data": data} diff --git a/invoice-extract/docker-compose.yml b/invoice-extract/docker-compose.yml new file mode 100644 index 0000000..77530c7 --- /dev/null +++ b/invoice-extract/docker-compose.yml @@ -0,0 +1,11 @@ +services: + api: + build: . + ports: ["8000:8000"] + environment: + - MODEL_ID=rednote-hilab/dots.ocr + - TORCH_DTYPE=float32 + volumes: + - hf_cache:/root/.cache/huggingface +volumes: + hf_cache: {} diff --git a/invoice-extract/requirements.txt b/invoice-extract/requirements.txt new file mode 100644 index 0000000..687f5b8 --- /dev/null +++ b/invoice-extract/requirements.txt @@ -0,0 +1,8 @@ +fastapi +uvicorn[standard] +transformers>=4.42 +accelerate +pillow +pymupdf +json5 +torch