Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,37 @@
# dots.ocr-api

invoice-extract/
Dockerfile
docker-compose.yml
requirements.txt
app/
main.py
This repository contains a minimal setup for extracting invoice fields with the [dots.ocr](https://huggingface.co/rednote-hilab/dots.ocr) model using a CPU-only stack.

## Running the API

Build and start the service with Docker Compose:

```bash
cd invoice-extract
docker compose up -d --build
```

The API will be available at `http://localhost:8000`.

## Example: extract from a PDF

Send a PDF file to the `/extract` endpoint using `curl`:

```bash
curl -F "file=@/path/to/invoice.pdf" http://localhost:8000/extract
```

The response is a JSON object containing the parsed invoice fields.

## Batch processing

Process all PDF or image files in a folder and save the outputs:

```bash
mkdir -p out
for f in /invoices/*.{pdf,jpg,png,jpeg}; do
[ -e "$f" ] || continue
b=$(basename "$f")
curl -s -F "file=@$f" http://localhost:8000/extract > "out/${b%.*}.json"
done
```
9 changes: 9 additions & 0 deletions invoice-extract/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app /app
ENV MODEL_ID=rednote-hilab/dots.ocr
ENV TORCH_DTYPE=float32
EXPOSE 8000
CMD ["uvicorn","main:app","--host","0.0.0.0","--port","8000"]
85 changes: 85 additions & 0 deletions invoice-extract/app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os, io, re, json
from typing import List
from fastapi import FastAPI, UploadFile, File
from pydantic import BaseModel
from PIL import Image
import fitz
import torch
from transformers import AutoModelForCausalLM, AutoProcessor

MODEL_ID = os.getenv("MODEL_ID", "rednote-hilab/dots.ocr")
DTYPE = os.getenv("TORCH_DTYPE", "bfloat16")

app = FastAPI()
model = None
processor = None
device = "cpu"

def load_model():
global model, processor, device
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" and DTYPE.lower().startswith("bf") else torch.float32
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, trust_remote_code=True,
torch_dtype=dtype, device_map="auto" if device=="cuda" else None
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

@app.on_event("startup")
def _startup():
load_model()

def pdf_or_image_to_pils(data: bytes, filename: str) -> List[Image.Image]:
if filename.lower().endswith(".pdf"):
doc = fitz.open(stream=data, filetype="pdf")
imgs = []
for p in doc:
pix = p.get_pixmap(matrix=fitz.Matrix(200/72, 200/72), alpha=False)
imgs.append(Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB"))
return imgs
return [Image.open(io.BytesIO(data)).convert("RGB")]

PROMPT = """
Extrage câmpurile de factură și returnează UN SINGUR obiect JSON valid, fără alt text.

Schema:
{
"invoice_number": null,
"issue_date": null,
"due_date": null,
"seller": {"name": null, "vat_id": null, "iban": null, "address": null},
"buyer": {"name": null, "vat_id": null, "address": null},
"currency": null,
"line_items": [{"description": null, "quantity": null, "unit_price": null, "line_total": null, "tax_rate": null}],
"subtotal": null, "tax": null, "total": null
}
Reguli: păstrează limba originală a textelor; nu traduce; nu inventa; dacă lipsesc date, lasă null; numere cu punct zecimal și fără separatori de mii.
"""

def infer(images: List[Image.Image]) -> dict:
content = [{"type":"image","image":img} for img in images] + [{"type":"text","text":PROMPT}]
messages = [{"role":"user","content":content}]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=images, padding=True, return_tensors="pt")
if device == "cuda":
inputs = {k: v.to("cuda") for k, v in inputs.items()}
out = model.generate(**inputs, max_new_tokens=4096, temperature=0.01)
resp = processor.batch_decode(out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
m = re.search(r"\{.*\}", resp, flags=re.S)
payload = m.group(0) if m else resp
try:
import json5
return json5.loads(payload)
except Exception:
return json.loads(payload)

class ExtractResponse(BaseModel):
pages: int
data: dict

@app.post("/extract", response_model=ExtractResponse)
async def extract(file: UploadFile = File(...)):
blob = await file.read()
images = pdf_or_image_to_pils(blob, file.filename)
data = infer(images)
return {"pages": len(images), "data": data}
11 changes: 11 additions & 0 deletions invoice-extract/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
services:
api:
build: .
ports: ["8000:8000"]
environment:
- MODEL_ID=rednote-hilab/dots.ocr
- TORCH_DTYPE=float32
volumes:
- hf_cache:/root/.cache/huggingface
volumes:
hf_cache: {}
8 changes: 8 additions & 0 deletions invoice-extract/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
fastapi
uvicorn[standard]
transformers>=4.42
accelerate
pillow
pymupdf
json5
torch