This repository was archived by the owner on Apr 1, 2026. It is now read-only.
forked from datalab-to/marker
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhandler.py
More file actions
147 lines (120 loc) · 4.5 KB
/
handler.py
File metadata and controls
147 lines (120 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import runpod
import shutil
import tempfile
import base64
from pathlib import Path
import subprocess
import json
# Install Marker if needed (will run once when container starts)
def install_marker():
try:
import marker
print("Marker already installed")
except ImportError:
print("Installing marker-pdf...")
subprocess.check_call(["pip", "install", "marker-pdf"])
print("Marker installed successfully")
# Run this when container starts
install_marker()
def process_file(file_path, output_format="json", use_llm=False, force_ocr=False):
"""Process a file with Marker and return the results"""
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser
# Create a temporary directory for output
output_dir = tempfile.mkdtemp()
# Configure Marker
config = {
"output_format": output_format,
"output_dir": output_dir,
"force_ocr": force_ocr,
"use_llm": use_llm
}
# Enable LLM if requested and API key is available
if use_llm and os.environ.get("GOOGLE_API_KEY"):
config["use_llm"] = True
else:
config["use_llm"] = False
# Set up the converter
config_parser = ConfigParser(config)
converter = PdfConverter(
config=config_parser.generate_config_dict(),
artifact_dict=create_model_dict(),
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service() if config["use_llm"] else None
)
# Process the file
print(f"Processing file: {file_path}")
rendered = converter(file_path)
# Extract results
result = {}
if output_format == "json":
# For JSON, return the raw JSON
result["output"] = rendered.model_dump()
# Also extract images if any
if hasattr(rendered, "images") and rendered.images:
result["images"] = {}
for img_id, img_data in rendered.images.items():
result["images"][img_id] = base64.b64encode(img_data).decode("utf-8")
else:
# For markdown or HTML, extract the text and images
text, _, images = text_from_rendered(rendered)
result["text"] = text
# Convert images to base64
if images:
result["images"] = {}
for img_id, img_data in images.items():
result["images"][img_id] = base64.b64encode(img_data).decode("utf-8")
# Clean up
shutil.rmtree(output_dir, ignore_errors=True)
return result
def handler(job):
"""
RunPod handler function for processing documents with Marker
Input job format:
{
"input": {
"file_url": "URL to download the file",
"output_format": "json|markdown|html", (optional, default: json)
"use_llm": true|false, (optional, default: false)
"force_ocr": true|false (optional, default: false)
}
}
"""
job_input = job["input"]
# Get parameters
file_url = job_input.get("file_url")
output_format = job_input.get("output_format", "json")
use_llm = job_input.get("use_llm", False)
force_ocr = job_input.get("force_ocr", False)
# Validate input
if not file_url:
return {"error": "No file_url provided"}
# Create a temporary directory for the file
temp_dir = tempfile.mkdtemp()
try:
# Download the file
file_name = os.path.basename(file_url.split("?")[0])
file_path = os.path.join(temp_dir, file_name)
print(f"Downloading file from {file_url}")
# Use curl to download the file
download_cmd = ["curl", "-L", "-o", file_path, file_url]
subprocess.check_call(download_cmd)
# Process the file
if not os.path.exists(file_path):
return {"error": f"Failed to download file from {file_url}"}
result = process_file(file_path, output_format, use_llm, force_ocr)
return result
except Exception as e:
import traceback
error_msg = f"Error processing file: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return {"error": error_msg}
finally:
# Clean up
shutil.rmtree(temp_dir, ignore_errors=True)
# Start the serverless handler
runpod.serverless.start({"handler": handler})