diff --git a/.gitignore b/.gitignore
index 65b85c4..e593f5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,15 @@ dist/
 # Environments
 .env
 .venv
+venv/
+
+# Python caches / local test artifacts
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.hypothesis/
+data/run/
 
 # API keys
-doubao_api.txt
\ No newline at end of file
+doubao_api.txt
+.gitnexus
diff --git a/README.md b/README.md
index 1fbb0cd..bd11d8b 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,7 @@ As shown above, our method produces results that are more accurate, visually ali
 - `html_generator.py`: Takes the detected component data and generates a complete HTML layout with generated code for each module.
 - `image_replacer.py`: A script to replace placeholder divs in the final HTML with actual cropped images.
 - `mapping.py`: Maps the detected UIED components to logical page regions.
+- `docs/asset-aware-html-generation.md`: Design plan for asset-aware screenshot-to-HTML generation and handoff to a reference UI asset system.
 - `requirements.txt`: Lists all the necessary Python dependencies for the project.
 - `doubao_api.txt`: API key file for the Doubao model (should be kept private and is included in `.gitignore`).
 
@@ -112,6 +113,64 @@ As shown above, our method produces results that are more accurate, visually ali
 
 The typical workflow is a multi-step process as follows:
 
+### Convert a Custom Screenshot to a Rough Asset-Aware HTML Preview
+
+For the current ScreenCoder handoff workflow, use the asset-aware preview path when you need a fast end-to-end result from a UI design image without calling a model API. It reads the input image size, builds a semantic UI schema, extracts crop-source assets, and writes a standalone HTML preview plus handoff files for `web-ui-reference`:
+
+```bash
+python screen_to_schema.py \
+  --image data/input/menu.png \
+  --output data/run/menu/screen-schema.json \
+  --preview data/run/menu/index.html \
+  --handoff data/run/menu/handoff.md \
+  --reference-json data/run/menu/reference-handoff.json
+```
+
+Optional hosted OCR can enrich region content with recognized text blocks. The preferred open-source-model path is PaddleOCR PP-OCRv4 hosted on Replicate:
+
+```bash
+export REPLICATE_API_TOKEN=...
+python screen_to_schema.py \
+  --image data/input/menu.png \
+  --ocr-provider replicate_paddleocr \
+  --ocr-output data/run/menu/ocr.json \
+  --output data/run/menu/screen-schema.json \
+  --preview data/run/menu/index.html \
+  --handoff data/run/menu/handoff.md \
+  --reference-json data/run/menu/reference-handoff.json
+```
+
+If the token is missing, ScreenCoder records an OCR manual gate in the schema handoff notes and still writes the non-OCR preview package.
+
+Key outputs:
+
+- `screen-schema.json`: structured page/region/component/asset-role schema.
+- `asset-crops/`: source-image snippets for `crop-source` roles.
+- `index.html`: rough standalone HTML/CSS reconstruction preview.
+- `handoff.md` and `reference-handoff.json`: component/asset intake notes for `web-ui-reference`.
+
+This path is intentionally approximate: ScreenCoder identifies and packages an editable first draft; `web-ui-reference` remains responsible for final reusable component implementations, asset licensing, visual regression, and Godot contracts.
+
+### Convert a Custom Screenshot through the Legacy Model/UIED Pipeline
+
+Put the target screenshot in the project, for example:
+
+```bash
+mkdir -p data/input
+cp your-menu.png data/input/menu.png
+```
+
+Then run the unified pipeline:
+
+```bash
+python screen_to_html.py \
+  --image data/input/menu.png \
+  --work-dir data/run/menu \
+  --api-key doubao_api.txt
+```
+
+The final HTML is written to `data/run/menu/menu_layout_final.html`. Intermediate files in the same directory include detected layout boxes, gray placeholder HTML, UIED component boxes, mapping overlays, and cropped image assets.
+
 1.  **Initial Generation with Placeholders:**
     Run the Python script to generate the initial HTML code for a given screenshot.
     - Block Detection:
@@ -157,5 +216,3 @@ The typical workflow is a multi-step process as follows:
 ## Acknowledgements
 
 This project builds upon several outstanding open-source efforts. We would like to thank the authors and contributors of the following projects: [UIED](https://github.com/MulongXie/UIED), [DCGen](https://github.com/WebPAI/DCGen), [Design2Code](https://github.com/NoviScl/Design2Code)
-
-
diff --git a/UIED/run_single.py b/UIED/run_single.py
index ea8efa1..927a093 100644
--- a/UIED/run_single.py
+++ b/UIED/run_single.py
@@ -1,8 +1,10 @@
 from os.path import join as pjoin
+import argparse
 import cv2
 import os
 import numpy as np
 import multiprocessing
+from pathlib import Path
 
 
 def resize_height_by_longest_edge(img_path, resize_length=800):
@@ -30,6 +32,13 @@ def color_tips():
 
 
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run UIED component detection for one screenshot.")
+    parser.add_argument("--image", default="data/input/test4.png", help="Input screenshot path.")
+    parser.add_argument("--output-root", default="data/tmp", help="Output root directory.")
+    parser.add_argument("--output-json", default=None, help="Expected or copied output JSON path.")
+    parser.add_argument("--show", action="store_true", help="Show debug windows when supported.")
+    args = parser.parse_args()
+
     # Set multiprocessing start method to 'spawn' for macOS compatibility.
     # This must be done at the very beginning of the main block.
     try:
@@ -61,14 +70,14 @@ def color_tips():
     key_params = {'min-grad':10, 'ffl-block':5, 'min-ele-area':50,
                   'merge-contained-ele':True, 'merge-line-to-paragraph':False, 'remove-bar':True}
 
-    # set input image path
-    input_path_img = 'data/input/test4.png'
-    output_root = 'data/tmp'
+    input_path_img = args.image
+    output_root = args.output_root
 
     resized_height = resize_height_by_longest_edge(input_path_img, resize_length=800)
-    color_tips()
+    if args.show:
+        color_tips()
 
-    is_ip = False
+    is_ip = True
     is_clf = False
     is_ocr = False
     is_merge = False
@@ -90,7 +99,7 @@ def color_tips():
             classifier['Elements'] = CNN('Elements')
             # classifier['Noise'] = CNN('Noise')
         ip.compo_detection(input_path_img, output_root, key_params,
-                           classifier=classifier, resize_by_height=resized_height, show=False)
+                           classifier=classifier, resize_by_height=resized_height, show=args.show)
 
     if is_merge:
         import detect_merge.merge as merge
@@ -99,4 +108,13 @@ def color_tips():
         compo_path = pjoin(output_root, 'ip', str(name) + '.json')
         ocr_path = pjoin(output_root, 'ocr', str(name) + '.json')
         merge.merge(input_path_img, compo_path, ocr_path, pjoin(output_root, 'merge'),
-                    is_remove_bar=key_params['remove-bar'], is_paragraph=key_params['merge-line-to-paragraph'], show=True)
+                    is_remove_bar=key_params['remove-bar'], is_paragraph=key_params['merge-line-to-paragraph'], show=args.show)
+
+    if args.output_json:
+        expected = Path(output_root) / "ip" / f"{Path(input_path_img).stem}.json"
+        requested = Path(args.output_json)
+        if expected.exists() and expected.resolve() != requested.resolve():
+            requested.parent.mkdir(parents=True, exist_ok=True)
+            requested.write_text(expected.read_text())
+        if not requested.exists():
+            raise FileNotFoundError(f"UIED output JSON was not created: {requested}")
diff --git a/asset_roles.py b/asset_roles.py
new file mode 100644
index 0000000..dc1f7bd
--- /dev/null
+++ b/asset_roles.py
@@ -0,0 +1,154 @@
+"""Asset role constants and prefab defaults for asset-aware HTML generation."""
+
+from __future__ import annotations
+
+
+ALLOWED_ASSET_ROLES = frozenset(
+    {
+        "ritual-background",
+        "ornate-border",
+        "paper-panel",
+        "brass-plate",
+        "red-seal",
+        "corner-rivet",
+        "symbolic-icon",
+        "portrait-frame",
+        "quick-card-illustration",
+        "title-logotype",
+        "ink-divider",
+        "blood-smear",
+        "rope-knot",
+        "hanging-tag",
+        "progress-petal",
+        "damage-scratch",
+    }
+)
+
+
+ALLOWED_ASSET_STRATEGIES = frozenset(
+    {
+        "css-procedural",
+        "crop-source",
+        "reference-asset",
+        "manual-art",
+    }
+)
+
+
+_DEFAULT_STRATEGY_BY_ROLE = {
+    "ritual-background": "css-procedural",
+    "ornate-border": "reference-asset",
+    "paper-panel": "reference-asset",
+    "brass-plate": "reference-asset",
+    "red-seal": "css-procedural",
+    "corner-rivet": "reference-asset",
+    "symbolic-icon": "reference-asset",
+    "portrait-frame": "reference-asset",
+    "quick-card-illustration": "crop-source",
+    "title-logotype": "manual-art",
+    "ink-divider": "css-procedural",
+    "blood-smear": "css-procedural",
+    "rope-knot": "reference-asset",
+    "hanging-tag": "reference-asset",
+    "progress-petal": "css-procedural",
+    "damage-scratch": "css-procedural",
+}
+
+
+PREFAB_ASSET_ROLE_HINTS = {
+    "ritual-title-stack": [
+        "title-logotype",
+        "ink-divider",
+        "red-seal",
+        "blood-smear",
+    ],
+    "ornate-action-plaque": [
+        "paper-panel",
+        "brass-plate",
+        "ornate-border",
+        "red-seal",
+        "corner-rivet",
+    ],
+    "journey-status-panel": [
+        "paper-panel",
+        "ornate-border",
+        "hanging-tag",
+        "progress-petal",
+        "damage-scratch",
+    ],
+    "relic-quick-card": [
+        "paper-panel",
+        "portrait-frame",
+        "quick-card-illustration",
+        "symbolic-icon",
+        "corner-rivet",
+    ],
+    "title-stack": [
+        "title-logotype",
+        "ink-divider",
+        "red-seal",
+    ],
+    "primary-actions": [
+        "paper-panel",
+        "brass-plate",
+        "ornate-border",
+        "red-seal",
+        "corner-rivet",
+    ],
+    "recent-run-panel": [
+        "paper-panel",
+        "ornate-border",
+        "hanging-tag",
+        "progress-petal",
+        "damage-scratch",
+    ],
+    "quick-links": [
+        "paper-panel",
+        "symbolic-icon",
+        "rope-knot",
+        "hanging-tag",
+    ],
+    "corner-system-actions": [
+        "brass-plate",
+        "corner-rivet",
+        "symbolic-icon",
+    ],
+    "background-ornament": [
+        "ritual-background",
+        "ornate-border",
+        "blood-smear",
+        "damage-scratch",
+    ],
+}
+
+
+DEFAULT_PREFABS = frozenset(PREFAB_ASSET_ROLE_HINTS)
+
+
+def normalize_asset_role(role: str) -> str:
+    """Normalize and validate an asset role name."""
+
+    if not isinstance(role, str):
+        raise TypeError("asset role must be a string")
+
+    normalized = role.strip().lower().replace("_", "-")
+    if normalized not in ALLOWED_ASSET_ROLES:
+        raise ValueError(f"unknown asset role: {role!r}")
+
+    return normalized
+
+
+def default_roles_for_prefab(prefab: str) -> list[str]:
+    """Return default asset roles for a prefab, or an empty list when unknown."""
+
+    if not isinstance(prefab, str):
+        raise TypeError("prefab must be a string")
+
+    return list(PREFAB_ASSET_ROLE_HINTS.get(prefab.strip(), ()))
+
+
+def default_strategy_for_role(role: str) -> str:
+    """Return the default generation strategy for an asset role."""
+
+    normalized = normalize_asset_role(role)
+    return _DEFAULT_STRATEGY_BY_ROLE[normalized]
diff --git a/block_parsor.py b/block_parsor.py
index 7e879b1..42e4c09 100644
--- a/block_parsor.py
+++ b/block_parsor.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import cv2
 import json
@@ -161,7 +162,7 @@ def parse_bboxes(bbox_input: str, image_path: str) -> dict[str, tuple[int, int,
     print("Final parsed bboxes:", bboxes)
     return bboxes
 
-def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]]) -> str:
+def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]], output_path: str | None = None) -> str:
     """Draw bounding boxes on image and save with different colors for each component"""
     image = cv2.imread(image_path)
     if image is None:
@@ -191,13 +192,13 @@ def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]]) -
         cv2.putText(image, component, (x_min, y_min - 10),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
     
-    # Output directory
-    output_dir = "data/tmp"
-    os.makedirs(output_dir, exist_ok=True)
-    
-    # Get the original filename without path
-    original_filename = os.path.basename(image_path)
-    output_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_with_bboxes.png")
+    if output_path is None:
+        output_dir = "data/tmp"
+        os.makedirs(output_dir, exist_ok=True)
+        original_filename = os.path.basename(image_path)
+        output_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_with_bboxes.png")
+    else:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
     
     if cv2.imwrite(output_path, image):
         print(f"Successfully saved annotated image: {output_path}")
@@ -205,14 +206,15 @@ def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]]) -
     print("Error: Failed to save image")
     return ""
 
-def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path: str) -> str:
+def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path: str, json_path: str | None = None) -> str:
     """Save bounding boxes information to a JSON file"""
-    # Output directory
-    output_dir = "data/tmp"
-    os.makedirs(output_dir, exist_ok=True)
-    
-    original_filename = os.path.basename(image_path)
-    json_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_bboxes.json")
+    if json_path is None:
+        output_dir = "data/tmp"
+        os.makedirs(output_dir, exist_ok=True)
+        original_filename = os.path.basename(image_path)
+        json_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_bboxes.json")
+    else:
+        os.makedirs(os.path.dirname(json_path), exist_ok=True)
     
     bboxes_dict = {k: list(v) for k, v in bboxes.items()}
     
@@ -310,8 +312,15 @@ def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path
     
     
 if __name__ == "__main__":
-    image_path = DEFAULT_IMAGE_PATH
-    api_path = DEFAULT_API_PATH
+    parser = argparse.ArgumentParser(description="Detect major layout regions in a webpage screenshot.")
+    parser.add_argument("--image", default=DEFAULT_IMAGE_PATH, help="Input screenshot path.")
+    parser.add_argument("--api-key", default=DEFAULT_API_PATH, help="API key file for the selected vision model.")
+    parser.add_argument("--json", default=None, help="Output JSON path for detected bboxes.")
+    parser.add_argument("--debug", default=None, help="Output debug image path with drawn bboxes.")
+    args = parser.parse_args()
+
+    image_path = args.image
+    api_path = args.api_key
 
     print("=== Starting Simple Component Detection ===")
     print(f"Input image: {image_path}")
@@ -333,8 +342,8 @@ def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path
         print(f"Found bounding boxes for components: {list(bboxes.keys())}")
         print(f"Total components detected: {len(bboxes)}")
         
-        json_path = save_bboxes_to_json(bboxes, image_path)
-        draw_bboxes(image_path, bboxes)
+        json_path = save_bboxes_to_json(bboxes, image_path, args.json)
+        draw_bboxes(image_path, bboxes, args.debug)
         
         print(f"\n=== Results ===")
         for component, bbox in bboxes.items():
diff --git a/component_schema.py b/component_schema.py
new file mode 100644
index 0000000..5e236bc
--- /dev/null
+++ b/component_schema.py
@@ -0,0 +1,269 @@
+"""Dataclass schema helpers for asset-aware screen descriptions."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from asset_roles import (
+    ALLOWED_ASSET_STRATEGIES,
+    default_strategy_for_role,
+    normalize_asset_role,
+)
+
+
+BBox = list[float]
+
+
+@dataclass
+class Viewport:
+    width: int
+    height: int
+
+
+@dataclass
+class AssetRoleBinding:
+    role: str
+    strategy: str
+    crop_id: str | None = None
+    source_bbox: BBox | None = None
+    notes: str | None = None
+
+
+@dataclass
+class RegionSchema:
+    id: str
+    prefab: str
+    bbox: BBox
+    label: str | None = None
+    content_priority: str | None = None
+    content: dict[str, Any] = field(default_factory=dict)
+    asset_roles: list[AssetRoleBinding] = field(default_factory=list)
+
+
+@dataclass
+class ReferenceHandoff:
+    composition: str | None = None
+    prefabs: list[str] = field(default_factory=list)
+    missing_assets: list[str] = field(default_factory=list)
+    notes: str | None = None
+
+
+@dataclass
+class ScreenSchema:
+    page_type: str
+    viewport: Viewport
+    theme_hint: str | None = None
+    template_hint: str | None = None
+    regions: list[RegionSchema] = field(default_factory=list)
+    reference_handoff: ReferenceHandoff | None = None
+
+
+def _require_mapping(value: Any, name: str) -> dict[str, Any]:
+    if not isinstance(value, dict):
+        raise TypeError(f"{name} must be a dict")
+    return value
+
+
+def _require_string(value: Any, name: str) -> str:
+    if not isinstance(value, str):
+        raise TypeError(f"{name} must be a string")
+    return value
+
+
+def _optional_string(value: Any, name: str) -> str | None:
+    if value is None:
+        return None
+    return _require_string(value, name)
+
+
+def _parse_bbox(value: Any, name: str) -> BBox:
+    if not isinstance(value, (list, tuple)) or len(value) != 4:
+        raise ValueError(f"{name} must contain exactly four numbers")
+
+    bbox: BBox = []
+    for item in value:
+        if not isinstance(item, (int, float)) or isinstance(item, bool):
+            raise TypeError(f"{name} values must be numbers")
+        bbox.append(float(item))
+    return bbox
+
+
+def _parse_viewport(value: Any) -> Viewport:
+    data = _require_mapping(value, "viewport")
+    width = data.get("width")
+    height = data.get("height")
+    if not isinstance(width, int) or isinstance(width, bool):
+        raise TypeError("viewport.width must be an integer")
+    if not isinstance(height, int) or isinstance(height, bool):
+        raise TypeError("viewport.height must be an integer")
+    if width <= 0 or height <= 0:
+        raise ValueError("viewport width and height must be positive")
+    return Viewport(width=width, height=height)
+
+
+def _parse_asset_role_binding(value: Any) -> AssetRoleBinding:
+    if isinstance(value, str):
+        role = normalize_asset_role(value)
+        return AssetRoleBinding(role=role, strategy=default_strategy_for_role(role))
+
+    data = _require_mapping(value, "asset_roles[]")
+    role = normalize_asset_role(data.get("role"))
+    strategy = _require_string(data.get("strategy"), "asset_roles[].strategy")
+    strategy = strategy.strip()
+    if strategy not in ALLOWED_ASSET_STRATEGIES:
+        raise ValueError(f"unknown asset strategy: {strategy!r}")
+
+    source_bbox = None
+    if data.get("source_bbox") is not None:
+        source_bbox = _parse_bbox(data.get("source_bbox"), "asset_roles[].source_bbox")
+
+    return AssetRoleBinding(
+        role=role,
+        strategy=strategy,
+        crop_id=_optional_string(data.get("crop_id"), "asset_roles[].crop_id"),
+        source_bbox=source_bbox,
+        notes=_optional_string(data.get("notes"), "asset_roles[].notes"),
+    )
+
+
+def _parse_region(value: Any) -> RegionSchema:
+    data = _require_mapping(value, "regions[]")
+    prefab = data.get("prefab", data.get("component_prefab", ""))
+    asset_roles = data.get("asset_roles", [])
+    if not isinstance(asset_roles, list):
+        raise TypeError("regions[].asset_roles must be a list")
+
+    content = data.get("content", {})
+    if content is None:
+        content = {}
+    if not isinstance(content, dict):
+        raise TypeError("regions[].content must be a dict")
+
+    return RegionSchema(
+        id=_require_string(data.get("id"), "regions[].id"),
+        prefab=_require_string(prefab, "regions[].prefab"),
+        bbox=_parse_bbox(data.get("bbox"), "regions[].bbox"),
+        label=_optional_string(data.get("label"), "regions[].label"),
+        content_priority=_optional_string(data.get("content_priority"), "regions[].content_priority"),
+        content=dict(content),
+        asset_roles=[_parse_asset_role_binding(item) for item in asset_roles],
+    )
+
+
+def _parse_reference_handoff(value: Any) -> ReferenceHandoff:
+    data = _require_mapping(value, "reference_handoff")
+
+    prefabs = data.get("prefabs", [])
+    if not isinstance(prefabs, list) or not all(isinstance(item, str) for item in prefabs):
+        raise TypeError("reference_handoff.prefabs must be a list of strings")
+
+    missing_assets = data.get("missing_assets", [])
+    if not isinstance(missing_assets, list):
+        raise TypeError("reference_handoff.missing_assets must be a list")
+
+    return ReferenceHandoff(
+        composition=_optional_string(data.get("composition"), "reference_handoff.composition"),
+        prefabs=list(prefabs),
+        missing_assets=[normalize_asset_role(item) for item in missing_assets],
+        notes=_optional_string(data.get("notes"), "reference_handoff.notes"),
+    )
+
+
+def screen_schema_from_dict(data: dict) -> ScreenSchema:
+    """Build a validated ScreenSchema from a plain dictionary."""
+
+    source = _require_mapping(data, "screen schema")
+    regions = source.get("regions", [])
+    if not isinstance(regions, list):
+        raise TypeError("regions must be a list")
+
+    reference_handoff = None
+    if source.get("reference_handoff") is not None:
+        reference_handoff = _parse_reference_handoff(source.get("reference_handoff"))
+
+    return ScreenSchema(
+        page_type=_require_string(source.get("page_type"), "page_type"),
+        viewport=_parse_viewport(source.get("viewport")),
+        theme_hint=_optional_string(source.get("theme_hint"), "theme_hint"),
+        template_hint=_optional_string(source.get("template_hint"), "template_hint"),
+        regions=[_parse_region(item) for item in regions],
+        reference_handoff=reference_handoff,
+    )
+
+
+def _asset_role_to_dict(binding: AssetRoleBinding) -> dict[str, Any]:
+    result: dict[str, Any] = {
+        "role": normalize_asset_role(binding.role),
+        "strategy": binding.strategy,
+    }
+    if result["strategy"] not in ALLOWED_ASSET_STRATEGIES:
+        raise ValueError(f"unknown asset strategy: {result['strategy']!r}")
+    if binding.crop_id is not None:
+        result["crop_id"] = binding.crop_id
+    if binding.source_bbox is not None:
+        result["source_bbox"] = _parse_bbox(binding.source_bbox, "asset_roles[].source_bbox")
+    if binding.notes is not None:
+        result["notes"] = binding.notes
+    return result
+
+
+def screen_schema_to_dict(schema: ScreenSchema) -> dict:
+    """Convert a ScreenSchema into a JSON-serializable dictionary."""
+
+    if not isinstance(schema, ScreenSchema):
+        raise TypeError("schema must be a ScreenSchema")
+
+    result: dict[str, Any] = {
+        "page_type": schema.page_type,
+        "viewport": {
+            "width": schema.viewport.width,
+            "height": schema.viewport.height,
+        },
+        "theme_hint": schema.theme_hint,
+        "template_hint": schema.template_hint,
+        "regions": [
+            {
+                "id": region.id,
+                "label": region.label,
+                "prefab": region.prefab,
+                "bbox": _parse_bbox(region.bbox, "regions[].bbox"),
+                "content_priority": region.content_priority,
+                "content": dict(region.content),
+                "asset_roles": [_asset_role_to_dict(binding) for binding in region.asset_roles],
+            }
+            for region in schema.regions
+        ],
+    }
+
+    if schema.reference_handoff is not None:
+        handoff = schema.reference_handoff
+        result["reference_handoff"] = {
+            "composition": handoff.composition,
+            "prefabs": list(handoff.prefabs),
+            "missing_assets": [normalize_asset_role(role) for role in handoff.missing_assets],
+            "notes": handoff.notes,
+        }
+
+    return result
+
+
+def load_screen_schema(path) -> ScreenSchema:
+    """Load a screen schema JSON file."""
+
+    with Path(path).open("r", encoding="utf-8") as handle:
+        data = json.load(handle)
+    return screen_schema_from_dict(data)
+
+
+def save_screen_schema(schema, path) -> None:
+    """Save a screen schema JSON file with stable formatting."""
+
+    data = screen_schema_to_dict(schema)
+    target = Path(path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    with target.open("w", encoding="utf-8") as handle:
+        json.dump(data, handle, ensure_ascii=False, indent=2)
+        handle.write("\n")
diff --git a/docs/asset-aware-html-generation.md b/docs/asset-aware-html-generation.md
new file mode 100644
index 0000000..5bd69c9
--- /dev/null
+++ b/docs/asset-aware-html-generation.md
@@ -0,0 +1,314 @@
+# Asset-Aware HTML Generation Design
+
+本文档定义 ScreenCoder 如何从“截图转 HTML”升级为“资产感知的截图转 HTML”。目标是让 ScreenCoder 能识别复杂游戏 UI 中的旧纸、金属边框、铆钉、朱印、挂签、角色框和插图等装饰资产，并把它们映射到可复用的 Web/Godot UI 组件语义，而不是只生成普通 `div`、Tailwind class 和灰色图片占位。
+
+## 背景
+
+当前流水线主要覆盖三类能力：
+
+- `block_parsor.py`：让视觉模型框出大区域，例如 sidebar、header、navigation、main content。
+- `html_generator.py`：针对区域截图生成 HTML/Tailwind 片段。
+- `image_box_detection.py`、`UIED/run_single.py`、`mapping.py`、`image_replacer.py`：找出 HTML 中的灰色图片占位，将其映射到原图 UIED 检测框，并裁剪替换为真实图片。
+
+这套流程适合普通网页截图，但对游戏主菜单、HUD、RPG 面板等复杂 UI 会遇到三个问题：
+
+- 纸张破损、金属边框、铆钉、红绳、朱印等装饰不是普通内容块，不能只靠 `border`、`shadow` 和背景色稳定复现。
+- 现有替换逻辑只识别 `.bg-gray-400` 占位块，无法表达“这个边框应该用 NinePatch/StyleBoxTexture 资产”。
+- 生成 HTML 缺少组件语义，后续无法可靠移交到 `web-ui-reference` 或 Godot。
+
+## 目标
+
+- 对输入截图输出结构化 UI schema，包含页面、语义区、组件 prefab、资产角色和内容字段。
+- 将截图中的复杂装饰分为三类：可程序化 CSS、可裁剪位图、需引用设计资产库。
+- 让生成 HTML 引用稳定组件语义，例如 `ornate-action-plaque`、`paper-panel`、`brass-plate`、`red-seal`。
+- 保留当前研究型流水线的快速原型能力，同时新增可审计、可回归的资产感知路径。
+- 与 `web-ui-reference` 分工清楚：ScreenCoder 负责识别、裁剪、映射和生成初稿；`web-ui-reference` 负责资产治理、组件实现、视觉回归和 Godot 合同。
+
+## 非目标
+
+- 不在 ScreenCoder 内维护完整设计系统。
+- 不把外部资产包、许可证、Storybook 或 Godot Theme 合同放到本仓库长期治理。
+- 不要求自动生成达到像素级一致；像素级或艺术级修正应进入 `web-ui-reference` 的组件和截图回归流程。
+- 不把游戏运行时状态、Godot `.tscn` 或业务逻辑复制到 ScreenCoder。
+
+## 目标架构
+
+```text
+input screenshot
+  -> optional hosted OCR text detection
+  -> layout detection
+  -> semantic region classification
+  -> asset role detection
+  -> component schema generation
+  -> optional source crop extraction
+  -> HTML preview generation
+  -> visual diff and handoff package
+```
+
+### Optional Hosted OCR
+
+ScreenCoder can enrich schema region `content` with OCR text and OCR boxes before writing preview/handoff files. The current preferred hosted open-source-model option is PaddleOCR PP-OCRv4 on Replicate:
+
+- Provider flag: `--ocr-provider replicate_paddleocr`
+- Required manual gate: `REPLICATE_API_TOKEN`
+- Output shape normalized internally to `{provider, text, blocks[]}` where each block has `text`, `bbox`, optional `confidence`, and optional `polygon`.
+
+Additional experimental/fallback providers are available through the same abstraction:
+
+- `--ocr-provider hf_ocr`: Hugging Face hosted open-model OCR, gated by `HF_TOKEN`; useful for plain text but often lacks boxes.
+- `--ocr-provider ocr_space`: OCR.space hosted fallback, gated by `OCR_SPACE_API_KEY`; practical but not open-source.
+
+If a provider is requested but its credential is missing, ScreenCoder keeps generating the non-OCR preview package and writes the manual gate into handoff notes instead of failing the whole run.
+
+### Stage 1: Layout Detection
+
+继续复用 `block_parsor.py` 的大区域识别能力，但 prompt 不应固定在普通网页区域。新 prompt 应允许输出游戏 UI 区域，例如：
+
+- `title-stack`
+- `primary-actions`
+- `recent-run-panel`
+- `quick-links`
+- `corner-system-actions`
+- `background-ornament`
+
+输出仍使用归一化 bbox，但需要增加 `confidence`、`reason` 和 `expected_component`。
+
+### Stage 2: Semantic Region Classification
+
+新增语义分类，让每个区域有明确职责。
+
+示例输出：
+
+```json
+{
+  "id": "primary-actions",
+  "label": "继续旅程 / 新的旅程",
+  "bbox": [88, 260, 918, 520],
+  "component_prefab": "ornate-action-plaque",
+  "content_priority": "L0-primary-action",
+  "asset_roles": ["paper-panel", "brass-plate", "ornate-border", "red-seal", "corner-rivet"]
+}
+```
+
+### Stage 3: Asset Role Detection
+
+资产检测不是简单 OCR 或 UIED 检测。它需要把视觉元素映射到可复用资产角色。
+
+初始角色集合应对齐 `web-ui-reference`：
+
+- `ritual-background`
+- `ornate-border`
+- `paper-panel`
+- `brass-plate`
+- `red-seal`
+- `corner-rivet`
+- `symbolic-icon`
+- `portrait-frame`
+- `quick-card-illustration`
+- `title-logotype`
+- `ink-divider`
+- `blood-smear`
+- `rope-knot`
+- `hanging-tag`
+- `progress-petal`
+- `damage-scratch`
+
+每个角色输出 `strategy`：
+
+| Strategy | Meaning | Example |
+| --- | --- | --- |
+| `css-procedural` | 可用 CSS 渐变、阴影、伪元素模拟 | 暗角、噪点、普通纸纹 |
+| `crop-source` | 从源图裁剪可复用片段 | 角色头像、底部卡片插图 |
+| `reference-asset` | 应引用外部/内部资产库 | 金属九宫格边框、铆钉、挂签 |
+| `manual-art` | 需要人工或图像生成资产 | 手写题字、复杂插画 |
+
+### Stage 4: Component Schema Generation
+
+新增 ScreenCoder 输出，不直接只产 HTML。
+
+建议文件：
+
+```text
+data/run/<case>/screen-schema.json
+data/run/<case>/asset-crops/
+data/run/<case>/preview.html
+data/run/<case>/handoff.md
+```
+
+`screen-schema.json` 示例：
+
+```json
+{
+  "page_type": "game-main-menu",
+  "viewport": { "width": 820, "height": 1600 },
+  "theme_hint": "japanese-horror",
+  "template_hint": "ornamental-mobile",
+  "regions": [
+    {
+      "id": "primary-actions",
+      "prefab": "ornate-action-plaque",
+      "bbox": [64, 410, 756, 840],
+      "content": {
+        "primary_title": "继续旅程",
+        "secondary_title": "新的旅程",
+        "subtitle": "返回当前的破宫之旅"
+      },
+      "asset_roles": [
+        { "role": "paper-panel", "strategy": "reference-asset" },
+        { "role": "brass-plate", "strategy": "reference-asset" },
+        { "role": "red-seal", "strategy": "css-procedural" }
+      ]
+    }
+  ]
+}
+```
+
+### Stage 5: HTML Preview Generation
+
+HTML 预览分两种模式：
+
+| Mode | Use case | Output |
+| --- | --- | --- |
+| `standalone` | 快速查看生成效果 | 单文件 `preview.html`，内联 CSS 和裁剪图 |
+| `component-reference` | 移交 `web-ui-reference` | 输出组件/slot schema，不复制组件实现 |
+
+`standalone` 可以继续用 `html_generator.py` 生成，但 prompt 应要求按 prefab 生成，而不是按 `sidebar/header/navigation/main content` 生成。
+
+`component-reference` 应生成可导入 `web-ui-reference` 的候选配置，例如：
+
+```json
+{
+  "composition": "horror-ornamental",
+  "prefabs": ["ritual-title-stack", "ornate-action-plaque", "journey-status-panel", "relic-quick-card"],
+  "missing_assets": ["ornate-border", "brass-plate", "portrait-frame"]
+}
+```
+
+## 与 web-ui-reference 的边界
+
+ScreenCoder 输出的资产角色和 prefab 必须对齐 `web-ui-reference` 的命名。推荐边界：
+
+| Responsibility | ScreenCoder | web-ui-reference |
+| --- | --- | --- |
+| 从截图识别区域 | Yes | No |
+| 从截图裁剪候选素材 | Yes | Optional review only |
+| 维护资产许可证 | No | Yes |
+| 定义 NinePatch/StyleBoxTexture | No | Yes |
+| 实现可复用组件 | No | Yes |
+| 生成一次性 HTML 预览 | Yes | Optional |
+| Storybook/Playwright 回归 | No | Yes |
+| Godot handoff contract | Draft only | Source of truth |
+
+ScreenCoder 的 handoff 文档只应说明“识别到了什么”和“建议映射到什么”，最终合同由 `web-ui-reference` 管理。
+
+## 新增模块建议
+
+```text
+asset_roles.py
+component_schema.py
+screen_to_schema.py
+schema_to_html.py
+handoff_writer.py
+```
+
+### `asset_roles.py`
+
+维护 ScreenCoder 侧可识别资产角色的轻量枚举。该文件应从 `web-ui-reference` 的文档或导出 JSON 同步，不应自行发明不同命名。
+
+### `component_schema.py`
+
+定义 Pydantic 或 dataclass schema，用于约束 region、component、asset role、crop 和 handoff。
+
+### `screen_to_schema.py`
+
+新主入口，负责把截图转成 `screen-schema.json`。它可以复用当前 `block_parsor.py`、OCR、UIED 和视觉模型。
+
+### `schema_to_html.py`
+
+把 `screen-schema.json` 变成可打开的 `preview.html`。该模块只用于预览，不作为设计系统源代码。
+
+### `handoff_writer.py`
+
+生成 `handoff.md`，包含：
+
+- 输入截图。
+- 识别到的页面类型。
+- 区域和 prefab 映射。
+- 裁剪资产清单。
+- 缺失资产角色。
+- 建议在 `web-ui-reference` 中实现或补齐的组件。
+
+## Prompt 改造
+
+旧 prompt 关注“还原容器内 HTML/Tailwind”。新 prompt 应要求模型输出结构化 JSON，再由代码生成 HTML。
+
+推荐 prompt 目标：
+
+- 先识别页面类型和主任务。
+- 再识别语义区域和交互优先级。
+- 最后识别资产角色及生成策略。
+- 不要求模型凭空生成复杂纹理；复杂纹理必须标记为 `reference-asset` 或 `manual-art`。
+
+## 评估指标
+
+### 结构评估
+
+- 主区域 bbox 是否覆盖正确。
+- prefab 分类是否正确。
+- 文案 OCR/视觉识别是否足够支撑内容模型。
+- 主行动、次行动、状态面板和底部入口是否层级清楚。
+
+### 资产评估
+
+- 资产角色召回率。
+- `css-procedural`、`crop-source`、`reference-asset` 分类是否合理。
+- 裁剪图是否可用，是否包含过多背景或文字。
+
+### 视觉评估
+
+- Playwright 截图与源图的粗粒度布局差异。
+- 主操作可发现性。
+- 文字可读性。
+- 装饰是否误导为可点击区域。
+
+## 实施阶段
+
+### Phase 1: Schema First
+
+- 新增 `screen_to_schema.py`。
+- 将 `block_parsor.py` 的输出转换为区域 schema。
+- 手工配置一组游戏 UI prefab 和 asset roles。
+- 输出 `screen-schema.json` 和 `handoff.md`。
+
+### Phase 2: Asset Role Detection
+
+- 增加资产角色识别 prompt。
+- 识别 `paper-panel`、`brass-plate`、`red-seal`、`ornate-border`、`portrait-frame`。
+- 输出每个资产角色的推荐策略。
+
+### Phase 3: Preview HTML
+
+- 新增 `schema_to_html.py`。
+- 生成一版 standalone HTML 预览。
+- 对 `crop-source` 资产生成 `asset-crops/`。
+
+### Phase 4: Reference Handoff
+
+- 输出可被 `web-ui-reference` 读取的 JSON。
+- 在 handoff 中列出缺失组件和缺失资产。
+- 与 `web-ui-reference` 的 Storybook/Playwright 验收对接。
+
+## 风险
+
+- VLM 对装饰资产的语义识别不稳定，需要 schema 校验和人工修正入口。
+- 直接裁剪原图可能包含版权或混合背景，不应默认进入长期资产库。
+- 如果 ScreenCoder 内部实现完整组件系统，会与 `web-ui-reference` 分叉，导致重复维护。
+- 过度追求像素级自动复现会拖慢研究迭代；本项目应优先输出可移交的结构化初稿。
+
+## 验收标准
+
+- 给定一张复杂游戏主菜单截图，能输出 `screen-schema.json`。
+- `screen-schema.json` 至少包含标题区、主行动区、状态面板、快捷入口四类区域。
+- 每类区域能映射到 prefab 和 asset roles。
+- `handoff.md` 能清楚列出哪些资产应由 `web-ui-reference` 维护。
+- standalone `preview.html` 可打开，并保留主要信息层级。
diff --git a/docs/pogong-ui-image-prompts.md b/docs/pogong-ui-image-prompts.md
new file mode 100644
index 0000000..7db084e
--- /dev/null
+++ b/docs/pogong-ui-image-prompts.md
@@ -0,0 +1,28 @@
+# 破宫十重奏风格 UI 图像生成 Prompt Pack
+
+用途：为 ScreenCoder 生成多种游戏 UI 设计图输入，再由 `screen_to_schema.py` 生成粗略 HTML preview / handoff。
+
+## 共通风格约束
+
+- Mobile portrait game UI screenshot, 9:16 aspect ratio, clean flat 2D interface, not a mockup on a device.
+- Style reference: 《破宫十重奏》现有主菜单 UI：米白宣纸背景、细灰线框、克制红棕点缀、朱印、黄铜/金色小标签、宋体/书法标题感、极简几何图形、淡墨插画、轻微纸纹和柔和阴影。
+- UI must be readable by OCR: use clear Simplified Chinese labels, avoid overdecorated text, avoid tiny unreadable glyphs.
+- Keep layout structured for later HTML reconstruction: distinct panels, visible bounding boxes, consistent spacing, no photorealism.
+- Avoid gore, explicit violence, or real-world logos. Use abstract ritual / palace / music / card-battle motifs.
+- Palette: warm ivory `#F4EFE3`, ink brown `#2E211A`, muted red `#A5352C`, brass gold `#B78A42`, pale gray `#B7B1A7`.
+
+## Prompt 1 — 主菜单 / Main Menu
+
+Create a mobile portrait game main menu UI screenshot for a Chinese narrative card roguelite called “破宫十重奏”. Off-white textured paper background, thin gray borders, muted red seal accents, brass number plaques, elegant Chinese typography. Layout: top left square button “设置” with gear icon, top right square button “成就”; centered large title “破宫之十重奏” and subtitle “一桌无人知晓的局，一场无法回头的演奏。”; two large horizontal action cards: “继续旅程 / 返回当前的破宫之旅” with number badge “10” and red circular door icon, “新的旅程 / 开启一场新的破宫之旅” with badge “01”; a bordered “最近旅程” panel with character name “沈伶音”, floor “07/10”, round “03/05”, progress petals; bottom three cards “规则说明”, “卡牌图鉴”, “结局回顾”. Minimal ink illustrations, clean OCR-readable Simplified Chinese text, flat UI screenshot, no device frame.
+
+## Prompt 2 — 设置 / Settings
+
+Create a mobile portrait game settings screen UI screenshot in the same “破宫十重奏” paper-and-ink style. Off-white parchment background, thin gray dividers, muted red circular seals, brass toggle knobs, elegant Chinese typography. Top bar: back arrow “返回”, centered title “设置”, small red seal “静”. Main sections: “声音” with sliders “主音量 72”, “乐音 58”, “音效 64”; “画面” with segmented options “低 / 中 / 高” and selected “高”, toggle “纸纹效果 开”, toggle “动态朱印 开”; “游戏” with toggle “自动存档 开”, toggle “战斗提示 开”, language row “语言 简体中文”; bottom brass outlined buttons “恢复默认” and red primary button “保存设置”. Layout must be clean and structured, distinct panels and rows, OCR-readable Chinese, subtle geometric ink icons, no photorealism.
+
+## Prompt 3 — 档案 / Archive / Codex
+
+Create a mobile portrait game archive / character dossier UI screenshot for “破宫十重奏”. Same off-white paper, ink-brown linework, muted red seals, brass labels. Top bar: “返回” left, title “档案馆”, right small tab “筛选”. Layout: left vertical navigation rail with tabs “角色”, “卡牌”, “事件”, “结局”, selected “角色” with red mark. Main panel titled “角色档案” with search field “搜索姓名或乐章”; large dossier card for “沈伶音” with pale ink portrait frame, tags “第七楼”, “琴弦”, “未完成”; stats rows “记忆 42”, “执念 68”, “共鸣 31”; notes panel titled “残页记录” containing short readable lines “她在第三次演奏后失去了名字。”; bottom grid of three smaller locked/available cards: “陆简”, “白鹿”, “？？？”. Strong panel boundaries for HTML reconstruction, legible Simplified Chinese, minimalist ink illustrations.
+
+## Prompt 4 — 战斗界面 / Combat HUD
+
+Create a mobile portrait tactical card battle UI screenshot in the same “破宫十重奏” style. Off-white parchment battlefield with thin gray grid and ink shadows, muted red enemy seals, brass resource counters. Top status bar: left player “沈伶音 HP 32/45”, right enemy “宫中影 HP 28/40”, center round seal “第 03 轮”. Middle: abstract palace-room battlefield with three enemy intent cards labeled “屏”, “聚”, “裂”, small damage markers, timeline ribbon “先手 → 敌意 → 演奏”. Lower panel: player hand of four cards with readable names “断弦”, “回声”, “朱印”, “退步”, each in bordered paper card style with cost circles. Bottom action bar: resource “气 4/6”, red primary button “结束回合”, small buttons “弃牌”, “查看规则”. Clean flat UI screenshot, structured panels, OCR-readable Chinese, no blood or gore, no device frame.
diff --git a/handoff_writer.py b/handoff_writer.py
new file mode 100644
index 0000000..43b489b
--- /dev/null
+++ b/handoff_writer.py
@@ -0,0 +1,314 @@
+"""Write asset-aware handoff documents for web-ui-reference."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import asdict, is_dataclass
+from pathlib import Path
+from typing import Any, Mapping
+
+
+WEB_UI_REFERENCE_NOTE = (
+    "这些资产和组件的长期维护、许可证、Storybook/Playwright 回归与 Godot 合同"
+    "应由 web-ui-reference 维护；ScreenCoder 只输出识别和映射初稿。"
+)
+
+
+def _schema_to_dict(schema: Any) -> dict[str, Any]:
+    if isinstance(schema, Mapping):
+        return dict(schema)
+
+    try:
+        from component_schema import screen_schema_to_dict
+    except ImportError:
+        screen_schema_to_dict = None
+
+    if screen_schema_to_dict is not None:
+        return screen_schema_to_dict(schema)
+    if is_dataclass(schema):
+        return asdict(schema)
+    return {
+        "page_type": getattr(schema, "page_type", "unknown"),
+        "viewport": getattr(schema, "viewport", {}),
+        "theme_hint": getattr(schema, "theme_hint", ""),
+        "template_hint": getattr(schema, "template_hint", ""),
+        "regions": getattr(schema, "regions", []),
+        "reference_handoff": getattr(schema, "reference_handoff", None),
+    }
+
+
+def _load_schema(path: str | Path) -> Any:
+    try:
+        from component_schema import load_screen_schema
+    except ImportError:
+        with Path(path).open("r", encoding="utf-8") as handle:
+            return json.load(handle)
+    return load_screen_schema(path)
+
+
+def _normalise_region(region: Any) -> dict[str, Any]:
+    if is_dataclass(region):
+        return asdict(region)
+    if isinstance(region, Mapping):
+        return dict(region)
+    return {
+        "id": getattr(region, "id", ""),
+        "label": getattr(region, "label", ""),
+        "bbox": getattr(region, "bbox", [0, 0, 0, 0]),
+        "prefab": getattr(region, "prefab", ""),
+        "content_priority": getattr(region, "content_priority", ""),
+        "content": getattr(region, "content", {}),
+        "asset_roles": getattr(region, "asset_roles", []),
+    }
+
+
+def _normalise_role(role: Any) -> dict[str, Any]:
+    if is_dataclass(role):
+        return asdict(role)
+    if isinstance(role, Mapping):
+        return dict(role)
+    return {
+        "role": getattr(role, "role", ""),
+        "strategy": getattr(role, "strategy", ""),
+        "crop_id": getattr(role, "crop_id", None),
+        "source_bbox": getattr(role, "source_bbox", None),
+        "notes": getattr(role, "notes", ""),
+    }
+
+
+def _unique(values: list[str]) -> list[str]:
+    seen: set[str] = set()
+    result: list[str] = []
+    for value in values:
+        if value and value not in seen:
+            seen.add(value)
+            result.append(value)
+    return result
+
+
+def _regions(schema_dict: Mapping[str, Any]) -> list[dict[str, Any]]:
+    return [_normalise_region(region) for region in schema_dict.get("regions") or []]
+
+
+def build_reference_handoff(schema: Any) -> dict[str, Any]:
+    """Build a compact JSON handoff for web-ui-reference intake."""
+    schema_dict = _schema_to_dict(schema)
+    regions = _regions(schema_dict)
+    region_entries: list[dict[str, Any]] = []
+    prefabs: list[str] = []
+    missing_assets: list[str] = []
+    missing_components: list[str] = []
+
+    for region in regions:
+        prefab = region.get("prefab") or ""
+        if prefab:
+            prefabs.append(str(prefab))
+        else:
+            missing_components.append(str(region.get("id") or "unmapped-region"))
+
+        role_entries: list[dict[str, Any]] = []
+        for raw_role in region.get("asset_roles") or []:
+            role = _normalise_role(raw_role)
+            role_name = str(role.get("role") or "unknown-role")
+            strategy = str(role.get("strategy") or "unknown-strategy")
+            role_entries.append(
+                {
+                    "role": role_name,
+                    "strategy": strategy,
+                    "crop_id": role.get("crop_id"),
+                    "source_bbox": role.get("source_bbox"),
+                    "notes": role.get("notes") or "",
+                    "maintainer": "web-ui-reference",
+                }
+            )
+            if strategy in {"reference-asset", "manual-art"}:
+                missing_assets.append(role_name)
+            elif strategy == "crop-source" and not role.get("crop_id"):
+                missing_assets.append(role_name)
+
+        region_entries.append(
+            {
+                "id": region.get("id") or "",
+                "label": region.get("label") or "",
+                "bbox": region.get("bbox") or [],
+                "prefab": prefab,
+                "content_priority": region.get("content_priority") or "",
+                "asset_roles": role_entries,
+            }
+        )
+
+    composition = {
+        "page_type": schema_dict.get("page_type") or "unknown-page",
+        "theme_hint": schema_dict.get("theme_hint") or "",
+        "template_hint": schema_dict.get("template_hint") or "",
+        "viewport": schema_dict.get("viewport") or {},
+        "target_owner": "web-ui-reference",
+    }
+
+    return {
+        "composition": composition,
+        "prefabs": _unique(prefabs),
+        "missing_assets": _unique(missing_assets),
+        "missing_components": _unique(missing_components),
+        "regions": region_entries,
+        "maintenance_boundary": WEB_UI_REFERENCE_NOTE,
+    }
+
+
+def _fmt(value: Any) -> str:
+    if value is None or value == "":
+        return "-"
+    if isinstance(value, (dict, list)):
+        return json.dumps(value, ensure_ascii=False)
+    return str(value)
+
+
+def render_handoff_markdown(schema: Any, source_image: str | None = None) -> str:
+    """Render a human-readable handoff document."""
+    schema_dict = _schema_to_dict(schema)
+    reference = build_reference_handoff(schema)
+    regions = reference["regions"]
+    lines: list[str] = [
+        "# Asset-Aware UI Handoff",
+        "",
+        "## 页面概览",
+        "",
+        f"- 页面类型：{_fmt(schema_dict.get('page_type'))}",
+        f"- 视口：{_fmt(schema_dict.get('viewport'))}",
+        f"- 主题提示：{_fmt(schema_dict.get('theme_hint'))}",
+        f"- 模板提示：{_fmt(schema_dict.get('template_hint'))}",
+    ]
+    if source_image:
+        lines.append(f"- 输入截图：{source_image}")
+    reference_notes = None
+    reference_handoff = schema_dict.get("reference_handoff") or {}
+    if isinstance(reference_handoff, Mapping):
+        reference_notes = reference_handoff.get("notes")
+    if reference_notes:
+        lines.append(f"- 备注：{_fmt(reference_notes)}")
+    lines.extend(["", f"> {WEB_UI_REFERENCE_NOTE}", ""])
+
+    lines.extend(
+        [
+            "## 区域与 Prefab 映射",
+            "",
+            "| Region | Label | BBox | Prefab | Priority |",
+            "| --- | --- | --- | --- | --- |",
+        ]
+    )
+    if regions:
+        for region in regions:
+            lines.append(
+                "| "
+                + " | ".join(
+                    [
+                        _fmt(region.get("id")),
+                        _fmt(region.get("label")),
+                        _fmt(region.get("bbox")),
+                        _fmt(region.get("prefab")),
+                        _fmt(region.get("content_priority")),
+                    ]
+                )
+                + " |"
+            )
+    else:
+        lines.append("| - | - | - | - | - |")
+
+    lines.extend(
+        [
+            "",
+            "## 资产角色策略",
+            "",
+            "| Region | Role | Strategy | Crop | Source BBox | Notes | Maintainer |",
+            "| --- | --- | --- | --- | --- | --- | --- |",
+        ]
+    )
+    has_roles = False
+    for region in regions:
+        for role in region.get("asset_roles") or []:
+            has_roles = True
+            lines.append(
+                "| "
+                + " | ".join(
+                    [
+                        _fmt(region.get("id")),
+                        _fmt(role.get("role")),
+                        _fmt(role.get("strategy")),
+                        _fmt(role.get("crop_id")),
+                        _fmt(role.get("source_bbox")),
+                        _fmt(role.get("notes")),
+                        "web-ui-reference",
+                    ]
+                )
+                + " |"
+            )
+    if not has_roles:
+        lines.append("| - | - | - | - | - | - | web-ui-reference |")
+
+    lines.extend(["", "## 缺失资产", ""])
+    missing_assets = reference.get("missing_assets") or []
+    if missing_assets:
+        for asset in missing_assets:
+            lines.append(f"- {asset}：应由 web-ui-reference 补齐或维护。")
+    else:
+        lines.append("- 未检测到需要补齐的 reference-asset、manual-art 或缺 crop 的 crop-source 资产。")
+
+    lines.extend(["", "## 缺失组件", ""])
+    missing_components = reference.get("missing_components") or []
+    if missing_components:
+        for component in missing_components:
+            lines.append(f"- {component}：该区域缺少 prefab 映射，应在 web-ui-reference 中确定组件归属。")
+    else:
+        lines.append("- 未检测到缺少 prefab 映射的区域。")
+
+    lines.extend(["", "## Reference JSON 摘要", ""])
+    lines.append("```json")
+    lines.append(
+        json.dumps(
+            {
+                "composition": reference["composition"],
+                "prefabs": reference["prefabs"],
+                "missing_assets": reference["missing_assets"],
+            },
+            ensure_ascii=False,
+            indent=2,
+        )
+    )
+    lines.append("```")
+    lines.append("")
+    return "\n".join(lines)
+
+
+def write_handoff(schema: Any, output_path: str | Path, source_image: str | None = None) -> None:
+    """Write handoff.md."""
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(render_handoff_markdown(schema, source_image=source_image), encoding="utf-8")
+
+
+def write_reference_json(schema: Any, output_path: str | Path) -> None:
+    """Write reference handoff JSON."""
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(build_reference_handoff(schema), ensure_ascii=False, indent=2) + "\n",
+        encoding="utf-8",
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Write handoff.md and reference handoff JSON.")
+    parser.add_argument("--schema", required=True, help="Path to screen-schema.json.")
+    parser.add_argument("--output", required=True, help="Path to write handoff.md.")
+    parser.add_argument("--reference-json", required=True, help="Path to write reference handoff JSON.")
+    parser.add_argument("--source-image", default=None, help="Optional source screenshot path to mention in handoff.md.")
+    args = parser.parse_args()
+
+    schema = _load_schema(args.schema)
+    write_handoff(schema, args.output, source_image=args.source_image)
+    write_reference_json(schema, args.reference_json)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/html_generator.py b/html_generator.py
index 3a14e5f..6bd3872 100644
--- a/html_generator.py
+++ b/html_generator.py
@@ -1,5 +1,6 @@
 from utils import encode_image, Doubao, Qwen, GPT, Gemini
 from PIL import Image
+import argparse
 import bs4
 from threading import Thread
 import time
@@ -346,17 +347,7 @@ def code_substitution(html_file, code_dict):
 #     except Exception as e:
 #         print(f"An error occurred during HTML refinement: {e}")
 
-# Main
-if __name__ == "__main__":
-    import json
-    import time
-    from PIL import Image
-    
-    # Load bboxes from block_parsing.py output
-    boxes_data = json.load(open("data/tmp/test1_bboxes.json"))
-
-
-    img_path = "data/input/test1.png"
+def build_layout_from_bboxes(boxes_data, img_path):
     with Image.open(img_path) as img:
         width, height = img.size
     
@@ -382,7 +373,6 @@ def code_substitution(html_file, code_dict):
         }
         root["children"].append(child)
     
-    # Assign IDs to all nodes
     def assign_id(node, id):
         node["id"] = id
         for child in node.get("children", []):
@@ -390,14 +380,44 @@ def assign_id(node, id):
         return id
     
     assign_id(root, 0)
+    return root
+
+
+def create_bot(provider, api_key, model):
+    if provider == "doubao":
+        return Doubao(api_key, model=model)
+    if provider == "qwen":
+        return Qwen(api_key, model=model)
+    if provider == "gpt":
+        return GPT(api_key, model=model)
+    if provider == "gemini":
+        return Gemini(api_key, model=model)
+    raise ValueError(f"Unsupported provider: {provider}")
+
+
+# Main
+if __name__ == "__main__":
+    import json
+
+    parser = argparse.ArgumentParser(description="Generate HTML for detected layout regions.")
+    parser.add_argument("--image", default="data/input/test1.png", help="Input screenshot path.")
+    parser.add_argument("--bboxes", default="data/tmp/test1_bboxes.json", help="Region bbox JSON from block_parsor.py.")
+    parser.add_argument("--output", default="data/tmp/test1_layout.html", help="Output gray placeholder HTML path.")
+    parser.add_argument("--api-key", default="doubao_api.txt", help="API key file for the selected model.")
+    parser.add_argument("--provider", default="doubao", choices=["doubao", "qwen", "gpt", "gemini"], help="Vision model provider.")
+    parser.add_argument("--model", default="doubao-1.5-thinking-vision-pro-250428", help="Vision model name.")
+    args = parser.parse_args()
+
+    boxes_data = json.load(open(args.bboxes))
+    img_path = args.image
+    root = build_layout_from_bboxes(boxes_data, img_path)
 
-    # print(root)
     # Generate initial HTML layout
-    generate_html(root, 'data/tmp/test1_layout.html')
+    generate_html(root, args.output)
 
     # Initialize the bot
     # Change your model & API ket path according to your needs
-    bot = Doubao("doubao_api.txt", model = "doubao-1.5-thinking-vision-pro-250428")
+    bot = create_bot(args.provider, args.api_key, args.model)
     # bot = Qwen("qwen_api.txt", model="qwen2.5-vl-72b-instruct")
     # bot = GPT("gpt_api.txt", model="gpt-4o")
     # bot = Gemini("gemini_api.txt", model="gemini-1.5-flash-latest")
@@ -408,7 +428,7 @@ def assign_id(node, id):
     code_dict = generate_code_parallel(root, img_path, bot)
     
     # Substitute the generated code into the HTML
-    code_substitution('data/tmp/test1_layout.html', code_dict)
+    code_substitution(args.output, code_dict)
 
     # Refine the html file
     # html_refinement('data/tmp/test1_layout.html', 'data/tmp/test1_layout_refined.html', img_path, bot)
diff --git a/ocr_client.py b/ocr_client.py
new file mode 100644
index 0000000..7eb92cb
--- /dev/null
+++ b/ocr_client.py
@@ -0,0 +1,308 @@
+"""Optional hosted OCR API clients for ScreenCoder.
+
+The clients normalize different OCR API responses to one lightweight shape so
+ScreenCoder can enrich screen schemas without depending on local OCR runtimes.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+import mimetypes
+import os
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Mapping
+
+import requests
+
+
+@dataclass
+class OCRBlock:
+    text: str
+    bbox: list[float] | None = None
+    confidence: float | None = None
+    polygon: list[list[float]] | None = None
+
+
+@dataclass
+class OCRResult:
+    provider: str
+    text: str = ""
+    blocks: list[OCRBlock] = field(default_factory=list)
+    raw: dict[str, Any] | None = None
+
+
+@dataclass
+class OCRClient:
+    provider: str
+    is_configured: bool
+    manual_gate: str = ""
+
+    def recognize(self, image_path: Path) -> OCRResult:
+        raise NotImplementedError
+
+
+def _compact_text(lines: list[str]) -> str:
+    return "\n".join(line.strip() for line in lines if line and line.strip())
+
+
+def _as_float(value: Any) -> float | None:
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _bbox_from_polygon(points: Any) -> list[float] | None:
+    if not isinstance(points, list) or not points:
+        return None
+    xs: list[float] = []
+    ys: list[float] = []
+    for point in points:
+        if not isinstance(point, (list, tuple)) or len(point) < 2:
+            continue
+        x = _as_float(point[0])
+        y = _as_float(point[1])
+        if x is None or y is None:
+            continue
+        xs.append(x)
+        ys.append(y)
+    if not xs or not ys:
+        return None
+    return [min(xs), min(ys), max(xs), max(ys)]
+
+
+def _normalise_polygon(points: Any) -> list[list[float]] | None:
+    if not isinstance(points, list):
+        return None
+    result: list[list[float]] = []
+    for point in points:
+        if not isinstance(point, (list, tuple)) or len(point) < 2:
+            continue
+        x = _as_float(point[0])
+        y = _as_float(point[1])
+        if x is not None and y is not None:
+            result.append([x, y])
+    return result or None
+
+
+def _bbox_from_xywh(item: Mapping[str, Any]) -> list[float] | None:
+    left = _as_float(item.get("Left", item.get("left")))
+    top = _as_float(item.get("Top", item.get("top")))
+    width = _as_float(item.get("Width", item.get("width")))
+    height = _as_float(item.get("Height", item.get("height")))
+    if left is None or top is None or width is None or height is None:
+        return None
+    return [left, top, left + width, top + height]
+
+
+def _data_uri_for_image(image_path: Path) -> str:
+    mime = mimetypes.guess_type(str(image_path))[0] or "image/png"
+    encoded = base64.b64encode(Path(image_path).read_bytes()).decode("ascii")
+    return f"data:{mime};base64,{encoded}"
+
+
+def normalize_replicate_paddleocr_response(payload: Mapping[str, Any]) -> OCRResult:
+    """Normalize PaddleOCR-like Replicate prediction output."""
+
+    output = payload.get("output", payload)
+    if isinstance(output, str):
+        return OCRResult(provider="replicate_paddleocr", text=output.strip(), raw=dict(payload))
+
+    candidates: Any = []
+    if isinstance(output, Mapping):
+        candidates = output.get("results", output.get("ocr", output.get("blocks", output.get("text", []))))
+    elif isinstance(output, list):
+        candidates = output
+
+    blocks: list[OCRBlock] = []
+    text_lines: list[str] = []
+    if isinstance(candidates, str):
+        text_lines.append(candidates)
+    elif isinstance(candidates, list):
+        for item in candidates:
+            if isinstance(item, str):
+                text_lines.append(item)
+                blocks.append(OCRBlock(text=item))
+                continue
+            if not isinstance(item, Mapping):
+                continue
+            text = str(item.get("text", item.get("transcription", item.get("label", "")))).strip()
+            if not text:
+                continue
+            polygon = _normalise_polygon(item.get("box", item.get("polygon", item.get("points"))))
+            bbox = None
+            raw_bbox = item.get("bbox", item.get("bounding_box"))
+            if isinstance(raw_bbox, list) and len(raw_bbox) == 4:
+                bbox = [float(value) for value in raw_bbox]
+            if bbox is None and polygon:
+                bbox = _bbox_from_polygon(polygon)
+            confidence = _as_float(item.get("score", item.get("confidence", item.get("probability"))))
+            text_lines.append(text)
+            blocks.append(OCRBlock(text=text, bbox=bbox, confidence=confidence, polygon=polygon))
+
+    if not text_lines and isinstance(output, Mapping):
+        generated = output.get("text", output.get("generated_text"))
+        if isinstance(generated, str):
+            text_lines.append(generated)
+
+    return OCRResult(
+        provider="replicate_paddleocr",
+        text=_compact_text(text_lines),
+        blocks=blocks,
+        raw=dict(payload),
+    )
+
+
+def normalize_ocr_space_response(payload: Mapping[str, Any]) -> OCRResult:
+    """Normalize OCR.space response as a non-open-source hosted fallback."""
+
+    results = payload.get("ParsedResults") or []
+    text_lines: list[str] = []
+    blocks: list[OCRBlock] = []
+    if isinstance(results, list):
+        for parsed in results:
+            if not isinstance(parsed, Mapping):
+                continue
+            parsed_text = parsed.get("ParsedText")
+            if isinstance(parsed_text, str) and parsed_text.strip():
+                text_lines.append(parsed_text.strip())
+            overlay = parsed.get("TextOverlay") or {}
+            lines = overlay.get("Lines") if isinstance(overlay, Mapping) else []
+            if isinstance(lines, list):
+                for line in lines:
+                    words = line.get("Words") if isinstance(line, Mapping) else []
+                    if not isinstance(words, list):
+                        continue
+                    for word in words:
+                        if not isinstance(word, Mapping):
+                            continue
+                        text = str(word.get("WordText", "")).strip()
+                        if not text:
+                            continue
+                        blocks.append(OCRBlock(text=text, bbox=_bbox_from_xywh(word)))
+
+    return OCRResult(provider="ocr_space", text=_compact_text(text_lines), blocks=blocks, raw=dict(payload))
+
+
+class ManualGateOCRClient(OCRClient):
+    def __init__(self, provider: str, manual_gate: str) -> None:
+        super().__init__(provider=provider, is_configured=False, manual_gate=manual_gate)
+
+    def recognize(self, image_path: Path) -> OCRResult:
+        raise RuntimeError(self.manual_gate)
+
+
+class ReplicatePaddleOCRClient(OCRClient):
+    DEFAULT_VERSION = "084b779cb09bc2462335a5768fabaeaaba53bb3f70afd0d2fe48fad71fdc4d5a"
+
+    def __init__(self, token: str, version: str | None = None, timeout: int = 120) -> None:
+        super().__init__(provider="replicate_paddleocr", is_configured=True)
+        self.token = token
+        self.version = version or os.getenv("SCREENCODER_REPLICATE_PADDLEOCR_VERSION", self.DEFAULT_VERSION)
+        self.timeout = timeout
+
+    def recognize(self, image_path: Path) -> OCRResult:
+        response = requests.post(
+            "https://api.replicate.com/v1/predictions",
+            headers={
+                "Authorization": f"Bearer {self.token}",
+                "Content-Type": "application/json",
+                "Prefer": "wait",
+            },
+            json={"version": self.version, "input": {"image": _data_uri_for_image(image_path), "lang": "ch"}},
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+        payload = response.json()
+        if payload.get("status") in {"starting", "processing"} and payload.get("urls", {}).get("get"):
+            payload = self._poll(payload["urls"]["get"])
+        if payload.get("status") == "failed":
+            raise RuntimeError(f"Replicate OCR failed: {payload.get('error')}")
+        return normalize_replicate_paddleocr_response(payload)
+
+    def _poll(self, url: str) -> dict[str, Any]:
+        deadline = time.monotonic() + self.timeout
+        while time.monotonic() < deadline:
+            response = requests.get(url, headers={"Authorization": f"Bearer {self.token}"}, timeout=30)
+            response.raise_for_status()
+            payload = response.json()
+            if payload.get("status") in {"succeeded", "successful"}:
+                return payload
+            if payload.get("status") == "failed":
+                raise RuntimeError(f"Replicate OCR failed: {payload.get('error')}")
+            time.sleep(2)
+        raise TimeoutError("Timed out waiting for Replicate OCR prediction")
+
+
+class OCRSpaceClient(OCRClient):
+    def __init__(self, api_key: str, timeout: int = 120) -> None:
+        super().__init__(provider="ocr_space", is_configured=True)
+        self.api_key = api_key
+        self.timeout = timeout
+
+    def recognize(self, image_path: Path) -> OCRResult:
+        with Path(image_path).open("rb") as handle:
+            response = requests.post(
+                "https://api.ocr.space/parse/image",
+                headers={"apikey": self.api_key},
+                data={"language": "chs", "isOverlayRequired": "true", "OCREngine": "2"},
+                files={"file": (Path(image_path).name, handle)},
+                timeout=self.timeout,
+            )
+        response.raise_for_status()
+        payload = response.json()
+        if payload.get("IsErroredOnProcessing"):
+            raise RuntimeError(f"OCR.space failed: {payload.get('ErrorMessage')}")
+        return normalize_ocr_space_response(payload)
+
+
+class HuggingFaceOCRClient(OCRClient):
+    def __init__(self, token: str, model: str | None = None, timeout: int = 120) -> None:
+        super().__init__(provider="hf_ocr", is_configured=True)
+        self.token = token
+        self.model = model or os.getenv("SCREENCODER_HF_OCR_MODEL", "stepfun-ai/GOT-OCR-2.0-hf")
+        self.timeout = timeout
+
+    def recognize(self, image_path: Path) -> OCRResult:
+        mime = mimetypes.guess_type(str(image_path))[0] or "image/png"
+        response = requests.post(
+            f"https://api-inference.huggingface.co/models/{self.model}",
+            headers={"Authorization": f"Bearer {self.token}", "Content-Type": mime},
+            data=Path(image_path).read_bytes(),
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+        payload = response.json()
+        text = ""
+        if isinstance(payload, list) and payload and isinstance(payload[0], Mapping):
+            text = str(payload[0].get("generated_text", payload[0].get("text", ""))).strip()
+        elif isinstance(payload, Mapping):
+            text = str(payload.get("generated_text", payload.get("text", json.dumps(payload, ensure_ascii=False)))).strip()
+        elif isinstance(payload, str):
+            text = payload.strip()
+        return OCRResult(provider="hf_ocr", text=text, blocks=[], raw={"response": payload})
+
+
+def build_ocr_client(provider: str | None) -> OCRClient | None:
+    provider = (provider or "none").strip().lower().replace("-", "_")
+    if provider in {"", "none", "off", "disabled"}:
+        return None
+    if provider == "replicate_paddleocr":
+        token = os.getenv("REPLICATE_API_TOKEN")
+        if not token:
+            return ManualGateOCRClient(provider, "Set REPLICATE_API_TOKEN to enable hosted PaddleOCR via Replicate.")
+        return ReplicatePaddleOCRClient(token)
+    if provider == "ocr_space":
+        api_key = os.getenv("OCR_SPACE_API_KEY")
+        if not api_key:
+            return ManualGateOCRClient(provider, "Set OCR_SPACE_API_KEY to enable OCR.space fallback OCR.")
+        return OCRSpaceClient(api_key)
+    if provider in {"hf", "hf_ocr", "huggingface"}:
+        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        if not token:
+            return ManualGateOCRClient("hf_ocr", "Set HF_TOKEN to enable Hugging Face hosted open-model OCR.")
+        return HuggingFaceOCRClient(token)
+    raise ValueError(f"Unsupported OCR provider: {provider}")
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..1021641
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,5 @@
+[pytest]
+testpaths = tests
+norecursedirs = post-training .git .venv venv __pycache__ .hypothesis .pytest_cache data tmp
+python_files = test_*.py
+addopts = -q
diff --git a/schema_to_html.py b/schema_to_html.py
new file mode 100644
index 0000000..8460657
--- /dev/null
+++ b/schema_to_html.py
@@ -0,0 +1,451 @@
+"""Render asset-aware screen schemas as standalone HTML previews."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from dataclasses import asdict, is_dataclass
+from html import escape
+from pathlib import Path
+from typing import Any, Mapping, Sequence
+
+
+def _schema_to_dict(schema: Any) -> dict[str, Any]:
+    if isinstance(schema, Mapping):
+        return dict(schema)
+
+    try:
+        from component_schema import screen_schema_to_dict
+    except ImportError:
+        screen_schema_to_dict = None
+
+    if screen_schema_to_dict is not None:
+        return screen_schema_to_dict(schema)
+    if is_dataclass(schema):
+        return asdict(schema)
+    return {
+        "page_type": getattr(schema, "page_type", "unknown"),
+        "viewport": getattr(schema, "viewport", {}),
+        "theme_hint": getattr(schema, "theme_hint", ""),
+        "template_hint": getattr(schema, "template_hint", ""),
+        "regions": getattr(schema, "regions", []),
+        "reference_handoff": getattr(schema, "reference_handoff", None),
+    }
+
+
+def _load_schema(path: str | Path) -> Any:
+    try:
+        from component_schema import load_screen_schema
+    except ImportError:
+        with Path(path).open("r", encoding="utf-8") as handle:
+            return json.load(handle)
+    return load_screen_schema(path)
+
+
+def _slug(value: Any) -> str:
+    text = str(value or "").strip().lower()
+    text = re.sub(r"[^a-z0-9_-]+", "-", text)
+    return text.strip("-") or "unknown"
+
+
+def _attr(value: Any) -> str:
+    return escape(str(value), quote=True)
+
+
+def _region_bbox(region: Mapping[str, Any]) -> tuple[float, float, float, float]:
+    bbox = region.get("bbox") or [0, 0, 0, 0]
+    if len(bbox) != 4:
+        return 0.0, 0.0, 0.0, 0.0
+    x1, y1, x2, y2 = (float(item or 0) for item in bbox)
+    return x1, y1, max(0.0, x2 - x1), max(0.0, y2 - y1)
+
+
+def _viewport(schema_dict: Mapping[str, Any]) -> tuple[int, int]:
+    viewport = schema_dict.get("viewport") or {}
+    if is_dataclass(viewport):
+        viewport = asdict(viewport)
+    width = int(viewport.get("width") or 820)
+    height = int(viewport.get("height") or 1600)
+    return max(1, width), max(1, height)
+
+
+def _normalise_region(region: Any) -> dict[str, Any]:
+    if is_dataclass(region):
+        return asdict(region)
+    if isinstance(region, Mapping):
+        return dict(region)
+    return {
+        "id": getattr(region, "id", ""),
+        "label": getattr(region, "label", ""),
+        "bbox": getattr(region, "bbox", [0, 0, 0, 0]),
+        "prefab": getattr(region, "prefab", ""),
+        "content_priority": getattr(region, "content_priority", ""),
+        "content": getattr(region, "content", {}),
+        "asset_roles": getattr(region, "asset_roles", []),
+    }
+
+
+def _normalise_role(role: Any) -> dict[str, Any]:
+    if is_dataclass(role):
+        return asdict(role)
+    if isinstance(role, Mapping):
+        return dict(role)
+    return {
+        "role": getattr(role, "role", ""),
+        "strategy": getattr(role, "strategy", ""),
+        "crop_id": getattr(role, "crop_id", None),
+        "source_bbox": getattr(role, "source_bbox", None),
+        "notes": getattr(role, "notes", ""),
+    }
+
+
+def _render_content(content: Any) -> str:
+    if not content:
+        return ""
+    if not isinstance(content, Mapping):
+        return f'<p class="content-value">{escape(str(content))}</p>'
+
+    parts: list[str] = []
+    for key, value in content.items():
+        if value is None or value == "":
+            continue
+        class_name = f"content-{_slug(key)}"
+        label = str(key).replace("_", " ")
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
+            items = "".join(f"<li>{escape(str(item))}</li>" for item in value)
+            parts.append(
+                f'<div class="content-item {class_name}">'
+                f'<span class="content-key">{escape(label)}</span><ul>{items}</ul></div>'
+            )
+        elif isinstance(value, Mapping):
+            nested = ", ".join(f"{k}: {v}" for k, v in value.items())
+            parts.append(
+                f'<div class="content-item {class_name}">'
+                f'<span class="content-key">{escape(label)}</span>'
+                f'<span class="content-value">{escape(nested)}</span></div>'
+            )
+        else:
+            parts.append(
+                f'<div class="content-item {class_name}">'
+                f'<span class="content-key">{escape(label)}</span>'
+                f'<span class="content-value">{escape(str(value))}</span></div>'
+            )
+    return "\n".join(parts)
+
+
+def _render_role_layers(roles: list[dict[str, Any]]) -> str:
+    layers: list[str] = []
+    for role in roles:
+        role_name = role.get("role") or "unknown-role"
+        strategy = role.get("strategy") or "unknown-strategy"
+        style = ""
+        if strategy == "crop-source" and role.get("crop_id"):
+            style = f' style="background-image:url({_attr(role.get("crop_id"))})"'
+        layers.append(
+            f'<span class="asset-layer asset-{_slug(role_name)} strategy-{_slug(strategy)}" '
+            f'data-role="{_attr(role_name)}" data-strategy="{_attr(strategy)}"'
+            f'{_optional_attr("data-crop-id", role.get("crop_id"))}{style}></span>'
+        )
+    return "\n".join(layers)
+
+
+def _optional_attr(name: str, value: Any) -> str:
+    if value is None or value == "":
+        return ""
+    return f' {name}="{_attr(value)}"'
+
+
+def _render_region(region: Mapping[str, Any], index: int) -> str:
+    x, y, width, height = _region_bbox(region)
+    region_id = region.get("id") or f"region-{index + 1}"
+    label = region.get("label") or region_id
+    prefab = region.get("prefab") or "unmapped-prefab"
+    priority = region.get("content_priority") or "unprioritized"
+    roles = [_normalise_role(role) for role in region.get("asset_roles") or []]
+    role_names = [role.get("role") or "unknown-role" for role in roles]
+    strategy_names = [role.get("strategy") or "unknown-strategy" for role in roles]
+    classes = [
+        "region",
+        f"region-{_slug(region_id)}",
+        f"prefab-{_slug(prefab)}",
+        f"priority-{_slug(priority)}",
+        *(f"has-{_slug(role)}" for role in role_names),
+    ]
+    style = f"left:{x:.2f}px;top:{y:.2f}px;width:{width:.2f}px;height:{height:.2f}px;"
+    role_chips = "".join(
+        f'<span class="role-chip strategy-{_slug(strategy_names[i])}">'
+        f"{escape(role_names[i])}</span>"
+        for i in range(len(role_names))
+    )
+
+    return f"""
+      <section class="{' '.join(classes)}"
+        style="{style}"
+        data-region-id="{_attr(region_id)}"
+        data-prefab="{_attr(prefab)}"
+        data-label="{_attr(label)}"
+        data-content-priority="{_attr(priority)}"
+        data-asset-roles="{_attr(','.join(role_names))}"
+        data-asset-strategies="{_attr(','.join(strategy_names))}">
+        {_render_role_layers(roles)}
+        <div class="region-body">
+          <div class="region-meta">
+            <span class="prefab-label">{escape(str(prefab))}</span>
+            <span class="region-label">{escape(str(label))}</span>
+          </div>
+          <div class="region-content">
+            {_render_content(region.get("content") or {})}
+          </div>
+          <div class="asset-roles">{role_chips}</div>
+        </div>
+      </section>"""
+
+
+def _base_css(width: int, height: int) -> str:
+    return f"""
+    :root {{
+      --paper: #d7b980;
+      --paper-light: #f0d59b;
+      --ink: #2c1a14;
+      --brass: #a97825;
+      --brass-light: #e2b557;
+      --seal: #9f1f18;
+      --shadow: rgba(28, 14, 8, 0.36);
+    }}
+    * {{ box-sizing: border-box; }}
+    body {{
+      margin: 0;
+      min-height: 100vh;
+      display: grid;
+      place-items: start center;
+      padding: 24px;
+      color: var(--ink);
+      background:
+        radial-gradient(circle at 20% 10%, rgba(178, 40, 32, 0.20), transparent 22rem),
+        radial-gradient(circle at 80% 80%, rgba(166, 120, 38, 0.24), transparent 26rem),
+        linear-gradient(135deg, #18110f, #352017 58%, #140d0b);
+      font-family: Georgia, "Times New Roman", serif;
+    }}
+    .screen-preview {{
+      position: relative;
+      width: {width}px;
+      height: {height}px;
+      overflow: hidden;
+      background:
+        radial-gradient(circle at 50% -10%, rgba(226, 181, 87, 0.22), transparent 30%),
+        linear-gradient(180deg, #332018 0%, #201411 100%);
+      border: 1px solid rgba(226, 181, 87, 0.34);
+      box-shadow: 0 24px 80px rgba(0, 0, 0, 0.45);
+      transform-origin: top center;
+    }}
+    .screen-preview::after {{
+      content: "";
+      position: absolute;
+      inset: 0;
+      pointer-events: none;
+      background-image:
+        linear-gradient(rgba(255,255,255,0.035) 1px, transparent 1px),
+        linear-gradient(90deg, rgba(255,255,255,0.025) 1px, transparent 1px);
+      background-size: 18px 18px;
+      mix-blend-mode: overlay;
+    }}
+    .region {{
+      position: absolute;
+      display: flex;
+      padding: 12px;
+      border: 1px solid rgba(239, 205, 135, 0.36);
+      background: rgba(86, 53, 32, 0.58);
+      box-shadow: 0 10px 28px var(--shadow), inset 0 0 0 1px rgba(255,255,255,0.08);
+      overflow: hidden;
+    }}
+    .region-body {{
+      position: relative;
+      z-index: 2;
+      width: 100%;
+      display: flex;
+      flex-direction: column;
+      gap: 8px;
+      justify-content: center;
+    }}
+    .region-meta {{
+      display: flex;
+      flex-wrap: wrap;
+      gap: 6px;
+      align-items: center;
+      text-transform: uppercase;
+      letter-spacing: 0.08em;
+      font-size: 11px;
+      opacity: 0.88;
+    }}
+    .prefab-label, .region-label, .role-chip {{
+      padding: 3px 7px;
+      border-radius: 999px;
+      background: rgba(35, 20, 15, 0.62);
+      color: #f2dca9;
+    }}
+    .region-label {{ background: rgba(159, 31, 24, 0.42); }}
+    .region-content {{
+      display: grid;
+      gap: 5px;
+      font-size: clamp(12px, 2.2vw, 20px);
+      line-height: 1.18;
+      text-shadow: 0 1px 0 rgba(255,255,255,0.14);
+    }}
+    .content-key {{
+      display: block;
+      font-size: 0.58em;
+      letter-spacing: 0.08em;
+      text-transform: uppercase;
+      opacity: 0.62;
+    }}
+    .content-value {{ font-weight: 700; }}
+    .content-primary-title .content-value,
+    .content-title .content-value {{
+      font-size: 1.5em;
+      color: #2d160f;
+    }}
+    .asset-roles {{
+      display: flex;
+      flex-wrap: wrap;
+      gap: 5px;
+      margin-top: auto;
+      font-size: 10px;
+    }}
+    .strategy-reference-asset {{ border: 1px solid rgba(226, 181, 87, 0.55); }}
+    .strategy-crop-source {{ border: 1px solid rgba(132, 174, 116, 0.65); }}
+    .strategy-manual-art {{ border: 1px dashed rgba(203, 72, 56, 0.75); }}
+    .has-paper-panel, .prefab-paper-panel {{
+      background:
+        linear-gradient(90deg, rgba(96, 53, 28, 0.15), transparent 10%, transparent 90%, rgba(96, 53, 28, 0.16)),
+        radial-gradient(circle at 24% 20%, rgba(255,255,255,0.20), transparent 22%),
+        linear-gradient(135deg, var(--paper-light), var(--paper));
+      border-color: rgba(91, 47, 25, 0.48);
+    }}
+    .has-brass-plate, .prefab-brass-plate {{
+      background:
+        linear-gradient(160deg, rgba(255,255,255,0.30), transparent 20%),
+        linear-gradient(180deg, var(--brass-light), var(--brass) 52%, #6a4318);
+      border: 2px solid rgba(66, 39, 14, 0.68);
+      color: #24130b;
+    }}
+    .has-ornate-border, .prefab-ornate-action-plaque {{
+      border: 4px double rgba(226, 181, 87, 0.82);
+      border-radius: 18px;
+    }}
+    .has-ornate-border::before, .prefab-ornate-action-plaque::before {{
+      content: "";
+      position: absolute;
+      inset: 7px;
+      border: 1px solid rgba(255, 231, 172, 0.38);
+      border-radius: 12px;
+      pointer-events: none;
+    }}
+    .has-red-seal::after, .prefab-red-seal::after {{
+      content: "";
+      position: absolute;
+      right: 12px;
+      bottom: 12px;
+      width: min(54px, 22%);
+      aspect-ratio: 1;
+      border-radius: 50%;
+      background:
+        radial-gradient(circle, transparent 42%, rgba(255,255,255,0.16) 43% 49%, transparent 50%),
+        radial-gradient(circle at 45% 42%, #b72b22, var(--seal));
+      box-shadow: 0 2px 8px rgba(77, 10, 8, 0.38);
+      opacity: 0.82;
+    }}
+    .has-corner-rivet::before {{
+      box-shadow:
+        10px 10px 0 0 rgba(99, 57, 18, 0.75),
+        calc(100% - 10px) 10px 0 0 rgba(99, 57, 18, 0.75),
+        10px calc(100% - 10px) 0 0 rgba(99, 57, 18, 0.75),
+        calc(100% - 10px) calc(100% - 10px) 0 0 rgba(99, 57, 18, 0.75);
+    }}
+    .has-portrait-frame, .prefab-portrait-frame {{
+      border-radius: 22px;
+      background: radial-gradient(ellipse at 50% 35%, rgba(240, 213, 155, 0.35), rgba(39, 22, 17, 0.7));
+    }}
+    .has-ink-divider .region-content {{
+      border-top: 1px solid rgba(44, 26, 20, 0.42);
+      border-bottom: 1px solid rgba(44, 26, 20, 0.24);
+      padding-block: 8px;
+    }}
+    .asset-layer {{
+      position: absolute;
+      inset: 0;
+      pointer-events: none;
+      z-index: 1;
+    }}
+    .strategy-crop-source {{
+      background-size: cover;
+      background-position: center;
+      opacity: 0.34;
+      mix-blend-mode: multiply;
+    }}
+    .asset-crop-source {{
+      background: repeating-linear-gradient(135deg, rgba(118, 157, 103, 0.15) 0 8px, transparent 8px 16px);
+    }}
+    .asset-manual-art {{
+      background: repeating-linear-gradient(45deg, rgba(159, 31, 24, 0.12) 0 7px, transparent 7px 15px);
+    }}
+    @media (max-width: {width + 48}px) {{
+      body {{ padding: 0; place-items: start; }}
+      .screen-preview {{
+        transform: scale(calc(100vw / {width}));
+      }}
+    }}
+    """
+
+
+def render_preview_html(schema: Any) -> str:
+    """Return a standalone preview HTML document for a ScreenSchema-like object."""
+    schema_dict = _schema_to_dict(schema)
+    width, height = _viewport(schema_dict)
+    regions = [_normalise_region(region) for region in schema_dict.get("regions") or []]
+    page_type = schema_dict.get("page_type") or "unknown-page"
+    theme_hint = schema_dict.get("theme_hint") or ""
+    template_hint = schema_dict.get("template_hint") or ""
+    rendered_regions = "\n".join(_render_region(region, i) for i, region in enumerate(regions))
+
+    return f"""<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>{escape(str(page_type))} preview</title>
+  <style>{_base_css(width, height)}</style>
+</head>
+<body>
+  <main class="screen-preview theme-{_slug(theme_hint)} template-{_slug(template_hint)}"
+    data-page-type="{_attr(page_type)}"
+    data-theme-hint="{_attr(theme_hint)}"
+    data-template-hint="{_attr(template_hint)}"
+    data-viewport-width="{width}"
+    data-viewport-height="{height}">
+    {rendered_regions}
+  </main>
+</body>
+</html>
+"""
+
+
+def write_preview_html(schema: Any, output_path: str | Path) -> None:
+    """Write a standalone preview HTML document."""
+    path = Path(output_path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(render_preview_html(schema), encoding="utf-8")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Render screen-schema.json to standalone preview.html.")
+    parser.add_argument("--schema", required=True, help="Path to screen-schema.json.")
+    parser.add_argument("--output", required=True, help="Path to write preview.html.")
+    args = parser.parse_args()
+
+    schema = _load_schema(args.schema)
+    write_preview_html(schema, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/screen_to_html.py b/screen_to_html.py
new file mode 100644
index 0000000..2ad58d4
--- /dev/null
+++ b/screen_to_html.py
@@ -0,0 +1,172 @@
+import argparse
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class PipelineConfig:
+    image: Path
+    work_dir: Path = Path("data/run")
+    api_key: Path = Path("doubao_api.txt")
+    model: str = "doubao-1.5-thinking-vision-pro-250428"
+
+
+@dataclass(frozen=True)
+class PipelineStep:
+    name: str
+    command: list[str]
+
+
+def _stem(path: Path) -> str:
+    return path.stem
+
+
+def build_steps(config: PipelineConfig) -> list[PipelineStep]:
+    image = Path(config.image)
+    work_dir = Path(config.work_dir)
+    api_key = Path(config.api_key)
+    stem = _stem(image)
+
+    region_bboxes = work_dir / f"{stem}_bboxes.json"
+    region_debug = work_dir / f"{stem}_with_bboxes.png"
+    gray_html = work_dir / f"{stem}_layout.html"
+    gray_bboxes = work_dir / f"{stem}_gray_bboxes.json"
+    uied_json = work_dir / "ip" / f"{stem}.json"
+    mapping_json = work_dir / f"mapping_full_{stem}.json"
+    mapping_debug = work_dir / f"overlay_{stem}.png"
+    final_html = work_dir / f"{stem}_layout_final.html"
+
+    return [
+        PipelineStep(
+            "detect layout regions",
+            [
+                sys.executable,
+                "block_parsor.py",
+                "--image",
+                str(image),
+                "--api-key",
+                str(api_key),
+                "--json",
+                str(region_bboxes),
+                "--debug",
+                str(region_debug),
+            ],
+        ),
+        PipelineStep(
+            "generate gray html",
+            [
+                sys.executable,
+                "html_generator.py",
+                "--model",
+                config.model,
+                "--image",
+                str(image),
+                "--bboxes",
+                str(region_bboxes),
+                "--output",
+                str(gray_html),
+                "--api-key",
+                str(api_key),
+            ],
+        ),
+        PipelineStep(
+            "detect gray placeholders",
+            [
+                sys.executable,
+                "image_box_detection.py",
+                "--html",
+                str(gray_html),
+                "--screenshot",
+                str(image),
+                "--out",
+                str(work_dir),
+                "--json",
+                str(gray_bboxes),
+            ],
+        ),
+        PipelineStep(
+            "detect source UI elements",
+            [
+                sys.executable,
+                "UIED/run_single.py",
+                "--image",
+                str(image),
+                "--output-root",
+                str(work_dir),
+                "--output-json",
+                str(uied_json),
+            ],
+        ),
+        PipelineStep(
+            "map placeholders to source elements",
+            [
+                sys.executable,
+                "mapping.py",
+                "--gray",
+                str(gray_bboxes),
+                "--uied",
+                str(uied_json),
+                "--out",
+                str(mapping_json),
+                "--debug",
+                str(mapping_debug),
+                "--debug-src",
+                str(image),
+            ],
+        ),
+        PipelineStep(
+            "replace placeholders",
+            [
+                sys.executable,
+                "image_replacer.py",
+                "--mapping",
+                str(mapping_json),
+                "--uied",
+                str(uied_json),
+                "--original-image",
+                str(image),
+                "--gray-html",
+                str(gray_html),
+                "--output-html",
+                str(final_html),
+            ],
+        ),
+    ]
+
+
+def run_pipeline(config: PipelineConfig, dry_run: bool = False) -> None:
+    config.work_dir.mkdir(parents=True, exist_ok=True)
+    for step in build_steps(config):
+        print(f"\n=== {step.name} ===")
+        print(" ".join(step.command))
+        if not dry_run:
+            subprocess.run(step.command, check=True)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a UI screenshot to HTML with the ScreenCoder pipeline.")
+    parser.add_argument("--image", required=True, type=Path, help="Source UI screenshot.")
+    parser.add_argument("--work-dir", default=Path("data/run"), type=Path, help="Directory for intermediate files and final HTML.")
+    parser.add_argument("--api-key", default=Path("doubao_api.txt"), type=Path, help="Doubao API key file.")
+    parser.add_argument("--model", default="doubao-1.5-thinking-vision-pro-250428", help="Doubao vision model name.")
+    parser.add_argument("--dry-run", action="store_true", help="Print commands without executing them.")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    run_pipeline(
+        PipelineConfig(
+            image=args.image,
+            work_dir=args.work_dir,
+            api_key=args.api_key,
+            model=args.model,
+        ),
+        dry_run=args.dry_run,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/screen_to_schema.py b/screen_to_schema.py
new file mode 100644
index 0000000..768c1bb
--- /dev/null
+++ b/screen_to_schema.py
@@ -0,0 +1,384 @@
+import argparse
+from pathlib import Path
+import json
+import re
+import sys
+
+from PIL import Image
+
+from asset_roles import default_roles_for_prefab, default_strategy_for_role
+from component_schema import (
+    AssetRoleBinding,
+    ReferenceHandoff,
+    RegionSchema,
+    ScreenSchema,
+    Viewport,
+    save_screen_schema,
+)
+
+
+DEFAULT_REGION_LAYOUTS = {
+    "title-stack": (0.10, 0.05, 0.90, 0.22),
+    "primary-actions": (0.08, 0.26, 0.92, 0.52),
+    "recent-run-panel": (0.10, 0.55, 0.90, 0.72),
+    "quick-links": (0.06, 0.76, 0.94, 0.94),
+}
+
+LEGACY_REGION_TO_PREFAB = {
+    "header": "ritual-title-stack",
+    "title-stack": "ritual-title-stack",
+    "sidebar": "ornate-action-plaque",
+    "primary-actions": "ornate-action-plaque",
+    "main content": "journey-status-panel",
+    "main-content": "journey-status-panel",
+    "recent-run-panel": "journey-status-panel",
+    "navigation": "relic-quick-card",
+    "quick-links": "relic-quick-card",
+    "corner-system-actions": "relic-quick-card",
+    "background-ornament": "ritual-background",
+}
+
+REGION_CONTENT_PRIORITIES = {
+    "title-stack": "L0-branding",
+    "primary-actions": "L0-primary-action",
+    "recent-run-panel": "L1-status",
+    "quick-links": "L2-shortcuts",
+    "corner-system-actions": "L2-system",
+    "background-ornament": "L3-decoration",
+}
+
+REGION_LABELS = {
+    "title-stack": "Title area",
+    "primary-actions": "Primary actions",
+    "recent-run-panel": "Run status panel",
+    "quick-links": "Quick links",
+    "corner-system-actions": "System actions",
+    "background-ornament": "Background ornament",
+}
+
+
+def _slugify(value: str) -> str:
+    return value.strip().lower().replace("_", "-").replace(" ", "-")
+
+
+def _bbox_from_fraction(fraction_bbox: tuple[float, float, float, float], width: int, height: int) -> list[int]:
+    x1, y1, x2, y2 = fraction_bbox
+    return [
+        round(x1 * width),
+        round(y1 * height),
+        round(x2 * width),
+        round(y2 * height),
+    ]
+
+
+def _bbox_to_pixels(
+    bbox: list[int] | tuple[int, int, int, int],
+    viewport: Viewport,
+    bbox_format: str,
+) -> list[int]:
+    if len(bbox) != 4:
+        raise ValueError(f"Expected bbox with 4 coordinates, got {bbox!r}")
+
+    coords = [int(value) for value in bbox]
+    if bbox_format == "pixel":
+        return coords
+    if bbox_format == "normalized":
+        return [
+            round(coords[0] * viewport.width / 1000),
+            round(coords[1] * viewport.height / 1000),
+            round(coords[2] * viewport.width / 1000),
+            round(coords[3] * viewport.height / 1000),
+        ]
+    if bbox_format != "auto":
+        raise ValueError(f"Unsupported bbox format: {bbox_format}")
+
+    x1, y1, x2, y2 = coords
+    if x2 > viewport.width or y2 > viewport.height:
+        return _bbox_to_pixels(coords, viewport, "normalized")
+    return coords
+
+
+def _prefab_for_region(region_id: str) -> str:
+    return LEGACY_REGION_TO_PREFAB.get(region_id, region_id)
+
+
+def _asset_bindings_for_prefab(prefab: str) -> list[AssetRoleBinding]:
+    return [
+        AssetRoleBinding(role=role, strategy=default_strategy_for_role(role))
+        for role in default_roles_for_prefab(prefab)
+    ]
+
+
+def _safe_filename(value: str) -> str:
+    return re.sub(r"[^a-z0-9_.-]+", "-", value.lower()).strip("-") or "asset"
+
+
+def _make_region(region_id: str, bbox: list[int], prefab: str | None = None, source: str = "heuristic") -> RegionSchema:
+    semantic_id = _slugify(region_id)
+    component_prefab = prefab or _prefab_for_region(semantic_id)
+    content = {
+        "label": REGION_LABELS.get(semantic_id, region_id),
+        "source": source,
+    }
+    return RegionSchema(
+        id=semantic_id,
+        label=REGION_LABELS.get(semantic_id, region_id),
+        bbox=bbox,
+        prefab=component_prefab,
+        content_priority=REGION_CONTENT_PRIORITIES.get(semantic_id, "L2-supporting"),
+        content=content,
+        asset_roles=_asset_bindings_for_prefab(component_prefab),
+    )
+
+
+def _clamp_bbox(bbox: list[float], width: int, height: int) -> tuple[int, int, int, int]:
+    x1, y1, x2, y2 = [round(float(value)) for value in bbox]
+    if x2 <= 0 or y2 <= 0 or x1 >= width or y1 >= height:
+        return 0, 0, max(1, min(width, 1)), max(1, min(height, 1))
+
+    x1 = max(0, min(width, x1))
+    x2 = max(0, min(width, x2))
+    y1 = max(0, min(height, y1))
+    y2 = max(0, min(height, y2))
+    if x2 <= x1:
+        x2 = min(width, x1 + 1)
+    if y2 <= y1:
+        y2 = min(height, y1 + 1)
+    return x1, y1, x2, y2
+
+
+def _bbox_intersection_area(a: list[float], b: list[float]) -> float:
+    ax1, ay1, ax2, ay2 = [float(value) for value in a]
+    bx1, by1, bx2, by2 = [float(value) for value in b]
+    width = max(0.0, min(ax2, bx2) - max(ax1, bx1))
+    height = max(0.0, min(ay2, by2) - max(ay1, by1))
+    return width * height
+
+
+def apply_ocr_to_schema(schema: ScreenSchema, ocr_result) -> ScreenSchema:
+    """Attach normalized OCR text blocks to intersecting schema regions."""
+
+    for region in schema.regions:
+        matched_blocks = []
+        matched_text: list[str] = []
+        for block in getattr(ocr_result, "blocks", []) or []:
+            bbox = getattr(block, "bbox", None)
+            text = str(getattr(block, "text", "")).strip()
+            if not bbox or not text:
+                continue
+            if _bbox_intersection_area(region.bbox, bbox) <= 0:
+                continue
+            matched_text.append(text)
+            matched_blocks.append(
+                {
+                    "text": text,
+                    "bbox": [float(value) for value in bbox],
+                    "confidence": getattr(block, "confidence", None),
+                }
+            )
+        if matched_text:
+            region.content["ocr_text"] = matched_text
+            region.content["ocr_blocks"] = matched_blocks
+            region.content["ocr_provider"] = getattr(ocr_result, "provider", "unknown")
+    return schema
+
+
+def extract_crop_source_assets(
+    schema: ScreenSchema,
+    image_path: Path,
+    output_dir: Path,
+    crop_id_prefix: str = "asset-crops",
+) -> list[Path]:
+    """Crop source-image regions for bindings that request the crop-source strategy."""
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    written: list[Path] = []
+    with Image.open(image_path) as image:
+        width, height = image.size
+        for region in schema.regions:
+            for index, binding in enumerate(region.asset_roles):
+                if binding.strategy != "crop-source":
+                    continue
+                x1, y1, x2, y2 = _clamp_bbox(binding.source_bbox or region.bbox, width, height)
+                filename = f"{_safe_filename(region.id)}-{_safe_filename(binding.role)}-{index}.png"
+                crop_path = output_dir / filename
+                image.crop((x1, y1, x2, y2)).save(crop_path)
+                binding.crop_id = str(Path(crop_id_prefix) / filename)
+                binding.source_bbox = [x1, y1, x2, y2]
+                written.append(crop_path)
+    return written
+
+
+def _load_bbox_regions(bboxes_path: Path, viewport: Viewport, bbox_format: str) -> list[RegionSchema]:
+    with bboxes_path.open("r", encoding="utf-8") as file:
+        raw_bboxes = json.load(file)
+
+    if not isinstance(raw_bboxes, dict):
+        raise ValueError(f"Expected a bbox mapping in {bboxes_path}, got {type(raw_bboxes).__name__}")
+
+    regions = []
+    for name, bbox in raw_bboxes.items():
+        semantic_id = _slugify(name)
+        regions.append(_make_region(semantic_id, _bbox_to_pixels(bbox, viewport, bbox_format), source="layout-detection"))
+    return regions
+
+
+def _heuristic_regions(viewport: Viewport) -> list[RegionSchema]:
+    return [
+        _make_region(region_id, _bbox_from_fraction(fraction_bbox, viewport.width, viewport.height))
+        for region_id, fraction_bbox in DEFAULT_REGION_LAYOUTS.items()
+    ]
+
+
+def _ensure_required_regions(regions: list[RegionSchema], viewport: Viewport) -> list[RegionSchema]:
+    present = {region.id for region in regions}
+    required = ["title-stack", "primary-actions", "recent-run-panel", "quick-links"]
+    completed = list(regions)
+    for region_id in required:
+        if region_id not in present:
+            completed.append(
+                _make_region(
+                    region_id,
+                    _bbox_from_fraction(DEFAULT_REGION_LAYOUTS[region_id], viewport.width, viewport.height),
+                )
+            )
+    return completed
+
+
+def _reference_handoff_for(regions: list[RegionSchema]) -> ReferenceHandoff:
+    prefabs = sorted({region.prefab for region in regions})
+    missing_assets = sorted(
+        {
+            binding.role
+            for region in regions
+            for binding in region.asset_roles
+            if binding.strategy in {"reference-asset", "manual-art"}
+        }
+    )
+    return ReferenceHandoff(
+        composition="horror-ornamental",
+        prefabs=prefabs,
+        missing_assets=missing_assets,
+    )
+
+
+def build_screen_schema(
+    image_path: Path,
+    bboxes_path: Path | None = None,
+    bbox_format: str = "auto",
+    page_type: str = "game-main-menu",
+    theme_hint: str = "japanese-horror",
+    template_hint: str = "ornamental-mobile",
+) -> ScreenSchema:
+    with Image.open(image_path) as image:
+        width, height = image.size
+
+    viewport = Viewport(width=width, height=height)
+    if bboxes_path:
+        regions = _load_bbox_regions(bboxes_path, viewport, bbox_format)
+        regions = _ensure_required_regions(regions, viewport)
+    else:
+        regions = _heuristic_regions(viewport)
+
+    return ScreenSchema(
+        page_type=page_type,
+        viewport=viewport,
+        theme_hint=theme_hint,
+        template_hint=template_hint,
+        regions=regions,
+        reference_handoff=_reference_handoff_for(regions),
+    )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Build an asset-aware ScreenCoder schema for a UI screenshot.")
+    parser.add_argument("--image", required=True, type=Path, help="Source screenshot path.")
+    parser.add_argument("--bboxes", type=Path, default=None, help="Optional region bbox JSON from block_parsor.py.")
+    parser.add_argument(
+        "--bbox-format",
+        choices=["auto", "normalized", "pixel"],
+        default="auto",
+        help="Coordinate format for --bboxes. Auto treats boxes outside the viewport as normalized 0-1000 boxes.",
+    )
+    parser.add_argument("--output", type=Path, default=None, help="Output screen-schema.json path.")
+    parser.add_argument("--preview", type=Path, default=None, help="Optional standalone preview.html output path.")
+    parser.add_argument("--handoff", type=Path, default=None, help="Optional handoff.md output path.")
+    parser.add_argument("--reference-json", type=Path, default=None, help="Optional web-ui-reference handoff JSON path.")
+    parser.add_argument("--asset-crops", type=Path, default=None, help="Directory for crop-source asset snippets.")
+    parser.add_argument("--no-crops", action="store_true", help="Do not extract crop-source assets.")
+    parser.add_argument(
+        "--ocr-provider",
+        default="none",
+        choices=["none", "replicate_paddleocr", "hf_ocr", "ocr_space"],
+        help="Optional hosted OCR provider. replicate_paddleocr uses an open-source PaddleOCR model on Replicate and requires REPLICATE_API_TOKEN.",
+    )
+    parser.add_argument("--ocr-output", type=Path, default=None, help="Optional normalized OCR JSON output path.")
+    parser.add_argument("--page-type", default="game-main-menu", help="Schema page_type.")
+    parser.add_argument("--theme-hint", default="japanese-horror", help="Schema theme_hint.")
+    parser.add_argument("--template-hint", default="ornamental-mobile", help="Schema template_hint.")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    output_path = args.output or Path("data/run") / args.image.stem / "screen-schema.json"
+
+    schema = build_screen_schema(
+        image_path=args.image,
+        bboxes_path=args.bboxes,
+        bbox_format=args.bbox_format,
+        page_type=args.page_type,
+        theme_hint=args.theme_hint,
+        template_hint=args.template_hint,
+    )
+
+    if args.ocr_provider != "none":
+        from ocr_client import build_ocr_client
+
+        ocr_client = build_ocr_client(args.ocr_provider)
+        if ocr_client is not None and ocr_client.is_configured:
+            ocr_result = ocr_client.recognize(args.image)
+            apply_ocr_to_schema(schema, ocr_result)
+            if args.ocr_output:
+                args.ocr_output.parent.mkdir(parents=True, exist_ok=True)
+                args.ocr_output.write_text(
+                    json.dumps(
+                        {
+                            "provider": ocr_result.provider,
+                            "text": ocr_result.text,
+                            "blocks": [block.__dict__ for block in ocr_result.blocks],
+                        },
+                        ensure_ascii=False,
+                        indent=2,
+                    )
+                    + "\n",
+                    encoding="utf-8",
+                )
+        elif ocr_client is not None:
+            note = f"OCR manual gate: {ocr_client.manual_gate}"
+            if schema.reference_handoff is not None:
+                schema.reference_handoff.notes = note
+            print(note, file=sys.stderr)
+
+    if not args.no_crops:
+        crop_dir = args.asset_crops or output_path.parent / "asset-crops"
+        extract_crop_source_assets(schema, args.image, crop_dir)
+
+    save_screen_schema(schema, output_path)
+
+    if args.preview:
+        from schema_to_html import write_preview_html
+
+        write_preview_html(schema, args.preview)
+    if args.handoff:
+        from handoff_writer import write_handoff
+
+        write_handoff(schema, args.handoff, source_image=args.image)
+    if args.reference_json:
+        from handoff_writer import write_reference_json
+
+        write_reference_json(schema, args.reference_json)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..86a1a5a
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,7 @@
+import sys
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
diff --git a/tests/test_asset_aware_schema.py b/tests/test_asset_aware_schema.py
new file mode 100644
index 0000000..b600718
--- /dev/null
+++ b/tests/test_asset_aware_schema.py
@@ -0,0 +1,162 @@
+import json
+from pathlib import Path
+
+from PIL import Image
+
+from asset_roles import default_roles_for_prefab, default_strategy_for_role, normalize_asset_role
+from component_schema import load_screen_schema, screen_schema_from_dict, screen_schema_to_dict
+from handoff_writer import build_reference_handoff, render_handoff_markdown
+from schema_to_html import render_preview_html
+from screen_to_schema import build_screen_schema, extract_crop_source_assets
+
+
+def test_asset_role_defaults_cover_core_prefabs():
+    assert normalize_asset_role("paper_panel") == "paper-panel"
+    assert "paper-panel" in default_roles_for_prefab("ornate-action-plaque")
+    assert default_strategy_for_role("quick-card-illustration") == "crop-source"
+
+
+def test_screen_schema_round_trips_asset_roles(tmp_path):
+    schema_path = tmp_path / "screen-schema.json"
+    data = {
+        "page_type": "game-main-menu",
+        "viewport": {"width": 820, "height": 1600},
+        "theme_hint": "japanese-horror",
+        "template_hint": "ornamental-mobile",
+        "regions": [
+            {
+                "id": "primary-actions",
+                "label": "Primary actions",
+                "prefab": "ornate-action-plaque",
+                "bbox": [64, 410, 756, 840],
+                "content_priority": "L0-primary-action",
+                "content": {"primary_title": "继续旅程"},
+                "asset_roles": [
+                    {"role": "paper-panel", "strategy": "reference-asset"},
+                    {"role": "red-seal", "strategy": "css-procedural"},
+                ],
+            }
+        ],
+    }
+
+    schema = screen_schema_from_dict(data)
+    schema_path.write_text(json.dumps(screen_schema_to_dict(schema), ensure_ascii=False), encoding="utf-8")
+    loaded = load_screen_schema(schema_path)
+
+    assert loaded.regions[0].label == "Primary actions"
+    assert loaded.regions[0].content_priority == "L0-primary-action"
+    assert screen_schema_to_dict(loaded)["regions"][0]["asset_roles"][0]["role"] == "paper-panel"
+
+
+def test_build_screen_schema_from_legacy_bboxes_adds_required_regions(tmp_path):
+    image_path = tmp_path / "menu.png"
+    bbox_path = tmp_path / "bboxes.json"
+    Image.new("RGB", (820, 1600), color=(20, 12, 10)).save(image_path)
+    bbox_path.write_text(json.dumps({"header": [100, 50, 900, 200]}), encoding="utf-8")
+
+    schema = build_screen_schema(image_path, bbox_path, bbox_format="normalized")
+    region_ids = {region.id for region in schema.regions}
+
+    assert schema.viewport.width == 820
+    assert schema.regions[0].prefab == "ritual-title-stack"
+    assert schema.regions[0].content["source"] == "layout-detection"
+    assert {"title-stack", "primary-actions", "recent-run-panel", "quick-links"}.issubset(region_ids)
+
+
+def test_build_screen_schema_marks_default_regions_as_heuristic(tmp_path):
+    image_path = tmp_path / "menu.png"
+    Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path)
+
+    schema = build_screen_schema(image_path)
+
+    assert {region.content["source"] for region in schema.regions} == {"heuristic"}
+
+
+def test_preview_and_handoff_render_semantic_assets(tmp_path):
+    image_path = tmp_path / "menu.png"
+    Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path)
+    schema = build_screen_schema(image_path)
+    crop_paths = extract_crop_source_assets(schema, image_path, tmp_path / "asset-crops")
+
+    html = render_preview_html(schema)
+    handoff = render_handoff_markdown(schema, source_image="data/input/menu.png")
+    reference = build_reference_handoff(schema)
+
+    assert crop_paths
+    assert crop_paths[0].exists()
+    assert 'data-prefab="ornate-action-plaque"' in html
+    assert "asset-crops/quick-links-quick-card-illustration" in html
+    assert "paper-panel" in html
+    assert "web-ui-reference" in handoff
+    assert "ornate-action-plaque" in reference["prefabs"]
+    assert "paper-panel" in reference["missing_assets"]
+
+
+def test_preview_and_handoff_accept_plain_dict_schema():
+    data = {
+        "page_type": "game-main-menu",
+        "viewport": {"width": 320, "height": 640},
+        "regions": [
+            {
+                "id": "primary-actions",
+                "prefab": "ornate-action-plaque",
+                "bbox": [10, 20, 300, 180],
+                "asset_roles": [{"role": "paper-panel", "strategy": "reference-asset"}],
+            }
+        ],
+    }
+
+    assert 'data-page-type="game-main-menu"' in render_preview_html(data)
+    assert build_reference_handoff(data)["prefabs"] == ["ornate-action-plaque"]
+
+
+def test_crop_source_assets_handle_fully_out_of_bounds_bbox(tmp_path):
+    image_path = tmp_path / "menu.png"
+    Image.new("RGB", (64, 64), color=(20, 12, 10)).save(image_path)
+    schema = build_screen_schema(image_path)
+    crop_binding = schema.regions[-1].asset_roles[2]
+    crop_binding.source_bbox = [1000, 1000, 1200, 1200]
+
+    crop_paths = extract_crop_source_assets(schema, image_path, tmp_path / "asset-crops")
+
+    assert crop_paths[0].exists()
+    assert crop_binding.source_bbox == [0, 0, 1, 1]
+
+
+def test_screen_to_schema_cli_writes_complete_preview_package(tmp_path):
+    import subprocess
+    import sys
+
+    image_path = tmp_path / "menu.png"
+    output_dir = tmp_path / "run" / "menu"
+    Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path)
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            "screen_to_schema.py",
+            "--image",
+            str(image_path),
+            "--output",
+            str(output_dir / "screen-schema.json"),
+            "--preview",
+            str(output_dir / "index.html"),
+            "--handoff",
+            str(output_dir / "handoff.md"),
+            "--reference-json",
+            str(output_dir / "reference-handoff.json"),
+        ],
+        check=True,
+        cwd=Path(__file__).resolve().parents[1],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.stderr == ""
+    assert (output_dir / "screen-schema.json").exists()
+    assert (output_dir / "index.html").exists()
+    assert (output_dir / "handoff.md").exists()
+    assert (output_dir / "reference-handoff.json").exists()
+    assert list((output_dir / "asset-crops").glob("*.png"))
+    assert 'data-page-type="game-main-menu"' in (output_dir / "index.html").read_text(encoding="utf-8")
+    assert "web-ui-reference" in (output_dir / "handoff.md").read_text(encoding="utf-8")
diff --git a/tests/test_ocr_integration.py b/tests/test_ocr_integration.py
new file mode 100644
index 0000000..e9822cb
--- /dev/null
+++ b/tests/test_ocr_integration.py
@@ -0,0 +1,112 @@
+from pathlib import Path
+
+from PIL import Image
+
+from handoff_writer import render_handoff_markdown
+from ocr_client import (
+    OCRBlock,
+    OCRResult,
+    build_ocr_client,
+    normalize_ocr_space_response,
+    normalize_replicate_paddleocr_response,
+)
+from screen_to_schema import apply_ocr_to_schema, build_screen_schema
+
+
+def test_normalize_replicate_paddleocr_response_returns_text_and_boxes():
+    payload = {
+        "status": "successful",
+        "output": {
+            "results": [
+                {
+                    "text": "继续旅程",
+                    "score": 0.98,
+                    "box": [[80, 180], [220, 180], [220, 215], [80, 215]],
+                },
+                {
+                    "text": "新的旅程",
+                    "confidence": 0.87,
+                    "bbox": [90, 260, 210, 295],
+                },
+            ]
+        },
+    }
+
+    result = normalize_replicate_paddleocr_response(payload)
+
+    assert result.provider == "replicate_paddleocr"
+    assert result.text == "继续旅程\n新的旅程"
+    assert result.blocks[0].bbox == [80, 180, 220, 215]
+    assert result.blocks[0].confidence == 0.98
+
+
+def test_normalize_ocr_space_response_keeps_overlay_word_boxes():
+    payload = {
+        "ParsedResults": [
+            {
+                "ParsedText": "设置\n成就\n",
+                "TextOverlay": {
+                    "Lines": [
+                        {
+                            "Words": [
+                                {"WordText": "设置", "Left": 10, "Top": 20, "Width": 30, "Height": 12},
+                                {"WordText": "成就", "Left": 300, "Top": 20, "Width": 30, "Height": 12},
+                            ]
+                        }
+                    ]
+                },
+            }
+        ],
+        "IsErroredOnProcessing": False,
+    }
+
+    result = normalize_ocr_space_response(payload)
+
+    assert result.provider == "ocr_space"
+    assert result.text == "设置\n成就"
+    assert [block.text for block in result.blocks] == ["设置", "成就"]
+    assert result.blocks[1].bbox == [300, 20, 330, 32]
+
+
+def test_apply_ocr_to_schema_attaches_text_to_intersecting_regions(tmp_path):
+    image_path = tmp_path / "menu.png"
+    Image.new("RGB", (400, 800), color=(20, 12, 10)).save(image_path)
+    schema = build_screen_schema(image_path)
+    ocr = OCRResult(
+        provider="test",
+        text="破宫之十重奏\n继续旅程\n设置",
+        blocks=[
+            OCRBlock(text="破宫之十重奏", bbox=[50, 40, 350, 90], confidence=0.99),
+            OCRBlock(text="继续旅程", bbox=[80, 250, 260, 290], confidence=0.95),
+            OCRBlock(text="设置", bbox=[5, 5, 40, 25], confidence=0.7),
+        ],
+    )
+
+    apply_ocr_to_schema(schema, ocr)
+    by_id = {region.id: region for region in schema.regions}
+
+    assert by_id["title-stack"].content["ocr_text"] == ["破宫之十重奏"]
+    assert by_id["primary-actions"].content["ocr_text"] == ["继续旅程"]
+    assert by_id["title-stack"].content["ocr_provider"] == "test"
+    assert "设置" not in by_id["title-stack"].content["ocr_text"]
+
+
+def test_build_ocr_client_preserves_manual_gate_when_token_missing(monkeypatch):
+    monkeypatch.delenv("REPLICATE_API_TOKEN", raising=False)
+
+    client = build_ocr_client("replicate_paddleocr")
+
+    assert client.is_configured is False
+    assert "REPLICATE_API_TOKEN" in client.manual_gate
+
+
+def test_handoff_renders_ocr_manual_gate_note(tmp_path):
+    image_path = tmp_path / "menu.png"
+    Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path)
+    schema = build_screen_schema(image_path)
+    schema.reference_handoff.notes = "OCR manual gate: Set REPLICATE_API_TOKEN to enable hosted PaddleOCR."
+
+    handoff = render_handoff_markdown(schema)
+
+    assert "OCR manual gate" in handoff
+    assert "REPLICATE_API_TOKEN" in handoff
diff --git a/tests/test_optional_provider_imports.py b/tests/test_optional_provider_imports.py
new file mode 100644
index 0000000..6eac5e5
--- /dev/null
+++ b/tests/test_optional_provider_imports.py
@@ -0,0 +1,3 @@
+def test_core_scripts_import_without_all_provider_sdks():
+    import block_parsor  # noqa: F401
+    import html_generator  # noqa: F401
diff --git a/tests/test_screen_to_html.py b/tests/test_screen_to_html.py
new file mode 100644
index 0000000..b18bb17
--- /dev/null
+++ b/tests/test_screen_to_html.py
@@ -0,0 +1,50 @@
+import sys
+from pathlib import Path
+
+from screen_to_html import PipelineConfig, build_steps
+
+
+def test_build_steps_uses_consistent_paths_for_input_image():
+    config = PipelineConfig(
+        image=Path("examples/menu.png"),
+        work_dir=Path("out/menu"),
+        api_key=Path("keys/doubao.txt"),
+        model="demo-model",
+    )
+
+    steps = build_steps(config)
+
+    assert [step.name for step in steps] == [
+        "detect layout regions",
+        "generate gray html",
+        "detect gray placeholders",
+        "detect source UI elements",
+        "map placeholders to source elements",
+        "replace placeholders",
+    ]
+    assert steps[0].command == [
+        sys.executable,
+        "block_parsor.py",
+        "--image",
+        "examples/menu.png",
+        "--api-key",
+        "keys/doubao.txt",
+        "--json",
+        "out/menu/menu_bboxes.json",
+        "--debug",
+        "out/menu/menu_with_bboxes.png",
+    ]
+    assert steps[1].command[-8:] == [
+        "--image",
+        "examples/menu.png",
+        "--bboxes",
+        "out/menu/menu_bboxes.json",
+        "--output",
+        "out/menu/menu_layout.html",
+        "--api-key",
+        "keys/doubao.txt",
+    ]
+    assert steps[1].command[-2:] == ["--api-key", "keys/doubao.txt"]
+    assert steps[2].command[-2:] == ["--json", "out/menu/menu_gray_bboxes.json"]
+    assert "out/menu/ip/menu.json" in steps[3].command
+    assert steps[-1].command[-2:] == ["--output-html", "out/menu/menu_layout_final.html"]
diff --git a/utils.py b/utils.py
index a7ab403..89898fb 100644
--- a/utils.py
+++ b/utils.py
@@ -1,8 +1,5 @@
 import os
 import time
-from openai import OpenAI
-import google.generativeai as genai
-from volcenginesdkarkruntime import Ark
 import base64
 import io
 from PIL import Image, ImageDraw
@@ -10,6 +7,14 @@
 import numpy as np
 
 
+def _missing_dependency(package_name, install_name=None):
+    install_name = install_name or package_name
+    raise ModuleNotFoundError(
+        f"Missing optional provider dependency '{package_name}'. "
+        f"Install it with `pip install {install_name}` before using this provider."
+    )
+
+
 def encode_image(image):
     if type(image) == str:
         try: 
@@ -227,6 +232,10 @@ def try_ask(self, question, image_encoding=None, verbose=False):
 class Doubao(Bot):
     def __init__(self, key_path, patience=3, model="doubao-1.5-thinking-vision-pro-250428") -> None:
         super().__init__(key_path, patience)
+        try:
+            from volcenginesdkarkruntime import Ark
+        except ModuleNotFoundError:
+            _missing_dependency("volcenginesdkarkruntime", "volcengine-python-sdk[ark]")
         self.client = Ark(api_key=self.key)
         self.model = model
     
@@ -270,6 +279,10 @@ def ask(self, question, image_encoding=None, verbose=False):
 class Qwen(Bot):
     def __init__(self, key_path, patience=3, model="qwen2.5-vl-32b-instruct") -> None:
         super().__init__(key_path, patience)
+        try:
+            from openai import OpenAI
+        except ModuleNotFoundError:
+            _missing_dependency("openai")
         self.client = OpenAI(api_key=self.key, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1")
         self.name = model
 
@@ -309,6 +322,10 @@ def ask(self, question, image_encoding=None, verbose=False):
 class GPT(Bot):
     def __init__(self, key_path, patience=3, model="gpt-4o") -> None:
         super().__init__(key_path, patience)
+        try:
+            from openai import OpenAI
+        except ModuleNotFoundError:
+            _missing_dependency("openai")
         self.client = OpenAI(api_key=self.key)
         self.name="gpt4"
         self.model = model
@@ -354,6 +371,10 @@ def ask(self, question, image_encoding=None, verbose=False):
 class Gemini(Bot):
     def __init__(self, key_path, patience=3, model="gemini-1.5-flash-latest") -> None:
         super().__init__(key_path, patience)
+        try:
+            import google.generativeai as genai
+        except ModuleNotFoundError:
+            _missing_dependency("google.generativeai", "google-generativeai")
         GOOGLE_API_KEY= self.key
         genai.configure(api_key=GOOGLE_API_KEY)
         self.name = "Gemini"