diff --git a/.gitignore b/.gitignore index 65b85c4..e593f5e 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,15 @@ dist/ # Environments .env .venv +venv/ + +# Python caches / local test artifacts +__pycache__/ +*.py[cod] +.pytest_cache/ +.hypothesis/ +data/run/ # API keys -doubao_api.txt \ No newline at end of file +doubao_api.txt +.gitnexus diff --git a/README.md b/README.md index 1fbb0cd..bd11d8b 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ As shown above, our method produces results that are more accurate, visually ali - `html_generator.py`: Takes the detected component data and generates a complete HTML layout with generated code for each module. - `image_replacer.py`: A script to replace placeholder divs in the final HTML with actual cropped images. - `mapping.py`: Maps the detected UIED components to logical page regions. +- `docs/asset-aware-html-generation.md`: Design plan for asset-aware screenshot-to-HTML generation and handoff to a reference UI asset system. - `requirements.txt`: Lists all the necessary Python dependencies for the project. - `doubao_api.txt`: API key file for the Doubao model (should be kept private and is included in `.gitignore`). @@ -112,6 +113,64 @@ As shown above, our method produces results that are more accurate, visually ali The typical workflow is a multi-step process as follows: +### Convert a Custom Screenshot to a Rough Asset-Aware HTML Preview + +For the current ScreenCoder handoff workflow, use the asset-aware preview path when you need a fast end-to-end result from a UI design image without calling a model API. It reads the input image size, builds a semantic UI schema, extracts crop-source assets, and writes a standalone HTML preview plus handoff files for `web-ui-reference`: + +```bash +python screen_to_schema.py \ + --image data/input/menu.png \ + --output data/run/menu/screen-schema.json \ + --preview data/run/menu/index.html \ + --handoff data/run/menu/handoff.md \ + --reference-json data/run/menu/reference-handoff.json +``` + +Optional hosted OCR can enrich region content with recognized text blocks. The preferred open-source-model path is PaddleOCR PP-OCRv4 hosted on Replicate: + +```bash +export REPLICATE_API_TOKEN=... +python screen_to_schema.py \ + --image data/input/menu.png \ + --ocr-provider replicate_paddleocr \ + --ocr-output data/run/menu/ocr.json \ + --output data/run/menu/screen-schema.json \ + --preview data/run/menu/index.html \ + --handoff data/run/menu/handoff.md \ + --reference-json data/run/menu/reference-handoff.json +``` + +If the token is missing, ScreenCoder records an OCR manual gate in the schema handoff notes and still writes the non-OCR preview package. + +Key outputs: + +- `screen-schema.json`: structured page/region/component/asset-role schema. +- `asset-crops/`: source-image snippets for `crop-source` roles. +- `index.html`: rough standalone HTML/CSS reconstruction preview. +- `handoff.md` and `reference-handoff.json`: component/asset intake notes for `web-ui-reference`. + +This path is intentionally approximate: ScreenCoder identifies and packages an editable first draft; `web-ui-reference` remains responsible for final reusable component implementations, asset licensing, visual regression, and Godot contracts. + +### Convert a Custom Screenshot through the Legacy Model/UIED Pipeline + +Put the target screenshot in the project, for example: + +```bash +mkdir -p data/input +cp your-menu.png data/input/menu.png +``` + +Then run the unified pipeline: + +```bash +python screen_to_html.py \ + --image data/input/menu.png \ + --work-dir data/run/menu \ + --api-key doubao_api.txt +``` + +The final HTML is written to `data/run/menu/menu_layout_final.html`. Intermediate files in the same directory include detected layout boxes, gray placeholder HTML, UIED component boxes, mapping overlays, and cropped image assets. + 1. **Initial Generation with Placeholders:** Run the Python script to generate the initial HTML code for a given screenshot. - Block Detection: @@ -157,5 +216,3 @@ The typical workflow is a multi-step process as follows: ## Acknowledgements This project builds upon several outstanding open-source efforts. We would like to thank the authors and contributors of the following projects: [UIED](https://github.com/MulongXie/UIED), [DCGen](https://github.com/WebPAI/DCGen), [Design2Code](https://github.com/NoviScl/Design2Code) - - diff --git a/UIED/run_single.py b/UIED/run_single.py index ea8efa1..927a093 100644 --- a/UIED/run_single.py +++ b/UIED/run_single.py @@ -1,8 +1,10 @@ from os.path import join as pjoin +import argparse import cv2 import os import numpy as np import multiprocessing +from pathlib import Path def resize_height_by_longest_edge(img_path, resize_length=800): @@ -30,6 +32,13 @@ def color_tips(): if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Run UIED component detection for one screenshot.") + parser.add_argument("--image", default="data/input/test4.png", help="Input screenshot path.") + parser.add_argument("--output-root", default="data/tmp", help="Output root directory.") + parser.add_argument("--output-json", default=None, help="Expected or copied output JSON path.") + parser.add_argument("--show", action="store_true", help="Show debug windows when supported.") + args = parser.parse_args() + # Set multiprocessing start method to 'spawn' for macOS compatibility. # This must be done at the very beginning of the main block. try: @@ -61,14 +70,14 @@ def color_tips(): key_params = {'min-grad':10, 'ffl-block':5, 'min-ele-area':50, 'merge-contained-ele':True, 'merge-line-to-paragraph':False, 'remove-bar':True} - # set input image path - input_path_img = 'data/input/test4.png' - output_root = 'data/tmp' + input_path_img = args.image + output_root = args.output_root resized_height = resize_height_by_longest_edge(input_path_img, resize_length=800) - color_tips() + if args.show: + color_tips() - is_ip = False + is_ip = True is_clf = False is_ocr = False is_merge = False @@ -90,7 +99,7 @@ def color_tips(): classifier['Elements'] = CNN('Elements') # classifier['Noise'] = CNN('Noise') ip.compo_detection(input_path_img, output_root, key_params, - classifier=classifier, resize_by_height=resized_height, show=False) + classifier=classifier, resize_by_height=resized_height, show=args.show) if is_merge: import detect_merge.merge as merge @@ -99,4 +108,13 @@ def color_tips(): compo_path = pjoin(output_root, 'ip', str(name) + '.json') ocr_path = pjoin(output_root, 'ocr', str(name) + '.json') merge.merge(input_path_img, compo_path, ocr_path, pjoin(output_root, 'merge'), - is_remove_bar=key_params['remove-bar'], is_paragraph=key_params['merge-line-to-paragraph'], show=True) + is_remove_bar=key_params['remove-bar'], is_paragraph=key_params['merge-line-to-paragraph'], show=args.show) + + if args.output_json: + expected = Path(output_root) / "ip" / f"{Path(input_path_img).stem}.json" + requested = Path(args.output_json) + if expected.exists() and expected.resolve() != requested.resolve(): + requested.parent.mkdir(parents=True, exist_ok=True) + requested.write_text(expected.read_text()) + if not requested.exists(): + raise FileNotFoundError(f"UIED output JSON was not created: {requested}") diff --git a/asset_roles.py b/asset_roles.py new file mode 100644 index 0000000..dc1f7bd --- /dev/null +++ b/asset_roles.py @@ -0,0 +1,154 @@ +"""Asset role constants and prefab defaults for asset-aware HTML generation.""" + +from __future__ import annotations + + +ALLOWED_ASSET_ROLES = frozenset( + { + "ritual-background", + "ornate-border", + "paper-panel", + "brass-plate", + "red-seal", + "corner-rivet", + "symbolic-icon", + "portrait-frame", + "quick-card-illustration", + "title-logotype", + "ink-divider", + "blood-smear", + "rope-knot", + "hanging-tag", + "progress-petal", + "damage-scratch", + } +) + + +ALLOWED_ASSET_STRATEGIES = frozenset( + { + "css-procedural", + "crop-source", + "reference-asset", + "manual-art", + } +) + + +_DEFAULT_STRATEGY_BY_ROLE = { + "ritual-background": "css-procedural", + "ornate-border": "reference-asset", + "paper-panel": "reference-asset", + "brass-plate": "reference-asset", + "red-seal": "css-procedural", + "corner-rivet": "reference-asset", + "symbolic-icon": "reference-asset", + "portrait-frame": "reference-asset", + "quick-card-illustration": "crop-source", + "title-logotype": "manual-art", + "ink-divider": "css-procedural", + "blood-smear": "css-procedural", + "rope-knot": "reference-asset", + "hanging-tag": "reference-asset", + "progress-petal": "css-procedural", + "damage-scratch": "css-procedural", +} + + +PREFAB_ASSET_ROLE_HINTS = { + "ritual-title-stack": [ + "title-logotype", + "ink-divider", + "red-seal", + "blood-smear", + ], + "ornate-action-plaque": [ + "paper-panel", + "brass-plate", + "ornate-border", + "red-seal", + "corner-rivet", + ], + "journey-status-panel": [ + "paper-panel", + "ornate-border", + "hanging-tag", + "progress-petal", + "damage-scratch", + ], + "relic-quick-card": [ + "paper-panel", + "portrait-frame", + "quick-card-illustration", + "symbolic-icon", + "corner-rivet", + ], + "title-stack": [ + "title-logotype", + "ink-divider", + "red-seal", + ], + "primary-actions": [ + "paper-panel", + "brass-plate", + "ornate-border", + "red-seal", + "corner-rivet", + ], + "recent-run-panel": [ + "paper-panel", + "ornate-border", + "hanging-tag", + "progress-petal", + "damage-scratch", + ], + "quick-links": [ + "paper-panel", + "symbolic-icon", + "rope-knot", + "hanging-tag", + ], + "corner-system-actions": [ + "brass-plate", + "corner-rivet", + "symbolic-icon", + ], + "background-ornament": [ + "ritual-background", + "ornate-border", + "blood-smear", + "damage-scratch", + ], +} + + +DEFAULT_PREFABS = frozenset(PREFAB_ASSET_ROLE_HINTS) + + +def normalize_asset_role(role: str) -> str: + """Normalize and validate an asset role name.""" + + if not isinstance(role, str): + raise TypeError("asset role must be a string") + + normalized = role.strip().lower().replace("_", "-") + if normalized not in ALLOWED_ASSET_ROLES: + raise ValueError(f"unknown asset role: {role!r}") + + return normalized + + +def default_roles_for_prefab(prefab: str) -> list[str]: + """Return default asset roles for a prefab, or an empty list when unknown.""" + + if not isinstance(prefab, str): + raise TypeError("prefab must be a string") + + return list(PREFAB_ASSET_ROLE_HINTS.get(prefab.strip(), ())) + + +def default_strategy_for_role(role: str) -> str: + """Return the default generation strategy for an asset role.""" + + normalized = normalize_asset_role(role) + return _DEFAULT_STRATEGY_BY_ROLE[normalized] diff --git a/block_parsor.py b/block_parsor.py index 7e879b1..42e4c09 100644 --- a/block_parsor.py +++ b/block_parsor.py @@ -1,3 +1,4 @@ +import argparse import os import cv2 import json @@ -161,7 +162,7 @@ def parse_bboxes(bbox_input: str, image_path: str) -> dict[str, tuple[int, int, print("Final parsed bboxes:", bboxes) return bboxes -def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]]) -> str: +def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]], output_path: str | None = None) -> str: """Draw bounding boxes on image and save with different colors for each component""" image = cv2.imread(image_path) if image is None: @@ -191,13 +192,13 @@ def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]]) - cv2.putText(image, component, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2) - # Output directory - output_dir = "data/tmp" - os.makedirs(output_dir, exist_ok=True) - - # Get the original filename without path - original_filename = os.path.basename(image_path) - output_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_with_bboxes.png") + if output_path is None: + output_dir = "data/tmp" + os.makedirs(output_dir, exist_ok=True) + original_filename = os.path.basename(image_path) + output_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_with_bboxes.png") + else: + os.makedirs(os.path.dirname(output_path), exist_ok=True) if cv2.imwrite(output_path, image): print(f"Successfully saved annotated image: {output_path}") @@ -205,14 +206,15 @@ def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]]) - print("Error: Failed to save image") return "" -def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path: str) -> str: +def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path: str, json_path: str | None = None) -> str: """Save bounding boxes information to a JSON file""" - # Output directory - output_dir = "data/tmp" - os.makedirs(output_dir, exist_ok=True) - - original_filename = os.path.basename(image_path) - json_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_bboxes.json") + if json_path is None: + output_dir = "data/tmp" + os.makedirs(output_dir, exist_ok=True) + original_filename = os.path.basename(image_path) + json_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_bboxes.json") + else: + os.makedirs(os.path.dirname(json_path), exist_ok=True) bboxes_dict = {k: list(v) for k, v in bboxes.items()} @@ -310,8 +312,15 @@ def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path if __name__ == "__main__": - image_path = DEFAULT_IMAGE_PATH - api_path = DEFAULT_API_PATH + parser = argparse.ArgumentParser(description="Detect major layout regions in a webpage screenshot.") + parser.add_argument("--image", default=DEFAULT_IMAGE_PATH, help="Input screenshot path.") + parser.add_argument("--api-key", default=DEFAULT_API_PATH, help="API key file for the selected vision model.") + parser.add_argument("--json", default=None, help="Output JSON path for detected bboxes.") + parser.add_argument("--debug", default=None, help="Output debug image path with drawn bboxes.") + args = parser.parse_args() + + image_path = args.image + api_path = args.api_key print("=== Starting Simple Component Detection ===") print(f"Input image: {image_path}") @@ -333,8 +342,8 @@ def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path print(f"Found bounding boxes for components: {list(bboxes.keys())}") print(f"Total components detected: {len(bboxes)}") - json_path = save_bboxes_to_json(bboxes, image_path) - draw_bboxes(image_path, bboxes) + json_path = save_bboxes_to_json(bboxes, image_path, args.json) + draw_bboxes(image_path, bboxes, args.debug) print(f"\n=== Results ===") for component, bbox in bboxes.items(): diff --git a/component_schema.py b/component_schema.py new file mode 100644 index 0000000..5e236bc --- /dev/null +++ b/component_schema.py @@ -0,0 +1,269 @@ +"""Dataclass schema helpers for asset-aware screen descriptions.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from asset_roles import ( + ALLOWED_ASSET_STRATEGIES, + default_strategy_for_role, + normalize_asset_role, +) + + +BBox = list[float] + + +@dataclass +class Viewport: + width: int + height: int + + +@dataclass +class AssetRoleBinding: + role: str + strategy: str + crop_id: str | None = None + source_bbox: BBox | None = None + notes: str | None = None + + +@dataclass +class RegionSchema: + id: str + prefab: str + bbox: BBox + label: str | None = None + content_priority: str | None = None + content: dict[str, Any] = field(default_factory=dict) + asset_roles: list[AssetRoleBinding] = field(default_factory=list) + + +@dataclass +class ReferenceHandoff: + composition: str | None = None + prefabs: list[str] = field(default_factory=list) + missing_assets: list[str] = field(default_factory=list) + notes: str | None = None + + +@dataclass +class ScreenSchema: + page_type: str + viewport: Viewport + theme_hint: str | None = None + template_hint: str | None = None + regions: list[RegionSchema] = field(default_factory=list) + reference_handoff: ReferenceHandoff | None = None + + +def _require_mapping(value: Any, name: str) -> dict[str, Any]: + if not isinstance(value, dict): + raise TypeError(f"{name} must be a dict") + return value + + +def _require_string(value: Any, name: str) -> str: + if not isinstance(value, str): + raise TypeError(f"{name} must be a string") + return value + + +def _optional_string(value: Any, name: str) -> str | None: + if value is None: + return None + return _require_string(value, name) + + +def _parse_bbox(value: Any, name: str) -> BBox: + if not isinstance(value, (list, tuple)) or len(value) != 4: + raise ValueError(f"{name} must contain exactly four numbers") + + bbox: BBox = [] + for item in value: + if not isinstance(item, (int, float)) or isinstance(item, bool): + raise TypeError(f"{name} values must be numbers") + bbox.append(float(item)) + return bbox + + +def _parse_viewport(value: Any) -> Viewport: + data = _require_mapping(value, "viewport") + width = data.get("width") + height = data.get("height") + if not isinstance(width, int) or isinstance(width, bool): + raise TypeError("viewport.width must be an integer") + if not isinstance(height, int) or isinstance(height, bool): + raise TypeError("viewport.height must be an integer") + if width <= 0 or height <= 0: + raise ValueError("viewport width and height must be positive") + return Viewport(width=width, height=height) + + +def _parse_asset_role_binding(value: Any) -> AssetRoleBinding: + if isinstance(value, str): + role = normalize_asset_role(value) + return AssetRoleBinding(role=role, strategy=default_strategy_for_role(role)) + + data = _require_mapping(value, "asset_roles[]") + role = normalize_asset_role(data.get("role")) + strategy = _require_string(data.get("strategy"), "asset_roles[].strategy") + strategy = strategy.strip() + if strategy not in ALLOWED_ASSET_STRATEGIES: + raise ValueError(f"unknown asset strategy: {strategy!r}") + + source_bbox = None + if data.get("source_bbox") is not None: + source_bbox = _parse_bbox(data.get("source_bbox"), "asset_roles[].source_bbox") + + return AssetRoleBinding( + role=role, + strategy=strategy, + crop_id=_optional_string(data.get("crop_id"), "asset_roles[].crop_id"), + source_bbox=source_bbox, + notes=_optional_string(data.get("notes"), "asset_roles[].notes"), + ) + + +def _parse_region(value: Any) -> RegionSchema: + data = _require_mapping(value, "regions[]") + prefab = data.get("prefab", data.get("component_prefab", "")) + asset_roles = data.get("asset_roles", []) + if not isinstance(asset_roles, list): + raise TypeError("regions[].asset_roles must be a list") + + content = data.get("content", {}) + if content is None: + content = {} + if not isinstance(content, dict): + raise TypeError("regions[].content must be a dict") + + return RegionSchema( + id=_require_string(data.get("id"), "regions[].id"), + prefab=_require_string(prefab, "regions[].prefab"), + bbox=_parse_bbox(data.get("bbox"), "regions[].bbox"), + label=_optional_string(data.get("label"), "regions[].label"), + content_priority=_optional_string(data.get("content_priority"), "regions[].content_priority"), + content=dict(content), + asset_roles=[_parse_asset_role_binding(item) for item in asset_roles], + ) + + +def _parse_reference_handoff(value: Any) -> ReferenceHandoff: + data = _require_mapping(value, "reference_handoff") + + prefabs = data.get("prefabs", []) + if not isinstance(prefabs, list) or not all(isinstance(item, str) for item in prefabs): + raise TypeError("reference_handoff.prefabs must be a list of strings") + + missing_assets = data.get("missing_assets", []) + if not isinstance(missing_assets, list): + raise TypeError("reference_handoff.missing_assets must be a list") + + return ReferenceHandoff( + composition=_optional_string(data.get("composition"), "reference_handoff.composition"), + prefabs=list(prefabs), + missing_assets=[normalize_asset_role(item) for item in missing_assets], + notes=_optional_string(data.get("notes"), "reference_handoff.notes"), + ) + + +def screen_schema_from_dict(data: dict) -> ScreenSchema: + """Build a validated ScreenSchema from a plain dictionary.""" + + source = _require_mapping(data, "screen schema") + regions = source.get("regions", []) + if not isinstance(regions, list): + raise TypeError("regions must be a list") + + reference_handoff = None + if source.get("reference_handoff") is not None: + reference_handoff = _parse_reference_handoff(source.get("reference_handoff")) + + return ScreenSchema( + page_type=_require_string(source.get("page_type"), "page_type"), + viewport=_parse_viewport(source.get("viewport")), + theme_hint=_optional_string(source.get("theme_hint"), "theme_hint"), + template_hint=_optional_string(source.get("template_hint"), "template_hint"), + regions=[_parse_region(item) for item in regions], + reference_handoff=reference_handoff, + ) + + +def _asset_role_to_dict(binding: AssetRoleBinding) -> dict[str, Any]: + result: dict[str, Any] = { + "role": normalize_asset_role(binding.role), + "strategy": binding.strategy, + } + if result["strategy"] not in ALLOWED_ASSET_STRATEGIES: + raise ValueError(f"unknown asset strategy: {result['strategy']!r}") + if binding.crop_id is not None: + result["crop_id"] = binding.crop_id + if binding.source_bbox is not None: + result["source_bbox"] = _parse_bbox(binding.source_bbox, "asset_roles[].source_bbox") + if binding.notes is not None: + result["notes"] = binding.notes + return result + + +def screen_schema_to_dict(schema: ScreenSchema) -> dict: + """Convert a ScreenSchema into a JSON-serializable dictionary.""" + + if not isinstance(schema, ScreenSchema): + raise TypeError("schema must be a ScreenSchema") + + result: dict[str, Any] = { + "page_type": schema.page_type, + "viewport": { + "width": schema.viewport.width, + "height": schema.viewport.height, + }, + "theme_hint": schema.theme_hint, + "template_hint": schema.template_hint, + "regions": [ + { + "id": region.id, + "label": region.label, + "prefab": region.prefab, + "bbox": _parse_bbox(region.bbox, "regions[].bbox"), + "content_priority": region.content_priority, + "content": dict(region.content), + "asset_roles": [_asset_role_to_dict(binding) for binding in region.asset_roles], + } + for region in schema.regions + ], + } + + if schema.reference_handoff is not None: + handoff = schema.reference_handoff + result["reference_handoff"] = { + "composition": handoff.composition, + "prefabs": list(handoff.prefabs), + "missing_assets": [normalize_asset_role(role) for role in handoff.missing_assets], + "notes": handoff.notes, + } + + return result + + +def load_screen_schema(path) -> ScreenSchema: + """Load a screen schema JSON file.""" + + with Path(path).open("r", encoding="utf-8") as handle: + data = json.load(handle) + return screen_schema_from_dict(data) + + +def save_screen_schema(schema, path) -> None: + """Save a screen schema JSON file with stable formatting.""" + + data = screen_schema_to_dict(schema) + target = Path(path) + target.parent.mkdir(parents=True, exist_ok=True) + with target.open("w", encoding="utf-8") as handle: + json.dump(data, handle, ensure_ascii=False, indent=2) + handle.write("\n") diff --git a/docs/asset-aware-html-generation.md b/docs/asset-aware-html-generation.md new file mode 100644 index 0000000..5bd69c9 --- /dev/null +++ b/docs/asset-aware-html-generation.md @@ -0,0 +1,314 @@ +# Asset-Aware HTML Generation Design + +本文档定义 ScreenCoder 如何从“截图转 HTML”升级为“资产感知的截图转 HTML”。目标是让 ScreenCoder 能识别复杂游戏 UI 中的旧纸、金属边框、铆钉、朱印、挂签、角色框和插图等装饰资产,并把它们映射到可复用的 Web/Godot UI 组件语义,而不是只生成普通 `div`、Tailwind class 和灰色图片占位。 + +## 背景 + +当前流水线主要覆盖三类能力: + +- `block_parsor.py`:让视觉模型框出大区域,例如 sidebar、header、navigation、main content。 +- `html_generator.py`:针对区域截图生成 HTML/Tailwind 片段。 +- `image_box_detection.py`、`UIED/run_single.py`、`mapping.py`、`image_replacer.py`:找出 HTML 中的灰色图片占位,将其映射到原图 UIED 检测框,并裁剪替换为真实图片。 + +这套流程适合普通网页截图,但对游戏主菜单、HUD、RPG 面板等复杂 UI 会遇到三个问题: + +- 纸张破损、金属边框、铆钉、红绳、朱印等装饰不是普通内容块,不能只靠 `border`、`shadow` 和背景色稳定复现。 +- 现有替换逻辑只识别 `.bg-gray-400` 占位块,无法表达“这个边框应该用 NinePatch/StyleBoxTexture 资产”。 +- 生成 HTML 缺少组件语义,后续无法可靠移交到 `web-ui-reference` 或 Godot。 + +## 目标 + +- 对输入截图输出结构化 UI schema,包含页面、语义区、组件 prefab、资产角色和内容字段。 +- 将截图中的复杂装饰分为三类:可程序化 CSS、可裁剪位图、需引用设计资产库。 +- 让生成 HTML 引用稳定组件语义,例如 `ornate-action-plaque`、`paper-panel`、`brass-plate`、`red-seal`。 +- 保留当前研究型流水线的快速原型能力,同时新增可审计、可回归的资产感知路径。 +- 与 `web-ui-reference` 分工清楚:ScreenCoder 负责识别、裁剪、映射和生成初稿;`web-ui-reference` 负责资产治理、组件实现、视觉回归和 Godot 合同。 + +## 非目标 + +- 不在 ScreenCoder 内维护完整设计系统。 +- 不把外部资产包、许可证、Storybook 或 Godot Theme 合同放到本仓库长期治理。 +- 不要求自动生成达到像素级一致;像素级或艺术级修正应进入 `web-ui-reference` 的组件和截图回归流程。 +- 不把游戏运行时状态、Godot `.tscn` 或业务逻辑复制到 ScreenCoder。 + +## 目标架构 + +```text +input screenshot + -> optional hosted OCR text detection + -> layout detection + -> semantic region classification + -> asset role detection + -> component schema generation + -> optional source crop extraction + -> HTML preview generation + -> visual diff and handoff package +``` + +### Optional Hosted OCR + +ScreenCoder can enrich schema region `content` with OCR text and OCR boxes before writing preview/handoff files. The current preferred hosted open-source-model option is PaddleOCR PP-OCRv4 on Replicate: + +- Provider flag: `--ocr-provider replicate_paddleocr` +- Required manual gate: `REPLICATE_API_TOKEN` +- Output shape normalized internally to `{provider, text, blocks[]}` where each block has `text`, `bbox`, optional `confidence`, and optional `polygon`. + +Additional experimental/fallback providers are available through the same abstraction: + +- `--ocr-provider hf_ocr`: Hugging Face hosted open-model OCR, gated by `HF_TOKEN`; useful for plain text but often lacks boxes. +- `--ocr-provider ocr_space`: OCR.space hosted fallback, gated by `OCR_SPACE_API_KEY`; practical but not open-source. + +If a provider is requested but its credential is missing, ScreenCoder keeps generating the non-OCR preview package and writes the manual gate into handoff notes instead of failing the whole run. + +### Stage 1: Layout Detection + +继续复用 `block_parsor.py` 的大区域识别能力,但 prompt 不应固定在普通网页区域。新 prompt 应允许输出游戏 UI 区域,例如: + +- `title-stack` +- `primary-actions` +- `recent-run-panel` +- `quick-links` +- `corner-system-actions` +- `background-ornament` + +输出仍使用归一化 bbox,但需要增加 `confidence`、`reason` 和 `expected_component`。 + +### Stage 2: Semantic Region Classification + +新增语义分类,让每个区域有明确职责。 + +示例输出: + +```json +{ + "id": "primary-actions", + "label": "继续旅程 / 新的旅程", + "bbox": [88, 260, 918, 520], + "component_prefab": "ornate-action-plaque", + "content_priority": "L0-primary-action", + "asset_roles": ["paper-panel", "brass-plate", "ornate-border", "red-seal", "corner-rivet"] +} +``` + +### Stage 3: Asset Role Detection + +资产检测不是简单 OCR 或 UIED 检测。它需要把视觉元素映射到可复用资产角色。 + +初始角色集合应对齐 `web-ui-reference`: + +- `ritual-background` +- `ornate-border` +- `paper-panel` +- `brass-plate` +- `red-seal` +- `corner-rivet` +- `symbolic-icon` +- `portrait-frame` +- `quick-card-illustration` +- `title-logotype` +- `ink-divider` +- `blood-smear` +- `rope-knot` +- `hanging-tag` +- `progress-petal` +- `damage-scratch` + +每个角色输出 `strategy`: + +| Strategy | Meaning | Example | +| --- | --- | --- | +| `css-procedural` | 可用 CSS 渐变、阴影、伪元素模拟 | 暗角、噪点、普通纸纹 | +| `crop-source` | 从源图裁剪可复用片段 | 角色头像、底部卡片插图 | +| `reference-asset` | 应引用外部/内部资产库 | 金属九宫格边框、铆钉、挂签 | +| `manual-art` | 需要人工或图像生成资产 | 手写题字、复杂插画 | + +### Stage 4: Component Schema Generation + +新增 ScreenCoder 输出,不直接只产 HTML。 + +建议文件: + +```text +data/run//screen-schema.json +data/run//asset-crops/ +data/run//preview.html +data/run//handoff.md +``` + +`screen-schema.json` 示例: + +```json +{ + "page_type": "game-main-menu", + "viewport": { "width": 820, "height": 1600 }, + "theme_hint": "japanese-horror", + "template_hint": "ornamental-mobile", + "regions": [ + { + "id": "primary-actions", + "prefab": "ornate-action-plaque", + "bbox": [64, 410, 756, 840], + "content": { + "primary_title": "继续旅程", + "secondary_title": "新的旅程", + "subtitle": "返回当前的破宫之旅" + }, + "asset_roles": [ + { "role": "paper-panel", "strategy": "reference-asset" }, + { "role": "brass-plate", "strategy": "reference-asset" }, + { "role": "red-seal", "strategy": "css-procedural" } + ] + } + ] +} +``` + +### Stage 5: HTML Preview Generation + +HTML 预览分两种模式: + +| Mode | Use case | Output | +| --- | --- | --- | +| `standalone` | 快速查看生成效果 | 单文件 `preview.html`,内联 CSS 和裁剪图 | +| `component-reference` | 移交 `web-ui-reference` | 输出组件/slot schema,不复制组件实现 | + +`standalone` 可以继续用 `html_generator.py` 生成,但 prompt 应要求按 prefab 生成,而不是按 `sidebar/header/navigation/main content` 生成。 + +`component-reference` 应生成可导入 `web-ui-reference` 的候选配置,例如: + +```json +{ + "composition": "horror-ornamental", + "prefabs": ["ritual-title-stack", "ornate-action-plaque", "journey-status-panel", "relic-quick-card"], + "missing_assets": ["ornate-border", "brass-plate", "portrait-frame"] +} +``` + +## 与 web-ui-reference 的边界 + +ScreenCoder 输出的资产角色和 prefab 必须对齐 `web-ui-reference` 的命名。推荐边界: + +| Responsibility | ScreenCoder | web-ui-reference | +| --- | --- | --- | +| 从截图识别区域 | Yes | No | +| 从截图裁剪候选素材 | Yes | Optional review only | +| 维护资产许可证 | No | Yes | +| 定义 NinePatch/StyleBoxTexture | No | Yes | +| 实现可复用组件 | No | Yes | +| 生成一次性 HTML 预览 | Yes | Optional | +| Storybook/Playwright 回归 | No | Yes | +| Godot handoff contract | Draft only | Source of truth | + +ScreenCoder 的 handoff 文档只应说明“识别到了什么”和“建议映射到什么”,最终合同由 `web-ui-reference` 管理。 + +## 新增模块建议 + +```text +asset_roles.py +component_schema.py +screen_to_schema.py +schema_to_html.py +handoff_writer.py +``` + +### `asset_roles.py` + +维护 ScreenCoder 侧可识别资产角色的轻量枚举。该文件应从 `web-ui-reference` 的文档或导出 JSON 同步,不应自行发明不同命名。 + +### `component_schema.py` + +定义 Pydantic 或 dataclass schema,用于约束 region、component、asset role、crop 和 handoff。 + +### `screen_to_schema.py` + +新主入口,负责把截图转成 `screen-schema.json`。它可以复用当前 `block_parsor.py`、OCR、UIED 和视觉模型。 + +### `schema_to_html.py` + +把 `screen-schema.json` 变成可打开的 `preview.html`。该模块只用于预览,不作为设计系统源代码。 + +### `handoff_writer.py` + +生成 `handoff.md`,包含: + +- 输入截图。 +- 识别到的页面类型。 +- 区域和 prefab 映射。 +- 裁剪资产清单。 +- 缺失资产角色。 +- 建议在 `web-ui-reference` 中实现或补齐的组件。 + +## Prompt 改造 + +旧 prompt 关注“还原容器内 HTML/Tailwind”。新 prompt 应要求模型输出结构化 JSON,再由代码生成 HTML。 + +推荐 prompt 目标: + +- 先识别页面类型和主任务。 +- 再识别语义区域和交互优先级。 +- 最后识别资产角色及生成策略。 +- 不要求模型凭空生成复杂纹理;复杂纹理必须标记为 `reference-asset` 或 `manual-art`。 + +## 评估指标 + +### 结构评估 + +- 主区域 bbox 是否覆盖正确。 +- prefab 分类是否正确。 +- 文案 OCR/视觉识别是否足够支撑内容模型。 +- 主行动、次行动、状态面板和底部入口是否层级清楚。 + +### 资产评估 + +- 资产角色召回率。 +- `css-procedural`、`crop-source`、`reference-asset` 分类是否合理。 +- 裁剪图是否可用,是否包含过多背景或文字。 + +### 视觉评估 + +- Playwright 截图与源图的粗粒度布局差异。 +- 主操作可发现性。 +- 文字可读性。 +- 装饰是否误导为可点击区域。 + +## 实施阶段 + +### Phase 1: Schema First + +- 新增 `screen_to_schema.py`。 +- 将 `block_parsor.py` 的输出转换为区域 schema。 +- 手工配置一组游戏 UI prefab 和 asset roles。 +- 输出 `screen-schema.json` 和 `handoff.md`。 + +### Phase 2: Asset Role Detection + +- 增加资产角色识别 prompt。 +- 识别 `paper-panel`、`brass-plate`、`red-seal`、`ornate-border`、`portrait-frame`。 +- 输出每个资产角色的推荐策略。 + +### Phase 3: Preview HTML + +- 新增 `schema_to_html.py`。 +- 生成一版 standalone HTML 预览。 +- 对 `crop-source` 资产生成 `asset-crops/`。 + +### Phase 4: Reference Handoff + +- 输出可被 `web-ui-reference` 读取的 JSON。 +- 在 handoff 中列出缺失组件和缺失资产。 +- 与 `web-ui-reference` 的 Storybook/Playwright 验收对接。 + +## 风险 + +- VLM 对装饰资产的语义识别不稳定,需要 schema 校验和人工修正入口。 +- 直接裁剪原图可能包含版权或混合背景,不应默认进入长期资产库。 +- 如果 ScreenCoder 内部实现完整组件系统,会与 `web-ui-reference` 分叉,导致重复维护。 +- 过度追求像素级自动复现会拖慢研究迭代;本项目应优先输出可移交的结构化初稿。 + +## 验收标准 + +- 给定一张复杂游戏主菜单截图,能输出 `screen-schema.json`。 +- `screen-schema.json` 至少包含标题区、主行动区、状态面板、快捷入口四类区域。 +- 每类区域能映射到 prefab 和 asset roles。 +- `handoff.md` 能清楚列出哪些资产应由 `web-ui-reference` 维护。 +- standalone `preview.html` 可打开,并保留主要信息层级。 diff --git a/docs/pogong-ui-image-prompts.md b/docs/pogong-ui-image-prompts.md new file mode 100644 index 0000000..7db084e --- /dev/null +++ b/docs/pogong-ui-image-prompts.md @@ -0,0 +1,28 @@ +# 破宫十重奏风格 UI 图像生成 Prompt Pack + +用途:为 ScreenCoder 生成多种游戏 UI 设计图输入,再由 `screen_to_schema.py` 生成粗略 HTML preview / handoff。 + +## 共通风格约束 + +- Mobile portrait game UI screenshot, 9:16 aspect ratio, clean flat 2D interface, not a mockup on a device. +- Style reference: 《破宫十重奏》现有主菜单 UI:米白宣纸背景、细灰线框、克制红棕点缀、朱印、黄铜/金色小标签、宋体/书法标题感、极简几何图形、淡墨插画、轻微纸纹和柔和阴影。 +- UI must be readable by OCR: use clear Simplified Chinese labels, avoid overdecorated text, avoid tiny unreadable glyphs. +- Keep layout structured for later HTML reconstruction: distinct panels, visible bounding boxes, consistent spacing, no photorealism. +- Avoid gore, explicit violence, or real-world logos. Use abstract ritual / palace / music / card-battle motifs. +- Palette: warm ivory `#F4EFE3`, ink brown `#2E211A`, muted red `#A5352C`, brass gold `#B78A42`, pale gray `#B7B1A7`. + +## Prompt 1 — 主菜单 / Main Menu + +Create a mobile portrait game main menu UI screenshot for a Chinese narrative card roguelite called “破宫十重奏”. Off-white textured paper background, thin gray borders, muted red seal accents, brass number plaques, elegant Chinese typography. Layout: top left square button “设置” with gear icon, top right square button “成就”; centered large title “破宫之十重奏” and subtitle “一桌无人知晓的局,一场无法回头的演奏。”; two large horizontal action cards: “继续旅程 / 返回当前的破宫之旅” with number badge “10” and red circular door icon, “新的旅程 / 开启一场新的破宫之旅” with badge “01”; a bordered “最近旅程” panel with character name “沈伶音”, floor “07/10”, round “03/05”, progress petals; bottom three cards “规则说明”, “卡牌图鉴”, “结局回顾”. Minimal ink illustrations, clean OCR-readable Simplified Chinese text, flat UI screenshot, no device frame. + +## Prompt 2 — 设置 / Settings + +Create a mobile portrait game settings screen UI screenshot in the same “破宫十重奏” paper-and-ink style. Off-white parchment background, thin gray dividers, muted red circular seals, brass toggle knobs, elegant Chinese typography. Top bar: back arrow “返回”, centered title “设置”, small red seal “静”. Main sections: “声音” with sliders “主音量 72”, “乐音 58”, “音效 64”; “画面” with segmented options “低 / 中 / 高” and selected “高”, toggle “纸纹效果 开”, toggle “动态朱印 开”; “游戏” with toggle “自动存档 开”, toggle “战斗提示 开”, language row “语言 简体中文”; bottom brass outlined buttons “恢复默认” and red primary button “保存设置”. Layout must be clean and structured, distinct panels and rows, OCR-readable Chinese, subtle geometric ink icons, no photorealism. + +## Prompt 3 — 档案 / Archive / Codex + +Create a mobile portrait game archive / character dossier UI screenshot for “破宫十重奏”. Same off-white paper, ink-brown linework, muted red seals, brass labels. Top bar: “返回” left, title “档案馆”, right small tab “筛选”. Layout: left vertical navigation rail with tabs “角色”, “卡牌”, “事件”, “结局”, selected “角色” with red mark. Main panel titled “角色档案” with search field “搜索姓名或乐章”; large dossier card for “沈伶音” with pale ink portrait frame, tags “第七楼”, “琴弦”, “未完成”; stats rows “记忆 42”, “执念 68”, “共鸣 31”; notes panel titled “残页记录” containing short readable lines “她在第三次演奏后失去了名字。”; bottom grid of three smaller locked/available cards: “陆简”, “白鹿”, “???”. Strong panel boundaries for HTML reconstruction, legible Simplified Chinese, minimalist ink illustrations. + +## Prompt 4 — 战斗界面 / Combat HUD + +Create a mobile portrait tactical card battle UI screenshot in the same “破宫十重奏” style. Off-white parchment battlefield with thin gray grid and ink shadows, muted red enemy seals, brass resource counters. Top status bar: left player “沈伶音 HP 32/45”, right enemy “宫中影 HP 28/40”, center round seal “第 03 轮”. Middle: abstract palace-room battlefield with three enemy intent cards labeled “屏”, “聚”, “裂”, small damage markers, timeline ribbon “先手 → 敌意 → 演奏”. Lower panel: player hand of four cards with readable names “断弦”, “回声”, “朱印”, “退步”, each in bordered paper card style with cost circles. Bottom action bar: resource “气 4/6”, red primary button “结束回合”, small buttons “弃牌”, “查看规则”. Clean flat UI screenshot, structured panels, OCR-readable Chinese, no blood or gore, no device frame. diff --git a/handoff_writer.py b/handoff_writer.py new file mode 100644 index 0000000..43b489b --- /dev/null +++ b/handoff_writer.py @@ -0,0 +1,314 @@ +"""Write asset-aware handoff documents for web-ui-reference.""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import asdict, is_dataclass +from pathlib import Path +from typing import Any, Mapping + + +WEB_UI_REFERENCE_NOTE = ( + "这些资产和组件的长期维护、许可证、Storybook/Playwright 回归与 Godot 合同" + "应由 web-ui-reference 维护;ScreenCoder 只输出识别和映射初稿。" +) + + +def _schema_to_dict(schema: Any) -> dict[str, Any]: + if isinstance(schema, Mapping): + return dict(schema) + + try: + from component_schema import screen_schema_to_dict + except ImportError: + screen_schema_to_dict = None + + if screen_schema_to_dict is not None: + return screen_schema_to_dict(schema) + if is_dataclass(schema): + return asdict(schema) + return { + "page_type": getattr(schema, "page_type", "unknown"), + "viewport": getattr(schema, "viewport", {}), + "theme_hint": getattr(schema, "theme_hint", ""), + "template_hint": getattr(schema, "template_hint", ""), + "regions": getattr(schema, "regions", []), + "reference_handoff": getattr(schema, "reference_handoff", None), + } + + +def _load_schema(path: str | Path) -> Any: + try: + from component_schema import load_screen_schema + except ImportError: + with Path(path).open("r", encoding="utf-8") as handle: + return json.load(handle) + return load_screen_schema(path) + + +def _normalise_region(region: Any) -> dict[str, Any]: + if is_dataclass(region): + return asdict(region) + if isinstance(region, Mapping): + return dict(region) + return { + "id": getattr(region, "id", ""), + "label": getattr(region, "label", ""), + "bbox": getattr(region, "bbox", [0, 0, 0, 0]), + "prefab": getattr(region, "prefab", ""), + "content_priority": getattr(region, "content_priority", ""), + "content": getattr(region, "content", {}), + "asset_roles": getattr(region, "asset_roles", []), + } + + +def _normalise_role(role: Any) -> dict[str, Any]: + if is_dataclass(role): + return asdict(role) + if isinstance(role, Mapping): + return dict(role) + return { + "role": getattr(role, "role", ""), + "strategy": getattr(role, "strategy", ""), + "crop_id": getattr(role, "crop_id", None), + "source_bbox": getattr(role, "source_bbox", None), + "notes": getattr(role, "notes", ""), + } + + +def _unique(values: list[str]) -> list[str]: + seen: set[str] = set() + result: list[str] = [] + for value in values: + if value and value not in seen: + seen.add(value) + result.append(value) + return result + + +def _regions(schema_dict: Mapping[str, Any]) -> list[dict[str, Any]]: + return [_normalise_region(region) for region in schema_dict.get("regions") or []] + + +def build_reference_handoff(schema: Any) -> dict[str, Any]: + """Build a compact JSON handoff for web-ui-reference intake.""" + schema_dict = _schema_to_dict(schema) + regions = _regions(schema_dict) + region_entries: list[dict[str, Any]] = [] + prefabs: list[str] = [] + missing_assets: list[str] = [] + missing_components: list[str] = [] + + for region in regions: + prefab = region.get("prefab") or "" + if prefab: + prefabs.append(str(prefab)) + else: + missing_components.append(str(region.get("id") or "unmapped-region")) + + role_entries: list[dict[str, Any]] = [] + for raw_role in region.get("asset_roles") or []: + role = _normalise_role(raw_role) + role_name = str(role.get("role") or "unknown-role") + strategy = str(role.get("strategy") or "unknown-strategy") + role_entries.append( + { + "role": role_name, + "strategy": strategy, + "crop_id": role.get("crop_id"), + "source_bbox": role.get("source_bbox"), + "notes": role.get("notes") or "", + "maintainer": "web-ui-reference", + } + ) + if strategy in {"reference-asset", "manual-art"}: + missing_assets.append(role_name) + elif strategy == "crop-source" and not role.get("crop_id"): + missing_assets.append(role_name) + + region_entries.append( + { + "id": region.get("id") or "", + "label": region.get("label") or "", + "bbox": region.get("bbox") or [], + "prefab": prefab, + "content_priority": region.get("content_priority") or "", + "asset_roles": role_entries, + } + ) + + composition = { + "page_type": schema_dict.get("page_type") or "unknown-page", + "theme_hint": schema_dict.get("theme_hint") or "", + "template_hint": schema_dict.get("template_hint") or "", + "viewport": schema_dict.get("viewport") or {}, + "target_owner": "web-ui-reference", + } + + return { + "composition": composition, + "prefabs": _unique(prefabs), + "missing_assets": _unique(missing_assets), + "missing_components": _unique(missing_components), + "regions": region_entries, + "maintenance_boundary": WEB_UI_REFERENCE_NOTE, + } + + +def _fmt(value: Any) -> str: + if value is None or value == "": + return "-" + if isinstance(value, (dict, list)): + return json.dumps(value, ensure_ascii=False) + return str(value) + + +def render_handoff_markdown(schema: Any, source_image: str | None = None) -> str: + """Render a human-readable handoff document.""" + schema_dict = _schema_to_dict(schema) + reference = build_reference_handoff(schema) + regions = reference["regions"] + lines: list[str] = [ + "# Asset-Aware UI Handoff", + "", + "## 页面概览", + "", + f"- 页面类型:{_fmt(schema_dict.get('page_type'))}", + f"- 视口:{_fmt(schema_dict.get('viewport'))}", + f"- 主题提示:{_fmt(schema_dict.get('theme_hint'))}", + f"- 模板提示:{_fmt(schema_dict.get('template_hint'))}", + ] + if source_image: + lines.append(f"- 输入截图:{source_image}") + reference_notes = None + reference_handoff = schema_dict.get("reference_handoff") or {} + if isinstance(reference_handoff, Mapping): + reference_notes = reference_handoff.get("notes") + if reference_notes: + lines.append(f"- 备注:{_fmt(reference_notes)}") + lines.extend(["", f"> {WEB_UI_REFERENCE_NOTE}", ""]) + + lines.extend( + [ + "## 区域与 Prefab 映射", + "", + "| Region | Label | BBox | Prefab | Priority |", + "| --- | --- | --- | --- | --- |", + ] + ) + if regions: + for region in regions: + lines.append( + "| " + + " | ".join( + [ + _fmt(region.get("id")), + _fmt(region.get("label")), + _fmt(region.get("bbox")), + _fmt(region.get("prefab")), + _fmt(region.get("content_priority")), + ] + ) + + " |" + ) + else: + lines.append("| - | - | - | - | - |") + + lines.extend( + [ + "", + "## 资产角色策略", + "", + "| Region | Role | Strategy | Crop | Source BBox | Notes | Maintainer |", + "| --- | --- | --- | --- | --- | --- | --- |", + ] + ) + has_roles = False + for region in regions: + for role in region.get("asset_roles") or []: + has_roles = True + lines.append( + "| " + + " | ".join( + [ + _fmt(region.get("id")), + _fmt(role.get("role")), + _fmt(role.get("strategy")), + _fmt(role.get("crop_id")), + _fmt(role.get("source_bbox")), + _fmt(role.get("notes")), + "web-ui-reference", + ] + ) + + " |" + ) + if not has_roles: + lines.append("| - | - | - | - | - | - | web-ui-reference |") + + lines.extend(["", "## 缺失资产", ""]) + missing_assets = reference.get("missing_assets") or [] + if missing_assets: + for asset in missing_assets: + lines.append(f"- {asset}:应由 web-ui-reference 补齐或维护。") + else: + lines.append("- 未检测到需要补齐的 reference-asset、manual-art 或缺 crop 的 crop-source 资产。") + + lines.extend(["", "## 缺失组件", ""]) + missing_components = reference.get("missing_components") or [] + if missing_components: + for component in missing_components: + lines.append(f"- {component}:该区域缺少 prefab 映射,应在 web-ui-reference 中确定组件归属。") + else: + lines.append("- 未检测到缺少 prefab 映射的区域。") + + lines.extend(["", "## Reference JSON 摘要", ""]) + lines.append("```json") + lines.append( + json.dumps( + { + "composition": reference["composition"], + "prefabs": reference["prefabs"], + "missing_assets": reference["missing_assets"], + }, + ensure_ascii=False, + indent=2, + ) + ) + lines.append("```") + lines.append("") + return "\n".join(lines) + + +def write_handoff(schema: Any, output_path: str | Path, source_image: str | None = None) -> None: + """Write handoff.md.""" + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(render_handoff_markdown(schema, source_image=source_image), encoding="utf-8") + + +def write_reference_json(schema: Any, output_path: str | Path) -> None: + """Write reference handoff JSON.""" + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(build_reference_handoff(schema), ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Write handoff.md and reference handoff JSON.") + parser.add_argument("--schema", required=True, help="Path to screen-schema.json.") + parser.add_argument("--output", required=True, help="Path to write handoff.md.") + parser.add_argument("--reference-json", required=True, help="Path to write reference handoff JSON.") + parser.add_argument("--source-image", default=None, help="Optional source screenshot path to mention in handoff.md.") + args = parser.parse_args() + + schema = _load_schema(args.schema) + write_handoff(schema, args.output, source_image=args.source_image) + write_reference_json(schema, args.reference_json) + + +if __name__ == "__main__": + main() diff --git a/html_generator.py b/html_generator.py index 3a14e5f..6bd3872 100644 --- a/html_generator.py +++ b/html_generator.py @@ -1,5 +1,6 @@ from utils import encode_image, Doubao, Qwen, GPT, Gemini from PIL import Image +import argparse import bs4 from threading import Thread import time @@ -346,17 +347,7 @@ def code_substitution(html_file, code_dict): # except Exception as e: # print(f"An error occurred during HTML refinement: {e}") -# Main -if __name__ == "__main__": - import json - import time - from PIL import Image - - # Load bboxes from block_parsing.py output - boxes_data = json.load(open("data/tmp/test1_bboxes.json")) - - - img_path = "data/input/test1.png" +def build_layout_from_bboxes(boxes_data, img_path): with Image.open(img_path) as img: width, height = img.size @@ -382,7 +373,6 @@ def code_substitution(html_file, code_dict): } root["children"].append(child) - # Assign IDs to all nodes def assign_id(node, id): node["id"] = id for child in node.get("children", []): @@ -390,14 +380,44 @@ def assign_id(node, id): return id assign_id(root, 0) + return root + + +def create_bot(provider, api_key, model): + if provider == "doubao": + return Doubao(api_key, model=model) + if provider == "qwen": + return Qwen(api_key, model=model) + if provider == "gpt": + return GPT(api_key, model=model) + if provider == "gemini": + return Gemini(api_key, model=model) + raise ValueError(f"Unsupported provider: {provider}") + + +# Main +if __name__ == "__main__": + import json + + parser = argparse.ArgumentParser(description="Generate HTML for detected layout regions.") + parser.add_argument("--image", default="data/input/test1.png", help="Input screenshot path.") + parser.add_argument("--bboxes", default="data/tmp/test1_bboxes.json", help="Region bbox JSON from block_parsor.py.") + parser.add_argument("--output", default="data/tmp/test1_layout.html", help="Output gray placeholder HTML path.") + parser.add_argument("--api-key", default="doubao_api.txt", help="API key file for the selected model.") + parser.add_argument("--provider", default="doubao", choices=["doubao", "qwen", "gpt", "gemini"], help="Vision model provider.") + parser.add_argument("--model", default="doubao-1.5-thinking-vision-pro-250428", help="Vision model name.") + args = parser.parse_args() + + boxes_data = json.load(open(args.bboxes)) + img_path = args.image + root = build_layout_from_bboxes(boxes_data, img_path) - # print(root) # Generate initial HTML layout - generate_html(root, 'data/tmp/test1_layout.html') + generate_html(root, args.output) # Initialize the bot # Change your model & API ket path according to your needs - bot = Doubao("doubao_api.txt", model = "doubao-1.5-thinking-vision-pro-250428") + bot = create_bot(args.provider, args.api_key, args.model) # bot = Qwen("qwen_api.txt", model="qwen2.5-vl-72b-instruct") # bot = GPT("gpt_api.txt", model="gpt-4o") # bot = Gemini("gemini_api.txt", model="gemini-1.5-flash-latest") @@ -408,7 +428,7 @@ def assign_id(node, id): code_dict = generate_code_parallel(root, img_path, bot) # Substitute the generated code into the HTML - code_substitution('data/tmp/test1_layout.html', code_dict) + code_substitution(args.output, code_dict) # Refine the html file # html_refinement('data/tmp/test1_layout.html', 'data/tmp/test1_layout_refined.html', img_path, bot) diff --git a/ocr_client.py b/ocr_client.py new file mode 100644 index 0000000..7eb92cb --- /dev/null +++ b/ocr_client.py @@ -0,0 +1,308 @@ +"""Optional hosted OCR API clients for ScreenCoder. + +The clients normalize different OCR API responses to one lightweight shape so +ScreenCoder can enrich screen schemas without depending on local OCR runtimes. +""" + +from __future__ import annotations + +import base64 +import json +import mimetypes +import os +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Mapping + +import requests + + +@dataclass +class OCRBlock: + text: str + bbox: list[float] | None = None + confidence: float | None = None + polygon: list[list[float]] | None = None + + +@dataclass +class OCRResult: + provider: str + text: str = "" + blocks: list[OCRBlock] = field(default_factory=list) + raw: dict[str, Any] | None = None + + +@dataclass +class OCRClient: + provider: str + is_configured: bool + manual_gate: str = "" + + def recognize(self, image_path: Path) -> OCRResult: + raise NotImplementedError + + +def _compact_text(lines: list[str]) -> str: + return "\n".join(line.strip() for line in lines if line and line.strip()) + + +def _as_float(value: Any) -> float | None: + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _bbox_from_polygon(points: Any) -> list[float] | None: + if not isinstance(points, list) or not points: + return None + xs: list[float] = [] + ys: list[float] = [] + for point in points: + if not isinstance(point, (list, tuple)) or len(point) < 2: + continue + x = _as_float(point[0]) + y = _as_float(point[1]) + if x is None or y is None: + continue + xs.append(x) + ys.append(y) + if not xs or not ys: + return None + return [min(xs), min(ys), max(xs), max(ys)] + + +def _normalise_polygon(points: Any) -> list[list[float]] | None: + if not isinstance(points, list): + return None + result: list[list[float]] = [] + for point in points: + if not isinstance(point, (list, tuple)) or len(point) < 2: + continue + x = _as_float(point[0]) + y = _as_float(point[1]) + if x is not None and y is not None: + result.append([x, y]) + return result or None + + +def _bbox_from_xywh(item: Mapping[str, Any]) -> list[float] | None: + left = _as_float(item.get("Left", item.get("left"))) + top = _as_float(item.get("Top", item.get("top"))) + width = _as_float(item.get("Width", item.get("width"))) + height = _as_float(item.get("Height", item.get("height"))) + if left is None or top is None or width is None or height is None: + return None + return [left, top, left + width, top + height] + + +def _data_uri_for_image(image_path: Path) -> str: + mime = mimetypes.guess_type(str(image_path))[0] or "image/png" + encoded = base64.b64encode(Path(image_path).read_bytes()).decode("ascii") + return f"data:{mime};base64,{encoded}" + + +def normalize_replicate_paddleocr_response(payload: Mapping[str, Any]) -> OCRResult: + """Normalize PaddleOCR-like Replicate prediction output.""" + + output = payload.get("output", payload) + if isinstance(output, str): + return OCRResult(provider="replicate_paddleocr", text=output.strip(), raw=dict(payload)) + + candidates: Any = [] + if isinstance(output, Mapping): + candidates = output.get("results", output.get("ocr", output.get("blocks", output.get("text", [])))) + elif isinstance(output, list): + candidates = output + + blocks: list[OCRBlock] = [] + text_lines: list[str] = [] + if isinstance(candidates, str): + text_lines.append(candidates) + elif isinstance(candidates, list): + for item in candidates: + if isinstance(item, str): + text_lines.append(item) + blocks.append(OCRBlock(text=item)) + continue + if not isinstance(item, Mapping): + continue + text = str(item.get("text", item.get("transcription", item.get("label", "")))).strip() + if not text: + continue + polygon = _normalise_polygon(item.get("box", item.get("polygon", item.get("points")))) + bbox = None + raw_bbox = item.get("bbox", item.get("bounding_box")) + if isinstance(raw_bbox, list) and len(raw_bbox) == 4: + bbox = [float(value) for value in raw_bbox] + if bbox is None and polygon: + bbox = _bbox_from_polygon(polygon) + confidence = _as_float(item.get("score", item.get("confidence", item.get("probability")))) + text_lines.append(text) + blocks.append(OCRBlock(text=text, bbox=bbox, confidence=confidence, polygon=polygon)) + + if not text_lines and isinstance(output, Mapping): + generated = output.get("text", output.get("generated_text")) + if isinstance(generated, str): + text_lines.append(generated) + + return OCRResult( + provider="replicate_paddleocr", + text=_compact_text(text_lines), + blocks=blocks, + raw=dict(payload), + ) + + +def normalize_ocr_space_response(payload: Mapping[str, Any]) -> OCRResult: + """Normalize OCR.space response as a non-open-source hosted fallback.""" + + results = payload.get("ParsedResults") or [] + text_lines: list[str] = [] + blocks: list[OCRBlock] = [] + if isinstance(results, list): + for parsed in results: + if not isinstance(parsed, Mapping): + continue + parsed_text = parsed.get("ParsedText") + if isinstance(parsed_text, str) and parsed_text.strip(): + text_lines.append(parsed_text.strip()) + overlay = parsed.get("TextOverlay") or {} + lines = overlay.get("Lines") if isinstance(overlay, Mapping) else [] + if isinstance(lines, list): + for line in lines: + words = line.get("Words") if isinstance(line, Mapping) else [] + if not isinstance(words, list): + continue + for word in words: + if not isinstance(word, Mapping): + continue + text = str(word.get("WordText", "")).strip() + if not text: + continue + blocks.append(OCRBlock(text=text, bbox=_bbox_from_xywh(word))) + + return OCRResult(provider="ocr_space", text=_compact_text(text_lines), blocks=blocks, raw=dict(payload)) + + +class ManualGateOCRClient(OCRClient): + def __init__(self, provider: str, manual_gate: str) -> None: + super().__init__(provider=provider, is_configured=False, manual_gate=manual_gate) + + def recognize(self, image_path: Path) -> OCRResult: + raise RuntimeError(self.manual_gate) + + +class ReplicatePaddleOCRClient(OCRClient): + DEFAULT_VERSION = "084b779cb09bc2462335a5768fabaeaaba53bb3f70afd0d2fe48fad71fdc4d5a" + + def __init__(self, token: str, version: str | None = None, timeout: int = 120) -> None: + super().__init__(provider="replicate_paddleocr", is_configured=True) + self.token = token + self.version = version or os.getenv("SCREENCODER_REPLICATE_PADDLEOCR_VERSION", self.DEFAULT_VERSION) + self.timeout = timeout + + def recognize(self, image_path: Path) -> OCRResult: + response = requests.post( + "https://api.replicate.com/v1/predictions", + headers={ + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json", + "Prefer": "wait", + }, + json={"version": self.version, "input": {"image": _data_uri_for_image(image_path), "lang": "ch"}}, + timeout=self.timeout, + ) + response.raise_for_status() + payload = response.json() + if payload.get("status") in {"starting", "processing"} and payload.get("urls", {}).get("get"): + payload = self._poll(payload["urls"]["get"]) + if payload.get("status") == "failed": + raise RuntimeError(f"Replicate OCR failed: {payload.get('error')}") + return normalize_replicate_paddleocr_response(payload) + + def _poll(self, url: str) -> dict[str, Any]: + deadline = time.monotonic() + self.timeout + while time.monotonic() < deadline: + response = requests.get(url, headers={"Authorization": f"Bearer {self.token}"}, timeout=30) + response.raise_for_status() + payload = response.json() + if payload.get("status") in {"succeeded", "successful"}: + return payload + if payload.get("status") == "failed": + raise RuntimeError(f"Replicate OCR failed: {payload.get('error')}") + time.sleep(2) + raise TimeoutError("Timed out waiting for Replicate OCR prediction") + + +class OCRSpaceClient(OCRClient): + def __init__(self, api_key: str, timeout: int = 120) -> None: + super().__init__(provider="ocr_space", is_configured=True) + self.api_key = api_key + self.timeout = timeout + + def recognize(self, image_path: Path) -> OCRResult: + with Path(image_path).open("rb") as handle: + response = requests.post( + "https://api.ocr.space/parse/image", + headers={"apikey": self.api_key}, + data={"language": "chs", "isOverlayRequired": "true", "OCREngine": "2"}, + files={"file": (Path(image_path).name, handle)}, + timeout=self.timeout, + ) + response.raise_for_status() + payload = response.json() + if payload.get("IsErroredOnProcessing"): + raise RuntimeError(f"OCR.space failed: {payload.get('ErrorMessage')}") + return normalize_ocr_space_response(payload) + + +class HuggingFaceOCRClient(OCRClient): + def __init__(self, token: str, model: str | None = None, timeout: int = 120) -> None: + super().__init__(provider="hf_ocr", is_configured=True) + self.token = token + self.model = model or os.getenv("SCREENCODER_HF_OCR_MODEL", "stepfun-ai/GOT-OCR-2.0-hf") + self.timeout = timeout + + def recognize(self, image_path: Path) -> OCRResult: + mime = mimetypes.guess_type(str(image_path))[0] or "image/png" + response = requests.post( + f"https://api-inference.huggingface.co/models/{self.model}", + headers={"Authorization": f"Bearer {self.token}", "Content-Type": mime}, + data=Path(image_path).read_bytes(), + timeout=self.timeout, + ) + response.raise_for_status() + payload = response.json() + text = "" + if isinstance(payload, list) and payload and isinstance(payload[0], Mapping): + text = str(payload[0].get("generated_text", payload[0].get("text", ""))).strip() + elif isinstance(payload, Mapping): + text = str(payload.get("generated_text", payload.get("text", json.dumps(payload, ensure_ascii=False)))).strip() + elif isinstance(payload, str): + text = payload.strip() + return OCRResult(provider="hf_ocr", text=text, blocks=[], raw={"response": payload}) + + +def build_ocr_client(provider: str | None) -> OCRClient | None: + provider = (provider or "none").strip().lower().replace("-", "_") + if provider in {"", "none", "off", "disabled"}: + return None + if provider == "replicate_paddleocr": + token = os.getenv("REPLICATE_API_TOKEN") + if not token: + return ManualGateOCRClient(provider, "Set REPLICATE_API_TOKEN to enable hosted PaddleOCR via Replicate.") + return ReplicatePaddleOCRClient(token) + if provider == "ocr_space": + api_key = os.getenv("OCR_SPACE_API_KEY") + if not api_key: + return ManualGateOCRClient(provider, "Set OCR_SPACE_API_KEY to enable OCR.space fallback OCR.") + return OCRSpaceClient(api_key) + if provider in {"hf", "hf_ocr", "huggingface"}: + token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") + if not token: + return ManualGateOCRClient("hf_ocr", "Set HF_TOKEN to enable Hugging Face hosted open-model OCR.") + return HuggingFaceOCRClient(token) + raise ValueError(f"Unsupported OCR provider: {provider}") diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..1021641 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +testpaths = tests +norecursedirs = post-training .git .venv venv __pycache__ .hypothesis .pytest_cache data tmp +python_files = test_*.py +addopts = -q diff --git a/schema_to_html.py b/schema_to_html.py new file mode 100644 index 0000000..8460657 --- /dev/null +++ b/schema_to_html.py @@ -0,0 +1,451 @@ +"""Render asset-aware screen schemas as standalone HTML previews.""" + +from __future__ import annotations + +import argparse +import json +import re +from dataclasses import asdict, is_dataclass +from html import escape +from pathlib import Path +from typing import Any, Mapping, Sequence + + +def _schema_to_dict(schema: Any) -> dict[str, Any]: + if isinstance(schema, Mapping): + return dict(schema) + + try: + from component_schema import screen_schema_to_dict + except ImportError: + screen_schema_to_dict = None + + if screen_schema_to_dict is not None: + return screen_schema_to_dict(schema) + if is_dataclass(schema): + return asdict(schema) + return { + "page_type": getattr(schema, "page_type", "unknown"), + "viewport": getattr(schema, "viewport", {}), + "theme_hint": getattr(schema, "theme_hint", ""), + "template_hint": getattr(schema, "template_hint", ""), + "regions": getattr(schema, "regions", []), + "reference_handoff": getattr(schema, "reference_handoff", None), + } + + +def _load_schema(path: str | Path) -> Any: + try: + from component_schema import load_screen_schema + except ImportError: + with Path(path).open("r", encoding="utf-8") as handle: + return json.load(handle) + return load_screen_schema(path) + + +def _slug(value: Any) -> str: + text = str(value or "").strip().lower() + text = re.sub(r"[^a-z0-9_-]+", "-", text) + return text.strip("-") or "unknown" + + +def _attr(value: Any) -> str: + return escape(str(value), quote=True) + + +def _region_bbox(region: Mapping[str, Any]) -> tuple[float, float, float, float]: + bbox = region.get("bbox") or [0, 0, 0, 0] + if len(bbox) != 4: + return 0.0, 0.0, 0.0, 0.0 + x1, y1, x2, y2 = (float(item or 0) for item in bbox) + return x1, y1, max(0.0, x2 - x1), max(0.0, y2 - y1) + + +def _viewport(schema_dict: Mapping[str, Any]) -> tuple[int, int]: + viewport = schema_dict.get("viewport") or {} + if is_dataclass(viewport): + viewport = asdict(viewport) + width = int(viewport.get("width") or 820) + height = int(viewport.get("height") or 1600) + return max(1, width), max(1, height) + + +def _normalise_region(region: Any) -> dict[str, Any]: + if is_dataclass(region): + return asdict(region) + if isinstance(region, Mapping): + return dict(region) + return { + "id": getattr(region, "id", ""), + "label": getattr(region, "label", ""), + "bbox": getattr(region, "bbox", [0, 0, 0, 0]), + "prefab": getattr(region, "prefab", ""), + "content_priority": getattr(region, "content_priority", ""), + "content": getattr(region, "content", {}), + "asset_roles": getattr(region, "asset_roles", []), + } + + +def _normalise_role(role: Any) -> dict[str, Any]: + if is_dataclass(role): + return asdict(role) + if isinstance(role, Mapping): + return dict(role) + return { + "role": getattr(role, "role", ""), + "strategy": getattr(role, "strategy", ""), + "crop_id": getattr(role, "crop_id", None), + "source_bbox": getattr(role, "source_bbox", None), + "notes": getattr(role, "notes", ""), + } + + +def _render_content(content: Any) -> str: + if not content: + return "" + if not isinstance(content, Mapping): + return f'

{escape(str(content))}

' + + parts: list[str] = [] + for key, value in content.items(): + if value is None or value == "": + continue + class_name = f"content-{_slug(key)}" + label = str(key).replace("_", " ") + if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): + items = "".join(f"
  • {escape(str(item))}
  • " for item in value) + parts.append( + f'
    ' + f'{escape(label)}
      {items}
    ' + ) + elif isinstance(value, Mapping): + nested = ", ".join(f"{k}: {v}" for k, v in value.items()) + parts.append( + f'
    ' + f'{escape(label)}' + f'{escape(nested)}
    ' + ) + else: + parts.append( + f'
    ' + f'{escape(label)}' + f'{escape(str(value))}
    ' + ) + return "\n".join(parts) + + +def _render_role_layers(roles: list[dict[str, Any]]) -> str: + layers: list[str] = [] + for role in roles: + role_name = role.get("role") or "unknown-role" + strategy = role.get("strategy") or "unknown-strategy" + style = "" + if strategy == "crop-source" and role.get("crop_id"): + style = f' style="background-image:url({_attr(role.get("crop_id"))})"' + layers.append( + f'' + ) + return "\n".join(layers) + + +def _optional_attr(name: str, value: Any) -> str: + if value is None or value == "": + return "" + return f' {name}="{_attr(value)}"' + + +def _render_region(region: Mapping[str, Any], index: int) -> str: + x, y, width, height = _region_bbox(region) + region_id = region.get("id") or f"region-{index + 1}" + label = region.get("label") or region_id + prefab = region.get("prefab") or "unmapped-prefab" + priority = region.get("content_priority") or "unprioritized" + roles = [_normalise_role(role) for role in region.get("asset_roles") or []] + role_names = [role.get("role") or "unknown-role" for role in roles] + strategy_names = [role.get("strategy") or "unknown-strategy" for role in roles] + classes = [ + "region", + f"region-{_slug(region_id)}", + f"prefab-{_slug(prefab)}", + f"priority-{_slug(priority)}", + *(f"has-{_slug(role)}" for role in role_names), + ] + style = f"left:{x:.2f}px;top:{y:.2f}px;width:{width:.2f}px;height:{height:.2f}px;" + role_chips = "".join( + f'' + f"{escape(role_names[i])}" + for i in range(len(role_names)) + ) + + return f""" +
    + {_render_role_layers(roles)} +
    +
    + {escape(str(prefab))} + {escape(str(label))} +
    +
    + {_render_content(region.get("content") or {})} +
    +
    {role_chips}
    +
    +
    """ + + +def _base_css(width: int, height: int) -> str: + return f""" + :root {{ + --paper: #d7b980; + --paper-light: #f0d59b; + --ink: #2c1a14; + --brass: #a97825; + --brass-light: #e2b557; + --seal: #9f1f18; + --shadow: rgba(28, 14, 8, 0.36); + }} + * {{ box-sizing: border-box; }} + body {{ + margin: 0; + min-height: 100vh; + display: grid; + place-items: start center; + padding: 24px; + color: var(--ink); + background: + radial-gradient(circle at 20% 10%, rgba(178, 40, 32, 0.20), transparent 22rem), + radial-gradient(circle at 80% 80%, rgba(166, 120, 38, 0.24), transparent 26rem), + linear-gradient(135deg, #18110f, #352017 58%, #140d0b); + font-family: Georgia, "Times New Roman", serif; + }} + .screen-preview {{ + position: relative; + width: {width}px; + height: {height}px; + overflow: hidden; + background: + radial-gradient(circle at 50% -10%, rgba(226, 181, 87, 0.22), transparent 30%), + linear-gradient(180deg, #332018 0%, #201411 100%); + border: 1px solid rgba(226, 181, 87, 0.34); + box-shadow: 0 24px 80px rgba(0, 0, 0, 0.45); + transform-origin: top center; + }} + .screen-preview::after {{ + content: ""; + position: absolute; + inset: 0; + pointer-events: none; + background-image: + linear-gradient(rgba(255,255,255,0.035) 1px, transparent 1px), + linear-gradient(90deg, rgba(255,255,255,0.025) 1px, transparent 1px); + background-size: 18px 18px; + mix-blend-mode: overlay; + }} + .region {{ + position: absolute; + display: flex; + padding: 12px; + border: 1px solid rgba(239, 205, 135, 0.36); + background: rgba(86, 53, 32, 0.58); + box-shadow: 0 10px 28px var(--shadow), inset 0 0 0 1px rgba(255,255,255,0.08); + overflow: hidden; + }} + .region-body {{ + position: relative; + z-index: 2; + width: 100%; + display: flex; + flex-direction: column; + gap: 8px; + justify-content: center; + }} + .region-meta {{ + display: flex; + flex-wrap: wrap; + gap: 6px; + align-items: center; + text-transform: uppercase; + letter-spacing: 0.08em; + font-size: 11px; + opacity: 0.88; + }} + .prefab-label, .region-label, .role-chip {{ + padding: 3px 7px; + border-radius: 999px; + background: rgba(35, 20, 15, 0.62); + color: #f2dca9; + }} + .region-label {{ background: rgba(159, 31, 24, 0.42); }} + .region-content {{ + display: grid; + gap: 5px; + font-size: clamp(12px, 2.2vw, 20px); + line-height: 1.18; + text-shadow: 0 1px 0 rgba(255,255,255,0.14); + }} + .content-key {{ + display: block; + font-size: 0.58em; + letter-spacing: 0.08em; + text-transform: uppercase; + opacity: 0.62; + }} + .content-value {{ font-weight: 700; }} + .content-primary-title .content-value, + .content-title .content-value {{ + font-size: 1.5em; + color: #2d160f; + }} + .asset-roles {{ + display: flex; + flex-wrap: wrap; + gap: 5px; + margin-top: auto; + font-size: 10px; + }} + .strategy-reference-asset {{ border: 1px solid rgba(226, 181, 87, 0.55); }} + .strategy-crop-source {{ border: 1px solid rgba(132, 174, 116, 0.65); }} + .strategy-manual-art {{ border: 1px dashed rgba(203, 72, 56, 0.75); }} + .has-paper-panel, .prefab-paper-panel {{ + background: + linear-gradient(90deg, rgba(96, 53, 28, 0.15), transparent 10%, transparent 90%, rgba(96, 53, 28, 0.16)), + radial-gradient(circle at 24% 20%, rgba(255,255,255,0.20), transparent 22%), + linear-gradient(135deg, var(--paper-light), var(--paper)); + border-color: rgba(91, 47, 25, 0.48); + }} + .has-brass-plate, .prefab-brass-plate {{ + background: + linear-gradient(160deg, rgba(255,255,255,0.30), transparent 20%), + linear-gradient(180deg, var(--brass-light), var(--brass) 52%, #6a4318); + border: 2px solid rgba(66, 39, 14, 0.68); + color: #24130b; + }} + .has-ornate-border, .prefab-ornate-action-plaque {{ + border: 4px double rgba(226, 181, 87, 0.82); + border-radius: 18px; + }} + .has-ornate-border::before, .prefab-ornate-action-plaque::before {{ + content: ""; + position: absolute; + inset: 7px; + border: 1px solid rgba(255, 231, 172, 0.38); + border-radius: 12px; + pointer-events: none; + }} + .has-red-seal::after, .prefab-red-seal::after {{ + content: ""; + position: absolute; + right: 12px; + bottom: 12px; + width: min(54px, 22%); + aspect-ratio: 1; + border-radius: 50%; + background: + radial-gradient(circle, transparent 42%, rgba(255,255,255,0.16) 43% 49%, transparent 50%), + radial-gradient(circle at 45% 42%, #b72b22, var(--seal)); + box-shadow: 0 2px 8px rgba(77, 10, 8, 0.38); + opacity: 0.82; + }} + .has-corner-rivet::before {{ + box-shadow: + 10px 10px 0 0 rgba(99, 57, 18, 0.75), + calc(100% - 10px) 10px 0 0 rgba(99, 57, 18, 0.75), + 10px calc(100% - 10px) 0 0 rgba(99, 57, 18, 0.75), + calc(100% - 10px) calc(100% - 10px) 0 0 rgba(99, 57, 18, 0.75); + }} + .has-portrait-frame, .prefab-portrait-frame {{ + border-radius: 22px; + background: radial-gradient(ellipse at 50% 35%, rgba(240, 213, 155, 0.35), rgba(39, 22, 17, 0.7)); + }} + .has-ink-divider .region-content {{ + border-top: 1px solid rgba(44, 26, 20, 0.42); + border-bottom: 1px solid rgba(44, 26, 20, 0.24); + padding-block: 8px; + }} + .asset-layer {{ + position: absolute; + inset: 0; + pointer-events: none; + z-index: 1; + }} + .strategy-crop-source {{ + background-size: cover; + background-position: center; + opacity: 0.34; + mix-blend-mode: multiply; + }} + .asset-crop-source {{ + background: repeating-linear-gradient(135deg, rgba(118, 157, 103, 0.15) 0 8px, transparent 8px 16px); + }} + .asset-manual-art {{ + background: repeating-linear-gradient(45deg, rgba(159, 31, 24, 0.12) 0 7px, transparent 7px 15px); + }} + @media (max-width: {width + 48}px) {{ + body {{ padding: 0; place-items: start; }} + .screen-preview {{ + transform: scale(calc(100vw / {width})); + }} + }} + """ + + +def render_preview_html(schema: Any) -> str: + """Return a standalone preview HTML document for a ScreenSchema-like object.""" + schema_dict = _schema_to_dict(schema) + width, height = _viewport(schema_dict) + regions = [_normalise_region(region) for region in schema_dict.get("regions") or []] + page_type = schema_dict.get("page_type") or "unknown-page" + theme_hint = schema_dict.get("theme_hint") or "" + template_hint = schema_dict.get("template_hint") or "" + rendered_regions = "\n".join(_render_region(region, i) for i, region in enumerate(regions)) + + return f""" + + + + + {escape(str(page_type))} preview + + + +
    + {rendered_regions} +
    + + +""" + + +def write_preview_html(schema: Any, output_path: str | Path) -> None: + """Write a standalone preview HTML document.""" + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(render_preview_html(schema), encoding="utf-8") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Render screen-schema.json to standalone preview.html.") + parser.add_argument("--schema", required=True, help="Path to screen-schema.json.") + parser.add_argument("--output", required=True, help="Path to write preview.html.") + args = parser.parse_args() + + schema = _load_schema(args.schema) + write_preview_html(schema, args.output) + + +if __name__ == "__main__": + main() diff --git a/screen_to_html.py b/screen_to_html.py new file mode 100644 index 0000000..2ad58d4 --- /dev/null +++ b/screen_to_html.py @@ -0,0 +1,172 @@ +import argparse +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class PipelineConfig: + image: Path + work_dir: Path = Path("data/run") + api_key: Path = Path("doubao_api.txt") + model: str = "doubao-1.5-thinking-vision-pro-250428" + + +@dataclass(frozen=True) +class PipelineStep: + name: str + command: list[str] + + +def _stem(path: Path) -> str: + return path.stem + + +def build_steps(config: PipelineConfig) -> list[PipelineStep]: + image = Path(config.image) + work_dir = Path(config.work_dir) + api_key = Path(config.api_key) + stem = _stem(image) + + region_bboxes = work_dir / f"{stem}_bboxes.json" + region_debug = work_dir / f"{stem}_with_bboxes.png" + gray_html = work_dir / f"{stem}_layout.html" + gray_bboxes = work_dir / f"{stem}_gray_bboxes.json" + uied_json = work_dir / "ip" / f"{stem}.json" + mapping_json = work_dir / f"mapping_full_{stem}.json" + mapping_debug = work_dir / f"overlay_{stem}.png" + final_html = work_dir / f"{stem}_layout_final.html" + + return [ + PipelineStep( + "detect layout regions", + [ + sys.executable, + "block_parsor.py", + "--image", + str(image), + "--api-key", + str(api_key), + "--json", + str(region_bboxes), + "--debug", + str(region_debug), + ], + ), + PipelineStep( + "generate gray html", + [ + sys.executable, + "html_generator.py", + "--model", + config.model, + "--image", + str(image), + "--bboxes", + str(region_bboxes), + "--output", + str(gray_html), + "--api-key", + str(api_key), + ], + ), + PipelineStep( + "detect gray placeholders", + [ + sys.executable, + "image_box_detection.py", + "--html", + str(gray_html), + "--screenshot", + str(image), + "--out", + str(work_dir), + "--json", + str(gray_bboxes), + ], + ), + PipelineStep( + "detect source UI elements", + [ + sys.executable, + "UIED/run_single.py", + "--image", + str(image), + "--output-root", + str(work_dir), + "--output-json", + str(uied_json), + ], + ), + PipelineStep( + "map placeholders to source elements", + [ + sys.executable, + "mapping.py", + "--gray", + str(gray_bboxes), + "--uied", + str(uied_json), + "--out", + str(mapping_json), + "--debug", + str(mapping_debug), + "--debug-src", + str(image), + ], + ), + PipelineStep( + "replace placeholders", + [ + sys.executable, + "image_replacer.py", + "--mapping", + str(mapping_json), + "--uied", + str(uied_json), + "--original-image", + str(image), + "--gray-html", + str(gray_html), + "--output-html", + str(final_html), + ], + ), + ] + + +def run_pipeline(config: PipelineConfig, dry_run: bool = False) -> None: + config.work_dir.mkdir(parents=True, exist_ok=True) + for step in build_steps(config): + print(f"\n=== {step.name} ===") + print(" ".join(step.command)) + if not dry_run: + subprocess.run(step.command, check=True) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a UI screenshot to HTML with the ScreenCoder pipeline.") + parser.add_argument("--image", required=True, type=Path, help="Source UI screenshot.") + parser.add_argument("--work-dir", default=Path("data/run"), type=Path, help="Directory for intermediate files and final HTML.") + parser.add_argument("--api-key", default=Path("doubao_api.txt"), type=Path, help="Doubao API key file.") + parser.add_argument("--model", default="doubao-1.5-thinking-vision-pro-250428", help="Doubao vision model name.") + parser.add_argument("--dry-run", action="store_true", help="Print commands without executing them.") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + run_pipeline( + PipelineConfig( + image=args.image, + work_dir=args.work_dir, + api_key=args.api_key, + model=args.model, + ), + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/screen_to_schema.py b/screen_to_schema.py new file mode 100644 index 0000000..768c1bb --- /dev/null +++ b/screen_to_schema.py @@ -0,0 +1,384 @@ +import argparse +from pathlib import Path +import json +import re +import sys + +from PIL import Image + +from asset_roles import default_roles_for_prefab, default_strategy_for_role +from component_schema import ( + AssetRoleBinding, + ReferenceHandoff, + RegionSchema, + ScreenSchema, + Viewport, + save_screen_schema, +) + + +DEFAULT_REGION_LAYOUTS = { + "title-stack": (0.10, 0.05, 0.90, 0.22), + "primary-actions": (0.08, 0.26, 0.92, 0.52), + "recent-run-panel": (0.10, 0.55, 0.90, 0.72), + "quick-links": (0.06, 0.76, 0.94, 0.94), +} + +LEGACY_REGION_TO_PREFAB = { + "header": "ritual-title-stack", + "title-stack": "ritual-title-stack", + "sidebar": "ornate-action-plaque", + "primary-actions": "ornate-action-plaque", + "main content": "journey-status-panel", + "main-content": "journey-status-panel", + "recent-run-panel": "journey-status-panel", + "navigation": "relic-quick-card", + "quick-links": "relic-quick-card", + "corner-system-actions": "relic-quick-card", + "background-ornament": "ritual-background", +} + +REGION_CONTENT_PRIORITIES = { + "title-stack": "L0-branding", + "primary-actions": "L0-primary-action", + "recent-run-panel": "L1-status", + "quick-links": "L2-shortcuts", + "corner-system-actions": "L2-system", + "background-ornament": "L3-decoration", +} + +REGION_LABELS = { + "title-stack": "Title area", + "primary-actions": "Primary actions", + "recent-run-panel": "Run status panel", + "quick-links": "Quick links", + "corner-system-actions": "System actions", + "background-ornament": "Background ornament", +} + + +def _slugify(value: str) -> str: + return value.strip().lower().replace("_", "-").replace(" ", "-") + + +def _bbox_from_fraction(fraction_bbox: tuple[float, float, float, float], width: int, height: int) -> list[int]: + x1, y1, x2, y2 = fraction_bbox + return [ + round(x1 * width), + round(y1 * height), + round(x2 * width), + round(y2 * height), + ] + + +def _bbox_to_pixels( + bbox: list[int] | tuple[int, int, int, int], + viewport: Viewport, + bbox_format: str, +) -> list[int]: + if len(bbox) != 4: + raise ValueError(f"Expected bbox with 4 coordinates, got {bbox!r}") + + coords = [int(value) for value in bbox] + if bbox_format == "pixel": + return coords + if bbox_format == "normalized": + return [ + round(coords[0] * viewport.width / 1000), + round(coords[1] * viewport.height / 1000), + round(coords[2] * viewport.width / 1000), + round(coords[3] * viewport.height / 1000), + ] + if bbox_format != "auto": + raise ValueError(f"Unsupported bbox format: {bbox_format}") + + x1, y1, x2, y2 = coords + if x2 > viewport.width or y2 > viewport.height: + return _bbox_to_pixels(coords, viewport, "normalized") + return coords + + +def _prefab_for_region(region_id: str) -> str: + return LEGACY_REGION_TO_PREFAB.get(region_id, region_id) + + +def _asset_bindings_for_prefab(prefab: str) -> list[AssetRoleBinding]: + return [ + AssetRoleBinding(role=role, strategy=default_strategy_for_role(role)) + for role in default_roles_for_prefab(prefab) + ] + + +def _safe_filename(value: str) -> str: + return re.sub(r"[^a-z0-9_.-]+", "-", value.lower()).strip("-") or "asset" + + +def _make_region(region_id: str, bbox: list[int], prefab: str | None = None, source: str = "heuristic") -> RegionSchema: + semantic_id = _slugify(region_id) + component_prefab = prefab or _prefab_for_region(semantic_id) + content = { + "label": REGION_LABELS.get(semantic_id, region_id), + "source": source, + } + return RegionSchema( + id=semantic_id, + label=REGION_LABELS.get(semantic_id, region_id), + bbox=bbox, + prefab=component_prefab, + content_priority=REGION_CONTENT_PRIORITIES.get(semantic_id, "L2-supporting"), + content=content, + asset_roles=_asset_bindings_for_prefab(component_prefab), + ) + + +def _clamp_bbox(bbox: list[float], width: int, height: int) -> tuple[int, int, int, int]: + x1, y1, x2, y2 = [round(float(value)) for value in bbox] + if x2 <= 0 or y2 <= 0 or x1 >= width or y1 >= height: + return 0, 0, max(1, min(width, 1)), max(1, min(height, 1)) + + x1 = max(0, min(width, x1)) + x2 = max(0, min(width, x2)) + y1 = max(0, min(height, y1)) + y2 = max(0, min(height, y2)) + if x2 <= x1: + x2 = min(width, x1 + 1) + if y2 <= y1: + y2 = min(height, y1 + 1) + return x1, y1, x2, y2 + + +def _bbox_intersection_area(a: list[float], b: list[float]) -> float: + ax1, ay1, ax2, ay2 = [float(value) for value in a] + bx1, by1, bx2, by2 = [float(value) for value in b] + width = max(0.0, min(ax2, bx2) - max(ax1, bx1)) + height = max(0.0, min(ay2, by2) - max(ay1, by1)) + return width * height + + +def apply_ocr_to_schema(schema: ScreenSchema, ocr_result) -> ScreenSchema: + """Attach normalized OCR text blocks to intersecting schema regions.""" + + for region in schema.regions: + matched_blocks = [] + matched_text: list[str] = [] + for block in getattr(ocr_result, "blocks", []) or []: + bbox = getattr(block, "bbox", None) + text = str(getattr(block, "text", "")).strip() + if not bbox or not text: + continue + if _bbox_intersection_area(region.bbox, bbox) <= 0: + continue + matched_text.append(text) + matched_blocks.append( + { + "text": text, + "bbox": [float(value) for value in bbox], + "confidence": getattr(block, "confidence", None), + } + ) + if matched_text: + region.content["ocr_text"] = matched_text + region.content["ocr_blocks"] = matched_blocks + region.content["ocr_provider"] = getattr(ocr_result, "provider", "unknown") + return schema + + +def extract_crop_source_assets( + schema: ScreenSchema, + image_path: Path, + output_dir: Path, + crop_id_prefix: str = "asset-crops", +) -> list[Path]: + """Crop source-image regions for bindings that request the crop-source strategy.""" + + output_dir.mkdir(parents=True, exist_ok=True) + written: list[Path] = [] + with Image.open(image_path) as image: + width, height = image.size + for region in schema.regions: + for index, binding in enumerate(region.asset_roles): + if binding.strategy != "crop-source": + continue + x1, y1, x2, y2 = _clamp_bbox(binding.source_bbox or region.bbox, width, height) + filename = f"{_safe_filename(region.id)}-{_safe_filename(binding.role)}-{index}.png" + crop_path = output_dir / filename + image.crop((x1, y1, x2, y2)).save(crop_path) + binding.crop_id = str(Path(crop_id_prefix) / filename) + binding.source_bbox = [x1, y1, x2, y2] + written.append(crop_path) + return written + + +def _load_bbox_regions(bboxes_path: Path, viewport: Viewport, bbox_format: str) -> list[RegionSchema]: + with bboxes_path.open("r", encoding="utf-8") as file: + raw_bboxes = json.load(file) + + if not isinstance(raw_bboxes, dict): + raise ValueError(f"Expected a bbox mapping in {bboxes_path}, got {type(raw_bboxes).__name__}") + + regions = [] + for name, bbox in raw_bboxes.items(): + semantic_id = _slugify(name) + regions.append(_make_region(semantic_id, _bbox_to_pixels(bbox, viewport, bbox_format), source="layout-detection")) + return regions + + +def _heuristic_regions(viewport: Viewport) -> list[RegionSchema]: + return [ + _make_region(region_id, _bbox_from_fraction(fraction_bbox, viewport.width, viewport.height)) + for region_id, fraction_bbox in DEFAULT_REGION_LAYOUTS.items() + ] + + +def _ensure_required_regions(regions: list[RegionSchema], viewport: Viewport) -> list[RegionSchema]: + present = {region.id for region in regions} + required = ["title-stack", "primary-actions", "recent-run-panel", "quick-links"] + completed = list(regions) + for region_id in required: + if region_id not in present: + completed.append( + _make_region( + region_id, + _bbox_from_fraction(DEFAULT_REGION_LAYOUTS[region_id], viewport.width, viewport.height), + ) + ) + return completed + + +def _reference_handoff_for(regions: list[RegionSchema]) -> ReferenceHandoff: + prefabs = sorted({region.prefab for region in regions}) + missing_assets = sorted( + { + binding.role + for region in regions + for binding in region.asset_roles + if binding.strategy in {"reference-asset", "manual-art"} + } + ) + return ReferenceHandoff( + composition="horror-ornamental", + prefabs=prefabs, + missing_assets=missing_assets, + ) + + +def build_screen_schema( + image_path: Path, + bboxes_path: Path | None = None, + bbox_format: str = "auto", + page_type: str = "game-main-menu", + theme_hint: str = "japanese-horror", + template_hint: str = "ornamental-mobile", +) -> ScreenSchema: + with Image.open(image_path) as image: + width, height = image.size + + viewport = Viewport(width=width, height=height) + if bboxes_path: + regions = _load_bbox_regions(bboxes_path, viewport, bbox_format) + regions = _ensure_required_regions(regions, viewport) + else: + regions = _heuristic_regions(viewport) + + return ScreenSchema( + page_type=page_type, + viewport=viewport, + theme_hint=theme_hint, + template_hint=template_hint, + regions=regions, + reference_handoff=_reference_handoff_for(regions), + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Build an asset-aware ScreenCoder schema for a UI screenshot.") + parser.add_argument("--image", required=True, type=Path, help="Source screenshot path.") + parser.add_argument("--bboxes", type=Path, default=None, help="Optional region bbox JSON from block_parsor.py.") + parser.add_argument( + "--bbox-format", + choices=["auto", "normalized", "pixel"], + default="auto", + help="Coordinate format for --bboxes. Auto treats boxes outside the viewport as normalized 0-1000 boxes.", + ) + parser.add_argument("--output", type=Path, default=None, help="Output screen-schema.json path.") + parser.add_argument("--preview", type=Path, default=None, help="Optional standalone preview.html output path.") + parser.add_argument("--handoff", type=Path, default=None, help="Optional handoff.md output path.") + parser.add_argument("--reference-json", type=Path, default=None, help="Optional web-ui-reference handoff JSON path.") + parser.add_argument("--asset-crops", type=Path, default=None, help="Directory for crop-source asset snippets.") + parser.add_argument("--no-crops", action="store_true", help="Do not extract crop-source assets.") + parser.add_argument( + "--ocr-provider", + default="none", + choices=["none", "replicate_paddleocr", "hf_ocr", "ocr_space"], + help="Optional hosted OCR provider. replicate_paddleocr uses an open-source PaddleOCR model on Replicate and requires REPLICATE_API_TOKEN.", + ) + parser.add_argument("--ocr-output", type=Path, default=None, help="Optional normalized OCR JSON output path.") + parser.add_argument("--page-type", default="game-main-menu", help="Schema page_type.") + parser.add_argument("--theme-hint", default="japanese-horror", help="Schema theme_hint.") + parser.add_argument("--template-hint", default="ornamental-mobile", help="Schema template_hint.") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + output_path = args.output or Path("data/run") / args.image.stem / "screen-schema.json" + + schema = build_screen_schema( + image_path=args.image, + bboxes_path=args.bboxes, + bbox_format=args.bbox_format, + page_type=args.page_type, + theme_hint=args.theme_hint, + template_hint=args.template_hint, + ) + + if args.ocr_provider != "none": + from ocr_client import build_ocr_client + + ocr_client = build_ocr_client(args.ocr_provider) + if ocr_client is not None and ocr_client.is_configured: + ocr_result = ocr_client.recognize(args.image) + apply_ocr_to_schema(schema, ocr_result) + if args.ocr_output: + args.ocr_output.parent.mkdir(parents=True, exist_ok=True) + args.ocr_output.write_text( + json.dumps( + { + "provider": ocr_result.provider, + "text": ocr_result.text, + "blocks": [block.__dict__ for block in ocr_result.blocks], + }, + ensure_ascii=False, + indent=2, + ) + + "\n", + encoding="utf-8", + ) + elif ocr_client is not None: + note = f"OCR manual gate: {ocr_client.manual_gate}" + if schema.reference_handoff is not None: + schema.reference_handoff.notes = note + print(note, file=sys.stderr) + + if not args.no_crops: + crop_dir = args.asset_crops or output_path.parent / "asset-crops" + extract_crop_source_assets(schema, args.image, crop_dir) + + save_screen_schema(schema, output_path) + + if args.preview: + from schema_to_html import write_preview_html + + write_preview_html(schema, args.preview) + if args.handoff: + from handoff_writer import write_handoff + + write_handoff(schema, args.handoff, source_image=args.image) + if args.reference_json: + from handoff_writer import write_reference_json + + write_reference_json(schema, args.reference_json) + + +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..86a1a5a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,7 @@ +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) diff --git a/tests/test_asset_aware_schema.py b/tests/test_asset_aware_schema.py new file mode 100644 index 0000000..b600718 --- /dev/null +++ b/tests/test_asset_aware_schema.py @@ -0,0 +1,162 @@ +import json +from pathlib import Path + +from PIL import Image + +from asset_roles import default_roles_for_prefab, default_strategy_for_role, normalize_asset_role +from component_schema import load_screen_schema, screen_schema_from_dict, screen_schema_to_dict +from handoff_writer import build_reference_handoff, render_handoff_markdown +from schema_to_html import render_preview_html +from screen_to_schema import build_screen_schema, extract_crop_source_assets + + +def test_asset_role_defaults_cover_core_prefabs(): + assert normalize_asset_role("paper_panel") == "paper-panel" + assert "paper-panel" in default_roles_for_prefab("ornate-action-plaque") + assert default_strategy_for_role("quick-card-illustration") == "crop-source" + + +def test_screen_schema_round_trips_asset_roles(tmp_path): + schema_path = tmp_path / "screen-schema.json" + data = { + "page_type": "game-main-menu", + "viewport": {"width": 820, "height": 1600}, + "theme_hint": "japanese-horror", + "template_hint": "ornamental-mobile", + "regions": [ + { + "id": "primary-actions", + "label": "Primary actions", + "prefab": "ornate-action-plaque", + "bbox": [64, 410, 756, 840], + "content_priority": "L0-primary-action", + "content": {"primary_title": "继续旅程"}, + "asset_roles": [ + {"role": "paper-panel", "strategy": "reference-asset"}, + {"role": "red-seal", "strategy": "css-procedural"}, + ], + } + ], + } + + schema = screen_schema_from_dict(data) + schema_path.write_text(json.dumps(screen_schema_to_dict(schema), ensure_ascii=False), encoding="utf-8") + loaded = load_screen_schema(schema_path) + + assert loaded.regions[0].label == "Primary actions" + assert loaded.regions[0].content_priority == "L0-primary-action" + assert screen_schema_to_dict(loaded)["regions"][0]["asset_roles"][0]["role"] == "paper-panel" + + +def test_build_screen_schema_from_legacy_bboxes_adds_required_regions(tmp_path): + image_path = tmp_path / "menu.png" + bbox_path = tmp_path / "bboxes.json" + Image.new("RGB", (820, 1600), color=(20, 12, 10)).save(image_path) + bbox_path.write_text(json.dumps({"header": [100, 50, 900, 200]}), encoding="utf-8") + + schema = build_screen_schema(image_path, bbox_path, bbox_format="normalized") + region_ids = {region.id for region in schema.regions} + + assert schema.viewport.width == 820 + assert schema.regions[0].prefab == "ritual-title-stack" + assert schema.regions[0].content["source"] == "layout-detection" + assert {"title-stack", "primary-actions", "recent-run-panel", "quick-links"}.issubset(region_ids) + + +def test_build_screen_schema_marks_default_regions_as_heuristic(tmp_path): + image_path = tmp_path / "menu.png" + Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path) + + schema = build_screen_schema(image_path) + + assert {region.content["source"] for region in schema.regions} == {"heuristic"} + + +def test_preview_and_handoff_render_semantic_assets(tmp_path): + image_path = tmp_path / "menu.png" + Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path) + schema = build_screen_schema(image_path) + crop_paths = extract_crop_source_assets(schema, image_path, tmp_path / "asset-crops") + + html = render_preview_html(schema) + handoff = render_handoff_markdown(schema, source_image="data/input/menu.png") + reference = build_reference_handoff(schema) + + assert crop_paths + assert crop_paths[0].exists() + assert 'data-prefab="ornate-action-plaque"' in html + assert "asset-crops/quick-links-quick-card-illustration" in html + assert "paper-panel" in html + assert "web-ui-reference" in handoff + assert "ornate-action-plaque" in reference["prefabs"] + assert "paper-panel" in reference["missing_assets"] + + +def test_preview_and_handoff_accept_plain_dict_schema(): + data = { + "page_type": "game-main-menu", + "viewport": {"width": 320, "height": 640}, + "regions": [ + { + "id": "primary-actions", + "prefab": "ornate-action-plaque", + "bbox": [10, 20, 300, 180], + "asset_roles": [{"role": "paper-panel", "strategy": "reference-asset"}], + } + ], + } + + assert 'data-page-type="game-main-menu"' in render_preview_html(data) + assert build_reference_handoff(data)["prefabs"] == ["ornate-action-plaque"] + + +def test_crop_source_assets_handle_fully_out_of_bounds_bbox(tmp_path): + image_path = tmp_path / "menu.png" + Image.new("RGB", (64, 64), color=(20, 12, 10)).save(image_path) + schema = build_screen_schema(image_path) + crop_binding = schema.regions[-1].asset_roles[2] + crop_binding.source_bbox = [1000, 1000, 1200, 1200] + + crop_paths = extract_crop_source_assets(schema, image_path, tmp_path / "asset-crops") + + assert crop_paths[0].exists() + assert crop_binding.source_bbox == [0, 0, 1, 1] + + +def test_screen_to_schema_cli_writes_complete_preview_package(tmp_path): + import subprocess + import sys + + image_path = tmp_path / "menu.png" + output_dir = tmp_path / "run" / "menu" + Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path) + + result = subprocess.run( + [ + sys.executable, + "screen_to_schema.py", + "--image", + str(image_path), + "--output", + str(output_dir / "screen-schema.json"), + "--preview", + str(output_dir / "index.html"), + "--handoff", + str(output_dir / "handoff.md"), + "--reference-json", + str(output_dir / "reference-handoff.json"), + ], + check=True, + cwd=Path(__file__).resolve().parents[1], + capture_output=True, + text=True, + ) + + assert result.stderr == "" + assert (output_dir / "screen-schema.json").exists() + assert (output_dir / "index.html").exists() + assert (output_dir / "handoff.md").exists() + assert (output_dir / "reference-handoff.json").exists() + assert list((output_dir / "asset-crops").glob("*.png")) + assert 'data-page-type="game-main-menu"' in (output_dir / "index.html").read_text(encoding="utf-8") + assert "web-ui-reference" in (output_dir / "handoff.md").read_text(encoding="utf-8") diff --git a/tests/test_ocr_integration.py b/tests/test_ocr_integration.py new file mode 100644 index 0000000..e9822cb --- /dev/null +++ b/tests/test_ocr_integration.py @@ -0,0 +1,112 @@ +from pathlib import Path + +from PIL import Image + +from handoff_writer import render_handoff_markdown +from ocr_client import ( + OCRBlock, + OCRResult, + build_ocr_client, + normalize_ocr_space_response, + normalize_replicate_paddleocr_response, +) +from screen_to_schema import apply_ocr_to_schema, build_screen_schema + + +def test_normalize_replicate_paddleocr_response_returns_text_and_boxes(): + payload = { + "status": "successful", + "output": { + "results": [ + { + "text": "继续旅程", + "score": 0.98, + "box": [[80, 180], [220, 180], [220, 215], [80, 215]], + }, + { + "text": "新的旅程", + "confidence": 0.87, + "bbox": [90, 260, 210, 295], + }, + ] + }, + } + + result = normalize_replicate_paddleocr_response(payload) + + assert result.provider == "replicate_paddleocr" + assert result.text == "继续旅程\n新的旅程" + assert result.blocks[0].bbox == [80, 180, 220, 215] + assert result.blocks[0].confidence == 0.98 + + +def test_normalize_ocr_space_response_keeps_overlay_word_boxes(): + payload = { + "ParsedResults": [ + { + "ParsedText": "设置\n成就\n", + "TextOverlay": { + "Lines": [ + { + "Words": [ + {"WordText": "设置", "Left": 10, "Top": 20, "Width": 30, "Height": 12}, + {"WordText": "成就", "Left": 300, "Top": 20, "Width": 30, "Height": 12}, + ] + } + ] + }, + } + ], + "IsErroredOnProcessing": False, + } + + result = normalize_ocr_space_response(payload) + + assert result.provider == "ocr_space" + assert result.text == "设置\n成就" + assert [block.text for block in result.blocks] == ["设置", "成就"] + assert result.blocks[1].bbox == [300, 20, 330, 32] + + +def test_apply_ocr_to_schema_attaches_text_to_intersecting_regions(tmp_path): + image_path = tmp_path / "menu.png" + Image.new("RGB", (400, 800), color=(20, 12, 10)).save(image_path) + schema = build_screen_schema(image_path) + ocr = OCRResult( + provider="test", + text="破宫之十重奏\n继续旅程\n设置", + blocks=[ + OCRBlock(text="破宫之十重奏", bbox=[50, 40, 350, 90], confidence=0.99), + OCRBlock(text="继续旅程", bbox=[80, 250, 260, 290], confidence=0.95), + OCRBlock(text="设置", bbox=[5, 5, 40, 25], confidence=0.7), + ], + ) + + apply_ocr_to_schema(schema, ocr) + by_id = {region.id: region for region in schema.regions} + + assert by_id["title-stack"].content["ocr_text"] == ["破宫之十重奏"] + assert by_id["primary-actions"].content["ocr_text"] == ["继续旅程"] + assert by_id["title-stack"].content["ocr_provider"] == "test" + assert "设置" not in by_id["title-stack"].content["ocr_text"] + + +def test_build_ocr_client_preserves_manual_gate_when_token_missing(monkeypatch): + monkeypatch.delenv("REPLICATE_API_TOKEN", raising=False) + + client = build_ocr_client("replicate_paddleocr") + + assert client.is_configured is False + assert "REPLICATE_API_TOKEN" in client.manual_gate + + +def test_handoff_renders_ocr_manual_gate_note(tmp_path): + image_path = tmp_path / "menu.png" + Image.new("RGB", (360, 640), color=(20, 12, 10)).save(image_path) + schema = build_screen_schema(image_path) + schema.reference_handoff.notes = "OCR manual gate: Set REPLICATE_API_TOKEN to enable hosted PaddleOCR." + + handoff = render_handoff_markdown(schema) + + assert "OCR manual gate" in handoff + assert "REPLICATE_API_TOKEN" in handoff diff --git a/tests/test_optional_provider_imports.py b/tests/test_optional_provider_imports.py new file mode 100644 index 0000000..6eac5e5 --- /dev/null +++ b/tests/test_optional_provider_imports.py @@ -0,0 +1,3 @@ +def test_core_scripts_import_without_all_provider_sdks(): + import block_parsor # noqa: F401 + import html_generator # noqa: F401 diff --git a/tests/test_screen_to_html.py b/tests/test_screen_to_html.py new file mode 100644 index 0000000..b18bb17 --- /dev/null +++ b/tests/test_screen_to_html.py @@ -0,0 +1,50 @@ +import sys +from pathlib import Path + +from screen_to_html import PipelineConfig, build_steps + + +def test_build_steps_uses_consistent_paths_for_input_image(): + config = PipelineConfig( + image=Path("examples/menu.png"), + work_dir=Path("out/menu"), + api_key=Path("keys/doubao.txt"), + model="demo-model", + ) + + steps = build_steps(config) + + assert [step.name for step in steps] == [ + "detect layout regions", + "generate gray html", + "detect gray placeholders", + "detect source UI elements", + "map placeholders to source elements", + "replace placeholders", + ] + assert steps[0].command == [ + sys.executable, + "block_parsor.py", + "--image", + "examples/menu.png", + "--api-key", + "keys/doubao.txt", + "--json", + "out/menu/menu_bboxes.json", + "--debug", + "out/menu/menu_with_bboxes.png", + ] + assert steps[1].command[-8:] == [ + "--image", + "examples/menu.png", + "--bboxes", + "out/menu/menu_bboxes.json", + "--output", + "out/menu/menu_layout.html", + "--api-key", + "keys/doubao.txt", + ] + assert steps[1].command[-2:] == ["--api-key", "keys/doubao.txt"] + assert steps[2].command[-2:] == ["--json", "out/menu/menu_gray_bboxes.json"] + assert "out/menu/ip/menu.json" in steps[3].command + assert steps[-1].command[-2:] == ["--output-html", "out/menu/menu_layout_final.html"] diff --git a/utils.py b/utils.py index a7ab403..89898fb 100644 --- a/utils.py +++ b/utils.py @@ -1,8 +1,5 @@ import os import time -from openai import OpenAI -import google.generativeai as genai -from volcenginesdkarkruntime import Ark import base64 import io from PIL import Image, ImageDraw @@ -10,6 +7,14 @@ import numpy as np +def _missing_dependency(package_name, install_name=None): + install_name = install_name or package_name + raise ModuleNotFoundError( + f"Missing optional provider dependency '{package_name}'. " + f"Install it with `pip install {install_name}` before using this provider." + ) + + def encode_image(image): if type(image) == str: try: @@ -227,6 +232,10 @@ def try_ask(self, question, image_encoding=None, verbose=False): class Doubao(Bot): def __init__(self, key_path, patience=3, model="doubao-1.5-thinking-vision-pro-250428") -> None: super().__init__(key_path, patience) + try: + from volcenginesdkarkruntime import Ark + except ModuleNotFoundError: + _missing_dependency("volcenginesdkarkruntime", "volcengine-python-sdk[ark]") self.client = Ark(api_key=self.key) self.model = model @@ -270,6 +279,10 @@ def ask(self, question, image_encoding=None, verbose=False): class Qwen(Bot): def __init__(self, key_path, patience=3, model="qwen2.5-vl-32b-instruct") -> None: super().__init__(key_path, patience) + try: + from openai import OpenAI + except ModuleNotFoundError: + _missing_dependency("openai") self.client = OpenAI(api_key=self.key, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1") self.name = model @@ -309,6 +322,10 @@ def ask(self, question, image_encoding=None, verbose=False): class GPT(Bot): def __init__(self, key_path, patience=3, model="gpt-4o") -> None: super().__init__(key_path, patience) + try: + from openai import OpenAI + except ModuleNotFoundError: + _missing_dependency("openai") self.client = OpenAI(api_key=self.key) self.name="gpt4" self.model = model @@ -354,6 +371,10 @@ def ask(self, question, image_encoding=None, verbose=False): class Gemini(Bot): def __init__(self, key_path, patience=3, model="gemini-1.5-flash-latest") -> None: super().__init__(key_path, patience) + try: + import google.generativeai as genai + except ModuleNotFoundError: + _missing_dependency("google.generativeai", "google-generativeai") GOOGLE_API_KEY= self.key genai.configure(api_key=GOOGLE_API_KEY) self.name = "Gemini"