diff --git a/openvibe/agent/agent.py b/openvibe/agent/agent.py index 78713be..e787960 100644 --- a/openvibe/agent/agent.py +++ b/openvibe/agent/agent.py @@ -82,6 +82,47 @@ class AgentInfo: You do not write or modify files. """ +_COMPUTER_SYSTEM_PROMPT = """\ +You are openvibe in computer-use mode. You can see and control the desktop. + +TOOL PRIORITY — always follow this order: + +1. ui tool (FIRST CHOICE — no coordinates needed, most reliable) + • Use `ui get_tree` to list clickable elements in an app by name. + • Use `ui click` with the element title — never guess coordinates. + • Use `ui click_menu` to trigger menu items (File → Save, etc.). + • Use `ui type` to enter text — handles Unicode and clipboard correctly. + • Use `ui press_key` for keys/chords (return, escape, cmd+s, etc.). + • ui is auto-allowed — no permission prompt. + +2. app tool — open, close, focus, list applications. + +3. screenshot tool — take a screenshot to observe the current screen state. + Always take one after opening an app to confirm it appeared. + The output includes the image dimensions — note them for step 4. + +4. mouse tool (LAST RESORT — only for unlabelled canvas areas) + • Only use when `ui get_tree` shows no accessible elements for the target. + • ALWAYS provide image_width and image_height from the screenshot output. + This is mandatory — without them, Retina scaling causes wrong coordinates. + • Example: mouse click x=450 y=300 image_width=1920 image_height=1200 + +5. keyboard tool — raw keystroke fallback when `ui type` / `ui press_key` + cannot be used (rare). + +WORKFLOW: + app open → screenshot → ui get_tree → ui click/type → screenshot → verify + +VERIFICATION: + Every screenshot compares automatically to the previous one and reports + what percentage of the screen changed. If you see "No visible change + detected" after an action, the action failed — do NOT repeat it blindly. + Instead: try ui get_tree to find the element by name, or take a fresh + screenshot and reassess coordinates. + +Never move the mouse to (0, 0) — that triggers pyautogui's failsafe abort. +""" + # --------------------------------------------------------------------------- # Built-in permission rulesets @@ -125,6 +166,23 @@ class AgentInfo: Rule(tool="bash", action=_A.DENY), ] +# Computer-use: screenshot + ui (accessibility) are always allowed; +# raw mouse/keyboard/app require consent (they affect the running system). +_COMPUTER_RULES: list[Rule] = [ + Rule(tool="screenshot", action=_A.ALLOW), + Rule(tool="ui", action=_A.ALLOW), # AppleScript accessibility — preferred over mouse + Rule(tool="mouse", action=_A.ASK), + Rule(tool="keyboard", action=_A.ASK), + Rule(tool="app", action=_A.ASK), + # Standard tools remain available + Rule(tool="read", action=_A.ALLOW), + Rule(tool="glob", action=_A.ALLOW), + Rule(tool="grep", action=_A.ALLOW), + Rule(tool="bash", action=_A.ASK), + Rule(tool="write", action=_A.ASK), + Rule(tool="edit", action=_A.ASK), +] + # --------------------------------------------------------------------------- # Built-in agent definitions @@ -154,6 +212,16 @@ class AgentInfo: permission_rules=_GENERAL_RULES, disabled_tools=["bash", "write", "edit", "todo_write"], ), + "computer": AgentInfo( + name="computer", + description=( + "Computer-use agent: sees the screen and controls mouse/keyboard. " + "Requires the computer-use extras (mss, pillow, pyautogui)." + ), + system_prompt=_COMPUTER_SYSTEM_PROMPT, + mode=AgentMode.PRIMARY, + permission_rules=_COMPUTER_RULES, + ), } diff --git a/openvibe/api.py b/openvibe/api.py index 5e6c543..c5396f3 100644 --- a/openvibe/api.py +++ b/openvibe/api.py @@ -266,9 +266,12 @@ def update_session_config(self, overrides: dict[str, Any]) -> None: # ------------------------------------------------------------------ def _try_command(self, text: str) -> Response | None: - """If *text* is a slash command, execute it and return a Response.""" - from openvibe.commands import (CommandContext, execute, get_command, - is_command) + """If *text* is a registered slash command, execute it and return a Response. + + Returns ``None`` for unrecognised names so that ``_try_skill`` can + handle skill invocations before we fall through to the LLM. + """ + from openvibe.commands import CommandContext, _COMMANDS, execute, get_command, is_command # noqa: PLC2701 if not is_command(text): return None @@ -276,6 +279,10 @@ def _try_command(self, text: str) -> Response | None: if parsed is None: return None name, args = parsed + # Only handle names that are registered as slash commands; unknown + # names may be skill invocations — let _try_skill decide. + if name not in _COMMANDS: + return None ctx = CommandContext(session=self, args=args) result = execute(name, ctx) return Response( @@ -284,6 +291,40 @@ def _try_command(self, text: str) -> Response | None: command_result=result, ) + def _try_skill(self, text: str) -> str | None: + """If *text* is a skill invocation (``/name args``), return the expanded prompt. + + Returns ``None`` when the text is not a skill invocation so that the + caller can fall through to the normal LLM path. + """ + from openvibe.commands import is_command + from openvibe.skill.registry import get_registry + + if not is_command(text): + return None + parts = text[1:].split(None, 1) + name = parts[0].lower() + args = parts[1] if len(parts) > 1 else "" + skill = get_registry().get(name) + if skill is None: + return None + return skill.get_prompt(args) + + def _send_raw( + self, + text: str, + on_token: Callable[[str], None] | None = None, + ) -> Response: + """Send *text* directly to the LLM without command/skill interception. + + Used internally by the :class:`~openvibe.skill.executor.SkillExecutor` + so that retry prompts bypass the skill expansion layer. Assumes the + FSM is already in THINKING state when called from within the skill + executor loop. + """ + self._launch_worker(text, on_token, callback=None) + return self._collect() + def send( self, text: str, @@ -299,16 +340,22 @@ def send( * an error occurs → Response(state=ERROR) Slash commands (``/help``, ``/cost``, etc.) are handled locally and - never reach the LLM. + never reach the LLM. Skill invocations (``/simplify``, ``/debug``, + etc.) are expanded into full LLM prompts before being sent. *on_message(msg_id, role)* — called when a new message is created. *on_tool(msg_id, part_index, state_dict)* — called on tool state changes. """ - # Slash commands bypass the LLM entirely. + # 1. Slash commands bypass the LLM entirely. cmd_response = self._try_command(text) if cmd_response is not None: return cmd_response + # 2. Skill invocations: expand prompt before sending to LLM. + expanded = self._try_skill(text) + if expanded is not None: + text = expanded + with self._lock: if self._state not in (SessionState.IDLE, SessionState.ERROR): raise InvalidStateError( @@ -368,6 +415,11 @@ def send_nowait( callback(cmd_response) return + # Skill invocations: expand prompt before sending to LLM. + expanded = self._try_skill(text) + if expanded is not None: + text = expanded + with self._lock: if self._state not in (SessionState.IDLE, SessionState.ERROR): raise InvalidStateError( @@ -606,6 +658,8 @@ def start(self) -> "OpenVibe": from openvibe.config import load_config from openvibe.db import create_database from openvibe.project import project as _project_module + from openvibe.skill.bundled import init_bundled_skills + from openvibe.skill.loader import load_skills_dir from openvibe.tool.base import create_default_registry if self._config is None: @@ -616,6 +670,9 @@ def start(self) -> "OpenVibe": self._registry = create_default_registry() self._project = _project_module.get_or_create(self._db, self._project_dir) + init_bundled_skills() + load_skills_dir(self._project_dir / "skills") + if self._config.mcp: self._init_mcp() @@ -648,6 +705,8 @@ async def start_async(self) -> "OpenVibe": from openvibe.permission.permission import PermissionService from openvibe.project import project as _project_module from openvibe.session.processor import SessionProcessor + from openvibe.skill.bundled import init_bundled_skills + from openvibe.skill.loader import load_skills_dir from openvibe.tool.base import create_default_registry if self._config is None: @@ -655,6 +714,9 @@ async def start_async(self) -> "OpenVibe": if self._db is None: self._db = create_database() + init_bundled_skills() + load_skills_dir(self._project_dir / "skills") + llm = self._llm or create_default_backend() self._bus = EventBus() self._registry = create_default_registry() diff --git a/openvibe/commands.py b/openvibe/commands.py index 0ef73fa..bf3d218 100644 --- a/openvibe/commands.py +++ b/openvibe/commands.py @@ -157,7 +157,7 @@ def _config(ctx: CommandContext): # --------------------------------------------------------------------------- -@command("help", "Show available commands") +@command("help", "Show available commands and skills") def cmd_help(ctx: CommandContext) -> CommandResult: lines = ["[bold]Available commands:[/bold]\n"] for name in sorted(_COMMANDS): @@ -166,9 +166,63 @@ def cmd_help(ctx: CommandContext) -> CommandResult: f" [bold cyan]/{name}[/bold cyan] [dim]{entry.description}[/dim]" ) for sub_name, (_, sub_desc) in sorted(entry.subcommands.items()): + lines.append(f" [bold cyan]/{name} {sub_name}[/bold cyan] [dim]{sub_desc}[/dim]") + + # Append skills section + try: + from rich.markup import escape + + from openvibe.skill.registry import get_registry + skills = get_registry().user_invocable() + if skills: + lines.append("\n[bold]Skills[/bold] [dim](route through the LLM):[/dim]\n") + for skill in skills: + aliases = ( + f" [dim]alias: {', '.join(f'/{a}' for a in skill.aliases)}[/dim]" + if skill.aliases + else "" + ) + hint = f" [dim]{escape(skill.argument_hint)}[/dim]" if skill.argument_hint else "" + lines.append( + f" [bold cyan]/{escape(skill.name)}[/bold cyan]{hint}" + f" [dim]{escape(skill.description)}[/dim]{aliases}" + ) + except Exception: + pass + + return CommandResult(output="\n".join(lines)) + + +@command("skills", "List available skills") +def cmd_skills(ctx: CommandContext) -> CommandResult: + """Show all user-invocable skills with metadata.""" + try: + from openvibe.skill.registry import get_registry + except ImportError: + return CommandResult(output="[dim]Skills system not available.[/dim]") + + skills = get_registry().user_invocable() + if not skills: + return CommandResult(output="[dim]No skills registered.[/dim]") + + from rich.markup import escape + + lines = ["[bold]Available skills:[/bold]\n"] + for skill in skills: + lines.append(f"[bold cyan]/{escape(skill.name)}[/bold cyan]") + if skill.aliases: + lines[-1] += f" [dim](aliases: {', '.join(f'/{a}' for a in skill.aliases)})[/dim]" + lines.append(f" [dim]{escape(skill.description)}[/dim]") + if skill.when_to_use: + lines.append(f" [yellow]When to use:[/yellow] [dim]{escape(skill.when_to_use)}[/dim]") + if skill.argument_hint: lines.append( - f" [bold cyan]/{name} {sub_name}[/bold cyan] [dim]{sub_desc}[/dim]" + f" [yellow]Usage:[/yellow] [dim]/{escape(skill.name)} {escape(skill.argument_hint)}[/dim]" ) + if skill.tags: + lines.append(f" [yellow]Tags:[/yellow] [dim]{escape(', '.join(skill.tags))}[/dim]") + lines.append("") + return CommandResult(output="\n".join(lines)) @@ -370,6 +424,83 @@ def cmd_model(ctx: CommandContext) -> CommandResult: ) +@command("screenshot", "Take a screenshot and display info about the current screen") +def cmd_screenshot(ctx: CommandContext) -> CommandResult: + """Capture the screen and show dimensions (does not embed the image in TUI).""" + try: + from openvibe.computer.capture import capture_screen, screen_size + except ImportError: + return CommandResult( + output="[red]Computer-use extras not installed.[/red]\n" + "[dim]Run: pip install mss pillow[/dim]" + ) + + try: + w, h = screen_size() + lines = [ + "[bold]Screen info[/bold]\n", + f" [dim]Primary monitor:[/dim] [bold]{w}×{h}[/bold] pixels", + "\n[dim]Use the 'screenshot' tool inside a computer-use session to " + "capture the screen and pass the image to the model.[/dim]", + ] + return CommandResult(output="\n".join(lines)) + except Exception as exc: + return CommandResult(output=f"[red]Screenshot failed:[/red] {exc}", ) + + +@command("computer", "Show computer-use session info or manage the sandbox") +def cmd_computer(ctx: CommandContext) -> CommandResult: + """Display audit log summary for the current session's computer-use sandbox.""" + try: + from openvibe.computer.sandbox import get_sandbox + except ImportError: + return CommandResult( + output="[red]Computer-use module not available.[/red]" + ) + + sandbox = get_sandbox(ctx.session.info.id) + lines = [ + "[bold]Computer-use sandbox[/bold]\n", + f" [dim]Session:[/dim] {sandbox.session_id[:16]}…", + f" [dim]Actions logged:[/dim] {len(sandbox.audit_log)}", + ] + if sandbox.allowed_apps: + lines.append(f" [dim]Allowed apps:[/dim] {', '.join(sandbox.allowed_apps)}") + else: + lines.append(" [dim]Allowed apps:[/dim] (all)") + if sandbox.screen_region: + x, y, w, h = sandbox.screen_region + lines.append(f" [dim]Screen region:[/dim] x={x} y={y} w={w} h={h}") + else: + lines.append(" [dim]Screen region:[/dim] (full screen)") + + if sandbox.audit_log: + lines.append("\n[bold dim]Recent actions:[/bold dim]") + for entry in sandbox.audit_log[-10:]: + ts = entry.timestamp + status = "[green]ok[/green]" if entry.error is None else "[red]err[/red]" + lines.append( + f" [{ts:.0f}] {status} {entry.action_type.value} " + f"[dim]{(entry.result or entry.error or '')[:60]}[/dim]" + ) + + return CommandResult(output="\n".join(lines)) + + +@subcommand("computer", "reset", "Clear the computer-use audit log for this session") +def cmd_computer_reset(ctx: CommandContext) -> CommandResult: + try: + from openvibe.computer.sandbox import clear_sandbox, get_sandbox + except ImportError: + return CommandResult(output="[red]Computer-use module not available.[/red]") + + count = len(get_sandbox(ctx.session.info.id).audit_log) + clear_sandbox(ctx.session.info.id) + return CommandResult( + output=f"[green]Cleared {count} computer-use audit entries.[/green]" + ) + + @command("quit", "Exit the application") def cmd_quit(ctx: CommandContext) -> CommandResult: return CommandResult(output="", quit=True) diff --git a/openvibe/computer/__init__.py b/openvibe/computer/__init__.py new file mode 100644 index 0000000..fdb9acf --- /dev/null +++ b/openvibe/computer/__init__.py @@ -0,0 +1,25 @@ +"""Computer Use — screen capture, mouse, keyboard, and app control. + +Public API +---------- +- :class:`ComputerSandbox` — session-scoped sandbox with audit log +- :func:`get_sandbox` — retrieve (or create) the sandbox for a session +- :func:`clear_sandbox` — discard a session's sandbox +- :func:`create_computer_use_registry` — build a ToolRegistry with all CU tools +""" + +from openvibe.computer.sandbox import ( + ActionType, + AuditEntry, + ComputerSandbox, + clear_sandbox, + get_sandbox, +) + +__all__ = [ + "ActionType", + "AuditEntry", + "ComputerSandbox", + "get_sandbox", + "clear_sandbox", +] diff --git a/openvibe/computer/capture.py b/openvibe/computer/capture.py new file mode 100644 index 0000000..92d0715 --- /dev/null +++ b/openvibe/computer/capture.py @@ -0,0 +1,215 @@ +"""Screen-capture helpers using *mss* and *Pillow*. + +All functions are synchronous; run them in a thread pool when called from +async code (see :mod:`openvibe.tool.computer_screenshot`). + +Example:: + + png_bytes, w, h = capture_screen() + png_bytes, w, h = capture_screen(region=(100, 100, 800, 600)) +""" + +from __future__ import annotations + +import base64 +import io + + +# Maximum width before the image is downscaled. +# 4K / Retina screens produce enormous PNGs that push against API size limits. +_MAX_WIDTH = 1920 + + +def capture_screen( + region: tuple[int, int, int, int] | None = None, +) -> tuple[bytes, int, int]: + """Capture a screenshot and return ``(png_bytes, width, height)``. + + Parameters + ---------- + region: + Optional ``(x, y, width, height)`` in *logical* screen coordinates. + ``None`` captures the entire primary monitor. + + Raises + ------ + ImportError + When ``mss`` or ``Pillow`` are not installed. + RuntimeError + When the captured data length doesn't match expectations — this + most commonly means screen recording permission has not been + granted (macOS: System Settings → Privacy & Security → Screen + Recording). + """ + try: + import mss # type: ignore[import-not-found] + except ImportError as exc: + raise ImportError( + "mss is required for computer use: pip install mss pillow" + ) from exc + + try: + from PIL import Image # type: ignore[import-not-found] + except ImportError as exc: + raise ImportError( + "Pillow is required for computer use: pip install mss pillow" + ) from exc + + with mss.mss() as sct: + if region is not None: + x, y, w, h = region + monitor: dict[str, int] = {"left": x, "top": y, "width": w, "height": h} + else: + # monitors[1] is the primary display. + # monitors[0] is the virtual all-monitors union. + monitor = dict(sct.monitors[1] if len(sct.monitors) > 1 else sct.monitors[0]) + + sct_img = sct.grab(monitor) + + # ── Critical fix ──────────────────────────────────────────────────── + # sct_img.bgra is a memoryview (or custom buffer object) returned by + # mss. PIL's raw decoder requires a plain bytes object; passing a + # memoryview directly causes PIL to silently read the wrong memory + # region, producing a corrupt image. + # + # bytes() materialises the buffer into an actual bytes object before + # handing it to PIL. + # + # The "BGRX" raw decoder reads 4 bytes per pixel (B, G, R, ignored) + # and writes them into an RGB image as (R, G, B) — the correct channel + # reordering for BGRA→RGB conversion. + # ───────────────────────────────────────────────────────────────────── + raw_bgra = bytes(sct_img.bgra) + + expected = sct_img.width * sct_img.height * 4 + if len(raw_bgra) != expected: + raise RuntimeError( + f"Screen capture data size mismatch: received {len(raw_bgra)} bytes, " + f"expected {expected} ({sct_img.width}×{sct_img.height}×4 BGRA). " + "On macOS this usually means Screen Recording permission has not been " + "granted — go to System Settings → Privacy & Security → Screen " + "Recording and add your terminal application." + ) + + img = Image.frombytes( + "RGB", + (sct_img.width, sct_img.height), + raw_bgra, + "raw", + "BGRX", # read B-G-R-X, produce R-G-B + ) + + # Downscale very large screens to stay under API image-size limits. + if img.width > _MAX_WIDTH: + scale = _MAX_WIDTH / img.width + new_h = max(1, int(img.height * scale)) + img = img.resize((_MAX_WIDTH, new_h), Image.LANCZOS) + + buf = io.BytesIO() + img.save(buf, format="PNG", optimize=True) + png_bytes = buf.getvalue() + + if not png_bytes: + raise RuntimeError("PNG encoding produced empty output.") + + return png_bytes, img.width, img.height + + +def diff_screenshots( + before_png: bytes, + after_png: bytes, + threshold: int = 10, +) -> dict[str, object]: + """Compare two PNG screenshots and return a change report. + + Parameters + ---------- + before_png, after_png: + Raw PNG bytes from :func:`capture_screen`. + threshold: + Per-channel intensity delta (0–255) below which a pixel is considered + unchanged. Default 10 filters camera/compression noise. + + Returns + ------- + dict with keys: + ``changed`` — bool, True if any pixel changed above threshold + ``change_fraction``— float 0.0–1.0, fraction of pixels that changed + ``changed_region`` — [x, y, w, h] bounding box of the changed area, or None + ``summary`` — human-readable string for the LLM + """ + try: + from PIL import Image, ImageChops # type: ignore[import-not-found] + except ImportError as exc: + raise ImportError( + "Pillow is required for screenshot diffing: pip install pillow" + ) from exc + + before = Image.open(io.BytesIO(before_png)).convert("RGB") + after = Image.open(io.BytesIO(after_png)).convert("RGB") + + if before.size != after.size: + return { + "changed": True, + "change_fraction": 1.0, + "changed_region": None, + "summary": ( + f"Screen resolution changed from {before.size} to {after.size}." + ), + } + + diff = ImageChops.difference(before, after) + gray = diff.convert("L") + + # Build a binary mask: 255 where change exceeds threshold, 0 elsewhere. + mask = gray.point(lambda p: 255 if p > threshold else 0) + from PIL import ImageStat # type: ignore[import-not-found] + changed_pixels = int(ImageStat.Stat(mask).sum[0] / 255) + total = before.width * before.height + fraction = changed_pixels / total if total > 0 else 0.0 + + if fraction < 0.001: + return { + "changed": False, + "change_fraction": fraction, + "changed_region": None, + "summary": ( + "No visible change detected — the action may not have had any effect." + ), + } + + bbox = mask.getbbox() # (left, top, right, bottom) of non-zero region + changed_region = None + region_str = "" + if bbox: + x, y, x2, y2 = bbox + changed_region = [x, y, x2 - x, y2 - y] + region_str = f" in region [x={x}, y={y}, {x2-x}×{y2-y}px]" + + summary = f"{fraction:.1%} of pixels changed{region_str}." + return { + "changed": True, + "change_fraction": fraction, + "changed_region": changed_region, + "summary": summary, + } + + +def capture_screen_b64( + region: tuple[int, int, int, int] | None = None, +) -> tuple[str, int, int]: + """Like :func:`capture_screen` but returns a base-64 encoded PNG string.""" + png_bytes, w, h = capture_screen(region) + return base64.b64encode(png_bytes).decode("ascii"), w, h + + +def screen_size() -> tuple[int, int]: + """Return ``(width, height)`` of the primary monitor.""" + try: + import mss # type: ignore[import-not-found] + except ImportError as exc: + raise ImportError("mss is required: pip install mss") from exc + + with mss.mss() as sct: + m = sct.monitors[1] if len(sct.monitors) > 1 else sct.monitors[0] + return m["width"], m["height"] diff --git a/openvibe/computer/deps.py b/openvibe/computer/deps.py new file mode 100644 index 0000000..79e60fb --- /dev/null +++ b/openvibe/computer/deps.py @@ -0,0 +1,88 @@ +"""Runtime dependency installer for platform-specific computer-use packages. + +openvibe auto-installs the right packages for the current OS the first time a +computer-use tool needs them, so users never have to specify extras manually. + +Usage:: + + from openvibe.computer.deps import ensure_import + + pyatspi = ensure_import("pyatspi") # Linux AT-SPI2 + pywinauto = ensure_import("pywinauto") # Windows UI Automation + pyperclip = ensure_import("pyperclip") # Windows clipboard + +``ensure_import`` returns the imported module on success. It raises +``RuntimeError`` with an actionable message if installation fails (e.g. no pip, +no network, or root-only environment). +""" + +from __future__ import annotations + +import importlib +import importlib.util +import subprocess +import sys +from typing import Any + +# pip_name may differ from the import name (e.g. "Pillow" → import "PIL") +_IMPORT_TO_PIP: dict[str, str] = { + "pyatspi": "pyatspi", + "pywinauto": "pywinauto", + "pyperclip": "pyperclip", + "PIL": "Pillow", + "mss": "mss", + "pyautogui": "pyautogui", +} + + +def _pip_install(pip_name: str) -> None: + """Run ``pip install `` in the current interpreter.""" + result = subprocess.run( + [sys.executable, "-m", "pip", "install", "--quiet", pip_name], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError( + f"Auto-install of '{pip_name}' failed.\n" + f"stdout: {result.stdout.strip()}\n" + f"stderr: {result.stderr.strip()}\n\n" + f"Run manually: pip install {pip_name}" + ) + + +def ensure_import(import_name: str, pip_name: str | None = None) -> Any: + """Import *import_name*, installing via pip first if absent. + + Parameters + ---------- + import_name: + The Python import name (``import ``). + pip_name: + The PyPI package name to install. Defaults to ``import_name`` (or a + built-in mapping if one exists). + + Returns + ------- + The imported module. + + Raises + ------ + RuntimeError + If the package cannot be installed automatically. + """ + resolved_pip = pip_name or _IMPORT_TO_PIP.get(import_name, import_name) + + if importlib.util.find_spec(import_name) is None: + _pip_install(resolved_pip) + # Invalidate the import caches so the freshly installed package is found + importlib.invalidate_caches() + + try: + return importlib.import_module(import_name) + except ImportError as exc: + raise RuntimeError( + f"Package '{resolved_pip}' was installed but '{import_name}' still cannot " + f"be imported. You may need to restart openvibe.\n" + f"Original error: {exc}" + ) from exc diff --git a/openvibe/computer/sandbox.py b/openvibe/computer/sandbox.py new file mode 100644 index 0000000..6dedd34 --- /dev/null +++ b/openvibe/computer/sandbox.py @@ -0,0 +1,181 @@ +"""Session-scoped computer-use sandbox. + +The sandbox tracks every action taken against the screen, enforces an +optional application allow-list, and maintains a full audit log that can +be exported for reproducibility / compliance purposes. + +Usage:: + + from openvibe.computer.sandbox import get_sandbox, ActionType + + sandbox = get_sandbox(session_id) + await sandbox.record_action(ActionType.SCREENSHOT, params={}, result="ok") + log = sandbox.export_audit_log() +""" + +from __future__ import annotations + +import asyncio +import time +import uuid +from dataclasses import dataclass, field +from enum import Enum +from typing import Any + + +# --------------------------------------------------------------------------- +# Action catalogue +# --------------------------------------------------------------------------- + + +class ActionType(str, Enum): + """All actions the computer-use subsystem can perform.""" + + SCREENSHOT = "screenshot" + MOUSE_CLICK = "mouse_click" + MOUSE_MOVE = "mouse_move" + MOUSE_SCROLL = "mouse_scroll" + MOUSE_DRAG = "mouse_drag" + KEYBOARD_TYPE = "keyboard_type" + KEYBOARD_PRESS = "keyboard_press" + KEYBOARD_HOTKEY = "keyboard_hotkey" + APP_OPEN = "app_open" + APP_CLOSE = "app_close" + APP_FOCUS = "app_focus" + APP_LIST = "app_list" + + +# --------------------------------------------------------------------------- +# Audit entry +# --------------------------------------------------------------------------- + + +@dataclass +class AuditEntry: + """A single recorded action in the audit log.""" + + id: str + timestamp: float + action_type: ActionType + params: dict[str, Any] + session_id: str + result: str | None = None + error: str | None = None + + def to_dict(self) -> dict[str, Any]: + return { + "id": self.id, + "timestamp": self.timestamp, + "action": self.action_type.value, + "params": self.params, + "session_id": self.session_id, + "result": self.result, + "error": self.error, + } + + +# --------------------------------------------------------------------------- +# Sandbox +# --------------------------------------------------------------------------- + + +@dataclass +class ComputerSandbox: + """Per-session sandbox that gates and records computer-use actions. + + Attributes + ---------- + session_id: + Owning session — used to correlate audit entries. + allowed_apps: + If non-empty, only these application names may be opened/focused. + Comparisons are case-insensitive substring matches. + screen_region: + Optional ``(x, y, width, height)`` bounding box. When set, + screenshot and click coordinates are validated to stay inside. + audit_log: + Ordered list of every action taken this session. + """ + + session_id: str + allowed_apps: list[str] = field(default_factory=list) + # (x, y, width, height) or None for "full screen" + screen_region: tuple[int, int, int, int] | None = None + audit_log: list[AuditEntry] = field(default_factory=list) + # Last captured screenshot PNG bytes — used for automatic change detection. + last_screenshot: bytes | None = field(default=None, repr=False) + _lock: asyncio.Lock = field(default_factory=asyncio.Lock, repr=False) + + # ------------------------------------------------------------------ + # Validation helpers + # ------------------------------------------------------------------ + + def is_app_allowed(self, app_name: str) -> bool: + """Return True when *app_name* passes the allow-list check.""" + if not self.allowed_apps: + return True # no restrictions + name_lower = app_name.lower() + return any(allowed.lower() in name_lower for allowed in self.allowed_apps) + + def is_coordinate_allowed(self, x: int, y: int) -> bool: + """Return True when *(x, y)* is within the permitted screen region.""" + if self.screen_region is None: + return True + rx, ry, rw, rh = self.screen_region + return rx <= x <= rx + rw and ry <= y <= ry + rh + + # ------------------------------------------------------------------ + # Audit log + # ------------------------------------------------------------------ + + async def record_action( + self, + action_type: ActionType, + params: dict[str, Any], + result: str | None = None, + error: str | None = None, + ) -> AuditEntry: + """Append an entry to the audit log and return it.""" + entry = AuditEntry( + id=str(uuid.uuid4()), + timestamp=time.time(), + action_type=action_type, + params=params, + session_id=self.session_id, + result=result, + error=error, + ) + async with self._lock: + self.audit_log.append(entry) + return entry + + def export_audit_log(self) -> list[dict[str, Any]]: + """Return the full audit log as a list of plain dicts.""" + return [e.to_dict() for e in self.audit_log] + + def summary(self) -> str: + """Return a human-readable summary line.""" + counts: dict[str, int] = {} + for e in self.audit_log: + counts[e.action_type.value] = counts.get(e.action_type.value, 0) + 1 + parts = ", ".join(f"{v}× {k}" for k, v in sorted(counts.items())) + return f"session={self.session_id[:12]}… | {len(self.audit_log)} actions: {parts or 'none'}" + + +# --------------------------------------------------------------------------- +# Global registry keyed by session_id +# --------------------------------------------------------------------------- + +_sandboxes: dict[str, ComputerSandbox] = {} + + +def get_sandbox(session_id: str) -> ComputerSandbox: + """Return (creating if needed) the sandbox for *session_id*.""" + if session_id not in _sandboxes: + _sandboxes[session_id] = ComputerSandbox(session_id=session_id) + return _sandboxes[session_id] + + +def clear_sandbox(session_id: str) -> None: + """Discard the sandbox for *session_id*.""" + _sandboxes.pop(session_id, None) diff --git a/openvibe/computer/stream.py b/openvibe/computer/stream.py new file mode 100644 index 0000000..6141190 --- /dev/null +++ b/openvibe/computer/stream.py @@ -0,0 +1,68 @@ +"""Async screen-update streaming for real-time computer-use feedback. + +The stream yields base-64 PNG frames at a configurable frame rate. Frames +are captured in a thread-pool executor so the event loop stays responsive. + +Example:: + + import asyncio + from openvibe.computer.stream import stream_screen + + async def watch(): + stop = asyncio.Event() + async for b64, w, h in stream_screen(fps=4, stop_event=stop): + print(f"frame {w}x{h}, {len(b64)} chars") + # … forward to TUI / websocket … +""" + +from __future__ import annotations + +import asyncio +from collections.abc import AsyncIterator +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + pass + + +async def stream_screen( + fps: float = 2.0, + region: tuple[int, int, int, int] | None = None, + stop_event: asyncio.Event | None = None, + deduplicate: bool = True, +) -> AsyncIterator[tuple[str, int, int]]: + """Yield ``(base64_png, width, height)`` frames at *fps*. + + Parameters + ---------- + fps: + Target frames per second. Values above ~10 may not be achievable + on slower machines; the loop will simply run as fast as it can. + region: + Optional ``(x, y, width, height)`` capture region. + stop_event: + Set this :class:`asyncio.Event` to terminate the generator cleanly. + deduplicate: + When ``True`` (default), identical consecutive frames are skipped + to reduce bandwidth. + """ + from openvibe.computer.capture import capture_screen_b64 + + interval = 1.0 / max(fps, 0.1) + loop = asyncio.get_event_loop() + last_b64: str | None = None + + while True: + if stop_event is not None and stop_event.is_set(): + return + + b64, w, h = await loop.run_in_executor(None, capture_screen_b64, region) + + if deduplicate and b64 == last_b64: + await asyncio.sleep(interval) + continue + + last_b64 = b64 + yield b64, w, h + + await asyncio.sleep(interval) diff --git a/openvibe/llm.py b/openvibe/llm.py index b795eb0..926cb6e 100644 --- a/openvibe/llm.py +++ b/openvibe/llm.py @@ -164,6 +164,32 @@ def _to_litellm_messages(messages: list[Message]) -> list[dict[str, Any]]: if isinstance(msg.content, str): d: dict[str, Any] = {"role": msg.role, "content": msg.content} else: + # role="tool" with image content needs special handling so that + # litellm can translate it correctly for both Anthropic (which + # embeds tool results inside a user message with type="tool_result") + # and OpenAI-compatible providers. + if msg.role == "tool" and msg.tool_call_id: + # Build the inner content list that litellm accepts for a + # multimodal tool result (image + caption text). + inner: list[dict[str, Any]] = [] + for block in msg.content: + match block.type: + case "image_url": + inner.append( + {"type": "image_url", "image_url": block.image_url} + ) + case "text": + inner.append({"type": "text", "text": block.text or ""}) + case _: + pass + d = { + "role": "tool", + "tool_call_id": msg.tool_call_id, + "content": inner, + } + out.append(d) + continue + parts: list[dict[str, Any]] = [] for block in msg.content: match block.type: diff --git a/openvibe/session/processor.py b/openvibe/session/processor.py index a4bffc0..6238110 100644 --- a/openvibe/session/processor.py +++ b/openvibe/session/processor.py @@ -25,6 +25,7 @@ from __future__ import annotations import asyncio +import base64 import json import time from dataclasses import dataclass @@ -336,6 +337,19 @@ async def _announce() -> None: tool_part.state.time_end = time.monotonic() - t0 if result.error: tool_part.state.error = result.output + + # Persist the first image attachment (e.g. screenshot) + # so it can be forwarded to the LLM on the next turn. + for att in result.attachments: + if att.media_type.startswith("image/"): + tool_part.state.metadata["image_b64"] = ( + base64.b64encode(att.content).decode("ascii") + ) + tool_part.state.metadata["image_media_type"] = ( + att.media_type + ) + break + session_store.upsert_part( self._db, assistant_msg.id, part_idx, tool_part ) @@ -670,19 +684,44 @@ def _to_llm_messages(history: list[MessageInfo], agent: "AgentInfo") -> list[Mes # output is None — emit a synthetic result so the LLM message # sequence remains valid (every tool_call needs a matching result). for part in tool_parts: - content = ( + output_text = ( part.state.output if part.state.output is not None else "Tool execution was interrupted (session was closed before the tool completed)." ) - messages.append( - Message( - role="tool", - content=content, - tool_call_id=part.state.call_id, - ) + image_b64: str | None = part.state.metadata.get("image_b64") + image_media_type: str = part.state.metadata.get( + "image_media_type", "image/png" ) + if image_b64: + # Build a vision-capable tool result: image first so the + # model sees it before the caption, then the text caption. + content_blocks = [ + ContentBlock( + type="image_url", + image_url={ + "url": f"data:{image_media_type};base64,{image_b64}" + }, + ), + ContentBlock(type="text", text=output_text), + ] + messages.append( + Message( + role="tool", + content=content_blocks, + tool_call_id=part.state.call_id, + ) + ) + else: + messages.append( + Message( + role="tool", + content=output_text, + tool_call_id=part.state.call_id, + ) + ) + return messages diff --git a/openvibe/skill/__init__.py b/openvibe/skill/__init__.py new file mode 100644 index 0000000..5b3ee7a --- /dev/null +++ b/openvibe/skill/__init__.py @@ -0,0 +1,73 @@ +"""openvibe skills — discoverable prompt-templates that route through the LLM. + +Public surface:: + + from openvibe.skill import ( + SkillDefinition, + SkillResult, + SkillStatus, + CostTier, + SkillValidator, + ValidationResult, + SkillExample, + get_registry, + register_skill, + SkillExecutor, + ExecutionContext, + get_skill_log, + init_skill_log, + load_skills_dir, + SkillLoader, + ) +""" + +from openvibe.skill.base import ( + CostTier, + SkillDefinition, + SkillExample, + SkillResult, + SkillStatus, + SkillValidator, + ValidationResult, +) +from openvibe.skill.executor import ExecutionContext, SkillExecutor +from openvibe.skill.loader import FileSkill, SkillLoader, load_skills_dir +from openvibe.skill.log import get_skill_log, init_skill_log +from openvibe.skill.registry import get_registry, register_skill +from openvibe.skill.verifier import ( + KeywordValidator, + MinLengthValidator, + NoErrorValidator, + NonEmptyValidator, + SkillVerifier, +) + +__all__ = [ + # base + "CostTier", + "SkillDefinition", + "SkillExample", + "SkillResult", + "SkillStatus", + "SkillValidator", + "ValidationResult", + # registry + "get_registry", + "register_skill", + # executor + "ExecutionContext", + "SkillExecutor", + # log + "get_skill_log", + "init_skill_log", + # loader + "FileSkill", + "SkillLoader", + "load_skills_dir", + # validators + "KeywordValidator", + "MinLengthValidator", + "NoErrorValidator", + "NonEmptyValidator", + "SkillVerifier", +] diff --git a/openvibe/skill/base.py b/openvibe/skill/base.py new file mode 100644 index 0000000..fc27bf3 --- /dev/null +++ b/openvibe/skill/base.py @@ -0,0 +1,171 @@ +"""Core skill abstractions. + +A ``SkillDefinition`` is a named, discoverable prompt-template that routes +through the LLM (unlike slash commands which execute locally). Skills carry +rich metadata used for: + +* **Discovery** — capability/tag-based search and ranking. +* **Execution** — retry policy, fallback skill, validator chain. +* **Observability** — cost tier, reliability score updated from the log. + +Bundled skills live in ``openvibe.skill.bundled``. +Custom skills can be registered via :func:`register_skill`. +""" + +from __future__ import annotations + +import abc +from dataclasses import dataclass, field +from enum import StrEnum +from typing import Any + + +# --------------------------------------------------------------------------- +# Enumerations +# --------------------------------------------------------------------------- + + +class CostTier(StrEnum): + """Expected relative cost of running this skill one time.""" + + LOW = "low" # < 5 tool calls + MEDIUM = "medium" # 5–20 tool calls + HIGH = "high" # 20+ tool calls or expensive tools (web, long bash) + + +class SkillStatus(StrEnum): + SUCCESS = "success" + PARTIAL = "partial" # completed with caveats / validation warnings + RETRIED = "retried" # succeeded after ≥1 retry + FALLBACK = "fallback" # succeeded via fallback_skill + FAILED = "failed" + + +# --------------------------------------------------------------------------- +# Data objects +# --------------------------------------------------------------------------- + + +@dataclass +class SkillExample: + """A concrete example that helps users and the ranking algorithm understand the skill.""" + + input: str # e.g. "the tests are failing after refactor" + description: str # e.g. "diagnose a regression introduced by a refactor" + + +@dataclass +class SkillResult: + """Output of a single skill execution attempt.""" + + skill_name: str + status: SkillStatus + output: str + attempt: int = 1 + elapsed: float = 0.0 + metadata: dict[str, Any] = field(default_factory=dict) + error: str | None = None + + +# --------------------------------------------------------------------------- +# Validators +# --------------------------------------------------------------------------- + + +@dataclass +class ValidationResult: + """Outcome of running a :class:`SkillValidator` against a :class:`SkillResult`.""" + + passed: bool + reason: str = "" + details: dict[str, Any] = field(default_factory=dict) + can_retry: bool = True + retry_hint: str = "" # appended to the retry prompt to guide the next attempt + + +class SkillValidator(abc.ABC): + """Abstract base for validators that inspect a :class:`SkillResult`.""" + + name: str = "validator" + + @abc.abstractmethod + def validate( + self, result: "SkillResult", context: dict[str, Any] + ) -> ValidationResult: + """Return a :class:`ValidationResult`. ``passed=True`` means OK.""" + ... + + +# --------------------------------------------------------------------------- +# SkillDefinition +# --------------------------------------------------------------------------- + + +class SkillDefinition(abc.ABC): + """Abstract base for all skills. + + Subclass this and implement :meth:`get_prompt`. Class-level attributes + provide all metadata; no ``__init__`` override is needed for simple skills. + + Example:: + + class MySkill(SkillDefinition): + name = "myskill" + description = "Does X to Y." + tags = ["x", "y"] + cost_estimate = CostTier.LOW + + def get_prompt(self, args: str) -> str: + return f"Do X to {args or 'this code'}." + """ + + # --- Identity --- + name: str = "" + description: str = "" + aliases: list[str] = [] + + # --- Discovery metadata --- + capabilities: list[str] = [] # e.g. ["code_review", "refactoring"] + input_types: list[str] = [] # e.g. ["code", "file_path", "error_message"] + output_types: list[str] = [] # e.g. ["code_diff", "report", "commit_message"] + constraints: list[str] = [] # e.g. ["requires_git", "read_only"] + tags: list[str] = [] # free-form search tokens + cost_estimate: CostTier = CostTier.MEDIUM + reliability: float = 1.0 # 0.0–1.0; updated in-place by the feedback loop + examples: list[SkillExample] = [] + + # --- UX --- + user_invocable: bool = True # show in /skills list + when_to_use: str = "" # one-line hint shown in /skills + argument_hint: str = "" # e.g. "[focus area]" + + # --- Execution control --- + max_retries: int = 0 # extra attempts beyond the first (0 = no retry) + fallback_skill: str | None = None # skill name to try after all retries exhausted + + # --- Validators --- + validators: list[SkillValidator] = [] + + # ------------------------------------------------------------------ + # Abstract interface + # ------------------------------------------------------------------ + + @abc.abstractmethod + def get_prompt(self, args: str) -> str: + """Return the LLM prompt for this invocation.""" + ... + + def get_retry_prompt(self, args: str, attempt: int, hint: str) -> str: + """Return a modified prompt for retry attempt *attempt* (1-indexed). + + Default: same as :meth:`get_prompt` with an appended hint block. + Override for skill-specific retry strategies. + """ + base = self.get_prompt(args) + if hint: + return ( + f"{base}\n\n" + f"**Previous attempt {attempt - 1} did not satisfy requirements.**\n" + f"Hint for this attempt: {hint}" + ) + return base diff --git a/openvibe/skill/bundled/__init__.py b/openvibe/skill/bundled/__init__.py new file mode 100644 index 0000000..64c41f8 --- /dev/null +++ b/openvibe/skill/bundled/__init__.py @@ -0,0 +1,23 @@ +"""Bundled skills — registered once via :func:`init_bundled_skills`.""" + +from __future__ import annotations + +from openvibe.skill.registry import get_registry + + +def init_bundled_skills() -> None: + """Register all bundled skills in the global registry. + + Called from :meth:`~openvibe.api.OpenVibe.start` and + :meth:`~openvibe.api.OpenVibe.start_async`. + """ + from openvibe.skill.bundled.brainstorm import BrainstormSkill + from openvibe.skill.bundled.draft import DraftSkill + from openvibe.skill.bundled.explain import ExplainSkill + from openvibe.skill.bundled.summarize import SummarizeSkill + + registry = get_registry() + registry.register(SummarizeSkill()) + registry.register(ExplainSkill()) + registry.register(BrainstormSkill()) + registry.register(DraftSkill()) diff --git a/openvibe/skill/bundled/brainstorm.py b/openvibe/skill/bundled/brainstorm.py new file mode 100644 index 0000000..2b63cdc --- /dev/null +++ b/openvibe/skill/bundled/brainstorm.py @@ -0,0 +1,38 @@ +"""``/brainstorm`` — generate a diverse set of ideas around a topic.""" + +from __future__ import annotations + +from openvibe.skill.base import CostTier, SkillDefinition, SkillExample +from openvibe.skill.verifier import KeywordValidator, MinLengthValidator + + +class BrainstormSkill(SkillDefinition): + name = "brainstorm" + description = "Generate a wide set of ideas, options, or approaches for any topic or problem." + aliases = ["ideas", "bs"] + capabilities = ["ideation", "exploration", "creative_thinking"] + input_types = ["topic", "problem", "question", "goal"] + output_types = ["ideas", "list", "options"] + tags = ["ideas", "brainstorm", "options", "explore", "creative"] + cost_estimate = CostTier.LOW + user_invocable = True + when_to_use = "When you need a broad set of options or are stuck on where to start." + argument_hint = "[topic or problem]" + examples = [ + SkillExample("names for a productivity app", "generate product name ideas"), + SkillExample("ways to reduce meeting time", "brainstorm process improvements"), + ] + validators = [MinLengthValidator(100), KeywordValidator(required=["1."])] + + def get_prompt(self, args: str) -> str: + topic = args.strip() or "the topic raised in the current conversation" + return ( + f"Brainstorm ideas for: {topic}\n\n" + "Generate at least 8 distinct ideas. For each idea:\n" + "- Number it (1. 2. 3. …)\n" + "- Give it a short name or headline\n" + "- Add one sentence explaining the core concept\n\n" + "Prioritise variety — include obvious options alongside unconventional ones.\n" + "Do not evaluate or rank the ideas; just list them.\n" + "After the list, briefly note any interesting tensions or trade-offs you see." + ) diff --git a/openvibe/skill/bundled/draft.py b/openvibe/skill/bundled/draft.py new file mode 100644 index 0000000..5500d5d --- /dev/null +++ b/openvibe/skill/bundled/draft.py @@ -0,0 +1,38 @@ +"""``/draft`` — produce a first draft of any written content.""" + +from __future__ import annotations + +from openvibe.skill.base import CostTier, SkillDefinition, SkillExample +from openvibe.skill.verifier import MinLengthValidator, NonEmptyValidator + + +class DraftSkill(SkillDefinition): + name = "draft" + description = "Write a first draft of any content: email, report, proposal, message, or document." + aliases = ["write", "compose"] + capabilities = ["writing", "drafting", "composition"] + input_types = ["topic", "description", "outline", "instructions"] + output_types = ["draft", "document", "email", "report"] + tags = ["write", "draft", "compose", "email", "document", "report"] + cost_estimate = CostTier.LOW + user_invocable = True + when_to_use = "When you need a first version of any written content to work from." + argument_hint = "[what to write and any context]" + examples = [ + SkillExample("an email declining a meeting politely", "draft a short professional email"), + SkillExample("a one-page project proposal for a team dashboard", "draft a proposal"), + ] + validators = [NonEmptyValidator(), MinLengthValidator(100)] + + def get_prompt(self, args: str) -> str: + request = args.strip() or "the content described in the current conversation" + return ( + f"Write a first draft of: {request}\n\n" + "Guidelines:\n" + "- Match the appropriate tone and format for the content type\n" + "- Be clear and direct — avoid filler phrases\n" + "- Use structure (headings, bullets, paragraphs) only when it aids readability\n" + "- Keep the draft appropriately concise; do not pad it\n\n" + "Produce the draft directly, without preamble.\n" + "After the draft, add a brief note on any assumptions you made." + ) diff --git a/openvibe/skill/bundled/explain.py b/openvibe/skill/bundled/explain.py new file mode 100644 index 0000000..aadcd67 --- /dev/null +++ b/openvibe/skill/bundled/explain.py @@ -0,0 +1,38 @@ +"""``/explain`` — explain a concept, decision, or piece of content clearly.""" + +from __future__ import annotations + +from openvibe.skill.base import CostTier, SkillDefinition, SkillExample +from openvibe.skill.verifier import MinLengthValidator, NonEmptyValidator + + +class ExplainSkill(SkillDefinition): + name = "explain" + description = "Explain a concept, decision, document, or anything else in plain language." + aliases = ["eli5", "howdoes"] + capabilities = ["explanation", "clarification", "teaching"] + input_types = ["concept", "text", "question", "document"] + output_types = ["explanation", "analogy", "walkthrough"] + tags = ["explain", "understand", "clarify", "teach", "how", "why"] + cost_estimate = CostTier.LOW + user_invocable = True + when_to_use = "When something is unclear or you need it broken down into simpler terms." + argument_hint = "[concept or question]" + examples = [ + SkillExample("how does OAuth2 work", "explain a technical protocol simply"), + SkillExample("this contract clause", "explain a document section in plain language"), + ] + validators = [NonEmptyValidator(), MinLengthValidator(80)] + + def get_prompt(self, args: str) -> str: + subject = args.strip() or "the most recent topic in the conversation" + return ( + f"Explain: {subject}\n\n" + "Your explanation should:\n" + "- Start with a one-sentence plain-language answer\n" + "- Use a concrete analogy or example where it helps\n" + "- Break down complex parts step by step\n" + "- Avoid unnecessary jargon; define any term that must be used\n" + "- Stay focused — do not over-explain tangential details\n\n" + "Aim for clarity over completeness." + ) diff --git a/openvibe/skill/bundled/summarize.py b/openvibe/skill/bundled/summarize.py new file mode 100644 index 0000000..87d197a --- /dev/null +++ b/openvibe/skill/bundled/summarize.py @@ -0,0 +1,37 @@ +"""``/summarize`` — distil any content into a concise summary.""" + +from __future__ import annotations + +from openvibe.skill.base import CostTier, SkillDefinition, SkillExample +from openvibe.skill.verifier import MinLengthValidator, NonEmptyValidator + + +class SummarizeSkill(SkillDefinition): + name = "summarize" + description = "Distil content, a document, or a topic into a clear, concise summary." + aliases = ["sum", "tldr"] + capabilities = ["summarization", "condensing", "overview"] + input_types = ["text", "document", "topic", "url"] + output_types = ["summary", "bullets", "report"] + tags = ["summary", "overview", "condense", "brief", "tldr"] + cost_estimate = CostTier.LOW + user_invocable = True + when_to_use = "When you need a quick overview of something lengthy or complex." + argument_hint = "[topic, text, or URL to summarize]" + examples = [ + SkillExample("the conversation so far", "recap the current conversation"), + SkillExample("https://example.com/article", "summarize a web page"), + ] + validators = [NonEmptyValidator(), MinLengthValidator(50)] + + def get_prompt(self, args: str) -> str: + subject = args.strip() or "the content or conversation so far" + return ( + f"Summarize: {subject}\n\n" + "Provide a concise summary that:\n" + "- Captures the key points in plain language\n" + "- Uses bullet points when listing multiple items\n" + "- Is no longer than needed — omit filler and repetition\n" + "- Ends with a one-sentence takeaway if helpful\n\n" + "Do not add opinions or information not present in the source." + ) diff --git a/openvibe/skill/executor.py b/openvibe/skill/executor.py new file mode 100644 index 0000000..cd5f7d5 --- /dev/null +++ b/openvibe/skill/executor.py @@ -0,0 +1,211 @@ +"""SkillExecutor — agent-skill loop with retry, fallback, and observability. + +The executor drives one complete skill invocation: + +1. **Select** — caller already resolved the skill via :class:`~openvibe.skill.registry.SkillRegistry`. +2. **Expand** — ``skill.get_prompt(args)`` builds the LLM prompt. +3. **Execute** — ``send_fn(prompt)`` runs the full agent loop and returns text. +4. **Verify** — :class:`~openvibe.skill.verifier.SkillVerifier` checks validators. +5. **Retry** — if verification failed and retries remain, rebuild prompt with hint. +6. **Fallback** — if all retries exhausted and a fallback skill is named, delegate. +7. **Log** — record outcome in :func:`~openvibe.skill.log.get_skill_log`. +8. **Update reliability** — write back ``skill.reliability`` from log. + +The ``send_fn`` callable is intentionally simple: ``(prompt: str) -> str``. +In :class:`~openvibe.api.Session` this is wired to a thin wrapper around the +internal worker so that skills can trigger the full agent + tool loop. + +Example (programmatic use):: + + from openvibe.skill.executor import SkillExecutor, ExecutionContext + from openvibe.skill.registry import get_registry + + registry = get_registry() + executor = SkillExecutor(registry) + + ctx = ExecutionContext( + session_id=session.id, + working_dir="/path/to/project", + send_fn=lambda prompt: session._send_raw(prompt), + ) + + result = executor.run(registry.get("debug"), "TypeError in auth.py", ctx) + print(result.status, result.output) +""" + +from __future__ import annotations + +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, Callable + +from openvibe.skill.base import SkillResult, SkillStatus, ValidationResult +from openvibe.skill.log import SkillLogEntry, get_skill_log +from openvibe.skill.verifier import SkillVerifier + +if TYPE_CHECKING: + from openvibe.skill.base import SkillDefinition + from openvibe.skill.registry import SkillRegistry + + +# --------------------------------------------------------------------------- +# ExecutionContext +# --------------------------------------------------------------------------- + + +@dataclass +class ExecutionContext: + """Everything the executor needs to send prompts and identify the caller. + + ``send_fn`` is the only required callable. It must accept a prompt string + and return the assistant's complete text response. It is allowed to block; + the executor is synchronous. + """ + + session_id: str + working_dir: str + send_fn: Callable[[str], str] + + +# --------------------------------------------------------------------------- +# SkillExecutor +# --------------------------------------------------------------------------- + + +class SkillExecutor: + """Runs one skill invocation through the full agent-skill loop.""" + + def __init__(self, registry: "SkillRegistry") -> None: + self._registry = registry + self._verifier = SkillVerifier() + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def run( + self, + skill: "SkillDefinition", + args: str, + ctx: ExecutionContext, + ) -> SkillResult: + """Execute *skill* synchronously and return a :class:`SkillResult`. + + Retry and fallback logic is transparent to the caller. + """ + start = time.monotonic() + last_result: SkillResult | None = None + last_validation: ValidationResult | None = None + + max_attempts = max(1, 1 + skill.max_retries) + + for attempt in range(1, max_attempts + 1): + hint = ( + last_validation.retry_hint + if (last_validation and not last_validation.passed) + else "" + ) + prompt = ( + skill.get_prompt(args) + if attempt == 1 + else skill.get_retry_prompt(args, attempt, hint) + ) + + result = self._call_llm(skill, prompt, attempt, start, ctx) + validation = self._verifier.verify(skill.validators, result) + + if validation.passed: + if attempt > 1: + result.status = SkillStatus.RETRIED + self._finish(skill, args, result) + return result + + last_result = result + last_validation = validation + + if not validation.can_retry or attempt >= max_attempts: + break + + # All attempts exhausted → try fallback + if skill.fallback_skill: + fallback = self._registry.get(skill.fallback_skill) + if fallback and fallback.name != skill.name: + fb_result = self.run(fallback, args, ctx) + fb_result.status = SkillStatus.FALLBACK + fb_result.metadata["original_skill"] = skill.name + fb_result.elapsed = time.monotonic() - start + self._log(skill.name, args, fb_result) + self._update_reliability(skill, success=False) + return fb_result + + # Hard failure + final = last_result or SkillResult( + skill_name=skill.name, + status=SkillStatus.FAILED, + output="", + error="No result produced.", + elapsed=time.monotonic() - start, + ) + final.status = SkillStatus.FAILED + self._finish(skill, args, final, success=False) + return final + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _call_llm( + self, + skill: "SkillDefinition", + prompt: str, + attempt: int, + start: float, + ctx: ExecutionContext, + ) -> SkillResult: + try: + output = ctx.send_fn(prompt) + return SkillResult( + skill_name=skill.name, + status=SkillStatus.SUCCESS, + output=output, + attempt=attempt, + elapsed=time.monotonic() - start, + ) + except Exception as exc: + return SkillResult( + skill_name=skill.name, + status=SkillStatus.FAILED, + output="", + attempt=attempt, + elapsed=time.monotonic() - start, + error=str(exc), + ) + + def _finish( + self, + skill: "SkillDefinition", + args: str, + result: SkillResult, + success: bool = True, + ) -> None: + self._log(skill.name, args, result) + self._update_reliability(skill, success=success) + + def _log(self, skill_name: str, args: str, result: SkillResult) -> None: + get_skill_log().record( + SkillLogEntry( + skill_name=skill_name, + args=args, + status=result.status, + attempt=result.attempt, + elapsed=result.elapsed, + error=result.error, + metadata=result.metadata, + ) + ) + + def _update_reliability( + self, skill: "SkillDefinition", success: bool # noqa: ARG002 (used implicitly via log) + ) -> None: + """Recompute and persist reliability on the live SkillDefinition object.""" + skill.reliability = get_skill_log().compute_reliability(skill.name) diff --git a/openvibe/skill/loader.py b/openvibe/skill/loader.py new file mode 100644 index 0000000..1ea9dc8 --- /dev/null +++ b/openvibe/skill/loader.py @@ -0,0 +1,249 @@ +"""Load skills from a ``skills/`` directory tree. + +Expected layout:: + + skills/ + summarize/ + SKILL.md + my-custom-skill/ + SKILL.md + +Each ``SKILL.md`` file contains optional YAML front-matter followed by the +prompt template. ``{{args}}`` in the template is replaced with the user's +arguments at invocation time:: + + --- + description: Summarize any content concisely. + aliases: [sum, tldr] + tags: [summary, overview] + cost: low + when_to_use: When you need a quick overview of something lengthy. + argument_hint: "[topic or text]" + --- + + Summarize: {{args}} + + Capture the key points in bullet form. End with a one-sentence takeaway. + +Front-matter keys (all optional) +--------------------------------- +* ``description`` — shown in ``/skills`` +* ``aliases`` — list of shorthand names +* ``tags`` — list of search tokens +* ``capabilities`` — list of capability labels +* ``input_types`` / ``output_types`` — list of type labels +* ``cost`` — ``low`` | ``medium`` | ``high`` (default ``medium``) +* ``when_to_use`` — one-line hint shown in ``/skills`` +* ``argument_hint`` — e.g. ``[topic]`` +* ``max_retries`` — int (default ``0``) +* ``fallback`` — name of another skill to try on failure +""" + +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any + +from openvibe.skill.base import CostTier, SkillDefinition +from openvibe.skill.registry import SkillRegistry, get_registry + +_FRONTMATTER_RE = re.compile(r"^\s*---\s*\n(.*?)\n---\s*\n", re.DOTALL) + + +# --------------------------------------------------------------------------- +# FileSkill — a SkillDefinition backed by a SKILL.md file +# --------------------------------------------------------------------------- + + +class FileSkill(SkillDefinition): + """A skill loaded from a ``SKILL.md`` file.""" + + def __init__( + self, + name: str, + prompt_template: str, + meta: dict[str, Any], + ) -> None: + self.name = name + self.description = str(meta.get("description", "")) + self.aliases = list(meta.get("aliases") or []) + self.tags = list(meta.get("tags") or []) + self.capabilities = list(meta.get("capabilities") or []) + self.input_types = list(meta.get("input_types") or []) + self.output_types = list(meta.get("output_types") or []) + self.when_to_use = str(meta.get("when_to_use", "")) + self.argument_hint = str(meta.get("argument_hint", "")) + self.max_retries = int(meta.get("max_retries", 0)) + self.fallback_skill = meta.get("fallback") or None + self.user_invocable = True + self._template = prompt_template.strip() + + # cost + cost_raw = str(meta.get("cost", "medium")).lower() + self.cost_estimate = CostTier(cost_raw) if cost_raw in CostTier._value2member_map_ else CostTier.MEDIUM # type: ignore[attr-defined] + + def get_prompt(self, args: str) -> str: + return self._template.replace("{{args}}", args).strip() + + +# --------------------------------------------------------------------------- +# Loader +# --------------------------------------------------------------------------- + + +class SkillLoader: + """Discovers and loads skills from a directory tree. + + Only the ``skill-name/SKILL.md`` layout is supported. Directories + without a ``SKILL.md`` are silently skipped. + """ + + SKILL_FILE = "SKILL.md" + + def __init__(self, registry: SkillRegistry | None = None) -> None: + self._registry = registry or get_registry() + + def load(self, skills_dir: Path | str) -> list[FileSkill]: + """Load all skills found under *skills_dir* and register them. + + Returns the list of successfully loaded :class:`FileSkill` instances. + Errors in individual files are logged but do not abort the load. + """ + skills_dir = Path(skills_dir) + if not skills_dir.is_dir(): + return [] + + loaded: list[FileSkill] = [] + for entry in sorted(skills_dir.iterdir()): + if not entry.is_dir(): + continue + skill_file = entry / self.SKILL_FILE + if not skill_file.is_file(): + continue + skill = self._load_file(entry.name, skill_file) + if skill is not None: + self._registry.register(skill) + loaded.append(skill) + + return loaded + + # ------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------ + + def _load_file(self, dir_name: str, skill_file: Path) -> "FileSkill | None": + try: + raw = skill_file.read_text(encoding="utf-8") + except OSError as exc: + import logging + logging.getLogger(__name__).warning("Could not read %s: %s", skill_file, exc) + return None + + meta, template = _parse_skill_md(raw) + + # Derive skill name: front-matter ``name`` overrides directory name + name = str(meta.pop("name", dir_name)).lower().replace(" ", "-") + + if not template.strip(): + import logging + logging.getLogger(__name__).warning( + "Skipping %s: no prompt template found (body is empty)", skill_file + ) + return None + + return FileSkill(name=name, prompt_template=template, meta=meta) + + +# --------------------------------------------------------------------------- +# Parsing helpers +# --------------------------------------------------------------------------- + + +def _parse_skill_md(text: str) -> tuple[dict[str, Any], str]: + """Split a SKILL.md into ``(front_matter_dict, prompt_template)``.""" + m = _FRONTMATTER_RE.match(text) + if not m: + return {}, text + + meta = _parse_yaml_lite(m.group(1)) + template = text[m.end():] + return meta, template + + +def _parse_yaml_lite(text: str) -> dict[str, Any]: + """Minimal YAML parser that handles the subset used in SKILL.md front-matter. + + Supports: + * ``key: scalar value`` + * ``key: [item1, item2]`` (inline list) + * Block lists:: + + key: + - item1 + - item2 + + Falls back to the ``yaml`` package when available for full YAML support. + """ + try: + import yaml # type: ignore[import] + return yaml.safe_load(text) or {} + except ImportError: + pass + + result: dict[str, Any] = {} + lines = text.splitlines() + i = 0 + while i < len(lines): + line = lines[i] + if not line.strip() or line.lstrip().startswith("#"): + i += 1 + continue + + m = re.match(r"^(\w[\w_-]*):\s*(.*)", line) + if not m: + i += 1 + continue + + key = m.group(1) + value_str = m.group(2).strip() + + if value_str.startswith("[") and value_str.endswith("]"): + # Inline list: [a, b, c] + inner = value_str[1:-1] + result[key] = [v.strip().strip("\"'") for v in inner.split(",") if v.strip()] + elif not value_str: + # Potential block list + items: list[str] = [] + i += 1 + while i < len(lines) and lines[i].lstrip().startswith("-"): + items.append(lines[i].lstrip().lstrip("-").strip().strip("\"'")) + i += 1 + result[key] = items + continue + else: + # Scalar — strip surrounding quotes + result[key] = value_str.strip("\"'") + + i += 1 + + return result + + +# --------------------------------------------------------------------------- +# Convenience function +# --------------------------------------------------------------------------- + + +def load_skills_dir( + skills_dir: Path | str, + registry: SkillRegistry | None = None, +) -> list[FileSkill]: + """Load skills from *skills_dir* into *registry* (defaults to global registry). + + This is the main entry point for external callers:: + + from openvibe.skill.loader import load_skills_dir + load_skills_dir(Path("./skills")) + """ + return SkillLoader(registry).load(skills_dir) diff --git a/openvibe/skill/log.py b/openvibe/skill/log.py new file mode 100644 index 0000000..b7ceedd --- /dev/null +++ b/openvibe/skill/log.py @@ -0,0 +1,168 @@ +"""Skill execution logging — observability and feedback loop. + +Every skill execution is appended to an in-memory ring buffer and optionally +to a JSONL file on disk. :meth:`SkillLog.compute_reliability` reads the +recent window to produce the 0–1 score that is written back to +``SkillDefinition.reliability``, closing the feedback loop. + +Usage:: + + from openvibe.skill.log import get_skill_log + + log = get_skill_log() + print(log.stats("simplify")) + # {'total': 12, 'success_rate': 0.916, 'failure_rate': 0.083, ...} +""" + +from __future__ import annotations + +import json +import time +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +from openvibe.skill.base import SkillStatus + + +# --------------------------------------------------------------------------- +# Log entry +# --------------------------------------------------------------------------- + + +@dataclass +class SkillLogEntry: + skill_name: str + args: str + status: str # SkillStatus value + attempt: int # which attempt succeeded / finally failed + elapsed: float # total wall-clock seconds for all attempts + timestamp: float = field(default_factory=time.time) + error: str | None = None + metadata: dict[str, Any] = field(default_factory=dict) + + +# --------------------------------------------------------------------------- +# SkillLog +# --------------------------------------------------------------------------- + + +class SkillLog: + """Thread-safe in-memory skill log with optional JSONL persistence. + + The ring buffer caps at :attr:`MAX_ENTRIES_IN_MEMORY` so long-running + processes don't grow unbounded. + """ + + MAX_ENTRIES_IN_MEMORY: int = 500 + # How many recent entries to consider for reliability scoring. + RELIABILITY_WINDOW: int = 100 + + def __init__(self, log_path: Path | None = None) -> None: + self._path = log_path + self._entries: list[SkillLogEntry] = [] + + # ------------------------------------------------------------------ + # Write + # ------------------------------------------------------------------ + + def record(self, entry: SkillLogEntry) -> None: + """Append *entry* to the log.""" + self._entries.append(entry) + if len(self._entries) > self.MAX_ENTRIES_IN_MEMORY: + self._entries = self._entries[-self.MAX_ENTRIES_IN_MEMORY :] + if self._path: + self._append_to_file(entry) + + # ------------------------------------------------------------------ + # Read / aggregate + # ------------------------------------------------------------------ + + def entries(self, skill_name: str | None = None) -> list[SkillLogEntry]: + """Return entries, optionally filtered to a single skill.""" + if skill_name is None: + return list(self._entries) + return [e for e in self._entries if e.skill_name == skill_name] + + def stats(self, skill_name: str | None = None) -> dict[str, Any]: + """Return aggregate statistics over logged entries. + + Keys: ``total``, ``success_rate``, ``failure_rate``, + ``partial_rate``, ``retry_rate``, ``avg_elapsed_s``. + """ + entries = self.entries(skill_name) + if not entries: + return {} + + total = len(entries) + successes = sum( + 1 + for e in entries + if e.status in (SkillStatus.SUCCESS, SkillStatus.RETRIED, SkillStatus.FALLBACK) + ) + partials = sum(1 for e in entries if e.status == SkillStatus.PARTIAL) + failures = sum(1 for e in entries if e.status == SkillStatus.FAILED) + retried = sum(1 for e in entries if e.attempt > 1) + avg_elapsed = sum(e.elapsed for e in entries) / total + + return { + "total": total, + "success_rate": round(successes / total, 4), + "partial_rate": round(partials / total, 4), + "failure_rate": round(failures / total, 4), + "retry_rate": round(retried / total, 4), + "avg_elapsed_s": round(avg_elapsed, 3), + } + + def compute_reliability(self, skill_name: str) -> float: + """Return a 0–1 reliability score from the most recent executions. + + Counts SUCCESS, RETRIED, PARTIAL, and FALLBACK as "good" outcomes. + Returns ``1.0`` when there is no history (benefit of the doubt). + """ + window = [ + e + for e in self._entries[-self.RELIABILITY_WINDOW :] + if e.skill_name == skill_name + ] + if not window: + return 1.0 + good_statuses = { + SkillStatus.SUCCESS, + SkillStatus.RETRIED, + SkillStatus.PARTIAL, + SkillStatus.FALLBACK, + } + good = sum(1 for e in window if e.status in good_statuses) + return round(good / len(window), 4) + + # ------------------------------------------------------------------ + # Persistence + # ------------------------------------------------------------------ + + def _append_to_file(self, entry: SkillLogEntry) -> None: + try: + with self._path.open("a", encoding="utf-8") as f: # type: ignore[union-attr] + f.write(json.dumps(asdict(entry)) + "\n") + except OSError: + pass # log failures are non-fatal + + +# --------------------------------------------------------------------------- +# Module-level singleton +# --------------------------------------------------------------------------- + + +_global_log: SkillLog = SkillLog() + + +def get_skill_log() -> SkillLog: + """Return the global :class:`SkillLog` singleton.""" + return _global_log + + +def init_skill_log(log_path: Path | None = None) -> SkillLog: + """(Re-)initialise the global log, optionally attaching a JSONL file.""" + global _global_log + _global_log = SkillLog(log_path=log_path) + return _global_log diff --git a/openvibe/skill/registry.py b/openvibe/skill/registry.py new file mode 100644 index 0000000..55766ec --- /dev/null +++ b/openvibe/skill/registry.py @@ -0,0 +1,141 @@ +"""SkillRegistry — registration, indexing, search, and reliability-weighted ranking. + +Skills are indexed by name and alias. :meth:`SkillRegistry.search` supports +keyword queries over name, description, tags, capabilities, and +input/output types, with a reliability multiplier that gradually demotes +consistently-failing skills. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from openvibe.skill.base import SkillDefinition + + +class SkillRegistry: + """Global registry of all :class:`~openvibe.skill.base.SkillDefinition` instances.""" + + def __init__(self) -> None: + self._by_name: dict[str, "SkillDefinition"] = {} + self._by_alias: dict[str, str] = {} # alias → canonical name + + # ------------------------------------------------------------------ + # Registration + # ------------------------------------------------------------------ + + def register(self, skill: "SkillDefinition") -> None: + """Register *skill*. Overwrites any existing skill with the same name.""" + self._by_name[skill.name] = skill + for alias in skill.aliases: + self._by_alias[alias.lower()] = skill.name + + # ------------------------------------------------------------------ + # Retrieval + # ------------------------------------------------------------------ + + def get(self, name: str) -> "SkillDefinition | None": + """Look up a skill by name or alias (case-insensitive).""" + name_lower = name.lower() + skill = self._by_name.get(name_lower) or self._by_name.get(name) + if skill: + return skill + canonical = self._by_alias.get(name_lower) + if canonical: + return self._by_name.get(canonical) + return None + + def all(self) -> list["SkillDefinition"]: + """Return all registered skills in registration order.""" + return list(self._by_name.values()) + + def user_invocable(self) -> list["SkillDefinition"]: + """Return skills that should appear in ``/skills`` output.""" + return [s for s in self._by_name.values() if s.user_invocable] + + # ------------------------------------------------------------------ + # Search & ranking + # ------------------------------------------------------------------ + + def search(self, query: str, top_k: int = 5) -> list[tuple["SkillDefinition", float]]: + """Return up to *top_k* skills ranked by relevance to *query*. + + Scoring: + * Exact name / alias match → +10 / +8 + * Keyword overlap with description → +1 per word + * Tag / capability / type match → +2 per token + * Reliability multiplier: ``0.5 + 0.5 * skill.reliability`` + (a skill with reliability=0 is scored at 50 % of its raw score) + """ + query_tokens = _tokenise(query) + results: list[tuple["SkillDefinition", float]] = [] + + for skill in self._by_name.values(): + score = self._score(skill, query_tokens) + if score > 0: + results.append((skill, score)) + + results.sort(key=lambda x: x[1], reverse=True) + return results[:top_k] + + def find_best(self, query: str) -> "SkillDefinition | None": + """Return the single highest-ranked skill for *query*, or ``None``.""" + results = self.search(query, top_k=1) + return results[0][0] if results else None + + # ------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------ + + def _score(self, skill: "SkillDefinition", query_tokens: set[str]) -> float: + score = 0.0 + + # Exact name / alias hit + if skill.name.lower() in query_tokens: + score += 10.0 + for alias in skill.aliases: + if alias.lower() in query_tokens: + score += 8.0 + + # Description overlap + desc_tokens = _tokenise(skill.description) + score += len(query_tokens & desc_tokens) * 1.0 + + # Tags + capabilities + input/output types + meta_tokens = _tokenise( + " ".join(skill.tags + skill.capabilities + skill.input_types + skill.output_types) + ) + score += len(query_tokens & meta_tokens) * 2.0 + + # when_to_use overlap + use_tokens = _tokenise(skill.when_to_use) + score += len(query_tokens & use_tokens) * 1.5 + + # Reliability weight: unreliable skills are demoted but not excluded + score *= 0.5 + 0.5 * max(0.0, min(1.0, skill.reliability)) + + return score + + +# --------------------------------------------------------------------------- +# Module-level singleton + helpers +# --------------------------------------------------------------------------- + + +_registry: SkillRegistry = SkillRegistry() + + +def get_registry() -> SkillRegistry: + """Return the global :class:`SkillRegistry` singleton.""" + return _registry + + +def register_skill(skill: "SkillDefinition") -> None: + """Register *skill* in the global registry.""" + _registry.register(skill) + + +def _tokenise(text: str) -> set[str]: + return set(re.findall(r"[a-z0-9]+", text.lower())) diff --git a/openvibe/skill/verifier.py b/openvibe/skill/verifier.py new file mode 100644 index 0000000..23251d7 --- /dev/null +++ b/openvibe/skill/verifier.py @@ -0,0 +1,139 @@ +"""Skill execution verification — built-in validators and the SkillVerifier. + +Built-in validators +------------------- +* :class:`NonEmptyValidator` — output must be non-empty. +* :class:`NoErrorValidator` — result must not carry an error flag. +* :class:`KeywordValidator` — output must contain required / not forbidden words. +* :class:`MinLengthValidator` — output must exceed a character threshold. + +Custom validators +----------------- +Subclass :class:`~openvibe.skill.base.SkillValidator` and add instances to +``SkillDefinition.validators``. The verifier short-circuits on the first +failure, returning its :class:`~openvibe.skill.base.ValidationResult`. +""" + +from __future__ import annotations + +from typing import Any + +from openvibe.skill.base import SkillResult, SkillValidator, ValidationResult + + +# --------------------------------------------------------------------------- +# Built-in validators +# --------------------------------------------------------------------------- + + +class NonEmptyValidator(SkillValidator): + """Fails when the skill output is empty or whitespace-only.""" + + name = "non_empty" + + def validate(self, result: SkillResult, context: dict[str, Any]) -> ValidationResult: + if not result.output.strip(): + return ValidationResult( + passed=False, + reason="Skill produced no output.", + can_retry=True, + retry_hint="Produce a complete, non-empty response.", + ) + return ValidationResult(passed=True) + + +class NoErrorValidator(SkillValidator): + """Fails when the result carries an error.""" + + name = "no_error" + + def validate(self, result: SkillResult, context: dict[str, Any]) -> ValidationResult: + if result.error: + return ValidationResult( + passed=False, + reason=f"Skill returned an error: {result.error}", + can_retry=True, + retry_hint="The previous attempt raised an error. Try a different approach.", + ) + return ValidationResult(passed=True) + + +class KeywordValidator(SkillValidator): + """Ensures the output contains required phrases and lacks forbidden ones.""" + + name = "keyword" + + def __init__( + self, + required: list[str] | None = None, + forbidden: list[str] | None = None, + ) -> None: + self._required = [k.lower() for k in (required or [])] + self._forbidden = [k.lower() for k in (forbidden or [])] + + def validate(self, result: SkillResult, context: dict[str, Any]) -> ValidationResult: + lower = result.output.lower() + for kw in self._required: + if kw not in lower: + return ValidationResult( + passed=False, + reason=f"Required phrase '{kw}' not found in output.", + can_retry=True, + retry_hint=f"Make sure the response includes '{kw}'.", + ) + for kw in self._forbidden: + if kw in lower: + return ValidationResult( + passed=False, + reason=f"Forbidden phrase '{kw}' found in output.", + can_retry=True, + retry_hint=f"Do not include '{kw}' in the response.", + ) + return ValidationResult(passed=True) + + +class MinLengthValidator(SkillValidator): + """Fails when the output is shorter than *min_chars* characters.""" + + name = "min_length" + + def __init__(self, min_chars: int = 50) -> None: + self._min = min_chars + + def validate(self, result: SkillResult, context: dict[str, Any]) -> ValidationResult: + length = len(result.output.strip()) + if length < self._min: + return ValidationResult( + passed=False, + reason=f"Output too short ({length} chars, minimum {self._min}).", + can_retry=True, + retry_hint=f"Provide a more complete response (at least {self._min} characters).", + ) + return ValidationResult(passed=True) + + +# --------------------------------------------------------------------------- +# SkillVerifier +# --------------------------------------------------------------------------- + + +class SkillVerifier: + """Runs a skill's validator chain and returns the first failure, or a pass.""" + + def verify( + self, + validators: list[SkillValidator], + result: SkillResult, + context: dict[str, Any] | None = None, + ) -> ValidationResult: + """Run all *validators* in order; short-circuit on first failure. + + Returns :class:`~openvibe.skill.base.ValidationResult` with + ``passed=True`` when all validators succeed (or the list is empty). + """ + ctx = context or {} + for validator in validators: + vr = validator.validate(result, ctx) + if not vr.passed: + return vr + return ValidationResult(passed=True, reason="All validators passed.") diff --git a/openvibe/tool/base.py b/openvibe/tool/base.py index a0dcef1..b2f5256 100644 --- a/openvibe/tool/base.py +++ b/openvibe/tool/base.py @@ -207,7 +207,12 @@ def __contains__(self, name: str) -> bool: def create_default_registry() -> ToolRegistry: - """Create a registry pre-loaded with all built-in tools.""" + """Create a registry pre-loaded with all built-in tools, including computer-use. + + Computer-use tools (screenshot, mouse, keyboard, app, ui) are always + registered so users can interact with the desktop without switching agents. + Their dependencies are auto-installed on first use via openvibe.computer.deps. + """ from openvibe.tool.bash import BashTool from openvibe.tool.edit import EditTool from openvibe.tool.glob_tool import GlobTool @@ -218,6 +223,11 @@ def create_default_registry() -> ToolRegistry: from openvibe.tool.web_fetch import WebFetchTool from openvibe.tool.web_search import WebSearchTool from openvibe.tool.write import WriteTool + from openvibe.tool.computer_app import AppTool + from openvibe.tool.computer_keyboard import KeyboardTool + from openvibe.tool.computer_mouse import MouseTool + from openvibe.tool.computer_screenshot import ScreenshotTool + from openvibe.tool.computer_ui import UITool registry = ToolRegistry() for tool in [ @@ -232,6 +242,16 @@ def create_default_registry() -> ToolRegistry: WebFetchTool(), TodoWriteTool(), TodoReadTool(), + ScreenshotTool(), + UITool(), + MouseTool(), + KeyboardTool(), + AppTool(), ]: registry.register(tool) return registry + + +def create_computer_use_registry() -> ToolRegistry: + """Alias for create_default_registry — computer-use tools are now built-in.""" + return create_default_registry() diff --git a/openvibe/tool/computer_app.py b/openvibe/tool/computer_app.py new file mode 100644 index 0000000..8216964 --- /dev/null +++ b/openvibe/tool/computer_app.py @@ -0,0 +1,305 @@ +"""AppTool — open, close, focus, and list running applications. + +Platform support +---------------- +macOS — uses ``open -a`` / AppleScript via ``osascript`` +Linux — uses ``xdg-open`` / ``wmctrl`` / ``xdotool`` +Windows — uses ``start`` shell command / ``pygetwindow`` + +All actions are gated by the session sandbox allow-list. +""" + +from __future__ import annotations + +import asyncio +import platform +import subprocess +import sys +from typing import Literal + +from pydantic import Field + +from openvibe.tool.base import Tool, ToolContext, ToolResult + +_PLATFORM = platform.system() # "Darwin", "Linux", "Windows" + + +class AppTool(Tool): + """Open, close, focus, or list desktop applications.""" + + name = "app" + description = ( + "Interact with desktop applications: open an app by name, close it, " + "bring it to the foreground, or list all currently running windows. " + "Useful for launching IDEs, browsers, terminals, and other tools." + ) + + class Params(Tool.Params): + action: Literal["open", "close", "focus", "list"] = Field( + description=( + "Application action:\n" + " open — launch an application by name or path\n" + " close — quit a running application by name\n" + " focus — bring a window to the foreground by name\n" + " list — list currently open windows / running applications" + ) + ) + name: str | None = Field( + default=None, + description=( + "Application name (e.g. 'Terminal', 'Google Chrome', 'VS Code') " + "or full path to executable. Required for open/close/focus." + ), + ) + + async def execute(self, ctx: ToolContext, params: "AppTool.Params") -> ToolResult: # type: ignore[override] + from openvibe.computer.sandbox import ActionType, get_sandbox + + app_arg = params.name or "(list)" + await ctx.check_permission( + tool="app", + argument=f"{params.action} {app_arg}", + description=f"App control: {params.action} '{app_arg}'", + ) + + sandbox = get_sandbox(ctx.session_id) + + # Enforce allow-list for mutating actions + if params.action in ("open", "close", "focus") and params.name: + if not sandbox.is_app_allowed(params.name): + return ToolResult( + title="App action denied", + output=( + f"Application '{params.name}' is not in the allow-list for this session. " + f"Allowed: {sandbox.allowed_apps or ['(all)']}" + ), + error=True, + ) + + action_map = { + "open": ActionType.APP_OPEN, + "close": ActionType.APP_CLOSE, + "focus": ActionType.APP_FOCUS, + "list": ActionType.APP_LIST, + } + + try: + loop = asyncio.get_event_loop() + result_msg = await loop.run_in_executor(None, self._do_action, params) + except Exception as exc: + await sandbox.record_action( + action_map.get(params.action, ActionType.APP_OPEN), + params={"action": params.action, "name": params.name}, + error=str(exc), + ) + return ToolResult( + title="App error", + output=f"App action '{params.action}' failed: {exc}", + error=True, + ) + + await sandbox.record_action( + action_map.get(params.action, ActionType.APP_OPEN), + params={"action": params.action, "name": params.name}, + result=result_msg[:200], + ) + + return ToolResult( + title=f"App: {params.action} '{params.name or ''}'", + output=result_msg, + ) + + # ------------------------------------------------------------------ + # Implementation — dispatches to platform-specific helpers + # ------------------------------------------------------------------ + + @staticmethod + def _do_action(params: "AppTool.Params") -> str: + if params.action == "list": + return _list_windows() + + if not params.name: + raise ValueError(f"name is required for action='{params.action}'.") + + if params.action == "open": + return _open_app(params.name) + if params.action == "close": + return _close_app(params.name) + if params.action == "focus": + return _focus_app(params.name) + + raise ValueError(f"Unknown app action: {params.action!r}") + + +# --------------------------------------------------------------------------- +# Platform helpers +# --------------------------------------------------------------------------- + + +def _run(cmd: list[str], **kwargs) -> subprocess.CompletedProcess: # type: ignore[type-arg] + return subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=15, + **kwargs, + ) + + +# ---- open ------------------------------------------------------------------ + +def _open_app(name: str) -> str: + import time + + if _PLATFORM == "Darwin": + # Launch the app first so it is running before AppleScript activates it. + r = _run(["open", "-a", name]) + if r.returncode != 0: + r2 = _run(["open", name]) + if r2.returncode != 0: + raise RuntimeError(r.stderr.strip() or r2.stderr.strip()) + # Wait for the process to start before AppleScript can address it. + time.sleep(1.5) + + # Activate and create a new document if the app is document-based. + # The `try` block is intentional: `make new document` fails silently + # for apps that don't support it (browsers, media players, etc.). + script = ( + f'tell application "{name}"\n' + f' activate\n' + f' try\n' + f' if (count of documents) = 0 then make new document\n' + f' end try\n' + f'end tell' + ) + _run(["osascript", "-e", script]) + time.sleep(0.5) # let window settle after document creation + return f"Opened '{name}' on macOS." + + if _PLATFORM == "Linux": + # Try launching as a command first, then xdg-open as fallback + try: + subprocess.Popen([name], start_new_session=True) + except FileNotFoundError: + subprocess.Popen(["xdg-open", name], start_new_session=True) + time.sleep(2.0) + return f"Opened '{name}' on Linux." + + if _PLATFORM == "Windows": + subprocess.Popen(["start", "", name], shell=True, start_new_session=True) + time.sleep(2.0) + return f"Opened '{name}' on Windows." + + raise RuntimeError(f"Unsupported platform: {_PLATFORM}") + + +# ---- close ----------------------------------------------------------------- + +def _close_app(name: str) -> str: + if _PLATFORM == "Darwin": + script = f'tell application "{name}" to quit' + r = _run(["osascript", "-e", script]) + if r.returncode != 0: + raise RuntimeError(r.stderr.strip()) + return f"Quit '{name}' via AppleScript." + + if _PLATFORM == "Linux": + r = _run(["pkill", "-f", name]) + if r.returncode not in (0, 1): + raise RuntimeError(r.stderr.strip()) + return f"Sent SIGTERM to processes matching '{name}'." + + if _PLATFORM == "Windows": + r = _run(["taskkill", "/IM", name, "/F"]) + if r.returncode != 0: + raise RuntimeError(r.stderr.strip()) + return f"Terminated '{name}' on Windows." + + raise RuntimeError(f"Unsupported platform: {_PLATFORM}") + + +# ---- focus ----------------------------------------------------------------- + +def _focus_app(name: str) -> str: + if _PLATFORM == "Darwin": + script = f'tell application "{name}" to activate' + r = _run(["osascript", "-e", script]) + if r.returncode != 0: + raise RuntimeError(r.stderr.strip()) + return f"Focused '{name}' via AppleScript." + + if _PLATFORM == "Linux": + # wmctrl is a common tool for X11 window management + r = _run(["wmctrl", "-a", name]) + if r.returncode != 0: + # Try xdotool as fallback + r2 = _run(["xdotool", "search", "--name", name, "windowactivate"]) + if r2.returncode != 0: + raise RuntimeError( + f"wmctrl: {r.stderr.strip()} | xdotool: {r2.stderr.strip()}" + ) + return f"Focused window matching '{name}'." + + if _PLATFORM == "Windows": + try: + import pygetwindow as gw # type: ignore[import-not-found] + + wins = gw.getWindowsWithTitle(name) + if not wins: + raise RuntimeError(f"No window found with title '{name}'.") + wins[0].activate() + return f"Focused '{wins[0].title}'." + except ImportError as exc: + raise RuntimeError( + "pygetwindow is required on Windows: pip install pygetwindow" + ) from exc + + raise RuntimeError(f"Unsupported platform: {_PLATFORM}") + + +# ---- list ------------------------------------------------------------------ + +def _list_windows() -> str: + if _PLATFORM == "Darwin": + script = ( + 'tell application "System Events" to get the name of every process ' + 'whose background only is false' + ) + r = _run(["osascript", "-e", script]) + if r.returncode != 0: + raise RuntimeError(r.stderr.strip()) + names = [n.strip() for n in r.stdout.strip().split(",") if n.strip()] + return "Running applications:\n" + "\n".join(f" • {n}" for n in names) + + if _PLATFORM == "Linux": + # Try wmctrl first (X11), then fallback to /proc + r = _run(["wmctrl", "-l"]) + if r.returncode == 0: + lines = [ln.strip() for ln in r.stdout.strip().splitlines() if ln.strip()] + return f"Open windows ({len(lines)}):\n" + "\n".join( + f" • {ln}" for ln in lines + ) + # Fallback: list process names + r2 = _run(["ps", "-eo", "comm="]) + if r2.returncode == 0: + procs = sorted(set(r2.stdout.strip().splitlines())) + return "Running processes:\n" + "\n".join(f" • {p}" for p in procs[:50]) + raise RuntimeError("Could not list windows: wmctrl and ps both failed.") + + if _PLATFORM == "Windows": + try: + import pygetwindow as gw # type: ignore[import-not-found] + + titles = [w.title for w in gw.getAllWindows() if w.title.strip()] + return f"Open windows ({len(titles)}):\n" + "\n".join( + f" • {t}" for t in titles + ) + except ImportError: + # Fallback to tasklist + r = _run(["tasklist", "/FO", "CSV", "/NH"]) + if r.returncode == 0: + lines = r.stdout.strip().splitlines()[:30] + return "Running processes:\n" + "\n".join(f" • {l}" for l in lines) + raise RuntimeError("Could not list windows on Windows.") + + raise RuntimeError(f"Unsupported platform: {_PLATFORM}") diff --git a/openvibe/tool/computer_keyboard.py b/openvibe/tool/computer_keyboard.py new file mode 100644 index 0000000..dcf9dc7 --- /dev/null +++ b/openvibe/tool/computer_keyboard.py @@ -0,0 +1,241 @@ +"""KeyboardTool — type text and press key combinations. + +Supports three modes: +- ``type`` — type a string with simulated keystrokes +- ``press`` — press a single named key (e.g. "enter", "escape", "tab") +- ``hotkey`` — send a key combination (e.g. ["ctrl", "c"]) + +Key names follow pyautogui conventions (lowercase): + enter, escape, tab, backspace, delete, up, down, left, right, + home, end, pageup, pagedown, f1–f12, ctrl, alt, shift, cmd/win, … +""" + +from __future__ import annotations + +import asyncio +from typing import Literal + +from pydantic import Field + +from openvibe.tool.base import Tool, ToolContext, ToolResult + + +def _pyautogui(): # type: ignore[return] + try: + import pyautogui # type: ignore[import-not-found] + return pyautogui + except ImportError as exc: + raise ImportError( + "pyautogui is required for computer use: pip install pyautogui" + ) from exc + + +def _check_accessibility() -> None: + """Same check as in computer_mouse — raises RuntimeError if Accessibility denied.""" + import platform + if platform.system() != "Darwin": + return + import subprocess + r = subprocess.run( + ["osascript", "-e", 'tell application "System Events" to get name of first process'], + capture_output=True, text=True, timeout=5, + ) + if r.returncode != 0 and "not allowed" in (r.stderr + r.stdout).lower(): + raise RuntimeError( + "macOS Accessibility permission is required for keyboard control. " + "Go to System Settings → Privacy & Security → Accessibility and add your " + "terminal application (e.g. iTerm, Terminal, VS Code)." + ) + + +def _type_text(pag: object, text: str, interval: float) -> None: + """Type text robustly — uses clipboard paste for Unicode on all platforms.""" + import platform + import time + sys_name = platform.system() + + if sys_name == "Darwin": + # pbcopy + cmd+v — handles full Unicode including CJK, emoji + import subprocess + subprocess.run(["pbcopy"], input=text.encode("utf-8"), check=True) + time.sleep(0.1) + import pyautogui # type: ignore[import-not-found] + pyautogui.hotkey("command", "v") + + elif sys_name == "Linux": + # Try xclip first, then xsel, then fall back to xdotool type + import subprocess + pasted = False + for clip_cmd, paste_keys in [ + (["xclip", "-selection", "clipboard"], ["ctrl", "v"]), + (["xsel", "--clipboard", "--input"], ["ctrl", "v"]), + ]: + try: + subprocess.run(clip_cmd, input=text.encode("utf-8"), check=True, + capture_output=True, timeout=5) + time.sleep(0.05) + import pyautogui # type: ignore[import-not-found] + pyautogui.hotkey(*paste_keys) + pasted = True + break + except (FileNotFoundError, subprocess.CalledProcessError): + continue + + if not pasted: + # xdotool type as last resort (handles most Unicode via XSendEvent) + try: + subprocess.run( + ["xdotool", "type", "--clearmodifiers", "--delay", + str(int(interval * 1000)), "--", text], + check=True, timeout=30, + ) + except (FileNotFoundError, subprocess.CalledProcessError) as exc: + raise RuntimeError( + "Cannot type text on Linux: install xclip (recommended) or xdotool. " + " sudo apt install xclip # Debian/Ubuntu\n" + " sudo dnf install xclip # Fedora" + ) from exc + + elif sys_name == "Windows": + # Use pyperclip for clipboard + Ctrl+V (auto-installed if absent) + from openvibe.computer.deps import ensure_import + pyperclip = ensure_import("pyperclip") + pyperclip.copy(text) + time.sleep(0.05) + import pyautogui # type: ignore[import-not-found] + pyautogui.hotkey("ctrl", "v") + + else: + # Unknown platform — best-effort ASCII via pyautogui + import pyautogui # type: ignore[import-not-found] + pyautogui.typewrite(text, interval=interval) + + +class KeyboardTool(Tool): + """Type text or press keyboard keys and shortcuts.""" + + name = "keyboard" + description = ( + "Simulate keyboard input: type a string of text, press a single key, " + "or send a key combination (hotkey). Use after clicking into a text " + "field or interactive element to enter input." + ) + + class Params(Tool.Params): + action: Literal["type", "press", "hotkey"] = Field( + description=( + "Keyboard action:\n" + " type — type a string of text (use for entering text into fields)\n" + " press — press a single named key, e.g. 'enter', 'escape', 'tab'\n" + " hotkey — send a key combination, e.g. keys=['ctrl','c'] for copy" + ) + ) + text: str | None = Field( + default=None, + description="Text to type. Required for action='type'.", + ) + key: str | None = Field( + default=None, + description="Key name to press. Required for action='press'. E.g. 'enter', 'escape', 'tab', 'f5'.", + ) + keys: list[str] | None = Field( + default=None, + description=( + "Key names for a hotkey combination. Required for action='hotkey'. " + "E.g. ['ctrl', 'c'] to copy, ['ctrl', 'shift', 'i'] to open DevTools." + ), + ) + interval: float = Field( + default=0.02, + description="Seconds between keystrokes when typing (action='type').", + ) + settle_ms: int = Field( + default=300, + description=( + "Milliseconds to wait after the action for the UI to settle before returning. " + "Increase for slow apps. Default 300." + ), + ) + + async def execute(self, ctx: ToolContext, params: "KeyboardTool.Params") -> ToolResult: # type: ignore[override] + from openvibe.computer.sandbox import ActionType, get_sandbox + + # Build a human-readable argument for the permission check + if params.action == "type": + arg_desc = f"type text: {(params.text or '')[:60]!r}" + elif params.action == "press": + arg_desc = f"press key: {params.key}" + else: + arg_desc = f"hotkey: {'+'.join(params.keys or [])}" + + await ctx.check_permission( + tool="keyboard", + argument=arg_desc, + description=f"Keyboard action — {arg_desc}", + ) + + sandbox = get_sandbox(ctx.session_id) + action_params: dict = {"action": params.action} + + try: + loop = asyncio.get_event_loop() + result_msg = await loop.run_in_executor(None, self._do_action, params) + except ImportError as exc: + return ToolResult(title="Keyboard error", output=str(exc), error=True) + except ValueError as exc: + return ToolResult(title="Keyboard error", output=str(exc), error=True) + except Exception as exc: + await sandbox.record_action( + ActionType.KEYBOARD_TYPE, params=action_params, error=str(exc) + ) + return ToolResult( + title="Keyboard error", + output=f"Keyboard action failed: {exc}", + error=True, + ) + + action_type_map = { + "type": ActionType.KEYBOARD_TYPE, + "press": ActionType.KEYBOARD_PRESS, + "hotkey": ActionType.KEYBOARD_HOTKEY, + } + await sandbox.record_action( + action_type_map.get(params.action, ActionType.KEYBOARD_TYPE), + params=action_params, + result=result_msg, + ) + + return ToolResult(title=f"Keyboard: {params.action}", output=result_msg) + + @staticmethod + def _do_action(params: "KeyboardTool.Params") -> str: + """Synchronous pyautogui calls — runs in thread pool.""" + import time + _check_accessibility() + pag = _pyautogui() + settle = params.settle_ms / 1000.0 + + if params.action == "type": + if not params.text: + raise ValueError("text is required for action='type'.") + _type_text(pag, params.text, params.interval) + time.sleep(settle) + preview = params.text[:40] + ("…" if len(params.text) > 40 else "") + return f"Typed {len(params.text)} characters: {preview!r}" + + if params.action == "press": + if not params.key: + raise ValueError("key is required for action='press'.") + pag.press(params.key) + time.sleep(settle) + return f"Pressed key: {params.key!r}" + + if params.action == "hotkey": + if not params.keys: + raise ValueError("keys list is required for action='hotkey'.") + pag.hotkey(*params.keys) + time.sleep(settle) + combo = "+".join(params.keys) + return f"Pressed hotkey: {combo}" + + raise ValueError(f"Unknown keyboard action: {params.action!r}") diff --git a/openvibe/tool/computer_mouse.py b/openvibe/tool/computer_mouse.py new file mode 100644 index 0000000..9588eee --- /dev/null +++ b/openvibe/tool/computer_mouse.py @@ -0,0 +1,257 @@ +"""MouseTool — control the mouse pointer. + +Supports click, double-click, right-click, move, scroll, and drag actions. +All coordinates are validated against the session sandbox before execution. +""" + +from __future__ import annotations + +import asyncio +from typing import Literal + +from pydantic import Field + +from openvibe.tool.base import Tool, ToolContext, ToolResult + +# pyautogui import is deferred so the rest of openvibe loads even when the +# computer-use extras are not installed. + + +def _pyautogui(): # type: ignore[return] + try: + import pyautogui # type: ignore[import-not-found] + return pyautogui + except ImportError as exc: + raise ImportError( + "pyautogui is required for computer use: pip install pyautogui" + ) from exc + + +def _check_accessibility() -> None: + """Raise RuntimeError if macOS Accessibility permission is not granted. + + pyautogui silently does nothing (no exception) when Accessibility is denied. + This gives a clear, actionable error instead. + """ + import platform + if platform.system() != "Darwin": + return + import subprocess + r = subprocess.run( + ["osascript", "-e", 'tell application "System Events" to get name of first process'], + capture_output=True, text=True, timeout=5, + ) + if r.returncode != 0 and "not allowed" in (r.stderr + r.stdout).lower(): + raise RuntimeError( + "macOS Accessibility permission is required for mouse/keyboard control. " + "Go to System Settings → Privacy & Security → Accessibility and add your " + "terminal application (e.g. iTerm, Terminal, VS Code)." + ) + + +class MouseTool(Tool): + """Move, click, scroll, or drag the mouse pointer.""" + + name = "mouse" + description = ( + "Control the mouse: move to a position, click (left/right/middle), " + "double-click, scroll, or drag from one point to another. " + "Coordinates are in screen pixels with (0,0) at the top-left corner." + ) + + class Params(Tool.Params): + action: Literal["move", "click", "double_click", "right_click", "scroll", "drag"] = Field( + description=( + "Mouse action to perform:\n" + " move — move pointer to (x, y) without clicking\n" + " click — left-click at (x, y)\n" + " double_click — double left-click at (x, y)\n" + " right_click — right-click at (x, y)\n" + " scroll — scroll wheel at (x, y) by 'amount' ticks\n" + " drag — drag from (x, y) to (end_x, end_y)" + ) + ) + x: int = Field(description="X coordinate as it appears in the screenshot image.") + y: int = Field(description="Y coordinate as it appears in the screenshot image.") + end_x: int | None = Field( + default=None, + description="Target X coordinate for drag action (image pixels).", + ) + end_y: int | None = Field( + default=None, + description="Target Y coordinate for drag action (image pixels).", + ) + image_width: int | None = Field( + default=None, + description=( + "Width of the screenshot image the coordinates were read from. " + "ALWAYS provide this — it is used to translate image pixels to " + "logical screen coordinates, fixing Retina/HiDPI scaling. " + "Use the width reported by the screenshot tool output." + ), + ) + image_height: int | None = Field( + default=None, + description=( + "Height of the screenshot image. Provide alongside image_width." + ), + ) + amount: int = Field( + default=3, + description="Scroll ticks (positive = up/right, negative = down/left). Used only for scroll.", + ) + duration: float = Field( + default=0.25, + description="Movement duration in seconds (for smooth animation).", + ) + settle_ms: int = Field( + default=500, + description=( + "Milliseconds to wait after the action for the UI to settle before returning. " + "Increase to 1000–2000 for slow apps or animations. Default 500." + ), + ) + + async def execute(self, ctx: ToolContext, params: "MouseTool.Params") -> ToolResult: # type: ignore[override] + from openvibe.computer.sandbox import ActionType, get_sandbox + + # ── Retina / HiDPI coordinate scaling ─────────────────────────────── + # Screenshots are downscaled to ≤1920px wide, but pyautogui works in + # logical screen pixels (e.g. 1440×900 on a Retina MacBook). + # If the caller supplies the image dimensions we read from the + # screenshot tool output, we translate automatically — no guessing. + scaled_params = params + scale_note = "" + if params.image_width and params.image_height: + try: + pag = _pyautogui() + screen_w, screen_h = pag.size() + sx = screen_w / params.image_width + sy = screen_h / params.image_height + if abs(sx - 1.0) > 0.02 or abs(sy - 1.0) > 0.02: + # Rebuild params with scaled coordinates + scaled = params.model_copy(update={ + "x": round(params.x * sx), + "y": round(params.y * sy), + "end_x": round(params.end_x * sx) if params.end_x is not None else None, + "end_y": round(params.end_y * sy) if params.end_y is not None else None, + }) + scaled_params = scaled + scale_note = ( + f" [image ({params.x},{params.y}) → logical ({scaled.x},{scaled.y})" + f" scale {sx:.3f}×{sy:.3f}]" + ) + except Exception: + pass # scaling is best-effort; never block the action + # ───────────────────────────────────────────────────────────────────── + + action_label = f"mouse {params.action} at ({params.x}, {params.y})" + await ctx.check_permission( + tool="mouse", + argument=action_label, + description=f"Perform mouse action: {action_label}", + ) + + sandbox = get_sandbox(ctx.session_id) + if not sandbox.is_coordinate_allowed(scaled_params.x, scaled_params.y): + return ToolResult( + title="Mouse action denied", + output=f"Coordinate ({scaled_params.x}, {scaled_params.y}) is outside the permitted screen region.", + error=True, + ) + + action_params: dict = { + "action": params.action, + "x": params.x, + "y": params.y, + "amount": params.amount, + "duration": params.duration, + } + if params.action == "drag": + action_params["end_x"] = params.end_x + action_params["end_y"] = params.end_y + + try: + loop = asyncio.get_event_loop() + result_msg = await loop.run_in_executor( + None, self._do_action, scaled_params + ) + except ImportError as exc: + return ToolResult(title="Mouse error", output=str(exc), error=True) + except Exception as exc: + await sandbox.record_action( + ActionType.MOUSE_CLICK, params=action_params, error=str(exc) + ) + return ToolResult( + title="Mouse error", + output=f"Mouse action failed: {exc}", + error=True, + ) + + action_type_map = { + "move": ActionType.MOUSE_MOVE, + "click": ActionType.MOUSE_CLICK, + "double_click": ActionType.MOUSE_CLICK, + "right_click": ActionType.MOUSE_CLICK, + "scroll": ActionType.MOUSE_SCROLL, + "drag": ActionType.MOUSE_DRAG, + } + await sandbox.record_action( + action_type_map.get(params.action, ActionType.MOUSE_CLICK), + params=action_params, + result=result_msg, + ) + + return ToolResult( + title=f"Mouse: {params.action}", + output=result_msg + scale_note, + ) + + @staticmethod + def _do_action(params: "MouseTool.Params") -> str: + """Synchronous pyautogui calls — runs in thread pool.""" + import time + _check_accessibility() + pag = _pyautogui() + pag.FAILSAFE = True # move to corner to abort + + settle = params.settle_ms / 1000.0 + + if params.action == "move": + pag.moveTo(params.x, params.y, duration=params.duration) + time.sleep(settle) + return f"Moved mouse to ({params.x}, {params.y})." + + if params.action == "click": + pag.click(params.x, params.y, duration=params.duration) + time.sleep(settle) + return f"Left-clicked at ({params.x}, {params.y})." + + if params.action == "double_click": + pag.doubleClick(params.x, params.y, duration=params.duration) + time.sleep(settle) + return f"Double-clicked at ({params.x}, {params.y})." + + if params.action == "right_click": + pag.rightClick(params.x, params.y, duration=params.duration) + time.sleep(settle) + return f"Right-clicked at ({params.x}, {params.y})." + + if params.action == "scroll": + pag.scroll(params.amount, x=params.x, y=params.y) + time.sleep(settle) + direction = "up" if params.amount > 0 else "down" + return f"Scrolled {direction} by {abs(params.amount)} ticks at ({params.x}, {params.y})." + + if params.action == "drag": + if params.end_x is None or params.end_y is None: + raise ValueError("end_x and end_y are required for drag action.") + pag.moveTo(params.x, params.y, duration=0.1) + pag.dragTo(params.end_x, params.end_y, duration=params.duration, mouseDownUp=True) + time.sleep(params.settle_ms / 1000.0) + return ( + f"Dragged from ({params.x}, {params.y}) " + f"to ({params.end_x}, {params.end_y})." + ) + + raise ValueError(f"Unknown mouse action: {params.action!r}") diff --git a/openvibe/tool/computer_screenshot.py b/openvibe/tool/computer_screenshot.py new file mode 100644 index 0000000..9f90342 --- /dev/null +++ b/openvibe/tool/computer_screenshot.py @@ -0,0 +1,171 @@ +"""ScreenshotTool — capture the current screen. + +The LLM receives a base-64 PNG attachment so it can *see* the current state +of the desktop before deciding which action to take next. +""" + +from __future__ import annotations + +import asyncio +import os + +from pydantic import Field + +from openvibe.tool.base import Attachment, Tool, ToolContext, ToolResult + + +class ScreenshotTool(Tool): + """Capture a screenshot of the full screen or a specific region.""" + + name = "screenshot" + description = ( + "Capture a screenshot of the current screen or a sub-region. " + "Returns the image so you can observe the current UI state before " + "deciding which action to take next. Call this frequently to verify " + "that previous actions had the intended effect. " + "Use save_path to write the PNG directly to disk (e.g. '/Users/you/Desktop/shot.png')." + ) + + class Params(Tool.Params): + region: list[int] | None = Field( + default=None, + description=( + "Optional screen region to capture as [x, y, width, height] in pixels. " + "Omit (or pass null) to capture the entire primary screen." + ), + ) + save_path: str | None = Field( + default=None, + description=( + "Optional absolute path where the PNG should be saved on disk. " + "Parent directories are created automatically. " + "Example: '/Users/alice/Documents/screenshot.png'" + ), + ) + + async def execute(self, ctx: ToolContext, params: "ScreenshotTool.Params") -> ToolResult: # type: ignore[override] + from openvibe.computer.capture import capture_screen + from openvibe.computer.sandbox import ActionType, get_sandbox + + await ctx.check_permission( + tool="screenshot", + argument="capture screen", + description="Take a screenshot of the current screen", + ) + + region: tuple[int, int, int, int] | None = None + if params.region: + if len(params.region) != 4: + return ToolResult( + title="Screenshot error", + output="region must have exactly 4 elements: [x, y, width, height]", + error=True, + ) + region = (params.region[0], params.region[1], params.region[2], params.region[3]) + + sandbox = get_sandbox(ctx.session_id) + if region and not sandbox.is_coordinate_allowed(region[0], region[1]): + return ToolResult( + title="Screenshot denied", + output="The requested region is outside the permitted screen area.", + error=True, + ) + + try: + loop = asyncio.get_event_loop() + png_bytes, width, height = await loop.run_in_executor( + None, capture_screen, region + ) + except ImportError as exc: + return ToolResult( + title="Screenshot error", + output=str(exc), + error=True, + ) + except Exception as exc: + await sandbox.record_action( + ActionType.SCREENSHOT, + params={"region": params.region}, + error=str(exc), + ) + return ToolResult( + title="Screenshot error", + output=f"Failed to capture screenshot: {exc}", + error=True, + ) + + # Compute diff against the previous screenshot stored in the sandbox. + # This gives the LLM concrete change-detection feedback rather than + # requiring it to visually compare two images in its head. + diff_summary: str | None = None + if sandbox.last_screenshot is not None: + try: + from openvibe.computer.capture import diff_screenshots + loop = asyncio.get_event_loop() + diff = await loop.run_in_executor( + None, diff_screenshots, sandbox.last_screenshot, png_bytes + ) + diff_summary = diff["summary"] # type: ignore[index] + except Exception: + pass # diff is best-effort; never block the screenshot + + # Update the stored baseline for the next comparison. + sandbox.last_screenshot = png_bytes + + await sandbox.record_action( + ActionType.SCREENSHOT, + params={"region": params.region}, + result=f"{width}x{height}" + (f" | {diff_summary}" if diff_summary else ""), + ) + + # Save to disk if a path was requested. + saved_path: str | None = None + if params.save_path: + try: + dest = os.path.expanduser(params.save_path) + os.makedirs(os.path.dirname(dest) if os.path.dirname(dest) else ".", exist_ok=True) + with open(dest, "wb") as fh: + fh.write(png_bytes) + saved_path = dest + except Exception as exc: + return ToolResult( + title="Screenshot save error", + output=f"Screenshot captured ({width}×{height}) but could not be saved to {params.save_path!r}: {exc}", + attachments=[ + Attachment(filename="screenshot.png", content=png_bytes, media_type="image/png") + ], + metadata={"width": width, "height": height}, + error=True, + ) + + region_desc = f" (region {params.region})" if params.region else "" + save_desc = f" → saved to {saved_path}" if saved_path else "" + + # Include logical screen size so the mouse tool can scale correctly. + logical_note = "" + try: + import pyautogui # type: ignore[import-not-found] + lw, lh = pyautogui.size() + logical_note = f" (logical screen: {lw}×{lh})" + except Exception: + pass + + output_lines = [ + f"Captured {width}×{height} screenshot{region_desc}{save_desc}.{logical_note}", + f"Mouse coordinates: pass image_width={width}, image_height={height} to the mouse tool for correct Retina scaling.", + ] + if diff_summary: + output_lines.append(f"Change detection vs previous screenshot: {diff_summary}") + + return ToolResult( + title=f"Screenshot {width}×{height}{region_desc}", + output="\n".join(output_lines), + attachments=[ + Attachment( + filename="screenshot.png", + content=png_bytes, + media_type="image/png", + ) + ], + metadata={"width": width, "height": height, "truncated": True}, + ) diff --git a/openvibe/tool/computer_ui.py b/openvibe/tool/computer_ui.py new file mode 100644 index 0000000..f1aeca0 --- /dev/null +++ b/openvibe/tool/computer_ui.py @@ -0,0 +1,917 @@ +"""UITool — accessibility-tree based UI interaction, cross-platform. + +Clicks buttons, types text, navigates menus, and reads element values through +each platform's native accessibility API. No pixel coordinates needed — +elements are found by their title, role, or position in the tree. + +Platform backends +----------------- +macOS — AppleScript / System Events (built-in, no extra deps) +Linux — AT-SPI2 via ``pyatspi`` (preferred) with ``xdotool`` fallbacks +Windows — UI Automation via ``pywinauto`` (preferred) with Win32 fallbacks + +Why this beats coordinate-based mouse clicks +-------------------------------------------- +* No Retina / HiDPI scaling translation needed. +* Works at any window position or screen resolution. +* Errors are descriptive: "button 'Save' not found" vs a silent mis-click. +* Uses the same accessibility layer as screen readers. + +Graceful degradation +-------------------- +If the preferred library (pyatspi / pywinauto) is missing, the tool falls +back to subprocess tools (xdotool on Linux, basic Win32 commands on Windows) +and surfaces a clear install hint in any error message. + +For apps that expose no accessibility elements (Electron games, custom +renderers) the tool returns a clear message and the caller should fall back +to screenshot + mouse with image_width/image_height for coordinate scaling. +""" + +from __future__ import annotations + +import platform +import subprocess +import time +from typing import Any, Literal + +from pydantic import Field + +from openvibe.tool.base import Tool, ToolContext, ToolResult + +_PLATFORM = platform.system() # "Darwin" | "Linux" | "Windows" + + +# --------------------------------------------------------------------------- +# Tool definition (platform-agnostic interface) +# --------------------------------------------------------------------------- + + +class UITool(Tool): + """Interact with desktop UI elements by name — no pixel coordinates needed. + + Prefer this over the mouse tool whenever the target element has a visible + label or title. Use mouse clicks only for unlabelled canvas areas + (games, drawing tools, video players) where no accessible elements exist. + """ + + name = "ui" + description = ( + "Interact with UI elements by name on any OS — no coordinates needed. " + "ALWAYS try this before the mouse tool. Actions:\n" + " get_tree — list all accessible elements in the app window\n" + " click — click a button or element by its title\n" + " click_menu — click a menu item, e.g. File → Save\n" + " type — type text (clipboard-paste, Unicode-safe)\n" + " press_key — press a key or chord: key='return', modifiers=['command']\n" + " get_value — read the current text value of a named element" + ) + + class Params(Tool.Params): + action: Literal["get_tree", "click", "click_menu", "type", "press_key", "get_value"] = Field( + description="UI action to perform." + ) + app: str = Field( + description=( + "Application name / process name. " + "macOS: process name as in Activity Monitor, e.g. 'TextEdit', 'Safari'. " + "Linux: process name or window title substring, e.g. 'gedit', 'firefox'. " + "Windows: executable name or window title, e.g. 'Notepad', 'notepad.exe'." + ) + ) + title: str | None = Field( + default=None, + description="Title or label of the UI element (for click, get_value).", + ) + role: str | None = Field( + default=None, + description=( + "Element role to narrow the search. " + "macOS: 'button', 'text field', 'text area', 'checkbox', 'pop up button'. " + "Linux: 'push button', 'entry', 'text', 'check box', 'combo box'. " + "Windows: 'Button', 'Edit', 'Text', 'CheckBox', 'ComboBox', 'ListItem'. " + "Optional — omit to search all roles." + ), + ) + text: str | None = Field( + default=None, + description="Text to type. Required for action='type'.", + ) + menu: str | None = Field( + default=None, + description="Menu bar menu name for click_menu, e.g. 'File', 'Edit', 'View'.", + ) + menu_item: str | None = Field( + default=None, + description="Menu item name for click_menu, e.g. 'Save', 'Copy', 'Undo'.", + ) + key: str | None = Field( + default=None, + description=( + "Key for press_key: 'return', 'escape', 'tab', 'space', 'delete', " + "'up', 'down', 'left', 'right', 'home', 'end', 'pageup', 'pagedown', " + "or a single character like 's', 'z'." + ), + ) + modifiers: list[str] = Field( + default_factory=list, + description=( + "Modifier keys for press_key: 'command' (macOS), 'ctrl'/'control', " + "'shift', 'alt'/'option'. E.g. ['command'] for cmd+key." + ), + ) + window_index: int = Field( + default=1, + description="Window index (1 = frontmost). Used by get_tree, click, get_value.", + ) + + async def execute(self, ctx: ToolContext, params: "UITool.Params") -> ToolResult: # type: ignore[override] + import asyncio + from openvibe.computer.sandbox import ActionType, get_sandbox + + await ctx.check_permission( + tool="ui", + argument=f"{params.action} in {params.app}", + description=f"UI accessibility action: {params.action} in '{params.app}'", + ) + + sandbox = get_sandbox(ctx.session_id) + loop = asyncio.get_event_loop() + + try: + result_msg = await loop.run_in_executor(None, self._do_action, params) + except (RuntimeError, ValueError, ImportError) as exc: + await sandbox.record_action( + ActionType.MOUSE_CLICK, + params={"action": params.action, "app": params.app, "title": params.title}, + error=str(exc), + ) + return ToolResult( + title=f"UI error: {params.action} in {params.app}", + output=str(exc), + error=True, + ) + except Exception as exc: + return ToolResult( + title=f"UI error: {params.action}", + output=str(exc), + error=True, + ) + + await sandbox.record_action( + ActionType.MOUSE_CLICK, + params={"action": params.action, "app": params.app, "title": params.title}, + result=result_msg[:200], + ) + label = params.title or params.menu_item or params.key or "" + return ToolResult( + title=f"UI: {params.action} '{label}' in {params.app}", + output=result_msg, + ) + + @staticmethod + def _do_action(params: "UITool.Params") -> str: + if _PLATFORM == "Darwin": + return _macos_dispatch(params) + if _PLATFORM == "Linux": + return _linux_dispatch(params) + if _PLATFORM == "Windows": + return _windows_dispatch(params) + raise RuntimeError( + f"UITool: unsupported platform '{_PLATFORM}'. " + "Use the mouse tool with image_width/image_height instead." + ) + + +# =========================================================================== +# macOS backend — AppleScript / System Events +# =========================================================================== + + +def _osascript(script: str, timeout: int = 15) -> str: + """Run an AppleScript and return stdout. Raises RuntimeError on failure.""" + r = subprocess.run( + ["osascript", "-e", script], + capture_output=True, + text=True, + timeout=timeout, + ) + if r.returncode != 0: + raise RuntimeError((r.stderr or r.stdout).strip()) + return r.stdout.strip() + + +def _macos_dispatch(params: "UITool.Params") -> str: + if params.action == "get_tree": + return _macos_get_tree(params.app, params.window_index) + if params.action == "click": + return _macos_click(params.app, params.title, params.role, params.window_index) + if params.action == "click_menu": + return _macos_click_menu(params.app, params.menu, params.menu_item) + if params.action == "type": + return _macos_type(params.app, params.text) + if params.action == "press_key": + return _macos_press_key(params.app, params.key, params.modifiers) + if params.action == "get_value": + return _macos_get_value(params.app, params.title, params.role, params.window_index) + raise ValueError(f"Unknown action: {params.action!r}") + + +def _macos_get_tree(app: str, window_index: int = 1) -> str: + script = f""" +tell application "System Events" + tell process "{app}" + set win to window {window_index} + set winTitle to "" + try + set winTitle to title of win + end try + set output to "Window " & {window_index} & ": \\"" & winTitle & "\\"\\n" + set allElems to entire contents of win + repeat with elem in allElems + try + set r to role of elem + set t to "" + try + set t to title of elem + end try + if t is "" then + try + set t to value of elem as text + on error + set t to "" + end try + end if + if t is "" then + try + set t to description of elem + end try + end if + if t is not "" and t is not missing value then + set output to output & " [" & r & "] " & t & "\\n" + end if + end try + end repeat + return output + end tell +end tell +""" + result = _osascript(script) + if not result.strip(): + return ( + f"No accessible elements found in {app} window {window_index}. " + "App may use custom rendering (Electron, game engines). " + "Fall back to mouse tool with image_width/image_height." + ) + return result + + +def _macos_click(app: str, title: str | None, role: str | None, window_index: int = 1) -> str: + if not title and not role: + raise ValueError("Provide at least 'title' or 'role' for click.") + if title: + escaped = title.replace('"', '\\"') + script = f""" +tell application "System Events" + tell process "{app}" + set allElems to entire contents of window {window_index} + repeat with elem in allElems + try + set matched to false + try + if title of elem is "{escaped}" then set matched to true + end try + if not matched then + try + if description of elem is "{escaped}" then set matched to true + end try + end if + if matched then + click elem + return "Clicked [" & (role of elem) & "] \\"{escaped}\\" in {app}." + end if + end try + end repeat + error "No element titled \\"{escaped}\\" in {app} window {window_index}. Run get_tree to list elements." + end tell +end tell +""" + else: + script = f""" +tell application "System Events" + tell process "{app}" + click first {role} of window {window_index} + return "Clicked first [{role}] in {app}." + end tell +end tell +""" + return _osascript(script) + + +def _macos_click_menu(app: str, menu: str | None, menu_item: str | None) -> str: + if not menu or not menu_item: + raise ValueError("Both 'menu' and 'menu_item' are required for click_menu.") + em = menu.replace('"', '\\"') + ei = menu_item.replace('"', '\\"') + script = f""" +tell application "System Events" + tell process "{app}" + click menu item "{ei}" of menu "{em}" of menu bar item "{em}" of menu bar 1 + return "Clicked {menu} → {menu_item} in {app}." + end tell +end tell +""" + return _osascript(script) + + +def _macos_type(app: str, text: str | None) -> str: + if not text: + raise ValueError("'text' is required for action='type'.") + _osascript(f'tell application "{app}" to activate') + time.sleep(0.3) + subprocess.run(["pbcopy"], input=text.encode("utf-8"), check=True) + time.sleep(0.1) + _osascript('tell application "System Events" to keystroke "v" using {command down}') + time.sleep(0.2) + preview = text[:60] + ("…" if len(text) > 60 else "") + return f"Typed {len(text)} chars into {app}: {preview!r}" + + +_MACOS_KEY_MAP: dict[str, tuple[str, bool]] = { + "return": ("return", False), "enter": ("return", False), + "escape": ("escape", False), "esc": ("escape", False), + "tab": ("tab", False), "space": ("space", False), + "delete": ("delete", False), "backspace": ("delete", False), + "up": ("126", True), "down": ("125", True), + "left": ("123", True), "right": ("124", True), + "home": ("115", True), "end": ("119", True), + "pageup": ("116", True), "pagedown": ("121", True), + "f1": ("122", True), "f2": ("120", True), "f3": ("99", True), + "f4": ("118", True), "f5": ("96", True), "f6": ("97", True), + "f7": ("98", True), "f8": ("100", True), "f9": ("101", True), + "f10": ("109", True), "f11": ("103", True), "f12": ("111", True), +} +_MACOS_MOD_MAP = { + "command": "command down", "cmd": "command down", + "shift": "shift down", "option": "option down", + "alt": "option down", "control": "control down", "ctrl": "control down", +} + + +def _macos_press_key(app: str, key: str | None, modifiers: list[str]) -> str: + if not key: + raise ValueError("'key' is required for action='press_key'.") + _osascript(f'tell application "{app}" to activate') + time.sleep(0.15) + mod_parts = [_MACOS_MOD_MAP.get(m.lower(), f"{m} down") for m in modifiers] + mod_str = "{" + ", ".join(mod_parts) + "}" if mod_parts else "" + kl = key.lower() + if kl in _MACOS_KEY_MAP: + val, is_code = _MACOS_KEY_MAP[kl] + verb = "key code" if is_code else "keystroke" + script = ( + f'tell application "System Events" to {verb} {val} using {mod_str}' + if mod_str else + f'tell application "System Events" to {verb} {val}' + ) + else: + esc = key.replace('"', '\\"') + script = ( + f'tell application "System Events" to keystroke "{esc}" using {mod_str}' + if mod_str else + f'tell application "System Events" to keystroke "{esc}"' + ) + _osascript(script) + combo = ("+".join(modifiers) + "+" if modifiers else "") + key + return f"Pressed {combo} in {app}." + + +def _macos_get_value(app: str, title: str | None, role: str | None, window_index: int = 1) -> str: + if title: + esc = title.replace('"', '\\"') + script = f""" +tell application "System Events" + tell process "{app}" + set allElems to entire contents of window {window_index} + repeat with elem in allElems + try + if title of elem is "{esc}" then + return value of elem as text + end if + end try + end repeat + error "No element titled \\"{esc}\\" found." + end tell +end tell +""" + elif role: + script = f""" +tell application "System Events" + tell process "{app}" + return value of first {role} of window {window_index} as text + end tell +end tell +""" + else: + raise ValueError("Provide 'title' or 'role' for get_value.") + return _osascript(script) + + +# =========================================================================== +# Linux backend — AT-SPI2 (pyatspi) preferred, xdotool fallback +# =========================================================================== + + +def _xdotool(*args: str, check: bool = False) -> subprocess.CompletedProcess[str]: + return subprocess.run( + ["xdotool", *args], capture_output=True, text=True, timeout=10, check=check + ) + + +def _has_xdotool() -> bool: + return subprocess.run(["which", "xdotool"], capture_output=True).returncode == 0 + + +def _linux_dispatch(params: "UITool.Params") -> str: + try: + from openvibe.computer.deps import ensure_import + ensure_import("pyatspi") + return _linux_atspi_dispatch(params) + except (ImportError, RuntimeError): + return _linux_xdotool_dispatch(params) + + +# ---- AT-SPI (pyatspi) path ---- + +def _atspi_desktop(): + import pyatspi + return pyatspi.Registry.getDesktop(0) + + +def _atspi_find_app(name: str): + """Find an AT-SPI application node by name (case-insensitive, partial match).""" + import pyatspi + desktop = pyatspi.Registry.getDesktop(0) + name_lower = name.lower() + # Exact match first + for app in desktop: + if app and app.name and app.name.lower() == name_lower: + return app + # Partial match + for app in desktop: + if app and app.name and name_lower in app.name.lower(): + return app + available = [a.name for a in desktop if a and a.name] + raise RuntimeError( + f"App '{name}' not found in AT-SPI tree. " + f"Running apps: {', '.join(available) or 'none'}. " + "Make sure the app is open and AT-SPI accessibility is enabled " + "(export GTK_MODULES=gail:atk-bridge on older GTK apps)." + ) + + +def _atspi_walk(node: Any, depth: int = 0, max_depth: int = 10): + """Yield all AT-SPI nodes up to max_depth.""" + if depth > max_depth: + return + yield node + try: + for i in range(node.childCount): + try: + yield from _atspi_walk(node.getChildAtIndex(i), depth + 1, max_depth) + except Exception: + pass + except Exception: + pass + + +def _atspi_find(app_node: Any, title: str | None, role: str | None) -> Any: + """Find first AT-SPI element matching title and/or role.""" + title_lower = title.lower() if title else None + role_lower = role.lower() if role else None + + for node in _atspi_walk(app_node): + try: + name = (node.name or "").lower() + node_role = (node.getLocalizedRoleName() or "").lower() + + name_ok = not title_lower or title_lower in name + role_ok = not role_lower or role_lower in node_role + + if name_ok and role_ok and (title_lower or role_lower): + if node.name or node.getLocalizedRoleName(): + return node + except Exception: + pass + return None + + +def _linux_atspi_dispatch(params: "UITool.Params") -> str: + if params.action == "get_tree": + return _linux_atspi_get_tree(params.app, params.window_index) + if params.action == "click": + return _linux_atspi_click(params.app, params.title, params.role) + if params.action == "click_menu": + return _linux_atspi_click_menu(params.app, params.menu, params.menu_item) + if params.action == "type": + return _linux_type(params.app, params.text) + if params.action == "press_key": + return _linux_press_key(params.app, params.key, params.modifiers) + if params.action == "get_value": + return _linux_atspi_get_value(params.app, params.title, params.role) + raise ValueError(f"Unknown action: {params.action!r}") + + +def _linux_atspi_get_tree(app_name: str, window_index: int = 1) -> str: + app = _atspi_find_app(app_name) + lines: list[str] = [f"App: {app.name}"] + seen = 0 + for node in _atspi_walk(app, max_depth=6): + if seen >= 150: + lines.append(" … (truncated — app has many elements)") + break + try: + name = node.name or "" + role = node.getLocalizedRoleName() or "" + if name or role: + lines.append(f" [{role}] {name}") + seen += 1 + except Exception: + pass + if len(lines) == 1: + return ( + f"No accessible elements found in '{app_name}'. " + "App may use custom rendering. Fall back to mouse tool." + ) + return "\n".join(lines) + + +def _linux_atspi_click(app_name: str, title: str | None, role: str | None) -> str: + import pyatspi + app = _atspi_find_app(app_name) + node = _atspi_find(app, title, role) + if node is None: + raise RuntimeError( + f"No element title='{title}' role='{role}' in '{app_name}'. " + "Run get_tree to see available elements." + ) + # Try AT-SPI action first (click/press/activate) + try: + action = node.queryAction() + action_names = [action.getName(i).lower() for i in range(action.nActions)] + for pref in ("click", "press", "activate", "toggle"): + if pref in action_names: + action.doAction(action_names.index(pref)) + return f"Clicked [{node.getLocalizedRoleName()}] '{node.name}' in {app_name}." + # Any action + if action.nActions > 0: + action.doAction(0) + return f"Activated [{node.getLocalizedRoleName()}] '{node.name}' in {app_name}." + except Exception: + pass + # Fall back: move mouse to element centre and click + try: + bbox = node.queryComponent().getExtents(pyatspi.DESKTOP_COORDS) + cx, cy = bbox.x + bbox.width // 2, bbox.y + bbox.height // 2 + _xdotool("mousemove", "--sync", str(cx), str(cy)) + _xdotool("click", "1") + return f"Clicked at ({cx},{cy}) [{node.getLocalizedRoleName()}] '{node.name}' in {app_name}." + except Exception as exc: + raise RuntimeError(f"Could not click element '{title}': {exc}") from exc + + +def _linux_atspi_click_menu(app_name: str, menu: str | None, menu_item: str | None) -> str: + import pyatspi + if not menu or not menu_item: + raise ValueError("Both 'menu' and 'menu_item' are required.") + app = _atspi_find_app(app_name) + # Find menu bar → menu → item + menu_bar = _atspi_find(app, None, "menu bar") + if menu_bar is None: + menu_bar = _atspi_find(app, None, "menubar") + if menu_bar is None: + raise RuntimeError(f"No menu bar found in '{app_name}'.") + menu_node = _atspi_find(menu_bar, menu, "menu") + if menu_node is None: + raise RuntimeError(f"Menu '{menu}' not found in '{app_name}'.") + # Open the menu + try: + act = menu_node.queryAction() + act.doAction(0) + time.sleep(0.2) + except Exception: + pass + item_node = _atspi_find(menu_node, menu_item, "menu item") + if item_node is None: + raise RuntimeError(f"Menu item '{menu_item}' not found under '{menu}' in '{app_name}'.") + try: + act = item_node.queryAction() + act.doAction(0) + except Exception: + bbox = item_node.queryComponent().getExtents(pyatspi.DESKTOP_COORDS) + cx, cy = bbox.x + bbox.width // 2, bbox.y + bbox.height // 2 + _xdotool("mousemove", "--sync", str(cx), str(cy)) + _xdotool("click", "1") + return f"Clicked {menu} → {menu_item} in {app_name}." + + +def _linux_atspi_get_value(app_name: str, title: str | None, role: str | None) -> str: + app = _atspi_find_app(app_name) + node = _atspi_find(app, title, role) + if node is None: + raise RuntimeError(f"No element title='{title}' role='{role}' in '{app_name}'.") + try: + val = node.queryValue() + return str(val.currentValue) + except Exception: + pass + try: + text = node.queryText() + return text.getText(0, -1) + except Exception: + pass + return node.name or "(no value)" + + +# ---- xdotool-only fallback for Linux ---- + +def _linux_xdotool_dispatch(params: "UITool.Params") -> str: + """Fallback when pyatspi is not installed — limited but usable.""" + if params.action == "type": + return _linux_type(params.app, params.text) + if params.action == "press_key": + return _linux_press_key(params.app, params.key, params.modifiers) + if params.action in ("get_tree", "click", "click_menu", "get_value"): + if not _has_xdotool(): + raise ImportError( + "pyatspi is required for accessibility-based UI on Linux: " + "pip install pyatspi (or: sudo apt install python3-pyatspi)\n" + "xdotool also not found. Install it for basic input: " + "sudo apt install xdotool" + ) + raise ImportError( + f"action='{params.action}' requires pyatspi for element discovery. " + "Install it: pip install pyatspi (or: sudo apt install python3-pyatspi)\n" + "For 'type' and 'press_key', xdotool is sufficient and already available." + ) + raise ValueError(f"Unknown action: {params.action!r}") + + +# ---- Shared Linux helpers ---- + +def _linux_focus_window(app_name: str) -> None: + """Best-effort: focus the window matching app_name.""" + r = _xdotool("search", "--name", app_name, "windowactivate", "--sync") + if r.returncode != 0: + _xdotool("search", "--class", app_name, "windowactivate", "--sync") + time.sleep(0.2) + + +def _linux_type(app_name: str, text: str | None) -> str: + if not text: + raise ValueError("'text' is required for action='type'.") + _linux_focus_window(app_name) + # Try clipboard paste (Unicode-safe) + for clipboard_tool in [ + (["xclip", "-selection", "clipboard"], ["xdotool", "key", "--clearmodifiers", "ctrl+v"]), + (["xsel", "--clipboard", "--input"], ["xdotool", "key", "--clearmodifiers", "ctrl+v"]), + ]: + copy_cmd, paste_cmd = clipboard_tool + r = subprocess.run(copy_cmd, input=text.encode("utf-8"), + capture_output=True, timeout=5) + if r.returncode == 0: + time.sleep(0.1) + subprocess.run(paste_cmd, check=True, timeout=5) + time.sleep(0.1) + preview = text[:60] + ("…" if len(text) > 60 else "") + return f"Typed {len(text)} chars into {app_name}: {preview!r}" + # Last resort: xdotool type (ASCII only, slow) + if _has_xdotool(): + _xdotool("type", "--clearmodifiers", "--delay", "20", text, check=True) + return f"Typed (xdotool, ASCII mode) {len(text)} chars into {app_name}." + raise RuntimeError( + "Could not type text: install xclip or xsel for Unicode support: " + "sudo apt install xclip" + ) + + +_LINUX_KEY_MAP = { + "return": "Return", "enter": "Return", + "escape": "Escape", "esc": "Escape", + "tab": "Tab", "space": "space", + "delete": "Delete", "backspace": "BackSpace", + "up": "Up", "down": "Down", "left": "Left", "right": "Right", + "home": "Home", "end": "End", + "pageup": "Prior", "pagedown": "Next", + **{f"f{i}": f"F{i}" for i in range(1, 13)}, +} +_LINUX_MOD_MAP = { + "command": "ctrl", # remap macOS cmd → Ctrl on Linux + "cmd": "ctrl", + "ctrl": "ctrl", "control": "ctrl", + "shift": "shift", + "alt": "alt", "option": "alt", +} + + +def _linux_press_key(app_name: str, key: str | None, modifiers: list[str]) -> str: + if not key: + raise ValueError("'key' is required for action='press_key'.") + if not _has_xdotool(): + raise RuntimeError("xdotool is required for press_key on Linux: sudo apt install xdotool") + _linux_focus_window(app_name) + mapped_key = _LINUX_KEY_MAP.get(key.lower(), key) + mapped_mods = [_LINUX_MOD_MAP.get(m.lower(), m) for m in modifiers] + combo = "+".join(mapped_mods + [mapped_key]) if mapped_mods else mapped_key + _xdotool("key", "--clearmodifiers", combo, check=True) + display = ("+".join(modifiers) + "+" if modifiers else "") + key + return f"Pressed {display} in {app_name}." + + +# =========================================================================== +# Windows backend — pywinauto (UI Automation) preferred, fallbacks where needed +# =========================================================================== + +# Role name translation: user-friendly → pywinauto control_type +_WIN_ROLE_MAP = { + "button": "Button", "btn": "Button", + "edit": "Edit", "text field": "Edit", "input": "Edit", + "text": "Text", "label": "Text", "static": "Text", + "checkbox": "CheckBox", "check box": "CheckBox", + "combobox": "ComboBox", "combo box": "ComboBox", "dropdown": "ComboBox", + "listitem": "ListItem", "list item": "ListItem", + "listbox": "List", "list": "List", + "tab": "TabItem", "tab item": "TabItem", + "tree item": "TreeItem", "treeitem": "TreeItem", + "menu item": "MenuItem", "menuitem": "MenuItem", + "toolbar": "ToolBar", "tool bar": "ToolBar", +} + + +def _win_connect(app_name: str): + """Connect to a running Windows application via pywinauto.""" + from openvibe.computer.deps import ensure_import + _pywinauto = ensure_import("pywinauto") + Application = _pywinauto.Application # type: ignore[attr-defined] + + errors: list[str] = [] + # Try executable name + for backend in ("uia", "win32"): + for kwargs in ( + {"path": app_name}, + {"title_re": f".*{app_name}.*"}, + {"title": app_name}, + ): + try: + return Application(backend=backend).connect(**kwargs, timeout=3) + except Exception as e: + errors.append(str(e)) + + raise RuntimeError( + f"Could not connect to '{app_name}' on Windows. " + f"Make sure the app is running. Tried: {'; '.join(errors[:3])}" + ) + + +def _windows_dispatch(params: "UITool.Params") -> str: + if params.action == "get_tree": + return _windows_get_tree(params.app, params.window_index) + if params.action == "click": + return _windows_click(params.app, params.title, params.role, params.window_index) + if params.action == "click_menu": + return _windows_click_menu(params.app, params.menu, params.menu_item) + if params.action == "type": + return _windows_type(params.app, params.text) + if params.action == "press_key": + return _windows_press_key(params.app, params.key, params.modifiers) + if params.action == "get_value": + return _windows_get_value(params.app, params.title, params.role, params.window_index) + raise ValueError(f"Unknown action: {params.action!r}") + + +def _windows_get_tree(app_name: str, window_index: int = 1) -> str: + app = _win_connect(app_name) + dlg = app.top_window() + lines = [f"Window: {dlg.window_text()}"] + seen = 0 + try: + for ctrl in dlg.descendants(): + if seen >= 150: + lines.append(" … (truncated)") + break + try: + title = ctrl.window_text().strip() + role = ctrl.friendly_class_name() + if title: + lines.append(f" [{role}] {title}") + seen += 1 + except Exception: + pass + except Exception: + pass + if len(lines) == 1: + return ( + f"No accessible elements in '{app_name}'. " + "Fall back to mouse tool with coordinates." + ) + return "\n".join(lines) + + +def _windows_click(app_name: str, title: str | None, role: str | None, window_index: int = 1) -> str: + app = _win_connect(app_name) + dlg = app.top_window() + win_role = _WIN_ROLE_MAP.get((role or "").lower()) + try: + if title and win_role: + ctrl = dlg.child_window(title=title, control_type=win_role) + elif title: + ctrl = dlg.child_window(title=title) + elif win_role: + ctrl = dlg.child_window(control_type=win_role) + else: + raise ValueError("Provide 'title' or 'role' for click.") + ctrl.click_input() + return f"Clicked '{title or role}' in {app_name}." + except Exception as exc: + raise RuntimeError( + f"Could not click '{title or role}' in '{app_name}'. " + f"Run get_tree to see available elements. Error: {exc}" + ) from exc + + +def _windows_click_menu(app_name: str, menu: str | None, menu_item: str | None) -> str: + if not menu or not menu_item: + raise ValueError("Both 'menu' and 'menu_item' are required for click_menu.") + app = _win_connect(app_name) + dlg = app.top_window() + try: + dlg.menu_select(f"{menu}->{menu_item}") + return f"Clicked {menu} → {menu_item} in {app_name}." + except Exception as exc: + raise RuntimeError( + f"Could not click menu '{menu}→{menu_item}' in '{app_name}': {exc}" + ) from exc + + +def _windows_type(app_name: str, text: str | None) -> str: + if not text: + raise ValueError("'text' is required for action='type'.") + # Use pyperclip for Unicode-safe clipboard paste (auto-installed if absent) + from openvibe.computer.deps import ensure_import + pyperclip = ensure_import("pyperclip") + pyperclip.copy(text) + + app = _win_connect(app_name) + app.top_window().type_keys("^v") # Ctrl+V + time.sleep(0.15) + preview = text[:60] + ("…" if len(text) > 60 else "") + return f"Typed {len(text)} chars into {app_name}: {preview!r}" + + +_WIN_KEY_MAP = { + "return": "{ENTER}", "enter": "{ENTER}", + "escape": "{ESC}", "esc": "{ESC}", + "tab": "{TAB}", "space": " ", + "delete": "{DELETE}", "backspace": "{BACKSPACE}", + "up": "{UP}", "down": "{DOWN}", "left": "{LEFT}", "right": "{RIGHT}", + "home": "{HOME}", "end": "{END}", + "pageup": "{PGUP}", "pagedown": "{PGDN}", + **{f"f{i}": f"{{F{i}}}" for i in range(1, 13)}, +} +_WIN_MOD_PREFIX = { + "command": "^", "cmd": "^", # remap macOS cmd → Ctrl on Windows + "ctrl": "^", "control": "^", + "shift": "+", + "alt": "%", "option": "%", +} + + +def _windows_press_key(app_name: str, key: str | None, modifiers: list[str]) -> str: + if not key: + raise ValueError("'key' is required for action='press_key'.") + app = _win_connect(app_name) + prefix = "".join(_WIN_MOD_PREFIX.get(m.lower(), "") for m in modifiers) + key_str = _WIN_KEY_MAP.get(key.lower(), key if len(key) == 1 else f"{{{key.upper()}}}") + app.top_window().type_keys(f"{prefix}{key_str}") + display = ("+".join(modifiers) + "+" if modifiers else "") + key + return f"Pressed {display} in {app_name}." + + +def _windows_get_value(app_name: str, title: str | None, role: str | None, window_index: int = 1) -> str: + app = _win_connect(app_name) + dlg = app.top_window() + win_role = _WIN_ROLE_MAP.get((role or "").lower()) + try: + if title and win_role: + ctrl = dlg.child_window(title=title, control_type=win_role) + elif title: + ctrl = dlg.child_window(title=title) + elif win_role: + ctrl = dlg.child_window(control_type=win_role) + else: + raise ValueError("Provide 'title' or 'role' for get_value.") + return ctrl.window_text() + except Exception as exc: + raise RuntimeError( + f"Could not get value of '{title or role}' in '{app_name}': {exc}" + ) from exc diff --git a/openvibe/tool/web_fetch.py b/openvibe/tool/web_fetch.py index f43e9d0..bfd35c3 100644 --- a/openvibe/tool/web_fetch.py +++ b/openvibe/tool/web_fetch.py @@ -64,7 +64,7 @@ async def execute( async with httpx.AsyncClient(follow_redirects=True, timeout=30) as client: response = await client.get( params.url, - headers={"User-Agent: openvibe/0.1 (AI coding agent))"}, + headers={"User-Agent": "openvibe/0.1 (AI coding agent)"}, ) response.raise_for_status() except httpx.TimeoutException: diff --git a/openvibe/tui/screens/session.py b/openvibe/tui/screens/session.py index c63c092..e24b8bf 100644 --- a/openvibe/tui/screens/session.py +++ b/openvibe/tui/screens/session.py @@ -221,11 +221,17 @@ async def handle_submitted(self, event: InputBar.Submitted) -> None: # Slash commands — handled at the API level, but we intercept the # result here to avoid freezing the UI / showing a spinner. - from openvibe.commands import is_command + # Skill invocations (/skillname args) look like commands but go to + # the LLM; route them through _start_turn instead. + from openvibe.commands import _COMMANDS, is_command # noqa: PLC2701 if is_command(event.text): - await self._handle_command(event.text) - return + parts = event.text[1:].split(None, 1) + name = parts[0].lower() if parts else "" + if name in _COMMANDS: + await self._handle_command(event.text) + return + # Not a registered command — fall through to LLM path (skill). input_bar = self.query_one(InputBar) input_bar.record_submission(event.text) @@ -295,7 +301,7 @@ async def _handle_command(self, text: str) -> None: reply_msg.id, str(MessageRole.ASSISTANT), ) - result_widget.append_text(result.output) + result_widget.set_markup(result.output) # ------------------------------------------------------------------ # Permission handling diff --git a/openvibe/tui/widgets/messages.py b/openvibe/tui/widgets/messages.py index 82c0eec..5a5f3bb 100644 --- a/openvibe/tui/widgets/messages.py +++ b/openvibe/tui/widgets/messages.py @@ -234,6 +234,13 @@ def append_text(self, content: str) -> None: self._text += safe widget.update(self._text) + def set_markup(self, content: str) -> None: + """Display pre-rendered Rich markup directly, bypassing markdown processing.""" + widget = self._simple_widget() + widget.remove_class("hidden-text") + self._text = content + widget.update(content) + def replace_text(self, content: str) -> None: self._text = content if self._role == "assistant": diff --git a/pyproject.toml b/pyproject.toml index 5ad70d6..9234f79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,9 @@ dependencies = [ "selenium>=4.0.0", "webdriver-manager>=4.0.0", "beautifulsoup4>=4.12.0", + "mss>=9.0.0", + "Pillow>=10.0.0", + "pyautogui>=0.9.54", ] [project.optional-dependencies] diff --git a/tests/test_computer_use.py b/tests/test_computer_use.py new file mode 100644 index 0000000..8273769 --- /dev/null +++ b/tests/test_computer_use.py @@ -0,0 +1,848 @@ +"""Tests for the computer-use subsystem. + +These tests cover the sandbox, tool parameter validation, and permission +gating without requiring the optional mss/pyautogui/pillow packages to be +installed (all screen/input calls are mocked out). +""" + +from __future__ import annotations + +import asyncio +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from openvibe.computer.sandbox import ( + ActionType, + AuditEntry, + ComputerSandbox, + clear_sandbox, + get_sandbox, +) +from openvibe.tool.base import ToolContext, ToolResult + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _ctx(session_id: str = "test-session") -> ToolContext: + """Build a minimal ToolContext with no permission service (all allowed).""" + return ToolContext( + session_id=session_id, + message_id="msg-1", + agent_name="computer", + project_id="proj-1", + working_dir="/tmp", + abort=asyncio.Event(), + call_id="call-1", + _permissions=None, + ) + + +# --------------------------------------------------------------------------- +# Sandbox tests +# --------------------------------------------------------------------------- + + +class TestComputerSandbox: + def setup_method(self): + clear_sandbox("s1") + + def test_get_sandbox_creates_new(self): + sb = get_sandbox("s1") + assert sb.session_id == "s1" + assert sb.audit_log == [] + + def test_get_sandbox_returns_same_instance(self): + assert get_sandbox("s1") is get_sandbox("s1") + + def test_clear_sandbox(self): + a = get_sandbox("s1") + clear_sandbox("s1") + b = get_sandbox("s1") + assert a is not b + + @pytest.mark.asyncio + async def test_record_action_appends_entry(self): + sb = ComputerSandbox(session_id="s1") + entry = await sb.record_action( + ActionType.SCREENSHOT, params={"region": None}, result="800x600" + ) + assert len(sb.audit_log) == 1 + assert sb.audit_log[0] is entry + assert entry.action_type == ActionType.SCREENSHOT + assert entry.result == "800x600" + assert entry.error is None + + @pytest.mark.asyncio + async def test_record_action_with_error(self): + sb = ComputerSandbox(session_id="s1") + await sb.record_action( + ActionType.MOUSE_CLICK, + params={"x": 100, "y": 200}, + error="boom", + ) + assert sb.audit_log[0].error == "boom" + + def test_export_audit_log(self): + sb = ComputerSandbox(session_id="s1") + asyncio.run( + sb.record_action(ActionType.APP_OPEN, params={"name": "Terminal"}, result="ok") + ) + log = sb.export_audit_log() + assert len(log) == 1 + assert log[0]["action"] == "app_open" + assert log[0]["result"] == "ok" + + def test_is_app_allowed_no_list(self): + sb = ComputerSandbox(session_id="s1") + assert sb.is_app_allowed("Anything") is True + + def test_is_app_allowed_with_list(self): + sb = ComputerSandbox(session_id="s1", allowed_apps=["Terminal", "Chrome"]) + assert sb.is_app_allowed("terminal") is True # case-insensitive + assert sb.is_app_allowed("Google Chrome") is True # substring + assert sb.is_app_allowed("Slack") is False + + def test_is_coordinate_allowed_no_region(self): + sb = ComputerSandbox(session_id="s1") + assert sb.is_coordinate_allowed(9999, 9999) is True + + def test_is_coordinate_allowed_with_region(self): + sb = ComputerSandbox(session_id="s1", screen_region=(100, 100, 500, 400)) + assert sb.is_coordinate_allowed(150, 200) is True + assert sb.is_coordinate_allowed(50, 50) is False + assert sb.is_coordinate_allowed(700, 600) is False + + def test_summary(self): + sb = ComputerSandbox(session_id="abcdef123456") + asyncio.run(sb.record_action(ActionType.SCREENSHOT, params={})) + asyncio.run(sb.record_action(ActionType.MOUSE_CLICK, params={})) + s = sb.summary() + assert "2 actions" in s + assert "mouse_click" in s + assert "screenshot" in s + + +# --------------------------------------------------------------------------- +# ScreenshotTool tests +# --------------------------------------------------------------------------- + + +class TestScreenshotTool: + @pytest.mark.asyncio + async def test_invalid_region_length(self): + from openvibe.tool.computer_screenshot import ScreenshotTool + + tool = ScreenshotTool() + ctx = _ctx() + result = await tool.execute(ctx, ScreenshotTool.Params(region=[100, 200])) + assert result.error is True + assert "4 elements" in result.output + + @pytest.mark.asyncio + async def test_coordinate_outside_sandbox_region(self): + from openvibe.tool.computer_screenshot import ScreenshotTool + + clear_sandbox("s-ss") + sb = get_sandbox("s-ss") + sb.screen_region = (0, 0, 200, 200) + + tool = ScreenshotTool() + ctx = _ctx("s-ss") + result = await tool.execute(ctx, ScreenshotTool.Params(region=[500, 500, 100, 100])) + assert result.error is True + assert "outside" in result.output.lower() + + @pytest.mark.asyncio + async def test_capture_full_screen(self): + from openvibe.tool.computer_screenshot import ScreenshotTool + + clear_sandbox("s-full") + tool = ScreenshotTool() + ctx = _ctx("s-full") + + fake_png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100 + with patch( + "openvibe.computer.capture.capture_screen", + return_value=(fake_png, 1920, 1080), + ): + result = await tool.execute(ctx, ScreenshotTool.Params()) + + assert result.error is False + assert "1920" in result.title + assert "1080" in result.title + # Attachment carries the raw PNG bytes + assert len(result.attachments) == 1 + assert result.attachments[0].filename == "screenshot.png" + assert result.attachments[0].content == fake_png + assert result.attachments[0].media_type == "image/png" + + @pytest.mark.asyncio + async def test_processor_stores_image_in_metadata(self): + """image_b64 must be stored in ToolState.metadata so the LLM can see it.""" + import base64 + + from openvibe.tool.base import Attachment, ToolResult + + fake_png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 50 + fake_result = ToolResult( + title="Screenshot 800x600", + output="Captured 800x600 screenshot.", + attachments=[ + Attachment(filename="screenshot.png", content=fake_png, media_type="image/png") + ], + ) + + # Simulate what the processor does when it receives a ToolResult with an image + from openvibe.session.models import ToolState + from openvibe.config import ToolStateStatus + + state = ToolState( + status=ToolStateStatus.COMPLETED, + call_id="call-1", + tool_name="screenshot", + input={}, + output=fake_result.output, + ) + + # Apply the same logic as the processor + for att in fake_result.attachments: + if att.media_type.startswith("image/"): + state.metadata["image_b64"] = base64.b64encode(att.content).decode("ascii") + state.metadata["image_media_type"] = att.media_type + break + + assert "image_b64" in state.metadata + assert state.metadata["image_media_type"] == "image/png" + assert base64.b64decode(state.metadata["image_b64"]) == fake_png + + @pytest.mark.asyncio + async def test_capture_records_audit_entry(self): + from openvibe.tool.computer_screenshot import ScreenshotTool + + clear_sandbox("s-audit") + tool = ScreenshotTool() + ctx = _ctx("s-audit") + + fake_png = b"\x89PNG" + b"\x00" * 20 + with patch( + "openvibe.computer.capture.capture_screen", + return_value=(fake_png, 800, 600), + ): + await tool.execute(ctx, ScreenshotTool.Params()) + + sb = get_sandbox("s-audit") + assert len(sb.audit_log) == 1 + assert sb.audit_log[0].action_type == ActionType.SCREENSHOT + + +# --------------------------------------------------------------------------- +# MouseTool tests +# --------------------------------------------------------------------------- + + +class TestMouseTool: + @pytest.mark.asyncio + async def test_click_outside_sandbox_region(self): + from openvibe.tool.computer_mouse import MouseTool + + clear_sandbox("s-mouse") + sb = get_sandbox("s-mouse") + sb.screen_region = (0, 0, 500, 500) + + tool = MouseTool() + ctx = _ctx("s-mouse") + result = await tool.execute( + ctx, MouseTool.Params(action="click", x=999, y=999) + ) + assert result.error is True + assert "outside" in result.output.lower() + + @pytest.mark.asyncio + async def test_click_within_region(self): + from openvibe.tool.computer_mouse import MouseTool + + clear_sandbox("s-click") + sb = get_sandbox("s-click") + sb.screen_region = (0, 0, 1920, 1080) + + tool = MouseTool() + ctx = _ctx("s-click") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_mouse._pyautogui", return_value=mock_pag): + result = await tool.execute( + ctx, MouseTool.Params(action="click", x=100, y=200) + ) + + assert result.error is False + mock_pag.click.assert_called_once_with(100, 200, duration=0.25) + assert "100" in result.output + + @pytest.mark.asyncio + async def test_scroll(self): + from openvibe.tool.computer_mouse import MouseTool + + clear_sandbox("s-scroll") + tool = MouseTool() + ctx = _ctx("s-scroll") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_mouse._pyautogui", return_value=mock_pag): + result = await tool.execute( + ctx, MouseTool.Params(action="scroll", x=400, y=400, amount=-3) + ) + + assert result.error is False + mock_pag.scroll.assert_called_once_with(-3, x=400, y=400) + assert "down" in result.output.lower() + + @pytest.mark.asyncio + async def test_drag_missing_end(self): + from openvibe.tool.computer_mouse import MouseTool + + clear_sandbox("s-drag") + tool = MouseTool() + ctx = _ctx("s-drag") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_mouse._pyautogui", return_value=mock_pag): + result = await tool.execute( + ctx, MouseTool.Params(action="drag", x=100, y=100) + ) + + assert result.error is True + assert "end_x" in result.output.lower() or "end" in result.output.lower() + + @pytest.mark.asyncio + async def test_drag_records_audit(self): + from openvibe.tool.computer_mouse import MouseTool + + clear_sandbox("s-drag2") + tool = MouseTool() + ctx = _ctx("s-drag2") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_mouse._pyautogui", return_value=mock_pag): + await tool.execute( + ctx, MouseTool.Params(action="drag", x=10, y=10, end_x=300, end_y=300) + ) + + sb = get_sandbox("s-drag2") + assert any(e.action_type == ActionType.MOUSE_DRAG for e in sb.audit_log) + + +# --------------------------------------------------------------------------- +# KeyboardTool tests +# --------------------------------------------------------------------------- + + +class TestKeyboardTool: + @pytest.mark.asyncio + async def test_type_text(self): + from openvibe.tool.computer_keyboard import KeyboardTool + + clear_sandbox("s-kbd") + tool = KeyboardTool() + ctx = _ctx("s-kbd") + + mock_pag = MagicMock() + with ( + patch("openvibe.tool.computer_keyboard._pyautogui", return_value=mock_pag), + patch("openvibe.tool.computer_keyboard._type_text") as mock_type, + ): + result = await tool.execute( + ctx, KeyboardTool.Params(action="type", text="hello world") + ) + + assert result.error is False + mock_type.assert_called_once_with(mock_pag, "hello world", 0.02) + assert "11" in result.output # 11 characters + + @pytest.mark.asyncio + async def test_type_missing_text(self): + from openvibe.tool.computer_keyboard import KeyboardTool + + clear_sandbox("s-kbd2") + tool = KeyboardTool() + ctx = _ctx("s-kbd2") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_keyboard._pyautogui", return_value=mock_pag): + result = await tool.execute( + ctx, KeyboardTool.Params(action="type") + ) + + assert result.error is True + + @pytest.mark.asyncio + async def test_press_key(self): + from openvibe.tool.computer_keyboard import KeyboardTool + + clear_sandbox("s-press") + tool = KeyboardTool() + ctx = _ctx("s-press") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_keyboard._pyautogui", return_value=mock_pag): + result = await tool.execute( + ctx, KeyboardTool.Params(action="press", key="enter") + ) + + assert result.error is False + mock_pag.press.assert_called_once_with("enter") + + @pytest.mark.asyncio + async def test_hotkey(self): + from openvibe.tool.computer_keyboard import KeyboardTool + + clear_sandbox("s-hotkey") + tool = KeyboardTool() + ctx = _ctx("s-hotkey") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_keyboard._pyautogui", return_value=mock_pag): + result = await tool.execute( + ctx, KeyboardTool.Params(action="hotkey", keys=["ctrl", "c"]) + ) + + assert result.error is False + mock_pag.hotkey.assert_called_once_with("ctrl", "c") + assert "ctrl+c" in result.output.lower() + + @pytest.mark.asyncio + async def test_keyboard_records_audit(self): + from openvibe.tool.computer_keyboard import KeyboardTool + + clear_sandbox("s-kdaud") + tool = KeyboardTool() + ctx = _ctx("s-kdaud") + + mock_pag = MagicMock() + with patch("openvibe.tool.computer_keyboard._pyautogui", return_value=mock_pag): + await tool.execute( + ctx, KeyboardTool.Params(action="press", key="escape") + ) + + sb = get_sandbox("s-kdaud") + assert any(e.action_type == ActionType.KEYBOARD_PRESS for e in sb.audit_log) + + +# --------------------------------------------------------------------------- +# AppTool tests +# --------------------------------------------------------------------------- + + +class TestAppTool: + @pytest.mark.asyncio + async def test_open_denied_by_allowlist(self): + from openvibe.tool.computer_app import AppTool + + clear_sandbox("s-app") + sb = get_sandbox("s-app") + sb.allowed_apps = ["Terminal"] + + tool = AppTool() + ctx = _ctx("s-app") + result = await tool.execute(ctx, AppTool.Params(action="open", name="Slack")) + assert result.error is True + assert "allow-list" in result.output.lower() or "allowed" in result.output.lower() + + @pytest.mark.asyncio + async def test_list_action_bypasses_allowlist(self): + """list action does not require an app name and should not be blocked.""" + from openvibe.tool.computer_app import AppTool, _list_windows + + clear_sandbox("s-list") + sb = get_sandbox("s-list") + sb.allowed_apps = ["Terminal"] + + tool = AppTool() + ctx = _ctx("s-list") + + with patch("openvibe.tool.computer_app._list_windows", return_value="• App1\n• App2"): + result = await tool.execute(ctx, AppTool.Params(action="list")) + + assert result.error is False + + @pytest.mark.asyncio + async def test_open_missing_name(self): + from openvibe.tool.computer_app import AppTool + + clear_sandbox("s-appname") + tool = AppTool() + ctx = _ctx("s-appname") + + # No app name provided — should fail gracefully + with patch("openvibe.tool.computer_app._open_app", side_effect=ValueError("name is required")): + result = await tool.execute(ctx, AppTool.Params(action="open")) + + assert result.error is True + + @pytest.mark.asyncio + async def test_open_records_audit(self): + from openvibe.tool.computer_app import AppTool + + clear_sandbox("s-appaud") + tool = AppTool() + ctx = _ctx("s-appaud") + + with patch("openvibe.tool.computer_app._open_app", return_value="Opened 'Terminal'."): + result = await tool.execute(ctx, AppTool.Params(action="open", name="Terminal")) + + assert result.error is False + sb = get_sandbox("s-appaud") + assert any(e.action_type == ActionType.APP_OPEN for e in sb.audit_log) + + +# --------------------------------------------------------------------------- +# LLM message builder — vision content block tests +# --------------------------------------------------------------------------- + + +class TestLLMVisionMessages: + """Verify that _to_llm_messages emits image ContentBlocks for screenshot results.""" + + def _make_tool_part_with_image(self, b64: str, media_type: str = "image/png"): + from openvibe.config import ToolStateStatus + from openvibe.session.models import ToolPart, ToolState + + state = ToolState( + status=ToolStateStatus.COMPLETED, + call_id="call-img-1", + tool_name="screenshot", + input={}, + output="Captured 1920x1080 screenshot.", + metadata={"image_b64": b64, "image_media_type": media_type}, + ) + return ToolPart(state=state) + + def _make_assistant_msg(self, tool_part): + from openvibe.config import MessageRole + from openvibe.session.models import MessageInfo, TextPart + + return MessageInfo( + id="msg-1", + session_id="s1", + role=MessageRole.ASSISTANT, + position=0, + created_at="2026-01-01T00:00:00", + parts=[TextPart(content="Let me take a screenshot."), tool_part], + ) + + def test_image_tool_result_produces_content_block_list(self): + import base64 + + from openvibe.session.processor import _to_llm_messages + from openvibe.llm import ContentBlock + + fake_b64 = base64.b64encode(b"\x89PNG fake").decode("ascii") + tool_part = self._make_tool_part_with_image(fake_b64) + assistant_msg = self._make_assistant_msg(tool_part) + + # Build a dummy agent with no disabled tools + from openvibe.agent.agent import _BUILTIN_AGENTS + agent = _BUILTIN_AGENTS["computer"] + + messages = _to_llm_messages([assistant_msg], agent) + + # Should produce: one assistant message + one tool message + tool_msgs = [m for m in messages if m.role == "tool"] + assert len(tool_msgs) == 1 + + tool_msg = tool_msgs[0] + assert tool_msg.tool_call_id == "call-img-1" + # Content must be a list of ContentBlocks (not a plain string) + assert isinstance(tool_msg.content, list) + assert len(tool_msg.content) == 2 + + image_block = tool_msg.content[0] + text_block = tool_msg.content[1] + + assert isinstance(image_block, ContentBlock) + assert image_block.type == "image_url" + assert image_block.image_url is not None + assert image_block.image_url["url"].startswith("data:image/png;base64,") + assert fake_b64 in image_block.image_url["url"] + + assert isinstance(text_block, ContentBlock) + assert text_block.type == "text" + assert "1920" in (text_block.text or "") + + def test_text_only_tool_result_produces_plain_string(self): + from openvibe.config import MessageRole, ToolStateStatus + from openvibe.session.models import MessageInfo, TextPart, ToolPart, ToolState + from openvibe.session.processor import _to_llm_messages + from openvibe.agent.agent import _BUILTIN_AGENTS + + state = ToolState( + status=ToolStateStatus.COMPLETED, + call_id="call-text-1", + tool_name="bash", + input={"command": "ls"}, + output="file1.py\nfile2.py", + ) + tool_part = ToolPart(state=state) + assistant_msg = MessageInfo( + id="msg-2", + session_id="s1", + role=MessageRole.ASSISTANT, + position=0, + created_at="2026-01-01T00:00:00", + parts=[TextPart(content="Running ls."), tool_part], + ) + + agent = _BUILTIN_AGENTS["build"] + messages = _to_llm_messages([assistant_msg], agent) + + tool_msgs = [m for m in messages if m.role == "tool"] + assert len(tool_msgs) == 1 + # Plain text tool result — content must be a string, not a list + assert isinstance(tool_msgs[0].content, str) + assert "file1.py" in tool_msgs[0].content + + def test_litellm_serialisation_of_image_tool_result(self): + """_to_litellm_messages must produce valid dict for image tool results.""" + import base64 + + from openvibe.llm import ContentBlock, Message, _to_litellm_messages + + fake_b64 = base64.b64encode(b"PNG-DATA").decode("ascii") + msg = Message( + role="tool", + content=[ + ContentBlock( + type="image_url", + image_url={"url": f"data:image/png;base64,{fake_b64}"}, + ), + ContentBlock(type="text", text="Screenshot captured."), + ], + tool_call_id="call-123", + ) + + result = _to_litellm_messages([msg]) + assert len(result) == 1 + d = result[0] + assert d["role"] == "tool" + assert d["tool_call_id"] == "call-123" + assert isinstance(d["content"], list) + assert d["content"][0]["type"] == "image_url" + assert d["content"][1]["type"] == "text" + assert d["content"][1]["text"] == "Screenshot captured." + + +# --------------------------------------------------------------------------- +# Registry tests +# --------------------------------------------------------------------------- + + +class TestComputerUseRegistry: + def test_create_computer_use_registry_contains_cu_tools(self): + from openvibe.tool.base import create_computer_use_registry + + registry = create_computer_use_registry() + assert "screenshot" in registry + assert "mouse" in registry + assert "keyboard" in registry + assert "app" in registry + + def test_create_computer_use_registry_contains_default_tools(self): + from openvibe.tool.base import create_computer_use_registry + + registry = create_computer_use_registry() + assert "bash" in registry + assert "read" in registry + assert "write" in registry + assert "glob" in registry + + +# --------------------------------------------------------------------------- +# Agent tests +# --------------------------------------------------------------------------- + + +class TestComputerAgent: + def test_computer_agent_exists(self): + from openvibe.agent.agent import _BUILTIN_AGENTS + + assert "computer" in _BUILTIN_AGENTS + + def test_computer_agent_rules(self): + from openvibe.agent.agent import _BUILTIN_AGENTS + from openvibe.config import PermissionAction + + rules = {r.tool: r.action for r in _BUILTIN_AGENTS["computer"].permission_rules} + assert rules.get("screenshot") == PermissionAction.ALLOW + assert rules.get("mouse") == PermissionAction.ASK + assert rules.get("keyboard") == PermissionAction.ASK + assert rules.get("app") == PermissionAction.ASK + + +# --------------------------------------------------------------------------- +# Verification loop tests +# --------------------------------------------------------------------------- + + +class TestVerificationLoop: + """Tests for the automatic change-detection verification loop.""" + + def _make_solid_png(self, width: int, height: int, color: tuple) -> bytes: + """Create a solid-colour PNG for diffing tests.""" + from PIL import Image + import io + img = Image.new("RGB", (width, height), color) + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + def test_diff_unchanged(self): + from openvibe.computer.capture import diff_screenshots + png = self._make_solid_png(100, 100, (128, 128, 128)) + report = diff_screenshots(png, png) + assert report["changed"] is False + assert report["change_fraction"] < 0.001 + assert "no visible change" in str(report["summary"]).lower() + + def test_diff_full_change(self): + from openvibe.computer.capture import diff_screenshots + before = self._make_solid_png(100, 100, (0, 0, 0)) + after = self._make_solid_png(100, 100, (255, 255, 255)) + report = diff_screenshots(before, after) + assert report["changed"] is True + assert report["change_fraction"] > 0.99 + assert report["changed_region"] is not None + + def test_diff_partial_change(self): + """Only the right half changes — bounding box should cover that region.""" + from PIL import Image + import io + from openvibe.computer.capture import diff_screenshots + + before = self._make_solid_png(200, 100, (0, 0, 0)) + # Paint right half white + img = Image.new("RGB", (200, 100), (0, 0, 0)) + for x in range(100, 200): + for y in range(100): + img.putpixel((x, y), (255, 255, 255)) + buf = io.BytesIO() + img.save(buf, format="PNG") + after = buf.getvalue() + + report = diff_screenshots(before, after) + assert report["changed"] is True + region = report["changed_region"] + assert region is not None + x, _y, w, _h = region + assert x >= 99 # change starts around x=100 + + def test_diff_size_mismatch(self): + from openvibe.computer.capture import diff_screenshots + small = self._make_solid_png(100, 100, (0, 0, 0)) + large = self._make_solid_png(200, 200, (0, 0, 0)) + report = diff_screenshots(small, large) + assert report["changed"] is True + assert "resolution" in str(report["summary"]).lower() + + @pytest.mark.asyncio + async def test_screenshot_tool_includes_diff_in_output(self): + """Second screenshot should report change vs the first.""" + from openvibe.tool.computer_screenshot import ScreenshotTool + from openvibe.computer.sandbox import clear_sandbox, get_sandbox + + clear_sandbox("s-diff") + tool = ScreenshotTool() + ctx = _ctx("s-diff") + + from PIL import Image + import io + + def _png(color): + img = Image.new("RGB", (100, 100), color) + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + first_png = _png((0, 0, 0)) + second_png = _png((255, 255, 255)) + + with patch("openvibe.computer.capture.capture_screen", return_value=(first_png, 100, 100)): + await tool.execute(ctx, ScreenshotTool.Params()) + + with patch("openvibe.computer.capture.capture_screen", return_value=(second_png, 100, 100)): + result = await tool.execute(ctx, ScreenshotTool.Params()) + + assert "change detection" in result.output.lower() + assert result.error is False + + @pytest.mark.asyncio + async def test_screenshot_tool_no_diff_on_first_capture(self): + """First screenshot has nothing to compare against — no diff line.""" + from openvibe.tool.computer_screenshot import ScreenshotTool + from openvibe.computer.sandbox import clear_sandbox + + clear_sandbox("s-nodiff") + tool = ScreenshotTool() + ctx = _ctx("s-nodiff") + + from PIL import Image + import io + img = Image.new("RGB", (100, 100), (0, 0, 0)) + buf = io.BytesIO() + img.save(buf, format="PNG") + png = buf.getvalue() + + with patch("openvibe.computer.capture.capture_screen", return_value=(png, 100, 100)): + result = await tool.execute(ctx, ScreenshotTool.Params()) + + assert "change detection" not in result.output.lower() + assert result.error is False + + @pytest.mark.asyncio + async def test_sandbox_stores_last_screenshot(self): + """sandbox.last_screenshot is updated after each capture.""" + from openvibe.tool.computer_screenshot import ScreenshotTool + from openvibe.computer.sandbox import clear_sandbox, get_sandbox + + clear_sandbox("s-store") + tool = ScreenshotTool() + ctx = _ctx("s-store") + + from PIL import Image + import io + img = Image.new("RGB", (50, 50), (1, 2, 3)) + buf = io.BytesIO() + img.save(buf, format="PNG") + png = buf.getvalue() + + assert get_sandbox("s-store").last_screenshot is None + + with patch("openvibe.computer.capture.capture_screen", return_value=(png, 50, 50)): + await tool.execute(ctx, ScreenshotTool.Params()) + + assert get_sandbox("s-store").last_screenshot == png + + @pytest.mark.asyncio + async def test_mouse_settle_ms_respected(self): + """Mouse tool passes settle_ms through to time.sleep.""" + from openvibe.tool.computer_mouse import MouseTool + from openvibe.computer.sandbox import clear_sandbox + + clear_sandbox("s-settle") + tool = MouseTool() + ctx = _ctx("s-settle") + + mock_pag = MagicMock() + with ( + patch("openvibe.tool.computer_mouse._pyautogui", return_value=mock_pag), + patch("openvibe.tool.computer_mouse._check_accessibility"), + patch("openvibe.tool.computer_mouse.MouseTool._do_action", wraps=lambda p: "ok") as _, + ): + # Use settle_ms=0 so the test doesn't actually sleep + result = await tool.execute( + ctx, MouseTool.Params(action="click", x=100, y=200, settle_ms=0) + ) + assert result.error is False