Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 21 additions & 35 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,43 +16,29 @@ jobs:
- name: Install deps
run: |
python -m pip install --upgrade pip
pip install pytest fastmcp httpx requests pynput pydantic ruff
# Runtime deps the full suite needs to import codec_dashboard /
# routes/* / codec_voice without ModuleNotFoundError on the Ubuntu
# runner. pynput stays in the list, but the conftest installs a
# stub when its real import fails (headless Linux has no X11).
pip install pytest fastmcp httpx requests pynput pydantic ruff \
fastapi numpy
- name: Lint (ruff) — F-4 gate, config in ruff.toml
run: ruff check .
- name: Skill import smoke test
- name: Skill import smoke (standalone script, not a pytest module)
run: python tests/test_skill_imports.py
- name: Skill contract tests
run: python -m pytest tests/test_skill_contracts.py -v
- name: Trusted-skill manifest is current (D-1 gate)
run: python tools/generate_skill_manifest.py --check
- name: Skill registry load-time AST gate tests (D-1)
run: python -m pytest tests/test_skill_registry.py -v
- name: Keychain helper + secret-migration tests (D-8, D-15)
run: python -m pytest tests/test_keychain.py -v
- name: OAuth provider unit test
run: python -m pytest tests/test_oauth_provider.py -v
- name: Retry helper unit test
run: python -m pytest tests/test_retry.py -v
- name: Readiness doc-guard tests (F-4 coverage expansion)
run: >
python -m pytest -v
tests/test_repo_health.py
tests/test_privacy_doc.py
tests/test_readme_investor.py
tests/test_one_pager.py
tests/test_apple_packaging.py
tests/test_dependabot.py
tests/test_versioning.py
tests/test_pyproject.py
tests/test_strict_ast_gate.py
- name: Apple packaging tests (W5 — bundle, launchd, python, models, sign, notarize, uninstall, release)
run: >
python -m pytest -v
tests/test_app_bundle.py
tests/test_launchd.py
tests/test_python_bundle.py
tests/test_model_fetch.py
tests/test_uninstaller.py
tests/test_signing.py
tests/test_release.py
tests/test_first_run.py
- name: Run full pytest suite
# Closes audit F-3: CI used to run 23 of 134 test files, leaving
# the Wave-1+2 hardening tests (D-7/D-9/D-12/D-13/D-18/D-19/D-21/D-22)
# unprotected on PRs. `pytest tests/` runs every test file at once.
# tests/test_ci_coverage_invariant.py guards this configuration
# against future drift.
run: |
python -m pytest tests/ \
--tb=short \
--strict-markers \
--strict-config \
-ra
env:
CI: "true"
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,28 @@ Then in Claude Desktop: *"Use CODEC to check my calendar for tomorrow."*

Skills opt-in to MCP exposure with `SKILL_MCP_EXPOSE = True`. Input validation enforces 5KB task / 10KB context limits with type checking on every call.

### Configuring which skills CODEC exposes over MCP

CODEC defaults to **opt-in** — only skills you explicitly allow reach the MCP surface. Three keys in `~/.codec/config.json` control the policy:

| Option | Default | Effect |
|---|---|---|
| `mcp_default_allow` | `false` | When `true`, every skill with `SKILL_MCP_EXPOSE = True` is exposed (opt-out via `mcp_blocked_tools`). When `false` (recommended), nothing is exposed unless listed in `mcp_allowed_tools`. |
| `mcp_allowed_tools` | `[]` | Explicit allowlist of skill names exposed over MCP when `mcp_default_allow` is `false`. Example: `["calculator", "weather", "memory_search"]`. |
| `mcp_blocked_tools` | `["terminal", "process_manager", "pm2_control"]` | Hard blocklist applied on every MCP transport regardless of the above. The HTTP transport adds a stricter built-in blocklist (`python_exec`, `ax_control`) that cannot be overridden. |

Example config snippet:

```jsonc
{
"mcp_default_allow": false,
"mcp_allowed_tools": ["calculator", "weather", "memory_search", "google_calendar"],
"mcp_blocked_tools": ["terminal", "process_manager", "pm2_control"]
}
```

Restart `codec-mcp-http` (HTTP transport) or the host MCP client (stdio transport) after changes.

### What this unlocks (that Claude alone can't do)

Claude Desktop/Code/Cursor gain — through this one MCP bridge — everything CODEC already owns on *your* machine:
Expand Down
22 changes: 12 additions & 10 deletions codec_voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import httpx
import numpy as np

log = logging.getLogger("codec_voice")

from codec_audit import log_event as _voice_log_event
from codec_hooks import (
HookVeto,
Expand Down Expand Up @@ -219,7 +221,7 @@ def _clear_voice_session_marker() -> None:
GEMINI_API_KEY = get_gemini_api_key()
VISION_PROVIDER = _cfg.get("vision_provider", "gemini" if GEMINI_API_KEY else "local")
except Exception:
pass
log.debug("voice: vision provider/Keychain bootstrap failed", exc_info=True)

# Screen-related trigger phrases
_SCREEN_TRIGGERS = re.compile(
Expand Down Expand Up @@ -323,7 +325,7 @@ def _build_system_prompt() -> str:
f" {f['key']} = {f['value']}" for f in facts
) + "\n[/FACTS]"
except Exception:
pass
log.debug("voice: facts injection into system prompt skipped", exc_info=True)

return f"""You are {_aname} — CODEC Voice, a JARVIS-class local AI running on a Mac Studio M1 Ultra.{_boot}
{f'The user is {_uname}. ' if _uname else ''}Fully local. No cloud. No external logs.
Expand Down Expand Up @@ -435,7 +437,7 @@ def _prune_resumable(cls, now=None):
cls._resumable_sessions.pop(sid, None)
cls._resume_timestamps.pop(sid, None)
except Exception:
pass
log.debug("voice: stale-resume-session cleanup pass swallowed", exc_info=True)

def _save_for_resume(self):
"""Stash conversation state so a reconnecting client can resume."""
Expand Down Expand Up @@ -686,7 +688,7 @@ async def generate_response(self, user_text: str):
"content": _build_system_prompt() + f"\n\n[MEMORY — RELEVANT CONTEXT]\n{targeted}\n[END MEMORY]"
}
except Exception:
pass
log.debug("voice: targeted-memory injection skipped", exc_info=True)
# Phase 2 Step 5 — Observer summary injection (gated per §X).
# Voice always uses local Qwen by default (transport="local"); if
# the user has cloud-routed voice configured (vision_provider=
Expand Down Expand Up @@ -882,7 +884,7 @@ async def _handle_voice_ask_user_answer(self, qid: str,
try:
await self._speak("Something went wrong recording your answer.")
except Exception:
pass
log.debug("voice: fallback TTS for ask_user error path failed", exc_info=True)

async def dispatch_skill(self, skill: dict, user_text: str) -> Optional[str]:
try:
Expand Down Expand Up @@ -1384,7 +1386,7 @@ async def run(self):
"resume_id": self.session_id if is_resumed else None},
correlation_id=cid)
except Exception:
pass
log.debug("voice: voice_session_start audit emit failed", exc_info=True)
# Phase 1 Step 2: fire on_operation_start hooks (per-plugin, not the
# voice_session_start audit event above — that's Step 1 vocabulary
# and intentionally unchanged). Hook layer never raises.
Expand All @@ -1393,7 +1395,7 @@ async def run(self):
transport="voice",
correlation_id=cid)
except Exception:
pass
log.debug("voice: on_operation_start hook emit failed", exc_info=True)

# Send session ID so client can reconnect to this session
await self.ws.send_json({"type": "session", "session_id": self.session_id})
Expand Down Expand Up @@ -1458,7 +1460,7 @@ async def run(self):
error=run_error,
correlation_id=cid)
except Exception:
pass
log.debug("voice: voice_session_end audit emit failed", exc_info=True)
# Phase 1 Step 2: fire on_operation_end hooks. Same caveat as
# the start emit above — voice_session_end audit event is Step 1
# vocabulary and unchanged; on_operation_end is the hook-layer
Expand All @@ -1470,11 +1472,11 @@ async def run(self):
duration_ms=duration_ms,
outcome=run_outcome)
except Exception:
pass
log.debug("voice: on_operation_end hook emit failed", exc_info=True)
try:
_voice_correlation_id_var.reset(cid_token)
except Exception:
pass
log.debug("voice: correlation_id contextvar reset failed", exc_info=True)
# Phase 1 Step 3 §5.3 — clear the active-session marker so
# codec_ask_user falls back to PWA-only for any subsequent
# questions. Best-effort; failures don't break shutdown.
Expand Down
6 changes: 4 additions & 2 deletions ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ ignore = [
[lint.per-file-ignores]
# Tests legitimately keep unused locals (fixtures, captured-but-unasserted) and redefine helpers.
"tests/*" = ["F811", "F841"]
# Smoke test + feature-audit script import symbols purely to probe availability (F401 is the point).
"tests/test_smoke.py" = ["F811", "F841", "F401"]
# Smoke + feature-audit scripts import symbols purely to probe availability (F401 is the point).
# Note: smoke.py moved tests/ → scripts/ when CI flipped to full-suite pytest
# (it was always a hand-rolled sys.exit(1) script, never a pytest module).
"scripts/smoke.py" = ["F811", "F841", "F401"]
"scripts/feature_audit.py" = ["F401", "F841"]
# A handful of benign unused locals in working skills — left as-is rather than risk-edit live code.
"skills/memory_entities.py" = ["F841"]
Expand Down
9 changes: 6 additions & 3 deletions tests/test_smoke.py → scripts/smoke.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#!/usr/bin/env python3
"""CODEC Smoke Test — catches import errors, NameErrors, and path mismatches.

Run after ANY change to codec.py, codec_overlays.py, codec_watcher.py, or codec_dashboard.py:
python3.13 tests/test_smoke.py
Manual sanity-check script, not a pytest module. Run by hand after ANY change
to codec.py, codec_overlays.py, codec_watcher.py, or codec_dashboard.py:
python3.13 scripts/smoke.py

Every test here exists because a real bug shipped without it.
Lives in scripts/ rather than tests/ because it uses sys.exit(1) on failure
and prints to stdout — incompatible with pytest collection. Full pytest
coverage handles regression protection.
"""
import sys
import os
Expand Down
54 changes: 54 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,19 @@
gets imported, not the main checkout. Without this, a worktree's Step 3
codec_audit (with new ASKUSER_EVENT_* constants) gets shadowed by main's
older copy and tests that rely on the new constants fail.

pynput stub: on Linux CI runners without an X display, `from pynput import
keyboard` raises ImportError at module-load time (pynput tries to acquire
an X11 connection). codec.py imports pynput unconditionally; many tests
do `import codec` at module scope. We install a minimal stub in sys.modules
BEFORE any test collection so those imports succeed. macOS dev machines
have a real pynput and skip the stub entirely. Per PR-4F design doc
(`docs/PR4F-STATE-LOCK-DESIGN.md`) — same pattern, lifted into conftest
so it applies session-wide instead of per-test.
"""
import sys
import os
import types
from pathlib import Path

# Worktree's repo root = parent of `tests/`. When running from main repo
Expand All @@ -18,3 +28,47 @@
sys.path.insert(0, os.path.expanduser("~/codec-repo"))
sys.path.insert(0, os.path.expanduser("~/.codec/skills"))
sys.path.insert(0, str(_WORKTREE_REPO))


def _install_pynput_stub_if_needed() -> None:
"""Stub `pynput` + `pynput.keyboard` if the real package can't import
(headless Linux CI). On macOS the real package imports fine and this
no-ops. The stub provides the symbols codec.py touches at import time
— Listener, KeyCode, Key, Controller — as minimal placeholders. Tests
that actually exercise keyboard behavior bring their own mocks; this
stub only unblocks module import."""
if "pynput" in sys.modules:
return
try:
import pynput # noqa: F401
return
except Exception:
pass

pynput_mod = types.ModuleType("pynput")
keyboard_mod = types.ModuleType("pynput.keyboard")

class _Stub:
def __init__(self, *a, **kw): pass
def __call__(self, *a, **kw): return self
def __getattr__(self, name): return _Stub()
def start(self): pass
def stop(self): pass
def join(self, *a, **kw): pass

# `_Key` is accessed as `Key.f5`, `Key.f13`, `Key.cmd`, etc. — the set
# of attribute names is open-ended (callers also use `f16`, `f17`).
# Use a _Stub() instance so __getattr__ returns a fresh non-None _Stub
# for ANY attribute, matching pynput's real keyboard.Key namespace
# closely enough for codec_config._resolve_key() to return non-None.
keyboard_mod.Listener = _Stub
keyboard_mod.KeyCode = _Stub
keyboard_mod.Key = _Stub()
keyboard_mod.Controller = _Stub
keyboard_mod.HotKey = _Stub
pynput_mod.keyboard = keyboard_mod
sys.modules["pynput"] = pynput_mod
sys.modules["pynput.keyboard"] = keyboard_mod


_install_pynput_stub_if_needed()
67 changes: 67 additions & 0 deletions tests/test_ci_coverage_invariant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""Guard against future CI-coverage drift.

The 2026-05 audit (F-3) found that .github/workflows/ci.yml ran 23 of the
134 test files in tests/ — leaving the Wave-1+2 hardening tests
(D-7 / D-9 / D-12 / D-13 / D-18 / D-19 / D-21 / D-22) without regression
protection on PRs. This file's tests fail loudly if anyone re-introduces
that pattern by enumerating individual test files in ci.yml instead of
running the full tests/ tree.

Removing this file or weakening its assertions REGRESSES F-3 closure.
"""
from pathlib import Path
import re

REPO_ROOT = Path(__file__).resolve().parent.parent
CI_WORKFLOW = REPO_ROOT / ".github" / "workflows" / "ci.yml"


def test_ci_workflow_exists() -> None:
assert CI_WORKFLOW.exists(), f"ci.yml not found at {CI_WORKFLOW}"


def test_ci_runs_full_pytest_suite() -> None:
"""ci.yml must invoke `pytest tests/` (or equivalent full-tree form),
not an enumerated subset of files."""
content = CI_WORKFLOW.read_text()

pytest_invocations = re.findall(r"pytest[^\n]*", content)
# Filter out the matches that are just comments mentioning pytest.
invocations = [
line for line in pytest_invocations
if not line.lstrip().startswith("#")
]
assert invocations, "No non-comment pytest invocation found in ci.yml"

full_suite_patterns = [
r"pytest\s+tests/?\b", # `pytest tests` or `pytest tests/`
r"-m\s+pytest\s+tests/?\b", # `python -m pytest tests/`
]
matches_full_suite = any(
any(re.search(p, inv) for p in full_suite_patterns)
for inv in invocations
)
assert matches_full_suite, (
"ci.yml does not run the full tests/ suite. "
f"Found pytest invocations: {invocations}. "
f"Expected one to match one of {full_suite_patterns}."
)


def test_ci_does_not_enumerate_individual_test_files() -> None:
"""ci.yml should NOT enumerate >= 5 individual test files.

A small number of explicit `tests/test_*.py` references are allowed
(e.g. a hand-rolled smoke script invoked via `python` not `pytest`).
A large number indicates the F-3 pattern reappearing — drift caught.
"""
content = CI_WORKFLOW.read_text()
# Match `test_<name>.py` whether wrapped in `tests/`, quoted, or bare.
explicit_files = re.findall(r"test_[a-zA-Z0-9_]+\.py", content)
# De-duplicate (some files may appear in multiple steps).
unique_files = set(explicit_files)
assert len(unique_files) < 5, (
f"ci.yml references {len(unique_files)} distinct test files "
f"explicitly: {sorted(unique_files)}. This regresses F-3 closure. "
f"Switch to `pytest tests/` to run the full suite."
)
Loading