From 7dd7e989acce8ddf3f1c6c88a54298424336c40f Mon Sep 17 00:00:00 2001 From: Dmitry Voropaev Date: Sun, 21 Jun 2026 14:38:34 +0300 Subject: [PATCH] feat: Docker image + compose + GHCR build/publish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Containerize knowbase: a slim runtime image (no torch) for the kb CLI / MCP stdio server, an optional -embed image with CPU-torch for `kb embed` + search, a docker-compose for local dev/eval, and a CI workflow that build-validates on PRs and publishes to GHCR on master (edge) and tags (semver + latest). - Dockerfile: multi-stage uv build, ARG EXTRAS (slim vs --extra embed), non-root, OCI labels (AGPL). Keeps the source tree at /app with an editable install so kb.store.migrate's parents[3] resolution of migrations/ + db/*.sql works. - .dockerignore, docker-compose.yml (pgvector db + kb; documented migrate/index/ serve flow; host pytest can use the compose Postgres via KB_TEST_DB_URL). - .github/workflows/docker.yml: hadolint + [slim, embed] matrix, buildx/QEMU, metadata-action tags, push only off-PR, multi-arch amd64+arm64 for slim (embed amd64-only), gha cache. - feat(cli): `kb migrate` — apply the Alembic schema to head (--db-url / KB_DB_URL). - docs: README "Run with Docker" + GHCR badge; CHANGELOG. kb migrate verified end-to-end on an ephemeral Postgres; ruff + mypy --strict clean; 52 eval tests pass (+1 skipped). Docker build is validated by the PR's docker workflow (no local daemon here). --- .dockerignore | 26 +++++++++++ .github/workflows/docker.yml | 90 ++++++++++++++++++++++++++++++++++++ CHANGELOG.md | 8 ++++ Dockerfile | 71 ++++++++++++++++++++++++++++ README.md | 31 ++++++++++++- docker-compose.yml | 44 ++++++++++++++++++ src/kb/daemon/cli.py | 19 ++++++-- 7 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/docker.yml create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..01f3f59 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,26 @@ +# Keep the build context small and the image clean. The build needs the source layout +# (src/, migrations/, db/, alembic.ini, pyproject.toml, uv.lock) — everything else is excluded. +.git +.github +.venv +**/__pycache__/ +*.pyc +.mypy_cache +.ruff_cache +.pytest_cache +*.egg-info +# local Postgres data / scratch / editor dirs +*.sqlite +*.db +.idea +.vscode +.DS_Store +# docs/config not needed at runtime (README + LICENSE are copied explicitly in the Dockerfile) +CHANGELOG.md +CONTRIBUTING.md +CODE_OF_CONDUCT.md +SECURITY.md +DESIGN.md +docker-compose.yml +Dockerfile +.dockerignore diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000..06237df --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,90 @@ +name: Docker + +# Build-validate the image on PRs; publish to GHCR on master (`edge`) and on `v*` tags +# (semver + `latest`). slim is multi-arch (amd64+arm64); the embed variant is amd64-only (torch). + +on: + pull_request: + push: + branches: [master] + tags: ["v*"] + workflow_dispatch: + +concurrency: + group: docker-${{ github.ref }} + cancel-in-progress: true + +env: + IMAGE: ghcr.io/${{ github.repository }} + +jobs: + hadolint: + name: hadolint (Dockerfile lint) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: hadolint/hadolint-action@v3.1.0 + with: + dockerfile: Dockerfile + + build: + name: build ${{ matrix.variant }} + runs-on: ubuntu-latest + needs: hadolint + permissions: + contents: read + packages: write + strategy: + fail-fast: false + matrix: + include: + - variant: slim + extras: "" + suffix: "" + platforms: linux/amd64,linux/arm64 + - variant: embed + extras: "--extra embed" + suffix: "-embed" + platforms: linux/amd64 + steps: + - uses: actions/checkout@v4 + + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE }} + flavor: | + suffix=${{ matrix.suffix }},onlatest=true + tags: | + type=edge,branch=master + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + + - name: Build${{ github.event_name != 'pull_request' && ' + push' || ' (validate only)' }} + uses: docker/build-push-action@v6 + with: + context: . + # PR: build amd64 only (multi-arch can't `--load`); push only off-PR. + platforms: ${{ github.event_name == 'pull_request' && 'linux/amd64' || matrix.platforms }} + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + EXTRAS=${{ matrix.extras }} + VERSION=${{ steps.meta.outputs.version }} + REVISION=${{ github.sha }} + CREATED=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }} + cache-from: type=gha,scope=${{ matrix.variant }} + cache-to: type=gha,mode=max,scope=${{ matrix.variant }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 5afb98a..6059090 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **Docker** (`Dockerfile`, `docker-compose.yml`, `.github/workflows/docker.yml`): a multi-stage, + uv-based image published to GHCR (`ghcr.io/v0ropaev/knowbase`) — `:edge` from `master`, semver + + `:latest` on `v*` tags, multi-arch amd64+arm64. The **slim** default image carries `index` / + `migrate` / `serve` / `introspect`; the **`-embed`** tag adds CPU-torch for `kb embed` + search. A + `docker compose` brings up a `pgvector` Postgres + the CLI for local dev/eval. CI build-validates on + PRs (hadolint + build, no push) and publishes on master/tags. +- **`kb migrate`** CLI command: applies the Alembic schema to `head` (`--db-url` or `KB_DB_URL`). + - **Deterministic entities extractor** (`kb.extract.deterministic.entities`): a fully static (tree-sitter) extractor that emits one `entity` artifact per domain class — pydantic `BaseModel`, `@dataclass`, and SQLAlchemy declarative model — with its fields, grounded on the class-definition diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8c0f61c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,71 @@ +# syntax=docker/dockerfile:1 +# +# knowbase image. Multi-stage, uv-based. +# slim (default): EXTRAS="" -> kb index / migrate / serve / introspect (no torch) +# embed: EXTRAS="--extra embed" -> adds CPU-torch for `kb embed` + search_knowledge +# +# The project is installed in EDITABLE mode and the source tree is kept at /app, because +# kb.store.migrate resolves migrations/ + db/*.sql via Path(__file__).parents[3] (the repo layout). + +# ---- builder: resolve + install deps and the project into /app/.venv ---- +FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder + +ENV UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + UV_PYTHON_DOWNLOADS=0 \ + UV_PROJECT_ENVIRONMENT=/app/.venv +WORKDIR /app + +ARG EXTRAS="" + +# 1) Dependencies only (cached layer keyed on the lockfile; project not installed yet). +COPY pyproject.toml uv.lock ./ +# hadolint ignore=SC2086 +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --frozen --no-install-project --no-dev ${EXTRAS} + +# 2) Source layout the runtime needs (src + migrations + db + alembic.ini), then install the project. +COPY src ./src +COPY migrations ./migrations +COPY db ./db +COPY alembic.ini README.md LICENSE ./ +# hadolint ignore=SC2086 +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --frozen --no-dev ${EXTRAS} \ + && /app/.venv/bin/kb --help >/dev/null + +# ---- runtime: slim image with just the venv + source tree ---- +FROM python:3.12-slim-bookworm AS runtime + +ARG EXTRAS="" +# torch (the embed extra) needs OpenMP at runtime; slim installs no apt packages. +# hadolint ignore=DL3008 +RUN case "${EXTRAS}" in \ + *embed*) apt-get update \ + && apt-get install -y --no-install-recommends libgomp1 \ + && rm -rf /var/lib/apt/lists/* ;; \ + esac \ + && useradd --create-home --uid 10001 app + +ENV PATH="/app/.venv/bin:${PATH}" \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 +WORKDIR /app + +COPY --from=builder --chown=app:app /app /app + +# OCI image metadata (the build/version/revision args are supplied by CI). +ARG VERSION="0.0.0-dev" +ARG REVISION="" +ARG CREATED="" +LABEL org.opencontainers.image.title="knowbase" \ + org.opencontainers.image.description="Versioned, provenance-grounded knowledge layer over a codebase, served via MCP." \ + org.opencontainers.image.source="https://github.com/v0ropaev/knowbase" \ + org.opencontainers.image.licenses="AGPL-3.0-or-later" \ + org.opencontainers.image.version="${VERSION}" \ + org.opencontainers.image.revision="${REVISION}" \ + org.opencontainers.image.created="${CREATED}" + +USER app +ENTRYPOINT ["kb"] +CMD ["--help"] diff --git a/README.md b/README.md index dae5be9..c2af168 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ [![Python 3.12+](https://img.shields.io/badge/python-3.12%2B-blue.svg)](https://www.python.org/downloads/) [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![Checked with mypy](https://img.shields.io/badge/mypy-strict-blue)](https://mypy-lang.org/) +[![GHCR image](https://img.shields.io/badge/ghcr.io-knowbase-2496ED?logo=docker&logoColor=white)](https://github.com/v0ropaev/knowbase/pkgs/container/knowbase) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/v0ropaev/knowbase/pulls) @@ -163,6 +164,34 @@ uv run kb introspect app.main:app --repo # sandboxed app.openapi() orac `kb introspect` runs a FastAPI app in a network-blocked sandbox and emits its `openapi()` as JSON — the ground truth the Tier-1 API gate scores the **static** contract extractor against. It executes user code, so it is eval-only and never runs during indexing. +## Run with Docker + +Prebuilt images are published to **GHCR**: `ghcr.io/v0ropaev/knowbase` (`:edge` from `master`, `:X.Y.Z`/`:latest` on releases; multi-arch amd64+arm64). The default image is **slim** (no torch — `index` / `migrate` / `serve` / `introspect`); the **`-embed`** tag (e.g. `:edge-embed`) adds CPU-torch for `kb embed` and semantic search. + +```bash +docker pull ghcr.io/v0ropaev/knowbase:latest +docker run --rm ghcr.io/v0ropaev/knowbase:latest --help +``` + +As an **MCP server** (stdio), pointed at your Postgres — this is the form an MCP client launches: + +```bash +docker run -i --rm ghcr.io/v0ropaev/knowbase:latest serve --db-url +``` + +### Local dev/eval with `docker compose` + +The bundled compose brings up a `pgvector` Postgres plus the `kb` CLI built from this checkout: + +```bash +docker compose up -d db # Postgres (pgvector) +docker compose run --rm kb migrate # apply the schema +docker compose run --rm kb index /workspace --sha HEAD # index the mounted repo +docker compose run --rm -i kb serve # MCP over stdio +``` + +The compose Postgres also backs the test suite from the host: `KB_TEST_DB_URL=postgresql+psycopg://postgres:postgres@localhost:5432/postgres uv run pytest src/kb/eval -q`. For embeddings, build the image with the embed extra: `docker compose build --build-arg EXTRAS="--extra embed" kb`. + ## Architecture A Python package `kb` (uv, src-layout). Modules and their responsibilities: @@ -180,7 +209,7 @@ A Python package `kb` (uv, src-layout). Modules and their responsibilities: | `kb.mcp` | Read-only MCP server and its provenance-carrying records: `find_provenance`, `get_knowledge`, `search_knowledge`. | | `kb.embed` | Replaceable embedding adapters (sentence-transformers default, OpenAI optional) + snapshot population. Torch isolated behind the `embed` extra and a lazy import. | | `kb.rag` | The frozen pgvector RAG-over-source baseline — the "other arm" of the knowledge-vs-RAG A/B (no provenance, no grounding). | -| `kb.daemon.cli` | The `kb` CLI: `index`, `embed`, `serve` (MCP), and `introspect` — all functional. | +| `kb.daemon.cli` | The `kb` CLI: `index`, `migrate`, `embed`, `serve` (MCP), and `introspect` — all functional. | | `kb.eval` | Eight HARD CI gates (identity reproducibility, adversarial grounding, Tier-1 import oracle, Tier-1 API oracle, Tier-1 entities oracle, Tier-3 knowledge-vs-RAG recall, Tier-4 one-hop invalidation, invariants) plus the supporting MCP / embed / store suite. | Core tables: `commit_ref`, `branch_ref`, `code_span`, `span_occurrence`, `artifact` (now with `embedding vector(384)` + `embedding_model_id`), `artifact_derived_from`, `snapshot_entry`, and `rag_chunk` (the baseline arm). diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..78a0d28 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,44 @@ +# Local dev / eval: a pgvector Postgres + the kb CLI built from this repo. +# +# docker compose up -d db # start Postgres (also usable by host `pytest` via +# # KB_TEST_DB_URL=postgresql+psycopg://postgres:postgres@localhost:5432/postgres) +# docker compose run --rm kb migrate # apply the schema +# docker compose run --rm kb index /workspace --sha HEAD # index the mounted repo (this checkout) +# docker compose run --rm -i kb serve # MCP server over stdio +# +# For embeddings/search, build the kb service with the embed extra: +# docker compose build --build-arg EXTRAS="--extra embed" kb + +services: + db: + image: pgvector/pgvector:pg17 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + ports: + - "5432:5432" + volumes: + - kbdata:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 10 + + kb: + build: + context: . + args: + EXTRAS: "" # set to "--extra embed" to include CPU-torch for `kb embed` / search + depends_on: + db: + condition: service_healthy + environment: + KB_DB_URL: postgresql+psycopg://postgres:postgres@db:5432/postgres + volumes: + - .:/workspace:ro # the repo to index (read-only); override with your own path + profiles: ["cli"] # not started by `up`; invoke with `docker compose run --rm kb ` + +volumes: + kbdata: diff --git a/src/kb/daemon/cli.py b/src/kb/daemon/cli.py index dd8703b..812e301 100644 --- a/src/kb/daemon/cli.py +++ b/src/kb/daemon/cli.py @@ -1,7 +1,8 @@ """The ``kb`` command-line interface (DESIGN.md §11). -``kb index`` runs the spine for one commit. ``serve`` (MCP) and ``introspect`` (the eval-only -FastAPI oracle) are stubs in this push — they belong to the next push (DESIGN.md §8 "Next push"). +``kb index`` runs the spine for one commit; ``migrate`` applies the schema; ``embed`` populates +embeddings; ``serve`` hosts the read-only MCP server over stdio; ``introspect`` is the eval-only +sandboxed FastAPI openapi oracle. """ from __future__ import annotations @@ -15,7 +16,7 @@ from kb.extract.deterministic.fastapi_contract import FastAPIExtractor from kb.extract.deterministic.imports import ImportExtractor from kb.introspect import introspect_app -from kb.store.engine import make_engine +from kb.store.engine import make_engine, resolve_db_url app = typer.Typer(no_args_is_help=True, help="knowbase — a provenance-grounded knowledge layer.") @@ -40,6 +41,18 @@ def index( typer.echo(f" gaps (unparseable, recorded): {', '.join(result.gaps)}") +@app.command() +def migrate( + db_url: str | None = typer.Option(None, "--db-url", help="Postgres URL (else KB_DB_URL env)."), +) -> None: + """Apply all Alembic migrations up to head (creates/updates the schema).""" + from kb.store.migrate import upgrade_to_head # local import keeps alembic off other commands + + resolved = resolve_db_url(db_url) + upgrade_to_head(resolved) + typer.echo(f"migrated to head: {resolved}") + + @app.command() def embed( db_url: str | None = typer.Option(None, "--db-url", help="Postgres URL (else KB_DB_URL env)."),