Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Keep the build context small and the image clean. The build needs the source layout
# (src/, migrations/, db/, alembic.ini, pyproject.toml, uv.lock) — everything else is excluded.
.git
.github
.venv
**/__pycache__/
*.pyc
.mypy_cache
.ruff_cache
.pytest_cache
*.egg-info
# local Postgres data / scratch / editor dirs
*.sqlite
*.db
.idea
.vscode
.DS_Store
# docs/config not needed at runtime (README + LICENSE are copied explicitly in the Dockerfile)
CHANGELOG.md
CONTRIBUTING.md
CODE_OF_CONDUCT.md
SECURITY.md
DESIGN.md
docker-compose.yml
Dockerfile
.dockerignore
90 changes: 90 additions & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: Docker

# Build-validate the image on PRs; publish to GHCR on master (`edge`) and on `v*` tags
# (semver + `latest`). slim is multi-arch (amd64+arm64); the embed variant is amd64-only (torch).

on:
pull_request:
push:
branches: [master]
tags: ["v*"]
workflow_dispatch:

concurrency:
group: docker-${{ github.ref }}
cancel-in-progress: true

env:
IMAGE: ghcr.io/${{ github.repository }}

jobs:
hadolint:
name: hadolint (Dockerfile lint)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: hadolint/hadolint-action@v3.1.0
with:
dockerfile: Dockerfile

build:
name: build ${{ matrix.variant }}
runs-on: ubuntu-latest
needs: hadolint
permissions:
contents: read
packages: write
strategy:
fail-fast: false
matrix:
include:
- variant: slim
extras: ""
suffix: ""
platforms: linux/amd64,linux/arm64
- variant: embed
extras: "--extra embed"
suffix: "-embed"
platforms: linux/amd64
steps:
- uses: actions/checkout@v4

- uses: docker/setup-qemu-action@v3
- uses: docker/setup-buildx-action@v3

- name: Log in to GHCR
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.IMAGE }}
flavor: |
suffix=${{ matrix.suffix }},onlatest=true
tags: |
type=edge,branch=master
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}

- name: Build${{ github.event_name != 'pull_request' && ' + push' || ' (validate only)' }}
uses: docker/build-push-action@v6
with:
context: .
# PR: build amd64 only (multi-arch can't `--load`); push only off-PR.
platforms: ${{ github.event_name == 'pull_request' && 'linux/amd64' || matrix.platforms }}
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
build-args: |
EXTRAS=${{ matrix.extras }}
VERSION=${{ steps.meta.outputs.version }}
REVISION=${{ github.sha }}
CREATED=${{ fromJSON(steps.meta.outputs.json).labels['org.opencontainers.image.created'] }}
cache-from: type=gha,scope=${{ matrix.variant }}
cache-to: type=gha,mode=max,scope=${{ matrix.variant }}
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- **Docker** (`Dockerfile`, `docker-compose.yml`, `.github/workflows/docker.yml`): a multi-stage,
uv-based image published to GHCR (`ghcr.io/v0ropaev/knowbase`) — `:edge` from `master`, semver +
`:latest` on `v*` tags, multi-arch amd64+arm64. The **slim** default image carries `index` /
`migrate` / `serve` / `introspect`; the **`-embed`** tag adds CPU-torch for `kb embed` + search. A
`docker compose` brings up a `pgvector` Postgres + the CLI for local dev/eval. CI build-validates on
PRs (hadolint + build, no push) and publishes on master/tags.
- **`kb migrate`** CLI command: applies the Alembic schema to `head` (`--db-url` or `KB_DB_URL`).

- **Deterministic entities extractor** (`kb.extract.deterministic.entities`): a fully static
(tree-sitter) extractor that emits one `entity` artifact per domain class — pydantic `BaseModel`,
`@dataclass`, and SQLAlchemy declarative model — with its fields, grounded on the class-definition
Expand Down
71 changes: 71 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# syntax=docker/dockerfile:1
#
# knowbase image. Multi-stage, uv-based.
# slim (default): EXTRAS="" -> kb index / migrate / serve / introspect (no torch)
# embed: EXTRAS="--extra embed" -> adds CPU-torch for `kb embed` + search_knowledge
#
# The project is installed in EDITABLE mode and the source tree is kept at /app, because
# kb.store.migrate resolves migrations/ + db/*.sql via Path(__file__).parents[3] (the repo layout).

# ---- builder: resolve + install deps and the project into /app/.venv ----
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder

ENV UV_COMPILE_BYTECODE=1 \
UV_LINK_MODE=copy \
UV_PYTHON_DOWNLOADS=0 \
UV_PROJECT_ENVIRONMENT=/app/.venv
WORKDIR /app

ARG EXTRAS=""

# 1) Dependencies only (cached layer keyed on the lockfile; project not installed yet).
COPY pyproject.toml uv.lock ./
# hadolint ignore=SC2086
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-install-project --no-dev ${EXTRAS}

# 2) Source layout the runtime needs (src + migrations + db + alembic.ini), then install the project.
COPY src ./src
COPY migrations ./migrations
COPY db ./db
COPY alembic.ini README.md LICENSE ./
# hadolint ignore=SC2086
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-dev ${EXTRAS} \
&& /app/.venv/bin/kb --help >/dev/null

# ---- runtime: slim image with just the venv + source tree ----
FROM python:3.12-slim-bookworm AS runtime

ARG EXTRAS=""
# torch (the embed extra) needs OpenMP at runtime; slim installs no apt packages.
# hadolint ignore=DL3008
RUN case "${EXTRAS}" in \
*embed*) apt-get update \
&& apt-get install -y --no-install-recommends libgomp1 \
&& rm -rf /var/lib/apt/lists/* ;; \
esac \
&& useradd --create-home --uid 10001 app

ENV PATH="/app/.venv/bin:${PATH}" \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1
WORKDIR /app

COPY --from=builder --chown=app:app /app /app

# OCI image metadata (the build/version/revision args are supplied by CI).
ARG VERSION="0.0.0-dev"
ARG REVISION=""
ARG CREATED=""
LABEL org.opencontainers.image.title="knowbase" \
org.opencontainers.image.description="Versioned, provenance-grounded knowledge layer over a codebase, served via MCP." \
org.opencontainers.image.source="https://github.com/v0ropaev/knowbase" \
org.opencontainers.image.licenses="AGPL-3.0-or-later" \
org.opencontainers.image.version="${VERSION}" \
org.opencontainers.image.revision="${REVISION}" \
org.opencontainers.image.created="${CREATED}"

USER app
ENTRYPOINT ["kb"]
CMD ["--help"]
31 changes: 30 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
[![Python 3.12+](https://img.shields.io/badge/python-3.12%2B-blue.svg)](https://www.python.org/downloads/)
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
[![Checked with mypy](https://img.shields.io/badge/mypy-strict-blue)](https://mypy-lang.org/)
[![GHCR image](https://img.shields.io/badge/ghcr.io-knowbase-2496ED?logo=docker&logoColor=white)](https://github.com/v0ropaev/knowbase/pkgs/container/knowbase)
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/v0ropaev/knowbase/pulls)

</div>
Expand Down Expand Up @@ -163,6 +164,34 @@ uv run kb introspect app.main:app --repo <path> # sandboxed app.openapi() orac

`kb introspect` runs a FastAPI app in a network-blocked sandbox and emits its `openapi()` as JSON — the ground truth the Tier-1 API gate scores the **static** contract extractor against. It executes user code, so it is eval-only and never runs during indexing.

## Run with Docker

Prebuilt images are published to **GHCR**: `ghcr.io/v0ropaev/knowbase` (`:edge` from `master`, `:X.Y.Z`/`:latest` on releases; multi-arch amd64+arm64). The default image is **slim** (no torch — `index` / `migrate` / `serve` / `introspect`); the **`-embed`** tag (e.g. `:edge-embed`) adds CPU-torch for `kb embed` and semantic search.

```bash
docker pull ghcr.io/v0ropaev/knowbase:latest
docker run --rm ghcr.io/v0ropaev/knowbase:latest --help
```

As an **MCP server** (stdio), pointed at your Postgres — this is the form an MCP client launches:

```bash
docker run -i --rm ghcr.io/v0ropaev/knowbase:latest serve --db-url <postgres-url>
```

### Local dev/eval with `docker compose`

The bundled compose brings up a `pgvector` Postgres plus the `kb` CLI built from this checkout:

```bash
docker compose up -d db # Postgres (pgvector)
docker compose run --rm kb migrate # apply the schema
docker compose run --rm kb index /workspace --sha HEAD # index the mounted repo
docker compose run --rm -i kb serve # MCP over stdio
```

The compose Postgres also backs the test suite from the host: `KB_TEST_DB_URL=postgresql+psycopg://postgres:postgres@localhost:5432/postgres uv run pytest src/kb/eval -q`. For embeddings, build the image with the embed extra: `docker compose build --build-arg EXTRAS="--extra embed" kb`.

## Architecture

A Python package `kb` (uv, src-layout). Modules and their responsibilities:
Expand All @@ -180,7 +209,7 @@ A Python package `kb` (uv, src-layout). Modules and their responsibilities:
| `kb.mcp` | Read-only MCP server and its provenance-carrying records: `find_provenance`, `get_knowledge`, `search_knowledge`. |
| `kb.embed` | Replaceable embedding adapters (sentence-transformers default, OpenAI optional) + snapshot population. Torch isolated behind the `embed` extra and a lazy import. |
| `kb.rag` | The frozen pgvector RAG-over-source baseline — the "other arm" of the knowledge-vs-RAG A/B (no provenance, no grounding). |
| `kb.daemon.cli` | The `kb` CLI: `index`, `embed`, `serve` (MCP), and `introspect` — all functional. |
| `kb.daemon.cli` | The `kb` CLI: `index`, `migrate`, `embed`, `serve` (MCP), and `introspect` — all functional. |
| `kb.eval` | Eight HARD CI gates (identity reproducibility, adversarial grounding, Tier-1 import oracle, Tier-1 API oracle, Tier-1 entities oracle, Tier-3 knowledge-vs-RAG recall, Tier-4 one-hop invalidation, invariants) plus the supporting MCP / embed / store suite. |

Core tables: `commit_ref`, `branch_ref`, `code_span`, `span_occurrence`, `artifact` (now with `embedding vector(384)` + `embedding_model_id`), `artifact_derived_from`, `snapshot_entry`, and `rag_chunk` (the baseline arm).
Expand Down
44 changes: 44 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Local dev / eval: a pgvector Postgres + the kb CLI built from this repo.
#
# docker compose up -d db # start Postgres (also usable by host `pytest` via
# # KB_TEST_DB_URL=postgresql+psycopg://postgres:postgres@localhost:5432/postgres)
# docker compose run --rm kb migrate # apply the schema
# docker compose run --rm kb index /workspace --sha HEAD # index the mounted repo (this checkout)
# docker compose run --rm -i kb serve # MCP server over stdio
#
# For embeddings/search, build the kb service with the embed extra:
# docker compose build --build-arg EXTRAS="--extra embed" kb

services:
db:
image: pgvector/pgvector:pg17
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
ports:
- "5432:5432"
volumes:
- kbdata:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 10

kb:
build:
context: .
args:
EXTRAS: "" # set to "--extra embed" to include CPU-torch for `kb embed` / search
depends_on:
db:
condition: service_healthy
environment:
KB_DB_URL: postgresql+psycopg://postgres:postgres@db:5432/postgres
volumes:
- .:/workspace:ro # the repo to index (read-only); override with your own path
profiles: ["cli"] # not started by `up`; invoke with `docker compose run --rm kb <command>`

volumes:
kbdata:
19 changes: 16 additions & 3 deletions src/kb/daemon/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""The ``kb`` command-line interface (DESIGN.md §11).

``kb index`` runs the spine for one commit. ``serve`` (MCP) and ``introspect`` (the eval-only
FastAPI oracle) are stubs in this push — they belong to the next push (DESIGN.md §8 "Next push").
``kb index`` runs the spine for one commit; ``migrate`` applies the schema; ``embed`` populates
embeddings; ``serve`` hosts the read-only MCP server over stdio; ``introspect`` is the eval-only
sandboxed FastAPI openapi oracle.
"""

from __future__ import annotations
Expand All @@ -15,7 +16,7 @@
from kb.extract.deterministic.fastapi_contract import FastAPIExtractor
from kb.extract.deterministic.imports import ImportExtractor
from kb.introspect import introspect_app
from kb.store.engine import make_engine
from kb.store.engine import make_engine, resolve_db_url

app = typer.Typer(no_args_is_help=True, help="knowbase — a provenance-grounded knowledge layer.")

Expand All @@ -40,6 +41,18 @@ def index(
typer.echo(f" gaps (unparseable, recorded): {', '.join(result.gaps)}")


@app.command()
def migrate(
db_url: str | None = typer.Option(None, "--db-url", help="Postgres URL (else KB_DB_URL env)."),
) -> None:
"""Apply all Alembic migrations up to head (creates/updates the schema)."""
from kb.store.migrate import upgrade_to_head # local import keeps alembic off other commands

resolved = resolve_db_url(db_url)
upgrade_to_head(resolved)
typer.echo(f"migrated to head: {resolved}")


@app.command()
def embed(
db_url: str | None = typer.Option(None, "--db-url", help="Postgres URL (else KB_DB_URL env)."),
Expand Down
Loading