Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ env:

jobs:
build-model:
name: Build Model Image
name: Build ${{ matrix.model }}
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

strategy:
matrix:
model: [qwen3.5-35b-a3b, hermes-4.3-36b]

steps:
- name: Checkout
uses: actions/checkout@v4
Expand All @@ -36,10 +40,10 @@ jobs:
- name: Build and push model image
uses: docker/build-push-action@v6
with:
context: models/qwen3.5-35b-a3b/
context: models/${{ matrix.model }}/
push: ${{ github.event_name != 'pull_request' }}
tags: |
${{ env.REGISTRY }}/qwen3.5-35b-a3b:latest
${{ github.ref_type == 'tag' && format('{0}/qwen3.5-35b-a3b:{1}', env.REGISTRY, github.ref_name) || '' }}
cache-from: type=gha
cache-to: type=gha,mode=max
${{ env.REGISTRY }}/${{ matrix.model }}:latest
${{ github.ref_type == 'tag' && format('{0}/{1}:{2}', env.REGISTRY, matrix.model, github.ref_name) || '' }}
cache-from: type=gha,scope=${{ matrix.model }}
cache-to: type=gha,mode=max,scope=${{ matrix.model }}
23 changes: 23 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Lint

on:
pull_request:
branches: [main]

jobs:
ruff:
name: Python (ruff)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: astral-sh/ruff-action@v3

shellcheck:
name: Shell (shellcheck)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ludeeus/action-shellcheck@2.0.0
with:
scandir: scripts/
severity: warning
11 changes: 8 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# ==============================================================================

REGISTRY ?= ghcr.io/infernet-org/foundry
# Default model (can be overridden: make run MODEL=hermes-4.3-36b)
MODEL ?= qwen3.5-35b-a3b
MODEL_TAG ?= $(REGISTRY)/$(MODEL)
PORT ?= 8080
Expand All @@ -11,23 +12,27 @@ MODELS_DIR ?= $(HOME)/.cache/foundry
.PHONY: help build run run-profile test benchmark monitoring down push push-all clean clean-models download

help: ## Show this help
@echo "Available models: qwen3.5-35b-a3b (default), hermes-4.3-36b"
@echo "Usage: make run MODEL=hermes-4.3-36b"
@echo ""
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \
awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'

# --- Build -------------------------------------------------------------------

build: ## Build the model image
@cp scripts/entrypoint.sh models/$(MODEL)/entrypoint.sh
docker build \
-t $(MODEL_TAG):latest \
models/$(MODEL)/

# --- Run ---------------------------------------------------------------------

up: ## Start via docker compose (detatched)
docker compose up -d
FOUNDRY_MODEL=$(MODEL) docker compose up -d

monitoring: ## Start via docker compose with full monitoring stack
docker compose --profile monitoring up -d
FOUNDRY_MODEL=$(MODEL) docker compose --profile monitoring up -d

down: ## Stop all docker compose services
docker compose --profile monitoring down
Expand Down Expand Up @@ -102,7 +107,7 @@ test: ## Smoke test: start container, wait for health, send one request
# --- Download ----------------------------------------------------------------

download: ## Download the GGUF model file
./scripts/download-model.sh
./scripts/download-model.sh --model $(MODEL)

# --- Benchmark ---------------------------------------------------------------

Expand Down
49 changes: 39 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ Works with any OpenAI-compatible client: Cursor, Continue, OpenCode, Open WebUI,

*Benchmarked with `Qwen3.5-35B-A3B` using `UD-Q4_K_XL` quantization (Unsloth Dynamic 2.0).*

### Hermes-4.3-36B (Dense)
| GPU | VRAM | Context | Decode | 4-concurrent |
|-----|------|---------|--------|--------------|
| RTX 5090 | 32 GB | 32K | ~64 tok/s | ~170 tok/s |
| Other NVIDIA (24GB+) | 24+ GB | 8K | varies | varies |

*Benchmarked with `NousResearch/Hermes-4.3-36B` using `Q4_K_M` quantization.*

## How It Works

Foundry uses [llama.cpp](https://github.com/ggml-org/llama.cpp) as the inference engine, built on the official [`server-cuda12`](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp) image.
Expand Down Expand Up @@ -70,7 +78,7 @@ docker run --gpus all -p 8080:8080 \
ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest
```

Available profiles: `rtx5090`, `default`
Available profiles (per model): `rtx5090`, `default`

## Configuration

Expand Down Expand Up @@ -128,9 +136,12 @@ docker run --gpus '"device=1"' -p 8081:8080 -v ~/.cache/foundry:/models \
## Docker Compose

```bash
# Basic
# Basic (default: Qwen3.5-35B-A3B)
docker compose up

# Choose a different model
FOUNDRY_MODEL=hermes-4.3-36b docker compose up

# With explicit profile
FOUNDRY_PROFILE=rtx5090 docker compose up

Expand Down Expand Up @@ -209,24 +220,32 @@ This tunes: `vm.swappiness`, `vm.overcommit_memory`, hugepages, TCP buffers, CPU
## Build From Source

```bash
make build # Build the model image
make run # Run with auto-detected GPU
make test # Smoke test: start, wait for health, send one request
make download # Download the GGUF model file to ~/.cache/foundry
make build # Build the default model image (qwen3.5-35b-a3b)
make build MODEL=hermes-4.3-36b # Build a different model
make run # Run with auto-detected GPU
make test # Smoke test: start, wait for health, send one request
make download # Download the GGUF model file to ~/.cache/foundry
```

## Architecture

```
foundry/
├── models/
│ └── qwen3.5-35b-a3b/
│ ├── qwen3.5-35b-a3b/
│ │ ├── Dockerfile # FROM llama.cpp:server-cuda12
│ │ ├── entrypoint.sh # Copied from scripts/entrypoint.sh at build time
│ │ └── profiles/
│ │ ├── rtx5090.sh # 192K ctx, 4 slots, 320 tok/s aggregate
│ │ └── default.sh # 16K ctx, q4_0 KV, conservative
│ └── hermes-4.3-36b/
│ ├── Dockerfile # FROM llama.cpp:server-cuda12
│ ├── entrypoint.sh # GPU detect, model download, launch
│ ├── entrypoint.sh # Copied from scripts/entrypoint.sh at build time
│ └── profiles/
│ ├── rtx5090.sh # 192K ctx, 4 slots, 320 tok/s aggregate
│ └── default.sh # 16K ctx, q4_0 KV, conservative
│ ├── rtx5090.sh # 32K ctx, 4 slots, 170 tok/s aggregate
│ └── default.sh # 8K ctx, q8_0 KV, 24GB minimum
├── scripts/
│ ├── entrypoint.sh # Shared entrypoint for all models
│ ├── benchmark.py # Generation speed, prompt processing, throughput
│ ├── optimize_5090.py # Multi-config A/B testing harness
│ ├── download-model.sh # Download GGUF outside Docker
Expand Down Expand Up @@ -270,6 +289,16 @@ python3 scripts/benchmark.py --url http://localhost:8080 --mode all
- **Min VRAM**: 16 GB (with expert offloading)
- **Max context**: 262K native, 192K default on RTX 5090

### Hermes-4.3-36B

- **Architecture**: Dense (36B total, all 36B active per token)
- ByteDance Seed-OSS-36B architecture
- Standard attention (GQA 80:8)
- **Quantization**: Q4_K_M via [bartowski](https://huggingface.co/bartowski/NousResearch_Hermes-4.3-36B-GGUF)
- **Disk size**: ~21.8 GB
- **Min VRAM**: 24 GB (dense models cannot effectively offload experts)
- **Max context**: 512K native, 32K default on RTX 5090

## License

Apache-2.0
63 changes: 43 additions & 20 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# ==============================================================================
# Foundry: Docker Compose
# ==============================================================================
# Quick start:
# Quick start (uses default Qwen3.5 MoE model):
# docker compose up
#
# Choose a different model:
# FOUNDRY_MODEL=hermes-4.3-36b docker compose up
#
# With monitoring (Prometheus + Grafana + GPU metrics):
# docker compose --profile monitoring up
#
Expand All @@ -16,12 +19,28 @@

services:
# ============================================================================
# Inference Server
# eBPF Exporter (Linux Kernel metrics)
# ============================================================================
ebpf-exporter:
image: ghcr.io/cloudflare/ebpf_exporter:latest
profiles:
- monitoring
privileged: true
pid: host
network_mode: host
volumes:
- /sys/kernel/debug:/sys/kernel/debug:ro
- /sys/kernel/tracing:/sys/kernel/tracing:ro
- /lib/modules:/lib/modules:ro
command: ["--config.file=/examples/biolatency.yaml"]

# ============================================================================
# Prometheus (Scrapes metrics)
# ============================================================================
qwen3.5-35b-a3b:
image: ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest
inference:
image: ghcr.io/infernet-org/foundry/${FOUNDRY_MODEL:-qwen3.5-35b-a3b}:latest
build:
context: models/qwen3.5-35b-a3b/
context: models/${FOUNDRY_MODEL:-qwen3.5-35b-a3b}/
ports:
- "${FOUNDRY_PORT:-8080}:8080"
volumes:
Expand All @@ -44,13 +63,19 @@ services:
sysctls:
- net.core.somaxconn=4096
- net.ipv4.tcp_keepalive_time=60
- net.ipv4.tcp_congestion_control=bbr
- net.core.busy_read=50
- net.core.busy_poll=50
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
start_period: 120s
retries: 3
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 10s
timeout: 5s
retries: 5
ulimits:
memlock:
soft: -1
hard: -1
networks:
- default
- monitoring
Expand All @@ -60,15 +85,13 @@ services:
# ============================================================================

prometheus:
image: prom/prometheus:v3.4.1
profiles: [monitoring]
container_name: foundry-prometheus
restart: unless-stopped
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=30d
- --web.enable-lifecycle
image: prom/prometheus:latest
profiles:
- monitoring
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
extra_hosts:
- "host.docker.internal:host-gateway"
ports:
- "9090:9090"
volumes:
Expand All @@ -77,7 +100,7 @@ services:
networks:
- monitoring
depends_on:
qwen3.5-35b-a3b:
inference:
condition: service_started

grafana:
Expand Down
87 changes: 87 additions & 0 deletions models/hermes-4.3-36b/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# ==============================================================================
# Foundry Model Image: Hermes-4.3-36B
# ==============================================================================
# Multi-stage build for a minimal CUDA runtime.
# Compiles llama.cpp from source for sm_89 (Ada) and sm_120a (Blackwell),
# then copies only the binary and required libraries to a clean Ubuntu base.
#
# Weights are NOT baked in. They are downloaded on first run or mounted
# from the host at /models.
# ==============================================================================

# ------------------------------------------------------------------------------
# Stage 1: Builder
# ------------------------------------------------------------------------------
FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder

RUN apt-get update && apt-get install -y git cmake g++ curl
RUN git clone --depth 1 -b b8183 https://github.com/ggml-org/llama.cpp.git /llama.cpp
WORKDIR /llama.cpp

# Compile explicitly for Ada (sm_89) and Blackwell (sm_120a).
# GGML_BACKEND_DL=ON builds CUDA as a runtime-loaded plugin (dlopen), which
# avoids the libcuda.so.1 transitive link error during Docker builds where
# no real GPU driver is present. This matches the official llama.cpp Dockerfile.
RUN cmake -B build \
-DGGML_NATIVE=OFF \
-DGGML_CUDA=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DCMAKE_CUDA_ARCHITECTURES="89;120a" \
-DCMAKE_BUILD_TYPE=Release \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DCMAKE_EXE_LINKER_FLAGS="-Wl,--allow-shlib-undefined" && \
cmake --build build --config Release -j$(nproc)

# ------------------------------------------------------------------------------
# Stage 2: Minimal Runtime
# ------------------------------------------------------------------------------
FROM ubuntu:24.04

# Install minimal runtime dependencies (CUDA runtime libs are mounted by nvidia-container-runtime)
RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 \
python3 python3-pip curl \
&& pip3 install --break-system-packages --no-cache-dir "huggingface-hub>=0.28,<1" "hf_transfer>=0.1.6" \
&& rm -rf /var/lib/apt/lists/*

# The NVIDIA runtime needs these env vars to mount the CUDA drivers correctly
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

# Model metadata
ENV FOUNDRY_MODEL_NAME="Hermes-4.3-36B"
ENV FOUNDRY_GGUF_REPO="NousResearch/Hermes-4.3-36B-GGUF"
ENV FOUNDRY_GGUF_FILE="hermes-4_3_36b-Q4_K_M.gguf"
ENV FOUNDRY_ARCH="dense"

# Enable fast downloads
ENV HF_HUB_ENABLE_HF_TRANSFER="1"

# Runtime defaults (can be overridden)
ENV FOUNDRY_PROFILE="auto"
ENV FOUNDRY_PORT="8080"
ENV FOUNDRY_CTX_LENGTH=""
ENV FOUNDRY_THREADS=""
ENV FOUNDRY_EXTRA_ARGS=""

# Copy the compiled binary and all shared libraries from the build output.
# With GGML_BACKEND_DL=ON, backends (ggml-cuda, ggml-cpu-*) are .so modules
# loaded at runtime via dlopen. CMake places everything in build/bin/.
COPY --from=builder /llama.cpp/build/bin/ /app/
ENV LD_LIBRARY_PATH="/app:/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu"

# Copy profiles and shared entrypoint
COPY profiles/ /opt/foundry/profiles/
COPY entrypoint.sh /opt/foundry/entrypoint.sh
RUN chmod +x /opt/foundry/entrypoint.sh

# Model storage
RUN mkdir -p /models
VOLUME /models

EXPOSE 8080

ENTRYPOINT ["/opt/foundry/entrypoint.sh"]
Loading