infernet-org · aWN4Y25pa2EK · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -12,12 +12,16 @@ env:
 
 jobs:
   build-model:
-    name: Build Model Image
+    name: Build ${{ matrix.model }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
       packages: write
 
+    strategy:
+      matrix:
+        model: [qwen3.5-35b-a3b, hermes-4.3-36b]
+
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -36,10 +40,10 @@ jobs:
       - name: Build and push model image
         uses: docker/build-push-action@v6
         with:
-          context: models/qwen3.5-35b-a3b/
+          context: models/${{ matrix.model }}/
           push: ${{ github.event_name != 'pull_request' }}
           tags: |
-            ${{ env.REGISTRY }}/qwen3.5-35b-a3b:latest
-            ${{ github.ref_type == 'tag' && format('{0}/qwen3.5-35b-a3b:{1}', env.REGISTRY, github.ref_name) || '' }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+            ${{ env.REGISTRY }}/${{ matrix.model }}:latest
+            ${{ github.ref_type == 'tag' && format('{0}/{1}:{2}', env.REGISTRY, matrix.model, github.ref_name) || '' }}
+          cache-from: type=gha,scope=${{ matrix.model }}
+          cache-to: type=gha,mode=max,scope=${{ matrix.model }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,23 @@
+name: Lint
+
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  ruff:
+    name: Python (ruff)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/ruff-action@v3
+
+  shellcheck:
+    name: Shell (shellcheck)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ludeeus/action-shellcheck@2.0.0
+        with:
+          scandir: scripts/
+          severity: warning
diff --git a/Makefile b/Makefile
@@ -3,6 +3,7 @@
 # ==============================================================================
 
 REGISTRY ?= ghcr.io/infernet-org/foundry
+# Default model (can be overridden: make run MODEL=hermes-4.3-36b)
 MODEL ?= qwen3.5-35b-a3b
 MODEL_TAG ?= $(REGISTRY)/$(MODEL)
 PORT ?= 8080
@@ -11,23 +12,27 @@ MODELS_DIR ?= $(HOME)/.cache/foundry
 .PHONY: help build run run-profile test benchmark monitoring down push push-all clean clean-models download
 
 help: ## Show this help
+	@echo "Available models: qwen3.5-35b-a3b (default), hermes-4.3-36b"
+	@echo "Usage: make run MODEL=hermes-4.3-36b"
+	@echo ""
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \
 		awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
 
 # --- Build -------------------------------------------------------------------
 
 build: ## Build the model image
+	@cp scripts/entrypoint.sh models/$(MODEL)/entrypoint.sh
 	docker build \
 		-t $(MODEL_TAG):latest \
 		models/$(MODEL)/
 
 # --- Run ---------------------------------------------------------------------
 
 up: ## Start via docker compose (detatched)
-	docker compose up -d
+	FOUNDRY_MODEL=$(MODEL) docker compose up -d
 
 monitoring: ## Start via docker compose with full monitoring stack
-	docker compose --profile monitoring up -d
+	FOUNDRY_MODEL=$(MODEL) docker compose --profile monitoring up -d
 
 down: ## Stop all docker compose services
 	docker compose --profile monitoring down
@@ -102,7 +107,7 @@ test: ## Smoke test: start container, wait for health, send one request
 # --- Download ----------------------------------------------------------------
 
 download: ## Download the GGUF model file
-	./scripts/download-model.sh
+	./scripts/download-model.sh --model $(MODEL)
 
 # --- Benchmark ---------------------------------------------------------------
 

diff --git a/README.md b/README.md
@@ -42,6 +42,14 @@ Works with any OpenAI-compatible client: Cursor, Continue, OpenCode, Open WebUI,
 
 *Benchmarked with `Qwen3.5-35B-A3B` using `UD-Q4_K_XL` quantization (Unsloth Dynamic 2.0).*
 
+### Hermes-4.3-36B (Dense)
+| GPU | VRAM | Context | Decode | 4-concurrent |
+|-----|------|---------|--------|--------------|
+| RTX 5090 | 32 GB | 32K | ~64 tok/s | ~170 tok/s |
+| Other NVIDIA (24GB+) | 24+ GB | 8K | varies | varies |
+
+*Benchmarked with `NousResearch/Hermes-4.3-36B` using `Q4_K_M` quantization.*
+
 ## How It Works
 
 Foundry uses [llama.cpp](https://github.com/ggml-org/llama.cpp) as the inference engine, built on the official [`server-cuda12`](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp) image.
@@ -70,7 +78,7 @@ docker run --gpus all -p 8080:8080 \
   ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest
 ```
 
-Available profiles: `rtx5090`, `default`
+Available profiles (per model): `rtx5090`, `default`
 
 ## Configuration
 
@@ -128,9 +136,12 @@ docker run --gpus '"device=1"' -p 8081:8080 -v ~/.cache/foundry:/models \
 ## Docker Compose
 
 ```bash
-# Basic
+# Basic (default: Qwen3.5-35B-A3B)
 docker compose up
 
+# Choose a different model
+FOUNDRY_MODEL=hermes-4.3-36b docker compose up
+
 # With explicit profile
 FOUNDRY_PROFILE=rtx5090 docker compose up
 
@@ -209,24 +220,32 @@ This tunes: `vm.swappiness`, `vm.overcommit_memory`, hugepages, TCP buffers, CPU
 ## Build From Source
 
 ```bash
-make build    # Build the model image
-make run      # Run with auto-detected GPU
-make test     # Smoke test: start, wait for health, send one request
-make download # Download the GGUF model file to ~/.cache/foundry
+make build                        # Build the default model image (qwen3.5-35b-a3b)
+make build MODEL=hermes-4.3-36b   # Build a different model
+make run                          # Run with auto-detected GPU
+make test                         # Smoke test: start, wait for health, send one request
+make download                     # Download the GGUF model file to ~/.cache/foundry
 ```
 
 ## Architecture
 
 ```
 foundry/
 ├── models/
-│   └── qwen3.5-35b-a3b/
+│   ├── qwen3.5-35b-a3b/
+│   │   ├── Dockerfile           # FROM llama.cpp:server-cuda12
+│   │   ├── entrypoint.sh        # Copied from scripts/entrypoint.sh at build time
+│   │   └── profiles/
+│   │       ├── rtx5090.sh       # 192K ctx, 4 slots, 320 tok/s aggregate
+│   │       └── default.sh       # 16K ctx, q4_0 KV, conservative
+│   └── hermes-4.3-36b/
 │       ├── Dockerfile           # FROM llama.cpp:server-cuda12
-│       ├── entrypoint.sh        # GPU detect, model download, launch
+│       ├── entrypoint.sh        # Copied from scripts/entrypoint.sh at build time
 │       └── profiles/
-│           ├── rtx5090.sh       # 192K ctx, 4 slots, 320 tok/s aggregate
-│           └── default.sh       # 16K ctx, q4_0 KV, conservative
+│           ├── rtx5090.sh       # 32K ctx, 4 slots, 170 tok/s aggregate
+│           └── default.sh       # 8K ctx, q8_0 KV, 24GB minimum
 ├── scripts/
+│   ├── entrypoint.sh            # Shared entrypoint for all models
 │   ├── benchmark.py             # Generation speed, prompt processing, throughput
 │   ├── optimize_5090.py         # Multi-config A/B testing harness
 │   ├── download-model.sh        # Download GGUF outside Docker
@@ -270,6 +289,16 @@ python3 scripts/benchmark.py --url http://localhost:8080 --mode all
 - **Min VRAM**: 16 GB (with expert offloading)
 - **Max context**: 262K native, 192K default on RTX 5090
 
+### Hermes-4.3-36B
+
+- **Architecture**: Dense (36B total, all 36B active per token)
+  - ByteDance Seed-OSS-36B architecture
+  - Standard attention (GQA 80:8)
+- **Quantization**: Q4_K_M via [bartowski](https://huggingface.co/bartowski/NousResearch_Hermes-4.3-36B-GGUF)
+- **Disk size**: ~21.8 GB
+- **Min VRAM**: 24 GB (dense models cannot effectively offload experts)
+- **Max context**: 512K native, 32K default on RTX 5090
+
 ## License
 
 Apache-2.0
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,9 +1,12 @@
 # ==============================================================================
 # Foundry: Docker Compose
 # ==============================================================================
-# Quick start:
+# Quick start (uses default Qwen3.5 MoE model):
 #   docker compose up
 #
+# Choose a different model:
+#   FOUNDRY_MODEL=hermes-4.3-36b docker compose up
+#
 # With monitoring (Prometheus + Grafana + GPU metrics):
 #   docker compose --profile monitoring up
 #
@@ -16,12 +19,28 @@
 
 services:
   # ============================================================================
-  # Inference Server
+  # eBPF Exporter (Linux Kernel metrics)
+  # ============================================================================
+  ebpf-exporter:
+    image: ghcr.io/cloudflare/ebpf_exporter:latest
+    profiles:
+      - monitoring
+    privileged: true
+    pid: host
+    network_mode: host
+    volumes:
+      - /sys/kernel/debug:/sys/kernel/debug:ro
+      - /sys/kernel/tracing:/sys/kernel/tracing:ro
+      - /lib/modules:/lib/modules:ro
+    command: ["--config.file=/examples/biolatency.yaml"]
+
+  # ============================================================================
+  # Prometheus (Scrapes metrics)
   # ============================================================================
-  qwen3.5-35b-a3b:
-    image: ghcr.io/infernet-org/foundry/qwen3.5-35b-a3b:latest
+  inference:
+    image: ghcr.io/infernet-org/foundry/${FOUNDRY_MODEL:-qwen3.5-35b-a3b}:latest
     build:
-      context: models/qwen3.5-35b-a3b/
+      context: models/${FOUNDRY_MODEL:-qwen3.5-35b-a3b}/
     ports:
       - "${FOUNDRY_PORT:-8080}:8080"
     volumes:
@@ -44,13 +63,19 @@ services:
     sysctls:
       - net.core.somaxconn=4096
       - net.ipv4.tcp_keepalive_time=60
+      - net.ipv4.tcp_congestion_control=bbr
+      - net.core.busy_read=50
+      - net.core.busy_poll=50
     restart: unless-stopped
     healthcheck:
-      test: ["CMD", "curl", "-sf", "http://localhost:8080/health"]
-      interval: 30s
-      timeout: 10s
-      start_period: 120s
-      retries: 3
+      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
     networks:
       - default
       - monitoring
@@ -60,15 +85,13 @@ services:
   # ============================================================================
 
   prometheus:
-    image: prom/prometheus:v3.4.1
-    profiles: [monitoring]
-    container_name: foundry-prometheus
-    restart: unless-stopped
-    command:
-      - --config.file=/etc/prometheus/prometheus.yml
-      - --storage.tsdb.path=/prometheus
-      - --storage.tsdb.retention.time=30d
-      - --web.enable-lifecycle
+    image: prom/prometheus:latest
+    profiles:
+      - monitoring
+    volumes:
+      - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     ports:
       - "9090:9090"
     volumes:
@@ -77,7 +100,7 @@ services:
     networks:
       - monitoring
     depends_on:
-      qwen3.5-35b-a3b:
+      inference:
         condition: service_started
 
   grafana:

diff --git a/models/hermes-4.3-36b/Dockerfile b/models/hermes-4.3-36b/Dockerfile
@@ -0,0 +1,87 @@
+# ==============================================================================
+# Foundry Model Image: Hermes-4.3-36B
+# ==============================================================================
+# Multi-stage build for a minimal CUDA runtime.
+# Compiles llama.cpp from source for sm_89 (Ada) and sm_120a (Blackwell),
+# then copies only the binary and required libraries to a clean Ubuntu base.
+#
+# Weights are NOT baked in. They are downloaded on first run or mounted
+# from the host at /models.
+# ==============================================================================
+
+# ------------------------------------------------------------------------------
+# Stage 1: Builder
+# ------------------------------------------------------------------------------
+FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS builder
+
+RUN apt-get update && apt-get install -y git cmake g++ curl
+RUN git clone --depth 1 -b b8183 https://github.com/ggml-org/llama.cpp.git /llama.cpp
+WORKDIR /llama.cpp
+
+# Compile explicitly for Ada (sm_89) and Blackwell (sm_120a).
+# GGML_BACKEND_DL=ON builds CUDA as a runtime-loaded plugin (dlopen), which
+# avoids the libcuda.so.1 transitive link error during Docker builds where
+# no real GPU driver is present. This matches the official llama.cpp Dockerfile.
+RUN cmake -B build \
+    -DGGML_NATIVE=OFF \
+    -DGGML_CUDA=ON \
+    -DGGML_BACKEND_DL=ON \
+    -DGGML_CPU_ALL_VARIANTS=ON \
+    -DCMAKE_CUDA_ARCHITECTURES="89;120a" \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DLLAMA_CURL=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DCMAKE_EXE_LINKER_FLAGS="-Wl,--allow-shlib-undefined" && \
+    cmake --build build --config Release -j$(nproc)
+
+# ------------------------------------------------------------------------------
+# Stage 2: Minimal Runtime
+# ------------------------------------------------------------------------------
+FROM ubuntu:24.04
+
+# Install minimal runtime dependencies (CUDA runtime libs are mounted by nvidia-container-runtime)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    python3 python3-pip curl \
+    && pip3 install --break-system-packages --no-cache-dir "huggingface-hub>=0.28,<1" "hf_transfer>=0.1.6" \
+    && rm -rf /var/lib/apt/lists/*
+
+# The NVIDIA runtime needs these env vars to mount the CUDA drivers correctly
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+# Model metadata
+ENV FOUNDRY_MODEL_NAME="Hermes-4.3-36B"
+ENV FOUNDRY_GGUF_REPO="NousResearch/Hermes-4.3-36B-GGUF"
+ENV FOUNDRY_GGUF_FILE="hermes-4_3_36b-Q4_K_M.gguf"
+ENV FOUNDRY_ARCH="dense"
+
+# Enable fast downloads
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+# Runtime defaults (can be overridden)
+ENV FOUNDRY_PROFILE="auto"
+ENV FOUNDRY_PORT="8080"
+ENV FOUNDRY_CTX_LENGTH=""
+ENV FOUNDRY_THREADS=""
+ENV FOUNDRY_EXTRA_ARGS=""
+
+# Copy the compiled binary and all shared libraries from the build output.
+# With GGML_BACKEND_DL=ON, backends (ggml-cuda, ggml-cpu-*) are .so modules
+# loaded at runtime via dlopen. CMake places everything in build/bin/.
+COPY --from=builder /llama.cpp/build/bin/ /app/
+ENV LD_LIBRARY_PATH="/app:/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu"
+
+# Copy profiles and shared entrypoint
+COPY profiles/ /opt/foundry/profiles/
+COPY entrypoint.sh /opt/foundry/entrypoint.sh
+RUN chmod +x /opt/foundry/entrypoint.sh
+
+# Model storage
+RUN mkdir -p /models
+VOLUME /models
+
+EXPOSE 8080
+
+ENTRYPOINT ["/opt/foundry/entrypoint.sh"]