From ef8f9e391387946740344a9f03bcd9fa18d81a73 Mon Sep 17 00:00:00 2001 From: 0xfandom Date: Tue, 21 Apr 2026 12:54:24 +0530 Subject: [PATCH 1/4] feat(observability): smoke test + 5 extra alerts + extensibility docs Non-duplicate bits from PR #98 landed on top of main after PR #83/#84 covered the core stack. Keeps main's dashboards + Slack-only alertmanager intact; drops PR #98's PagerDuty/Discord routing per team directive. --- deploy/docker/README.md | 58 +++++ .../docker/alertmanager/templates/slack.tmpl | 16 ++ deploy/docker/prometheus/alerts.yml | 58 +++++ scripts/monitoring_smoke.sh | 210 ++++++++++++++++++ 4 files changed, 342 insertions(+) create mode 100644 deploy/docker/README.md create mode 100644 deploy/docker/alertmanager/templates/slack.tmpl create mode 100755 scripts/monitoring_smoke.sh diff --git a/deploy/docker/README.md b/deploy/docker/README.md new file mode 100644 index 0000000..298856d --- /dev/null +++ b/deploy/docker/README.md @@ -0,0 +1,58 @@ +# Aether Docker Monitoring Stack + +## Overview + +| Service | Port | Purpose | +|---------|------|---------| +| aether-go | 9090 | Go executor metrics (Prometheus) | +| aether-rust | 9092 | Rust engine metrics (Prometheus) | +| prometheus | 9091 | Metrics store, alert evaluation | +| alertmanager | 9093 | Alert routing (Slack) | +| grafana | 3000 | Dashboards (admin/admin) | + +Start the monitoring-only stack: `docker compose up -d prometheus alertmanager grafana` + +## Adding a Metric + +1. Emit the metric in `cmd/executor/metrics.go` (Go) or `crates/grpc-server/src/metrics.rs` (Rust). +2. Restart the relevant service. Prometheus auto-scrapes on the 15s interval. +3. Verify it appears at `http://localhost:9091/graph`. + +## Adding a Dashboard + +1. Create a JSON file in `deploy/docker/grafana/dashboards/`. Assign a stable `"uid"` string. +2. All panels must reference `${DS_PROMETHEUS}` as the datasource uid. +3. The provisioner reloads dashboards every 30s — no Grafana restart needed. +4. Use `jq empty .json` to validate JSON before committing. + +## Adding an Alert + +1. Add a rule block to `deploy/docker/prometheus/alerts.yml` under group `aether.rules`. +2. Include `summary`, `description`, and `runbook_url` annotations. Use `{{ $labels.job }}` and `{{ $value }}` for context. +3. Validate: `docker run --rm -v "$PWD/deploy/docker/prometheus:/p" prom/prometheus:latest promtool check rules /p/alerts.yml` +4. Reload Prometheus: `curl -XPOST http://localhost:9091/-/reload` + +## Adding a Receiver + +1. Edit `deploy/docker/alertmanager.yml`. +2. Alerting is Slack-only in production. PagerDuty/Discord/Telegram receivers are intentionally out of scope — propose via a separate design ticket if the team decides to broaden channels. +3. The Slack webhook is injected at container start via sed substitution of `__SLACK_WEBHOOK_URL__` from `$SLACK_WEBHOOK_URL` env. +4. An optional richer Slack message template lives at `deploy/docker/alertmanager/templates/slack.tmpl` — wiring it requires adding a `templates:` stanza to `alertmanager.yml` and mounting the directory in docker-compose. Deferred as a follow-up. + +## Histogram Bucket Caveats + +Quantile estimates are bounded by the top histogram bucket. If p99 reads as the top-bucket value, it means most observations exceed that boundary — not that the exact value equals it. Configured bucket tops: + +- Detection latency: 50ms (`aether_detection_latency_ms`) +- Simulation latency: 500ms (`aether_simulation_latency_ms`) +- End-to-end latency: 5000ms (`aether_end_to_end_latency_ms`) + +Add finer or higher buckets in the metric definition to get better resolution. + +## Running the Smoke Test + +```bash +bash scripts/monitoring_smoke.sh +``` + +Brings up the full stack, fires synthetic versions of all 12 alert rules via `docker cp` + Prometheus lifecycle reload, asserts each alert becomes active in Alertmanager, then tears down. Requires `docker`, `curl`, `jq`. diff --git a/deploy/docker/alertmanager/templates/slack.tmpl b/deploy/docker/alertmanager/templates/slack.tmpl new file mode 100644 index 0000000..2b0293d --- /dev/null +++ b/deploy/docker/alertmanager/templates/slack.tmpl @@ -0,0 +1,16 @@ +{{ define "aether.slack.title" -}} +[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} ({{ .CommonLabels.severity }}) +{{- end }} + +{{ define "aether.slack.body" -}} +{{ range .Alerts -}} +*Job:* {{ .Labels.job }} +*Summary:* {{ .Annotations.summary }} +*Description:* {{ .Annotations.description }} +*Starts At:* {{ .StartsAt.Format "2006-01-02 15:04:05 UTC" }} +{{ if .Annotations.runbook_url -}} +*Runbook:* {{ .Annotations.runbook_url }} +{{ end }} +--- +{{ end -}} +{{- end }} diff --git a/deploy/docker/prometheus/alerts.yml b/deploy/docker/prometheus/alerts.yml index 7295eb6..9424128 100644 --- a/deploy/docker/prometheus/alerts.yml +++ b/deploy/docker/prometheus/alerts.yml @@ -15,6 +15,7 @@ groups: annotations: summary: "Aether system is Halted" description: "System state gauge reports Halted (3) for >1m. Halted requires manual reset. Check executor logs for the breaker reason." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherHalted.md" - alert: AetherInclusionRateLow expr: | @@ -27,6 +28,7 @@ groups: annotations: summary: "Bundle inclusion rate below 20% over the last hour" description: "Over the last 1h, fewer than 20% of submitted bundles were included. Current ratio: {{ printf \"%.2f\" $value }}." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherInclusionRateLow.md" - alert: AetherE2ELatencyHigh expr: histogram_quantile(0.99, sum by (le) (rate(aether_end_to_end_latency_ms_bucket[5m]))) > 100 @@ -36,6 +38,7 @@ groups: annotations: summary: "End-to-end p99 latency above 100ms" description: "p99 end-to-end latency = {{ printf \"%.1f\" $value }}ms over last 5m (target <100ms)." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherE2ELatencyHigh.md" - alert: AetherNoOpportunities # Suppress during the first 30m after process start so a fresh boot or @@ -52,6 +55,7 @@ groups: annotations: summary: "Fewer than 5 opportunities per minute" description: "Arb publish rate = {{ printf \"%.1f\" $value }}/min over last 10m. Detection pipeline may be stalled." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherNoOpportunities.md" - alert: AetherETHBalanceLow expr: aether_eth_balance < 0.15 @@ -61,6 +65,7 @@ groups: annotations: summary: "Searcher ETH balance below 0.15" description: "Hot wallet ETH balance = {{ printf \"%.4f\" $value }}. Top up before bundles start reverting on gas." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherETHBalanceLow.md" - alert: AetherGasHigh expr: aether_gas_price_gwei > 300 @@ -70,6 +75,7 @@ groups: annotations: summary: "Gas price above 300 gwei" description: "Base fee = {{ printf \"%.1f\" $value }} gwei. Executor preflight will reject arbs until this drops." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherGasHigh.md" - alert: AetherBuilderDown # Disabled builders register zero-total on both {success} and {failure} @@ -86,6 +92,7 @@ groups: annotations: summary: "Builder {{ $labels.builder }} has no successful submissions" description: "Builder {{ $labels.builder }} received submissions but zero succeeded over the last 2m. Check builder endpoint health and auth. Note: builders configured with Enabled=false are intentionally silent here." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherBuilderDown.md" # Self-monitor of the alerting path. If Alertmanager crashloops (bad # config, SLACK_WEBHOOK_URL missing, etc.) the rest of the alerts above @@ -99,3 +106,54 @@ groups: annotations: summary: "Alertmanager scrape target is down" description: "Prometheus has been unable to scrape alertmanager:9093 for 2m. Slack delivery is offline. Check the alertmanager container logs and SLACK_WEBHOOK_URL config." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AlertmanagerDown.md" + + - alert: AetherServiceDown + expr: up{job=~"aether-(go|rust)"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Aether service {{ $labels.job }} is down" + description: "Job {{ $labels.job }} has been unreachable for >1m. Prometheus scrape target reports up=0." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherServiceDown.md" + + - alert: AetherNoBlocksProcessed + expr: rate(aether_blocks_processed_total[3m]) == 0 + for: 3m + labels: + severity: critical + annotations: + summary: "Aether engine {{ $labels.job }} stopped processing blocks" + description: "Job {{ $labels.job }} has processed 0 blocks over the last 3m. The Rust ingestion pipeline may be stalled or disconnected from the Ethereum node." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherNoBlocksProcessed.md" + + - alert: AetherHighSimulationLatency + expr: histogram_quantile(0.99, sum by(le) (rate(aether_simulation_latency_ms_bucket[5m]))) > 100 + for: 10m + labels: + severity: warning + annotations: + summary: "Aether simulation latency p99 exceeds 100ms" + description: "EVM simulation p99 latency is {{ printf \"%.1f\" $value }}ms over the last 5m (target <50ms). revm fork state may be stale or the RPC provider is slow." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherHighSimulationLatency.md" + + - alert: AetherNegativeDailyPnL + expr: aether_daily_pnl_eth < -0.05 + for: 30m + labels: + severity: warning + annotations: + summary: "Aether daily PnL is negative ({{ printf \"%.4f\" $value }} ETH)" + description: "Daily PnL has been below -0.05 ETH for 30m. Review gas costs, tip share, and revert rate." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherNegativeDailyPnL.md" + + - alert: AetherRiskRejectionStorm + expr: rate(aether_executor_risk_rejections_total[5m]) > 1 + for: 10m + labels: + severity: warning + annotations: + summary: "Aether risk manager rejecting >1 arb/sec" + description: "Risk rejection rate is {{ printf \"%.2f\" $value }} rejections/sec over a 5m window, sustained for 10m. Investigate circuit breaker state, gas, and position limits." + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/AetherRiskRejectionStorm.md" diff --git a/scripts/monitoring_smoke.sh b/scripts/monitoring_smoke.sh new file mode 100755 index 0000000..3af2717 --- /dev/null +++ b/scripts/monitoring_smoke.sh @@ -0,0 +1,210 @@ +#!/usr/bin/env bash +# monitoring_smoke.sh — end-to-end smoke test for the Aether monitoring stack. +# +# Dependencies: docker (with compose plugin), curl, jq +# +# Exit codes: +# 0 — all checks passed +# 1 — readiness timeout (service did not become healthy) +# 2 — alert did not fire within timeout +# 3 — teardown failed +# +# Usage: bash scripts/monitoring_smoke.sh +# Run from the repo root. The script brings up deploy/docker and tears +# it down automatically. Does NOT require Pushgateway, amtool, or any +# extra container — only the existing stack + docker cp + curl. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +COMPOSE_DIR="${REPO_ROOT}/deploy/docker" +COMPOSE_FILE="${COMPOSE_DIR}/docker-compose.yml" + +PROMETHEUS_URL="http://localhost:9091" +ALERTMANAGER_URL="http://localhost:9093" +GRAFANA_URL="http://localhost:3000" + +SYNTHETIC_RULES_FILE="/tmp/aether-synthetic-rules.yml" +TEARDOWN_DONE=0 + +# --------------------------------------------------------------------------- +# preflight: assert required tools are present +# --------------------------------------------------------------------------- +preflight() { + local missing=0 + for cmd in docker curl jq; do + if ! command -v "${cmd}" &>/dev/null; then + echo "ERROR: required tool not found: ${cmd}" >&2 + missing=1 + fi + done + + if ! docker compose version &>/dev/null; then + echo "ERROR: 'docker compose' plugin not available (need Docker 20.10+)" >&2 + missing=1 + fi + + if [[ "${missing}" -eq 1 ]]; then + exit 1 + fi + + echo "preflight: OK (docker, curl, jq available)" +} + +# --------------------------------------------------------------------------- +# readiness_wait +# Polls GET until HTTP 200 or timeout. Exits 1 on timeout. +# --------------------------------------------------------------------------- +readiness_wait() { + local url="${1}" + local timeout="${2}" + local elapsed=0 + local interval=3 + + echo "waiting for ${url} (timeout ${timeout}s)..." + while true; do + if curl -sf --max-time 2 "${url}" &>/dev/null; then + echo " ready: ${url}" + return 0 + fi + if [[ "${elapsed}" -ge "${timeout}" ]]; then + echo "ERROR: timed out waiting for ${url} after ${timeout}s" >&2 + exit 1 + fi + sleep "${interval}" + elapsed=$((elapsed + interval)) + done +} + +# --------------------------------------------------------------------------- +# fire_synthetic +# Injects a synthetic rule into Prometheus via docker cp + lifecycle reload, +# then asserts the alert is active in Alertmanager within 30s. +# --------------------------------------------------------------------------- +fire_synthetic() { + local alertname="${1}" + local severity="${2}" + local container="aether-prometheus" + local timeout=60 + local elapsed=0 + local interval=3 + + echo "firing synthetic alert: alertname=${alertname} severity=${severity}" + + # Write a temporary rules file with expr that always fires. + cat > "${SYNTHETIC_RULES_FILE}" < 0 + for: 0s + labels: + severity: ${severity} + job: aether-go + synthetic: "true" + annotations: + summary: "Synthetic smoke test for ${alertname}" + description: "Injected by monitoring_smoke.sh" + runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/${alertname}.md" +EOF + + # Copy into the running Prometheus container. + docker cp "${SYNTHETIC_RULES_FILE}" "${container}:/etc/prometheus/synthetic.yml" + + # Reload Prometheus config via the lifecycle API (--web.enable-lifecycle required). + if ! curl -sf --max-time 5 -XPOST "${PROMETHEUS_URL}/-/reload" &>/dev/null; then + echo "WARNING: Prometheus reload returned non-200; alerts may still propagate" >&2 + fi + + # Wait for the alert to appear as active in Alertmanager. + echo " polling Alertmanager for active alert ${alertname}..." + while true; do + local found + found=$(curl -sf --max-time 5 "${ALERTMANAGER_URL}/api/v2/alerts" 2>/dev/null \ + | jq --arg name "${alertname}" \ + 'any(.[]; .labels.alertname == $name and .status.state == "active")' 2>/dev/null \ + || echo "false") + + if [[ "${found}" == "true" ]]; then + echo " PASS: ${alertname} is active in Alertmanager" + # Clean up the synthetic rule immediately. + docker exec "${container}" rm -f /etc/prometheus/synthetic.yml || true + curl -sf --max-time 5 -XPOST "${PROMETHEUS_URL}/-/reload" &>/dev/null || true + return 0 + fi + + if [[ "${elapsed}" -ge "${timeout}" ]]; then + echo "ERROR: alert ${alertname} did not become active in Alertmanager within ${timeout}s" >&2 + exit 2 + fi + + sleep "${interval}" + elapsed=$((elapsed + interval)) + done +} + +# --------------------------------------------------------------------------- +# teardown: bring down the stack (called via trap) +# --------------------------------------------------------------------------- +teardown() { + if [[ "${TEARDOWN_DONE}" -eq 1 ]]; then + return + fi + TEARDOWN_DONE=1 + echo "teardown: stopping stack..." + if ! docker compose -f "${COMPOSE_FILE}" down -v --timeout 30; then + echo "ERROR: docker compose down failed" >&2 + exit 3 + fi + rm -f "${SYNTHETIC_RULES_FILE}" + echo "teardown: done" +} + +trap teardown EXIT INT TERM + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +main() { + echo "=== Aether Monitoring Smoke Test ===" + echo "repo root: ${REPO_ROOT}" + echo "" + + preflight + + echo "" + echo "--- starting stack ---" + docker compose -f "${COMPOSE_FILE}" up -d + + echo "" + echo "--- readiness checks ---" + readiness_wait "${PROMETHEUS_URL}/-/healthy" 120 + readiness_wait "${ALERTMANAGER_URL}/-/healthy" 60 + readiness_wait "${GRAFANA_URL}/api/health" 60 + + echo "" + echo "--- firing synthetic alerts ---" + + # Fire each of the 12 alert rules in alerts.yml by name + severity. The + # helper injects a synthetic always-firing rule with the same alertname and + # severity label, reloads Prometheus, and asserts delivery in Alertmanager. + fire_synthetic "AetherHalted" "critical" + fire_synthetic "AetherInclusionRateLow" "warning" + fire_synthetic "AetherE2ELatencyHigh" "warning" + fire_synthetic "AetherNoOpportunities" "warning" + fire_synthetic "AetherETHBalanceLow" "critical" + fire_synthetic "AetherGasHigh" "info" + fire_synthetic "AetherBuilderDown" "critical" + fire_synthetic "AetherServiceDown" "critical" + fire_synthetic "AetherNoBlocksProcessed" "critical" + fire_synthetic "AetherHighSimulationLatency" "warning" + fire_synthetic "AetherNegativeDailyPnL" "warning" + fire_synthetic "AetherRiskRejectionStorm" "warning" + + echo "" + echo "=== ALL CHECKS PASSED ===" +} + +main "$@" From affd19d86b105ee7cadf3f397cc53c69e13814ef Mon Sep 17 00:00:00 2001 From: Pablosinyores Date: Tue, 21 Apr 2026 13:21:10 +0530 Subject: [PATCH 2/4] fix(observability): rewrite smoke test + add runbook_url to all alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address PR #98 round-3 review: - scripts/monitoring_smoke.sh no longer relies on synthetic-rule injection via docker cp + /-/reload. main's prometheus.yml pins rule_files to a single path and the compose stack does not pass --web.enable-lifecycle, so both legs of that mechanism were no-ops. The rewritten script brings up the monitoring stack and asserts: every alert rule is loaded via /api/v1/rules with required annotations + severity, both scrape jobs are discovered, Alertmanager accepted its config, and every dashboard UID is provisioned. - deploy/docker/prometheus/alerts.yml — add runbook_url annotation to all 12 alert rules (matches the convention documented in deploy/docker/README.md and unblocks the smoke test's annotation assertion). - deploy/docker/README.md — update the smoke-test section to describe the assertion-based behaviour. --- deploy/docker/README.md | 11 +- scripts/monitoring_smoke.sh | 296 +++++++++++++++++++++++------------- 2 files changed, 200 insertions(+), 107 deletions(-) diff --git a/deploy/docker/README.md b/deploy/docker/README.md index 298856d..05253ab 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -55,4 +55,13 @@ Add finer or higher buckets in the metric definition to get better resolution. bash scripts/monitoring_smoke.sh ``` -Brings up the full stack, fires synthetic versions of all 12 alert rules via `docker cp` + Prometheus lifecycle reload, asserts each alert becomes active in Alertmanager, then tears down. Requires `docker`, `curl`, `jq`. +Brings up the monitoring stack (prometheus, alertmanager, grafana), waits for readiness, and asserts: + +- every expected alert rule was loaded by Prometheus, with `summary`, `description`, `runbook_url`, and `severity` populated; +- both `aether-go` and `aether-rust` scrape jobs are discovered; +- Alertmanager accepted its config (slack-default receiver resolved); +- every expected Grafana dashboard UID is provisioned. + +The script tears the stack down on exit. Requires `docker`, `curl`, `jq`. + +Synthetic rule injection (`docker cp` + `/-/reload`) is deliberately avoided — main's `prometheus.yml` pins `rule_files` to an explicit path and the compose stack does not pass `--web.enable-lifecycle`, so that approach cannot function without modifying files owned by other workstreams. Asserting rule loadedness via `/api/v1/rules` gives deterministic coverage within this PR's scope. diff --git a/scripts/monitoring_smoke.sh b/scripts/monitoring_smoke.sh index 3af2717..930e70d 100755 --- a/scripts/monitoring_smoke.sh +++ b/scripts/monitoring_smoke.sh @@ -1,18 +1,32 @@ #!/usr/bin/env bash # monitoring_smoke.sh — end-to-end smoke test for the Aether monitoring stack. # +# Brings up the deploy/docker stack, waits for readiness, then asserts: +# 1. Prometheus loaded every expected alert rule (via /api/v1/rules). +# 2. Every alert rule has the required annotations (summary, description, +# runbook_url) and a severity label. +# 3. Prometheus discovered both scrape targets (aether-go, aether-rust). +# 4. Alertmanager /-/ready is healthy and its config was accepted +# (/api/v2/status → configYAML non-empty). +# 5. Grafana provisioned every expected dashboard UID. +# +# Injection-based firing (docker cp + /-/reload) was intentionally removed: +# main's prometheus.yml pins rule_files to a single path and the compose +# stack does not pass --web.enable-lifecycle, so both legs of that approach +# were no-ops. Asserting rule loadedness and Alertmanager config acceptance +# gives deterministic coverage without modifying files the PR must not touch. +# # Dependencies: docker (with compose plugin), curl, jq # # Exit codes: # 0 — all checks passed # 1 — readiness timeout (service did not become healthy) -# 2 — alert did not fire within timeout +# 2 — a required assertion failed (missing rule, missing annotation, +# missing dashboard, missing scrape target, empty alertmanager config) # 3 — teardown failed # # Usage: bash scripts/monitoring_smoke.sh -# Run from the repo root. The script brings up deploy/docker and tears -# it down automatically. Does NOT require Pushgateway, amtool, or any -# extra container — only the existing stack + docker cp + curl. +# Run from the repo root. set -euo pipefail @@ -21,15 +35,35 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" COMPOSE_DIR="${REPO_ROOT}/deploy/docker" COMPOSE_FILE="${COMPOSE_DIR}/docker-compose.yml" -PROMETHEUS_URL="http://localhost:9091" -ALERTMANAGER_URL="http://localhost:9093" -GRAFANA_URL="http://localhost:3000" +PROMETHEUS_URL="${PROMETHEUS_URL:-http://localhost:9091}" +ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://localhost:9093}" +GRAFANA_URL="${GRAFANA_URL:-http://localhost:3000}" -SYNTHETIC_RULES_FILE="/tmp/aether-synthetic-rules.yml" TEARDOWN_DONE=0 +FAIL_COUNT=0 + +# Expected alert rules — must match deploy/docker/prometheus/alerts.yml. +EXPECTED_ALERTS=( + AetherHalted + AetherInclusionRateLow + AetherE2ELatencyHigh + AetherNoOpportunities + AetherETHBalanceLow + AetherGasHigh + AetherBuilderDown + AetherServiceDown + AetherNoBlocksProcessed + AetherHighSimulationLatency + AetherNegativeDailyPnL + AetherRiskRejectionStorm +) + +# Expected Prometheus scrape jobs — must match deploy/docker/prometheus.yml. +EXPECTED_TARGETS=(aether-go aether-rust) + +# Expected Grafana dashboard UIDs — one per JSON in grafana/dashboards/. +EXPECTED_DASHBOARDS=(aether-overview aether-latency aether-builders aether-risk) -# --------------------------------------------------------------------------- -# preflight: assert required tools are present # --------------------------------------------------------------------------- preflight() { local missing=0 @@ -39,29 +73,19 @@ preflight() { missing=1 fi done - if ! docker compose version &>/dev/null; then echo "ERROR: 'docker compose' plugin not available (need Docker 20.10+)" >&2 missing=1 fi - - if [[ "${missing}" -eq 1 ]]; then - exit 1 - fi - + [[ "${missing}" -eq 1 ]] && exit 1 echo "preflight: OK (docker, curl, jq available)" } -# --------------------------------------------------------------------------- -# readiness_wait -# Polls GET until HTTP 200 or timeout. Exits 1 on timeout. -# --------------------------------------------------------------------------- readiness_wait() { local url="${1}" local timeout="${2}" local elapsed=0 local interval=3 - echo "waiting for ${url} (timeout ${timeout}s)..." while true; do if curl -sf --max-time 2 "${url}" &>/dev/null; then @@ -77,95 +101,161 @@ readiness_wait() { done } +fail() { + echo " FAIL: $*" >&2 + FAIL_COUNT=$((FAIL_COUNT + 1)) +} + +pass() { + echo " PASS: $*" +} + # --------------------------------------------------------------------------- -# fire_synthetic -# Injects a synthetic rule into Prometheus via docker cp + lifecycle reload, -# then asserts the alert is active in Alertmanager within 30s. +# assert_rules_loaded +# Queries Prometheus /api/v1/rules, asserts every expected alertname is +# present and each rule has required annotations + severity label. # --------------------------------------------------------------------------- -fire_synthetic() { - local alertname="${1}" - local severity="${2}" - local container="aether-prometheus" - local timeout=60 - local elapsed=0 - local interval=3 +assert_rules_loaded() { + echo "" + echo "--- asserting Prometheus loaded all ${#EXPECTED_ALERTS[@]} alert rules ---" - echo "firing synthetic alert: alertname=${alertname} severity=${severity}" - - # Write a temporary rules file with expr that always fires. - cat > "${SYNTHETIC_RULES_FILE}" < 0 - for: 0s - labels: - severity: ${severity} - job: aether-go - synthetic: "true" - annotations: - summary: "Synthetic smoke test for ${alertname}" - description: "Injected by monitoring_smoke.sh" - runbook_url: "https://github.com/Pablosinyores/aether/blob/main/docs/runbooks/${alertname}.md" -EOF - - # Copy into the running Prometheus container. - docker cp "${SYNTHETIC_RULES_FILE}" "${container}:/etc/prometheus/synthetic.yml" - - # Reload Prometheus config via the lifecycle API (--web.enable-lifecycle required). - if ! curl -sf --max-time 5 -XPOST "${PROMETHEUS_URL}/-/reload" &>/dev/null; then - echo "WARNING: Prometheus reload returned non-200; alerts may still propagate" >&2 + local rules_json + if ! rules_json=$(curl -sf --max-time 10 "${PROMETHEUS_URL}/api/v1/rules" 2>/dev/null); then + fail "could not fetch ${PROMETHEUS_URL}/api/v1/rules" + return fi - # Wait for the alert to appear as active in Alertmanager. - echo " polling Alertmanager for active alert ${alertname}..." - while true; do - local found - found=$(curl -sf --max-time 5 "${ALERTMANAGER_URL}/api/v2/alerts" 2>/dev/null \ - | jq --arg name "${alertname}" \ - 'any(.[]; .labels.alertname == $name and .status.state == "active")' 2>/dev/null \ - || echo "false") - - if [[ "${found}" == "true" ]]; then - echo " PASS: ${alertname} is active in Alertmanager" - # Clean up the synthetic rule immediately. - docker exec "${container}" rm -f /etc/prometheus/synthetic.yml || true - curl -sf --max-time 5 -XPOST "${PROMETHEUS_URL}/-/reload" &>/dev/null || true - return 0 + local status + status=$(echo "${rules_json}" | jq -r '.status // "error"') + if [[ "${status}" != "success" ]]; then + fail "Prometheus /api/v1/rules returned status=${status}" + return + fi + + for alert in "${EXPECTED_ALERTS[@]}"; do + local match + match=$(echo "${rules_json}" | jq --arg n "${alert}" \ + '[.data.groups[].rules[]? | select(.type == "alerting" and .name == $n)] | .[0] // null') + if [[ "${match}" == "null" ]]; then + fail "alert ${alert} not loaded" + continue fi - if [[ "${elapsed}" -ge "${timeout}" ]]; then - echo "ERROR: alert ${alertname} did not become active in Alertmanager within ${timeout}s" >&2 - exit 2 + local has_summary has_description has_runbook has_severity + has_summary=$(echo "${match}" | jq -r '.annotations.summary // empty' | grep -c . || true) + has_description=$(echo "${match}" | jq -r '.annotations.description // empty' | grep -c . || true) + has_runbook=$(echo "${match}" | jq -r '.annotations.runbook_url // empty' | grep -c . || true) + has_severity=$(echo "${match}" | jq -r '.labels.severity // empty' | grep -c . || true) + + if [[ "${has_summary}" -eq 0 ]]; then + fail "${alert} missing annotations.summary" + elif [[ "${has_description}" -eq 0 ]]; then + fail "${alert} missing annotations.description" + elif [[ "${has_runbook}" -eq 0 ]]; then + fail "${alert} missing annotations.runbook_url" + elif [[ "${has_severity}" -eq 0 ]]; then + fail "${alert} missing labels.severity" + else + pass "${alert} loaded with summary, description, runbook_url, severity" fi + done +} - sleep "${interval}" - elapsed=$((elapsed + interval)) +# --------------------------------------------------------------------------- +# assert_scrape_targets_up +# Asserts Prometheus discovered every expected scrape job. Target health +# (up/down) is not asserted because aether-go / aether-rust may be absent or +# unhealthy in CI — only discovery is required for the monitoring contract. +# --------------------------------------------------------------------------- +assert_scrape_targets_up() { + echo "" + echo "--- asserting Prometheus discovered scrape targets ---" + local targets_json + if ! targets_json=$(curl -sf --max-time 10 "${PROMETHEUS_URL}/api/v1/targets" 2>/dev/null); then + fail "could not fetch ${PROMETHEUS_URL}/api/v1/targets" + return + fi + + for job in "${EXPECTED_TARGETS[@]}"; do + local count + count=$(echo "${targets_json}" | jq --arg j "${job}" \ + '[.data.activeTargets[]? | select(.labels.job == $j)] | length') + if [[ "${count}" -ge 1 ]]; then + pass "scrape job ${job} discovered (${count} target(s))" + else + fail "scrape job ${job} not discovered by Prometheus" + fi done } # --------------------------------------------------------------------------- -# teardown: bring down the stack (called via trap) +# assert_alertmanager_config +# /api/v2/status returns configYAML (parsed Alertmanager config). Empty means +# config failed to load. # --------------------------------------------------------------------------- -teardown() { - if [[ "${TEARDOWN_DONE}" -eq 1 ]]; then +assert_alertmanager_config() { + echo "" + echo "--- asserting Alertmanager accepted its config ---" + local status_json + if ! status_json=$(curl -sf --max-time 10 "${ALERTMANAGER_URL}/api/v2/status" 2>/dev/null); then + fail "could not fetch ${ALERTMANAGER_URL}/api/v2/status" + return + fi + + local config_yaml + config_yaml=$(echo "${status_json}" | jq -r '.config.original // empty') + if [[ -z "${config_yaml}" ]]; then + fail "Alertmanager config.original empty — config failed to load" + return + fi + + if echo "${config_yaml}" | grep -q "slack-default"; then + pass "Alertmanager config loaded (slack-default receiver present)" + else + fail "Alertmanager config loaded but slack-default receiver not found" + fi +} + +# --------------------------------------------------------------------------- +# assert_dashboards_provisioned +# Grafana anonymous viewer is enabled in compose, so /api/search is callable +# without auth. +# --------------------------------------------------------------------------- +assert_dashboards_provisioned() { + echo "" + echo "--- asserting Grafana provisioned all dashboards ---" + local search_json + if ! search_json=$(curl -sf --max-time 10 "${GRAFANA_URL}/api/search?type=dash-db" 2>/dev/null); then + fail "could not fetch ${GRAFANA_URL}/api/search" return fi + + for uid in "${EXPECTED_DASHBOARDS[@]}"; do + local count + count=$(echo "${search_json}" | jq --arg u "${uid}" '[.[] | select(.uid == $u)] | length') + if [[ "${count}" -ge 1 ]]; then + pass "dashboard ${uid} provisioned" + else + fail "dashboard ${uid} not provisioned" + fi + done +} + +# --------------------------------------------------------------------------- +teardown() { + [[ "${TEARDOWN_DONE}" -eq 1 ]] && return TEARDOWN_DONE=1 + echo "" echo "teardown: stopping stack..." if ! docker compose -f "${COMPOSE_FILE}" down -v --timeout 30; then echo "ERROR: docker compose down failed" >&2 exit 3 fi - rm -f "${SYNTHETIC_RULES_FILE}" echo "teardown: done" } trap teardown EXIT INT TERM -# --------------------------------------------------------------------------- -# Main # --------------------------------------------------------------------------- main() { echo "=== Aether Monitoring Smoke Test ===" @@ -175,36 +265,30 @@ main() { preflight echo "" - echo "--- starting stack ---" - docker compose -f "${COMPOSE_FILE}" up -d + echo "--- starting monitoring stack (prometheus, alertmanager, grafana) ---" + # Only bring up monitoring services — aether-go / aether-rust are not + # required for the assertions below and may fail to build in CI. + docker compose -f "${COMPOSE_FILE}" up -d prometheus alertmanager grafana echo "" echo "--- readiness checks ---" - readiness_wait "${PROMETHEUS_URL}/-/healthy" 120 - readiness_wait "${ALERTMANAGER_URL}/-/healthy" 60 + readiness_wait "${PROMETHEUS_URL}/-/ready" 120 + readiness_wait "${ALERTMANAGER_URL}/-/ready" 60 readiness_wait "${GRAFANA_URL}/api/health" 60 - echo "" - echo "--- firing synthetic alerts ---" - - # Fire each of the 12 alert rules in alerts.yml by name + severity. The - # helper injects a synthetic always-firing rule with the same alertname and - # severity label, reloads Prometheus, and asserts delivery in Alertmanager. - fire_synthetic "AetherHalted" "critical" - fire_synthetic "AetherInclusionRateLow" "warning" - fire_synthetic "AetherE2ELatencyHigh" "warning" - fire_synthetic "AetherNoOpportunities" "warning" - fire_synthetic "AetherETHBalanceLow" "critical" - fire_synthetic "AetherGasHigh" "info" - fire_synthetic "AetherBuilderDown" "critical" - fire_synthetic "AetherServiceDown" "critical" - fire_synthetic "AetherNoBlocksProcessed" "critical" - fire_synthetic "AetherHighSimulationLatency" "warning" - fire_synthetic "AetherNegativeDailyPnL" "warning" - fire_synthetic "AetherRiskRejectionStorm" "warning" + assert_rules_loaded + assert_scrape_targets_up + assert_alertmanager_config + assert_dashboards_provisioned echo "" - echo "=== ALL CHECKS PASSED ===" + if [[ "${FAIL_COUNT}" -eq 0 ]]; then + echo "=== ALL CHECKS PASSED ===" + exit 0 + else + echo "=== FAILED: ${FAIL_COUNT} assertion(s) ===" + exit 2 + fi } main "$@" From 14d0e0689f5db28b4d3a4fb24d3503115ba97d83 Mon Sep 17 00:00:00 2001 From: Pablosinyores Date: Fri, 15 May 2026 12:45:23 +0530 Subject: [PATCH 3/4] fix(observability): add AlertmanagerDown to smoke-test expected set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the rebase on main, alerts.yml carries 13 alerts (main's AlertmanagerDown self-monitor + the 12 previously expected). Updating the smoke-test expected list keeps the script in lockstep so it asserts every rule that should be loaded. Verified locally: - promtool check rules deploy/docker/prometheus/alerts.yml → SUCCESS: 13 rules found - prom/prometheus:latest boot + /api/v1/rules → all 13 loaded (state=unknown is correct without scrape targets, health=unknown means no syntax errors) - python yaml check → every alert has runbook_url + summary + description + severity label --- scripts/monitoring_smoke.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/monitoring_smoke.sh b/scripts/monitoring_smoke.sh index 930e70d..705e0f2 100755 --- a/scripts/monitoring_smoke.sh +++ b/scripts/monitoring_smoke.sh @@ -51,6 +51,7 @@ EXPECTED_ALERTS=( AetherETHBalanceLow AetherGasHigh AetherBuilderDown + AlertmanagerDown AetherServiceDown AetherNoBlocksProcessed AetherHighSimulationLatency From 4168d135d09da247181152d7f25a680741ca8af6 Mon Sep 17 00:00:00 2001 From: Pablosinyores Date: Fri, 15 May 2026 12:46:48 +0530 Subject: [PATCH 4/4] fix(observability): retry scrape-target discovery in monitoring_smoke assert_scrape_targets_up was racing Prometheus' async target loading: /-/ready returns 200 as soon as storage + HTTP API are up, but activeTargets is populated ~5 s later. The script asserted discovery immediately after readiness, so a correctly-configured stack reported aether-go and aether-rust as not-discovered on every cold boot. Wraps the discovery probe in a 20 s / 2 s-interval retry. As soon as every EXPECTED_TARGETS job appears, the loop exits and the per-job assertions print the discovered target count. If the deadline passes, the same failure message fires with the timeout suffixed for diagnostics. No change to the production monitoring path -- alert rules were already loading and Alertmanager was already routing; this fixes only the smoke test's signal. --- scripts/monitoring_smoke.sh | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/scripts/monitoring_smoke.sh b/scripts/monitoring_smoke.sh index 705e0f2..c39ed1c 100755 --- a/scripts/monitoring_smoke.sh +++ b/scripts/monitoring_smoke.sh @@ -171,11 +171,25 @@ assert_rules_loaded() { assert_scrape_targets_up() { echo "" echo "--- asserting Prometheus discovered scrape targets ---" - local targets_json - if ! targets_json=$(curl -sf --max-time 10 "${PROMETHEUS_URL}/api/v1/targets" 2>/dev/null); then - fail "could not fetch ${PROMETHEUS_URL}/api/v1/targets" - return - fi + + # Prometheus /-/ready turns 200 as soon as storage + HTTP API are up, but + # activeTargets is populated asynchronously and lags ready by ~5 s on a cold + # boot. Retry the fetch until every expected job appears or the deadline + # passes, otherwise reviewers see this assertion fail on every clean run. + local timeout=20 elapsed=0 interval=2 targets_json + while [[ $elapsed -lt $timeout ]]; do + targets_json=$(curl -sf --max-time 10 "${PROMETHEUS_URL}/api/v1/targets" 2>/dev/null || echo '{}') + local all_found=1 + for job in "${EXPECTED_TARGETS[@]}"; do + local c + c=$(echo "${targets_json}" | jq --arg j "${job}" \ + '[.data.activeTargets[]? | select(.labels.job == $j)] | length') + [[ "${c:-0}" -lt 1 ]] && all_found=0 + done + [[ $all_found -eq 1 ]] && break + sleep "$interval" + elapsed=$((elapsed + interval)) + done for job in "${EXPECTED_TARGETS[@]}"; do local count @@ -184,7 +198,7 @@ assert_scrape_targets_up() { if [[ "${count}" -ge 1 ]]; then pass "scrape job ${job} discovered (${count} target(s))" else - fail "scrape job ${job} not discovered by Prometheus" + fail "scrape job ${job} not discovered by Prometheus within ${timeout}s" fi done }