Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ env:
RESULT_DIR: /workspace/results
PYTHONDONTWRITEBYTECODE: '1'
PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
# SWE-bench scoring runs on Modal (GPU runners have no Docker); creds are
# bootstrapped into ~/.modal.toml by benchmark_lib.sh:_ensure_modal_credentials.
SWEBENCH_USE_MODAL: 'true'
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}

permissions:
contents: read
Expand Down
44 changes: 41 additions & 3 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
eval-config: ${{ steps.get-jobs.outputs.eval-config }}
multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
agentic-eval-config: ${{ steps.get-jobs.outputs.agentic-eval-config }}
multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
steps:
- name: Checkout code (ref)
Expand All @@ -69,13 +70,15 @@
pip install pydantic
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and not x.get('run-eval', False)]))")
AGENTIC_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and x.get('run-eval', False)]))")
MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
echo "agentic-eval-config=$AGENTIC_EVAL" >> $GITHUB_OUTPUT
echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -195,7 +198,42 @@
scenario-type: agentic-coding
ref: ${{ inputs.ref }}

test-sweep-agentic-evals:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.agentic-eval-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: agentic eval /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.agentic-eval-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
offloading: ${{ matrix.config.offloading }}
duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
isl: '0'
osl: '0'
max-model-len: '0'
spec-decoding: 'none'
disagg: 'false'
# scenario-type agentic-coding => run_eval auto-selects swebench.
run-eval: true
eval-only: true
scenario-type: agentic-coding
ref: ${{ inputs.ref }}

test-sweep-multi-node-agentic:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: get-jobs
if: ${{ needs.get-jobs.outputs.multi-node-agentic-config != '[]' }}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
Expand Down Expand Up @@ -305,12 +343,12 @@
result-prefix: "bmk"

collect-evals:
needs: [test-sweep-evals, test-sweep-multi-node-evals]
if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
needs: [test-sweep-evals, test-sweep-multi-node-evals, test-sweep-agentic-evals]
if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped' || needs.test-sweep-agentic-evals.result != 'skipped') }}
uses: ./.github/workflows/collect-evals.yml
secrets: inherit

collect-agentic-results:

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
needs: [test-sweep-agentic, test-sweep-multi-node-agentic]
if: ${{ always() && (needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
runs-on: ubuntu-latest
Expand Down
162 changes: 160 additions & 2 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -966,21 +966,178 @@ META
echo "Moved eval artifacts to: $(pwd)"
}

# ------------------------------
# SWE-bench eval helpers
# ------------------------------

# Run the SWE-bench Lite eval: generate patches with lm-eval, then score them
# with the official swebench Docker harness. lm-eval cannot score SWE-bench
# itself (no repo-level test executor), so we reuse it only for generation and
# emit an lm-eval-shaped results JSON from swebench_score.py so the rest of the
# pipeline (append_lm_eval_summary / collect / validate) is unchanged.
#
# Env knobs:
# SWEBENCH_TASK_NAME (default swebench_lite) selects utils/evals/<name>.yaml
# SWEBENCH_DATASET optional; must equal the YAML's dataset_path (the
# scoring dataset is derived from the YAML so generation
# and scoring never diverge) -- mismatch fails fast
# SWEBENCH_MAX_WORKERS (default 4) harness workers / Modal parallelism
# SWEBENCH_USE_MODAL "true" => score on Modal remote sandboxes instead of
# local Docker (no Docker needed on the node; requires a
# Modal account — credentials from ~/.modal.toml or from
# MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env vars, e.g. a
# GitHub secret; the env vars are bootstrapped into
# ~/.modal.toml automatically if the file is absent)
# SWEBENCH_NAMESPACE local-Docker only: pass "" on arm/Mac to build locally
# SWEBENCH_SKIP_SCORE "true" => generate + stage predictions only, no scoring
# (score elsewhere)
_install_swebench_deps() {
# Best-effort (mirrors _install_lm_eval_deps); a real failure surfaces at scoring.
python3 -m pip install -q --no-cache-dir --break-system-packages swebench || true
if [ "${SWEBENCH_USE_MODAL:-false}" = "true" ]; then
python3 -m pip install -q --no-cache-dir --break-system-packages modal || true
fi
}

# swebench's validate_modal_credentials() only checks that ~/.modal.toml
# exists; the modal package itself prefers MODAL_TOKEN_ID/MODAL_TOKEN_SECRET
# env vars (how CI passes the GitHub secret). Bootstrap a minimal file from
# the env so the harness's check passes. Never overwrite an existing file.
_ensure_modal_credentials() {
if [ "${SWEBENCH_USE_MODAL:-false}" != "true" ]; then return 0; fi
if [ -f "$HOME/.modal.toml" ]; then return 0; fi
if [ -n "${MODAL_TOKEN_ID:-}" ] && [ -n "${MODAL_TOKEN_SECRET:-}" ]; then
printf '[default]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\n' \
"$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > "$HOME/.modal.toml"
chmod 600 "$HOME/.modal.toml"
echo "[swebench] wrote ~/.modal.toml from MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env"
else
echo "WARN: SWEBENCH_USE_MODAL=true but no ~/.modal.toml and no MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env; Modal scoring will fail credential validation" >&2
fi
}

# Run the configured eval and stage its artifacts when RUN_EVAL is enabled.
# run_eval auto-selects the framework by scenario (agentic -> swebench,
# fixed-seq-len -> lm-eval), so recipes call this without naming a framework.
maybe_run_eval() {
local port="${1:-${PORT:-8888}}"
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --port "$port"
append_lm_eval_summary
fi
}

run_swebench_eval() {
local out_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
local task_name="${SWEBENCH_TASK_NAME:-swebench_lite}"
local gen_dir
gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX)

# Keep the scoring dataset in lockstep with the generation YAML: the harness
# must score against the same instance set lm-eval generated patches for, or
# the instance IDs won't match. Derive it from the task YAML; if
# SWEBENCH_DATASET is set it must agree (fail-fast rather than mis-score).
local yaml_path="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
local dataset
dataset=$(awk '/^dataset_path:[[:space:]]/{print $2; exit}' "$yaml_path" 2>/dev/null)
if [ -z "$dataset" ]; then
echo "ERROR: could not read dataset_path from ${yaml_path}" >&2
rm -rf "$gen_dir" 2>/dev/null || true
return 1
fi
if [ -n "${SWEBENCH_DATASET:-}" ] && [ "${SWEBENCH_DATASET}" != "$dataset" ]; then
echo "ERROR: SWEBENCH_DATASET='${SWEBENCH_DATASET}' disagrees with ${yaml_path} dataset_path='${dataset}'." >&2
echo " Generation and scoring must use the same dataset; edit the YAML or unset SWEBENCH_DATASET." >&2
rm -rf "$gen_dir" 2>/dev/null || true
return 1
fi

# 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.).
# run_lm_eval already passes --log_samples, which is what we consume.
local prev_tasks_dir="${EVAL_TASKS_DIR:-}"
export EVAL_TASKS_DIR="$yaml_path"
local gen_rc=0
run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$?
export EVAL_TASKS_DIR="$prev_tasks_dir"
if [ "$gen_rc" -ne 0 ]; then
echo "ERROR: swebench generation (lm-eval) failed with $gen_rc" >&2
rm -rf "$gen_dir" 2>/dev/null || true
return "$gen_rc"
fi

# Preserve generations as artifacts alongside the scored results.
mkdir -p "$out_dir"
find "$gen_dir" -name 'samples_*.jsonl' -exec cp -f {} "$out_dir"/ \; 2>/dev/null || true
export EVAL_RESULT_DIR="$out_dir"

local lm_eval_version
lm_eval_version=$(python3 -c 'import lm_eval; print(lm_eval.__version__)' 2>/dev/null || echo unknown)

if [ "${SWEBENCH_SKIP_SCORE:-false}" = "true" ]; then
# Generation-only mode: emit predictions, defer Docker scoring elsewhere.
# TODO(alec): wire the separate scoring job (Modal / sb-cli / CPU runner).
local skip_rc=0
python3 utils/evals/swebench_score.py \
--samples-dir "$gen_dir" --out-dir "$out_dir" \
--model-name "${MODEL_NAME:-$MODEL}" --task-name "$task_name" \
--predictions-only || skip_rc=$?
echo "SWEBENCH_SKIP_SCORE=true: staged predictions only (no resolved-rate)." >&2
rm -rf "$gen_dir" 2>/dev/null || true
return "$skip_rc"
fi

# 2. Score with the official swebench harness (local Docker, or Modal remote
# sandboxes when SWEBENCH_USE_MODAL=true) and emit the lm-eval-shaped JSON.
if [ "${INFERENCEX_SWEBENCH_RUNTIME_READY:-false}" != "true" ]; then
_install_swebench_deps
export INFERENCEX_SWEBENCH_RUNTIME_READY=true
fi
_ensure_modal_credentials
local score_rc=0
python3 utils/evals/swebench_score.py \
--samples-dir "$gen_dir" \
--out-dir "$out_dir" \
--model-name "${MODEL_NAME:-$MODEL}" \
--task-name "$task_name" \
--dataset-name "$dataset" \
--max-workers "${SWEBENCH_MAX_WORKERS:-4}" \
--lm-eval-version "$lm_eval_version" \
${SWEBENCH_USE_MODAL:+--modal} \
${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \
|| score_rc=$?
rm -rf "$gen_dir" 2>/dev/null || true
if [ "$score_rc" -ne 0 ]; then
echo "ERROR: swebench scoring failed with $score_rc" >&2
return "$score_rc"
Comment thread
adibarra marked this conversation as resolved.
fi
}

# ------------------------------
# Unified eval entrypoint
# ------------------------------

run_eval() {
local framework="${EVAL_FRAMEWORK:-lm-eval}"
local cli_framework=""
local forwarded=()

while [[ $# -gt 0 ]]; do
case "$1" in
--framework) framework="$2"; shift 2 ;;
--framework) cli_framework="$2"; shift 2 ;;
*) forwarded+=("$1"); shift ;;
esac
done

# Eval framework is chosen by SCENARIO: agentic-coding configs run swebench;
# fixed-seq-len (8k1k/1k1k) run lm-eval/gsm8k -- agentic never runs gsm8k and
# 8k1k/1k1k never runs swebench. An explicit EVAL_FRAMEWORK env or --framework
# arg still overrides the scenario default (e.g. the recipes' `--framework
# lm-eval`, or a forced override).
local scenario_default="lm-eval"
if [ "${IS_AGENTIC:-0}" = "1" ] || [ "${SCENARIO_TYPE:-}" = "agentic-coding" ]; then
scenario_default="swebench"
fi
local framework="${EVAL_FRAMEWORK:-${cli_framework:-$scenario_default}}"

# Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
Expand Down Expand Up @@ -1052,6 +1209,7 @@ run_eval() {
local eval_rc=0
case "$framework" in
lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
swebench) run_swebench_eval "${forwarded[@]}" || eval_rc=$? ;;
*) echo "Unknown framework '${framework}'"; eval_rc=1 ;;
esac

Expand Down
13 changes: 9 additions & 4 deletions benchmarks/single_node/agentic/dsr1_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ echo "Server PID: $SERVER_PID"

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
if [ "${EVAL_ONLY}" = "true" ]; then
# Eval-only: skip the multi-turn agentic replay and run the eval against the
# live server. run_eval auto-selects swebench for agentic-coding scenario.
maybe_run_eval "$PORT"
else
# ---- Run benchmark ------------------------------------------------------
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
fi
13 changes: 9 additions & 4 deletions benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,12 @@ echo "Server PID: $SERVER_PID"

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
if [ "${EVAL_ONLY}" = "true" ]; then
# Eval-only: skip the multi-turn agentic replay and run the eval against the
# live server. run_eval auto-selects swebench for agentic-coding scenario.
maybe_run_eval "$PORT"
else
# ---- Run benchmark ------------------------------------------------------
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
fi
13 changes: 9 additions & 4 deletions benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,12 @@ echo "Server PID: $SERVER_PID"

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
if [ "${EVAL_ONLY}" = "true" ]; then
# Eval-only: skip the multi-turn agentic replay and run the eval against the
# live server. run_eval auto-selects swebench for agentic-coding scenario.
maybe_run_eval "$PORT"
else
# ---- Run benchmark ------------------------------------------------------
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
fi
13 changes: 9 additions & 4 deletions benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,12 @@ echo "Server PID: $SERVER_PID"

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
if [ "${EVAL_ONLY}" = "true" ]; then
# Eval-only: skip the multi-turn agentic replay and run the eval against the
# live server. run_eval auto-selects swebench for agentic-coding scenario.
maybe_run_eval "$PORT"
else
# ---- Run benchmark ------------------------------------------------------
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
fi
13 changes: 9 additions & 4 deletions benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,12 @@ echo "Server PID: $SERVER_PID"

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
if [ "${EVAL_ONLY}" = "true" ]; then
# Eval-only: skip the multi-turn agentic replay and run the eval against the
# live server. run_eval auto-selects swebench for agentic-coding scenario.
maybe_run_eval "$PORT"
else
# ---- Run benchmark ------------------------------------------------------
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
fi
13 changes: 9 additions & 4 deletions benchmarks/single_node/agentic/dsv4_fp8_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,12 @@ echo "Server PID: $SERVER_PID"

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
if [ "${EVAL_ONLY}" = "true" ]; then
# Eval-only: skip the multi-turn agentic replay and run the eval against the
# live server. run_eval auto-selects swebench for agentic-coding scenario.
maybe_run_eval "$PORT"
else
# ---- Run benchmark ------------------------------------------------------
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
fi
13 changes: 9 additions & 4 deletions benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ echo "Server PID: $SERVER_PID"

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

# ---- Run benchmark ----------------------------------------------------------
build_replay_cmd "$RESULT_DIR"

run_agentic_replay_and_write_outputs "$RESULT_DIR"
if [ "${EVAL_ONLY}" = "true" ]; then
# Eval-only: skip the multi-turn agentic replay and run the eval against the
# live server. run_eval auto-selects swebench for agentic-coding scenario.
maybe_run_eval "$PORT"
else
# ---- Run benchmark ------------------------------------------------------
build_replay_cmd "$RESULT_DIR"
run_agentic_replay_and_write_outputs "$RESULT_DIR"
fi
Loading