SemiAnalysisAI · adibarra · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -117,6 +117,11 @@ env:
   RESULT_DIR: /workspace/results
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
+  # SWE-bench scoring runs on Modal (GPU runners have no Docker); creds are
+  # bootstrapped into ~/.modal.toml by benchmark_lib.sh:_ensure_modal_credentials.
+  SWEBENCH_USE_MODAL: 'true'
+  MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+  MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
 
 permissions:
   contents: read

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -50,6 +50,7 @@
             eval-config: ${{ steps.get-jobs.outputs.eval-config }}
             multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
             agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
+            agentic-eval-config: ${{ steps.get-jobs.outputs.agentic-eval-config }}
             multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
         steps:
             - name: Checkout code (ref)
@@ -69,13 +70,15 @@
                   pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
-                  AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
+                  AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and not x.get('run-eval', False)]))")
+                  AGENTIC_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x and x.get('run-eval', False)]))")
                   MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
                   SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
                   EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
                   MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
+                  echo "agentic-eval-config=$AGENTIC_EVAL" >> $GITHUB_OUTPUT
                   echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
@@ -195,7 +198,42 @@
             scenario-type: agentic-coding
             ref: ${{ inputs.ref }}
 
+    test-sweep-agentic-evals:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.agentic-eval-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: agentic eval /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.agentic-eval-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            offloading: ${{ matrix.config.offloading }}
+            duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
+            isl: '0'
+            osl: '0'
+            max-model-len: '0'
+            spec-decoding: 'none'
+            disagg: 'false'
+            # scenario-type agentic-coding => run_eval auto-selects swebench.
+            run-eval: true
+            eval-only: true
+            scenario-type: agentic-coding
+            ref: ${{ inputs.ref }}
+
     test-sweep-multi-node-agentic:
         needs: get-jobs
         if: ${{ needs.get-jobs.outputs.multi-node-agentic-config != '[]' }}
        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
@@ -305,12 +343,12 @@
             result-prefix: "bmk"
 
     collect-evals:
-        needs: [test-sweep-evals, test-sweep-multi-node-evals]
-        if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped') }}
+        needs: [test-sweep-evals, test-sweep-multi-node-evals, test-sweep-agentic-evals]
+        if: ${{ always() && (needs.test-sweep-evals.result != 'skipped' || needs.test-sweep-multi-node-evals.result != 'skipped' || needs.test-sweep-agentic-evals.result != 'skipped') }}
         uses: ./.github/workflows/collect-evals.yml
         secrets: inherit
 
    collect-agentic-results:
        needs: [test-sweep-agentic, test-sweep-multi-node-agentic]
        if: ${{ always() && (needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
        runs-on: ubuntu-latest

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -966,21 +966,178 @@ META
     echo "Moved eval artifacts to: $(pwd)"
 }
 
+# ------------------------------
+# SWE-bench eval helpers
+# ------------------------------
+
+# Run the SWE-bench Lite eval: generate patches with lm-eval, then score them
+# with the official swebench Docker harness. lm-eval cannot score SWE-bench
+# itself (no repo-level test executor), so we reuse it only for generation and
+# emit an lm-eval-shaped results JSON from swebench_score.py so the rest of the
+# pipeline (append_lm_eval_summary / collect / validate) is unchanged.
+#
+# Env knobs:
+#   SWEBENCH_TASK_NAME     (default swebench_lite) selects utils/evals/<name>.yaml
+#   SWEBENCH_DATASET       optional; must equal the YAML's dataset_path (the
+#                          scoring dataset is derived from the YAML so generation
+#                          and scoring never diverge) -- mismatch fails fast
+#   SWEBENCH_MAX_WORKERS   (default 4) harness workers / Modal parallelism
+#   SWEBENCH_USE_MODAL     "true" => score on Modal remote sandboxes instead of
+#                          local Docker (no Docker needed on the node; requires a
+#                          Modal account — credentials from ~/.modal.toml or from
+#                          MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env vars, e.g. a
+#                          GitHub secret; the env vars are bootstrapped into
+#                          ~/.modal.toml automatically if the file is absent)
+#   SWEBENCH_NAMESPACE     local-Docker only: pass "" on arm/Mac to build locally
+#   SWEBENCH_SKIP_SCORE    "true" => generate + stage predictions only, no scoring
+#                          (score elsewhere)
+_install_swebench_deps() {
+    # Best-effort (mirrors _install_lm_eval_deps); a real failure surfaces at scoring.
+    python3 -m pip install -q --no-cache-dir --break-system-packages swebench || true
+    if [ "${SWEBENCH_USE_MODAL:-false}" = "true" ]; then
+        python3 -m pip install -q --no-cache-dir --break-system-packages modal || true
+    fi
+}
+
+# swebench's validate_modal_credentials() only checks that ~/.modal.toml
+# exists; the modal package itself prefers MODAL_TOKEN_ID/MODAL_TOKEN_SECRET
+# env vars (how CI passes the GitHub secret). Bootstrap a minimal file from
+# the env so the harness's check passes. Never overwrite an existing file.
+_ensure_modal_credentials() {
+    if [ "${SWEBENCH_USE_MODAL:-false}" != "true" ]; then return 0; fi
+    if [ -f "$HOME/.modal.toml" ]; then return 0; fi
+    if [ -n "${MODAL_TOKEN_ID:-}" ] && [ -n "${MODAL_TOKEN_SECRET:-}" ]; then
+        printf '[default]\ntoken_id = "%s"\ntoken_secret = "%s"\nactive = true\n' \
+            "$MODAL_TOKEN_ID" "$MODAL_TOKEN_SECRET" > "$HOME/.modal.toml"
+        chmod 600 "$HOME/.modal.toml"
+        echo "[swebench] wrote ~/.modal.toml from MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env"
+    else
+        echo "WARN: SWEBENCH_USE_MODAL=true but no ~/.modal.toml and no MODAL_TOKEN_ID/MODAL_TOKEN_SECRET env; Modal scoring will fail credential validation" >&2
+    fi
+}
+
+# Run the configured eval and stage its artifacts when RUN_EVAL is enabled.
+# run_eval auto-selects the framework by scenario (agentic -> swebench,
+# fixed-seq-len -> lm-eval), so recipes call this without naming a framework.
+maybe_run_eval() {
+    local port="${1:-${PORT:-8888}}"
+    if [ "${RUN_EVAL}" = "true" ]; then
+        run_eval --port "$port"
+        append_lm_eval_summary
+    fi
+}
+
+run_swebench_eval() {
+    local out_dir="${EVAL_RESULT_DIR:-$(mktemp -d /tmp/eval_out-XXXXXX)}"
+    local task_name="${SWEBENCH_TASK_NAME:-swebench_lite}"
+    local gen_dir
+    gen_dir=$(mktemp -d /tmp/swebench_gen-XXXXXX)
+
+    # Keep the scoring dataset in lockstep with the generation YAML: the harness
+    # must score against the same instance set lm-eval generated patches for, or
+    # the instance IDs won't match. Derive it from the task YAML; if
+    # SWEBENCH_DATASET is set it must agree (fail-fast rather than mis-score).
+    local yaml_path="${EVAL_TASKS_DIR:-utils/evals/${task_name}.yaml}"
+    local dataset
+    dataset=$(awk '/^dataset_path:[[:space:]]/{print $2; exit}' "$yaml_path" 2>/dev/null)
+    if [ -z "$dataset" ]; then
+        echo "ERROR: could not read dataset_path from ${yaml_path}" >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return 1
+    fi
+    if [ -n "${SWEBENCH_DATASET:-}" ] && [ "${SWEBENCH_DATASET}" != "$dataset" ]; then
+        echo "ERROR: SWEBENCH_DATASET='${SWEBENCH_DATASET}' disagrees with ${yaml_path} dataset_path='${dataset}'." >&2
+        echo "       Generation and scoring must use the same dataset; edit the YAML or unset SWEBENCH_DATASET." >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return 1
+    fi
+
+    # 1. Generation via lm-eval (reuses endpoint wiring, _patch_lm_eval, etc.).
+    #    run_lm_eval already passes --log_samples, which is what we consume.
+    local prev_tasks_dir="${EVAL_TASKS_DIR:-}"
+    export EVAL_TASKS_DIR="$yaml_path"
+    local gen_rc=0
+    run_lm_eval "$@" --results-dir "$gen_dir" || gen_rc=$?
+    export EVAL_TASKS_DIR="$prev_tasks_dir"
+    if [ "$gen_rc" -ne 0 ]; then
+        echo "ERROR: swebench generation (lm-eval) failed with $gen_rc" >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return "$gen_rc"
+    fi
+
+    # Preserve generations as artifacts alongside the scored results.
+    mkdir -p "$out_dir"
+    find "$gen_dir" -name 'samples_*.jsonl' -exec cp -f {} "$out_dir"/ \; 2>/dev/null || true
+    export EVAL_RESULT_DIR="$out_dir"
+
+    local lm_eval_version
+    lm_eval_version=$(python3 -c 'import lm_eval; print(lm_eval.__version__)' 2>/dev/null || echo unknown)
+
+    if [ "${SWEBENCH_SKIP_SCORE:-false}" = "true" ]; then
+        # Generation-only mode: emit predictions, defer Docker scoring elsewhere.
+        # TODO(alec): wire the separate scoring job (Modal / sb-cli / CPU runner).
+        local skip_rc=0
+        python3 utils/evals/swebench_score.py \
+            --samples-dir "$gen_dir" --out-dir "$out_dir" \
+            --model-name "${MODEL_NAME:-$MODEL}" --task-name "$task_name" \
+            --predictions-only || skip_rc=$?
+        echo "SWEBENCH_SKIP_SCORE=true: staged predictions only (no resolved-rate)." >&2
+        rm -rf "$gen_dir" 2>/dev/null || true
+        return "$skip_rc"
+    fi
+
+    # 2. Score with the official swebench harness (local Docker, or Modal remote
+    #    sandboxes when SWEBENCH_USE_MODAL=true) and emit the lm-eval-shaped JSON.
+    if [ "${INFERENCEX_SWEBENCH_RUNTIME_READY:-false}" != "true" ]; then
+        _install_swebench_deps
+        export INFERENCEX_SWEBENCH_RUNTIME_READY=true
+    fi
+    _ensure_modal_credentials
+    local score_rc=0
+    python3 utils/evals/swebench_score.py \
+        --samples-dir "$gen_dir" \
+        --out-dir "$out_dir" \
+        --model-name "${MODEL_NAME:-$MODEL}" \
+        --task-name "$task_name" \
+        --dataset-name "$dataset" \
+        --max-workers "${SWEBENCH_MAX_WORKERS:-4}" \
+        --lm-eval-version "$lm_eval_version" \
+        ${SWEBENCH_USE_MODAL:+--modal} \
+        ${SWEBENCH_NAMESPACE+--namespace "$SWEBENCH_NAMESPACE"} \
+        || score_rc=$?
+    rm -rf "$gen_dir" 2>/dev/null || true
+    if [ "$score_rc" -ne 0 ]; then
+        echo "ERROR: swebench scoring failed with $score_rc" >&2
+        return "$score_rc"
+    fi
+}
+
 # ------------------------------
 # Unified eval entrypoint
 # ------------------------------
 
 run_eval() {
-    local framework="${EVAL_FRAMEWORK:-lm-eval}"
+    local cli_framework=""
     local forwarded=()
 
     while [[ $# -gt 0 ]]; do
         case "$1" in
-            --framework) framework="$2"; shift 2 ;;
+            --framework) cli_framework="$2"; shift 2 ;;
             *)           forwarded+=("$1"); shift ;;
         esac
     done
 
+    # Eval framework is chosen by SCENARIO: agentic-coding configs run swebench;
+    # fixed-seq-len (8k1k/1k1k) run lm-eval/gsm8k -- agentic never runs gsm8k and
+    # 8k1k/1k1k never runs swebench. An explicit EVAL_FRAMEWORK env or --framework
+    # arg still overrides the scenario default (e.g. the recipes' `--framework
+    # lm-eval`, or a forced override).
+    local scenario_default="lm-eval"
+    if [ "${IS_AGENTIC:-0}" = "1" ] || [ "${SCENARIO_TYPE:-}" = "agentic-coding" ]; then
+        scenario_default="swebench"
+    fi
+    local framework="${EVAL_FRAMEWORK:-${cli_framework:-$scenario_default}}"
+
     # Compute EVAL_MAX_MODEL_LEN if not already set by the calling script
     if [ -z "${EVAL_MAX_MODEL_LEN:-}" ]; then
         compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
@@ -1052,6 +1209,7 @@ run_eval() {
     local eval_rc=0
     case "$framework" in
         lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
+        swebench)        run_swebench_eval "${forwarded[@]}" || eval_rc=$? ;;
         *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
     esac
 

diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
@@ -58,7 +58,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
@@ -52,7 +52,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
@@ -249,7 +249,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -137,7 +137,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -156,7 +156,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -63,7 +63,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -64,7 +64,12 @@ echo "Server PID: $SERVER_PID"
 
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+if [ "${EVAL_ONLY}" = "true" ]; then
+    # Eval-only: skip the multi-turn agentic replay and run the eval against the
+    # live server. run_eval auto-selects swebench for agentic-coding scenario.
+    maybe_run_eval "$PORT"
+else
+    # ---- Run benchmark ------------------------------------------------------
+    build_replay_cmd "$RESULT_DIR"
+    run_agentic_replay_and_write_outputs "$RESULT_DIR"
+fi