From d99590f3276bb7fbd94c24c19fc6dc5d6eba0089 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 19 May 2026 13:48:38 -0700 Subject: [PATCH 1/3] chore(ci): add required mirror gate statuses Signed-off-by: Piotr Mlocek --- .github/actions/pr-merge-base/action.yml | 37 +++++ .github/workflows/e2e-label-help.yml | 4 +- .github/workflows/helm-lint.yml | 52 +++++-- .github/workflows/required-ci-gates.yml | 185 +++++++++++++++++++++++ CI.md | 35 +++-- CONTRIBUTING.md | 2 +- architecture/build.md | 13 +- 7 files changed, 299 insertions(+), 29 deletions(-) create mode 100644 .github/actions/pr-merge-base/action.yml create mode 100644 .github/workflows/required-ci-gates.yml diff --git a/.github/actions/pr-merge-base/action.yml b/.github/actions/pr-merge-base/action.yml new file mode 100644 index 000000000..9393f6518 --- /dev/null +++ b/.github/actions/pr-merge-base/action.yml @@ -0,0 +1,37 @@ +name: PR Merge Base +description: Resolve and fetch the merge-base commit needed to diff a copy-pr-bot pull-request/ push against the PR base branch. + +inputs: + gh_token: + description: GitHub token for PR and compare API calls. + required: true + +outputs: + base_sha: + description: Merge-base commit SHA for pull-request/ refs, or empty for other refs. + value: ${{ steps.merge-base.outputs.base_sha }} + +runs: + using: composite + steps: + - id: merge-base + shell: bash + env: + GH_TOKEN: ${{ inputs.gh_token }} + GH_REPO: ${{ github.repository }} + REF_NAME: ${{ github.ref_name }} + GITHUB_SHA_VALUE: ${{ github.sha }} + run: | + set -euo pipefail + + if [[ "$REF_NAME" =~ ^pull-request/([0-9]+)$ ]]; then + pr_number="${BASH_REMATCH[1]}" + base_ref=$(gh pr view "$pr_number" --repo "$GH_REPO" --json baseRefName -q '.baseRefName') + # The mirrored branch is a push ref, so changed-files needs the true + # merge-base to diff the PR head against its base branch. + base_sha=$(gh api "repos/$GH_REPO/compare/$base_ref...$GITHUB_SHA_VALUE" --jq '.merge_base_commit.sha') + git fetch --no-tags --depth=1 origin "$base_sha" + echo "base_sha=$base_sha" >> "$GITHUB_OUTPUT" + else + echo "base_sha=" >> "$GITHUB_OUTPUT" + fi diff --git a/.github/workflows/e2e-label-help.yml b/.github/workflows/e2e-label-help.yml index fde45fc0a..e1a268803 100644 --- a/.github/workflows/e2e-label-help.yml +++ b/.github/workflows/e2e-label-help.yml @@ -1,6 +1,6 @@ name: E2E Label Help -# When a `test:e2e` / `test:e2e-gpu` label is applied, post a PR comment +# When a `test:e2e*` label is applied, post a PR comment # telling the maintainer the next manual step. We don't dispatch the workflow # ourselves: a workflow_dispatch-triggered run does not surface in the PR's # Checks tab, so we'd lose in-progress visibility. Instead we point the @@ -62,7 +62,7 @@ jobs: workflow_link="[$workflow_name](https://github.com/$GH_REPO/actions/workflows/$workflow_file)" instructions="Open $workflow_link, find the run for commit \`$short_pr\`, and click **Re-run all jobs** to execute with the label set." fi - body="Label \`$LABEL_NAME\` applied for \`$short_pr\`. $instructions The \`E2E Gate\` check on this PR will flip green automatically once the run finishes." + body="Label \`$LABEL_NAME\` applied for \`$short_pr\`. $instructions The matching required CI gate status on this PR will flip green automatically once the run finishes." fi gh pr comment "$PR_NUMBER" --body "$body" diff --git a/.github/workflows/helm-lint.yml b/.github/workflows/helm-lint.yml index 9f3e2fbcb..cf0666bc0 100644 --- a/.github/workflows/helm-lint.yml +++ b/.github/workflows/helm-lint.yml @@ -7,12 +7,6 @@ on: push: branches: - "pull-request/[0-9]+" - paths: - - "deploy/helm/**" - - "mise.toml" - - "mise.lock" - - "tasks/helm.toml" - - ".github/workflows/helm-lint.yml" workflow_dispatch: env: @@ -36,15 +30,53 @@ jobs: outputs: should_run: ${{ steps.gate.outputs.should_run }} steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - id: gate uses: ./.github/actions/pr-gate - helm-lint: - name: Helm Lint + helm_changes: + name: Detect Helm changes needs: pr_metadata if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.default.outputs.should_run || steps.changes.outputs.any_changed }} + steps: + - id: default + if: github.event_name != 'push' + shell: bash + run: echo "should_run=true" >> "$GITHUB_OUTPUT" + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + if: github.event_name == 'push' + + - id: merge-base + if: github.event_name == 'push' + uses: ./.github/actions/pr-merge-base + with: + gh_token: ${{ secrets.GITHUB_TOKEN }} + + - id: changes + if: github.event_name == 'push' + uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11 # v42.1.0 + with: + base_sha: ${{ steps.merge-base.outputs.base_sha }} + skip_initial_fetch: ${{ steps.merge-base.outputs.base_sha != '' }} + files: | + deploy/helm/** + mise.toml + mise.lock + tasks/helm.toml + .github/workflows/helm-lint.yml + + helm-lint: + name: Helm Lint + needs: [pr_metadata, helm_changes] + if: needs.pr_metadata.outputs.should_run == 'true' && needs.helm_changes.outputs.should_run == 'true' runs-on: linux-amd64-cpu8 container: image: ghcr.io/nvidia/openshell/ci:latest @@ -52,7 +84,7 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Install tools run: mise install --locked diff --git a/.github/workflows/required-ci-gates.yml b/.github/workflows/required-ci-gates.yml new file mode 100644 index 000000000..ddde80519 --- /dev/null +++ b/.github/workflows/required-ci-gates.yml @@ -0,0 +1,185 @@ +name: Required CI Gates + +on: + pull_request_target: + types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled] + workflow_run: + workflows: + - Branch Checks + - Branch E2E Checks + - GPU Test + - Branch Kubernetes E2E + - Helm Lint + types: [completed] + +permissions: + actions: read + contents: read + pull-requests: read + statuses: write + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.event.workflow_run.head_sha || github.run_id }} + cancel-in-progress: true + +jobs: + publish: + name: Publish required CI gate statuses + runs-on: ubuntu-latest + steps: + - name: Evaluate required CI gates + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_REPO: ${{ github.repository }} + EVENT_NAME: ${{ github.event_name }} + PR_NUMBER_FROM_EVENT: ${{ github.event.pull_request.number }} + PR_HEAD_SHA_FROM_EVENT: ${{ github.event.pull_request.head.sha }} + PR_LABELS_FROM_EVENT: ${{ toJSON(github.event.pull_request.labels.*.name) }} + WORKFLOW_RUN_HEAD_SHA: ${{ github.event.workflow_run.head_sha }} + WORKFLOW_RUN_EVENT: ${{ github.event.workflow_run.event }} + shell: bash + run: | + set -euo pipefail + + post_status() { + local context="$1" + local state="$2" + local description="$3" + local target_url="${4:-}" + + args=( + --method POST + "repos/$GH_REPO/statuses/$HEAD_SHA" + -f "state=$state" + -f "context=$context" + -f "description=$description" + ) + if [ -n "$target_url" ]; then + args+=(-f "target_url=$target_url") + fi + + echo "$context: $state - $description" + gh api "${args[@]}" >/dev/null + } + + has_label() { + local label="$1" + jq -e --arg label "$label" 'index($label) != null' <<< "$LABELS_JSON" >/dev/null + } + + resolve_pull_request_event() { + PR_NUMBER="$PR_NUMBER_FROM_EVENT" + HEAD_SHA="$PR_HEAD_SHA_FROM_EVENT" + LABELS_JSON=$(jq -c . <<< "$PR_LABELS_FROM_EVENT") + } + + resolve_workflow_run_event() { + if [ "$WORKFLOW_RUN_EVENT" != "push" ]; then + echo "Ignoring workflow_run from event '$WORKFLOW_RUN_EVENT'." + exit 0 + fi + + local associated_prs pr + associated_prs=$(gh api "repos/$GH_REPO/commits/$WORKFLOW_RUN_HEAD_SHA/pulls") + pr=$(jq -c 'map(select(.state == "open"))[0] // empty' <<< "$associated_prs") + if [ -z "$pr" ]; then + echo "No open PR associated with $WORKFLOW_RUN_HEAD_SHA; nothing to publish." + exit 0 + fi + + PR_NUMBER=$(jq -r '.number' <<< "$pr") + pr=$(gh api "repos/$GH_REPO/pulls/$PR_NUMBER") + HEAD_SHA=$(jq -r '.head.sha' <<< "$pr") + LABELS_JSON=$(gh api "repos/$GH_REPO/issues/$PR_NUMBER" --jq '[.labels[].name]') + } + + resolve_context() { + if [ "$EVENT_NAME" = "pull_request_target" ]; then + resolve_pull_request_event + elif [ "$EVENT_NAME" = "workflow_run" ]; then + resolve_workflow_run_event + else + echo "Unsupported event '$EVENT_NAME'." + exit 1 + fi + + PR_URL="https://github.com/$GH_REPO/pull/$PR_NUMBER" + MIRROR_REF="pull-request/$PR_NUMBER" + } + + verify_mirror() { + local context="$1" + local mirror_sha + + mirror_sha=$(gh api "repos/$GH_REPO/branches/$MIRROR_REF" --jq '.commit.sha' 2>/dev/null || true) + if [ -z "$mirror_sha" ]; then + post_status "$context" pending "Waiting for copy-pr-bot mirror" "$PR_URL" + return 1 + fi + + if [ "$mirror_sha" != "$HEAD_SHA" ]; then + post_status "$context" failure "copy-pr-bot mirror is stale" "$PR_URL" + return 1 + fi + + return 0 + } + + evaluate_workflow() { + local context="$1" + local workflow_file="$2" + local workflow_name="$3" + local required_label="${4:-}" + local workflow_url="https://github.com/$GH_REPO/actions/workflows/$workflow_file" + + if [ -n "$required_label" ] && ! has_label "$required_label"; then + post_status "$context" success "$required_label not applied" "$PR_URL" + return 0 + fi + + if ! verify_mirror "$context"; then + return 0 + fi + + local runs latest run_id status conclusion run_url real_success + runs=$(gh api "repos/$GH_REPO/actions/workflows/$workflow_file/runs?head_sha=$HEAD_SHA&event=push" --jq '.workflow_runs') + latest=$(jq -c --arg branch "$MIRROR_REF" '[.[] | select(.head_branch == $branch)] | sort_by(.created_at) | reverse | .[0] // empty' <<< "$runs") + + if [ -z "$latest" ]; then + post_status "$context" pending "Waiting for $workflow_name" "$workflow_url" + return 0 + fi + + run_id=$(jq -r '.id' <<< "$latest") + status=$(jq -r '.status' <<< "$latest") + conclusion=$(jq -r '.conclusion' <<< "$latest") + run_url=$(jq -r '.html_url' <<< "$latest") + + if [ "$status" != "completed" ]; then + post_status "$context" pending "$workflow_name is $status" "$run_url" + return 0 + fi + + if [ "$conclusion" != "success" ]; then + post_status "$context" failure "$workflow_name concluded $conclusion" "$run_url" + return 0 + fi + + real_success=$(gh api "repos/$GH_REPO/actions/runs/$run_id/jobs?per_page=100" \ + --jq '[.jobs[] | select(.conclusion == "success" and .name != "Resolve PR metadata")] | length') + + if [ "$real_success" -lt 1 ]; then + post_status "$context" failure "No real CI jobs ran" "$run_url" + return 0 + fi + + post_status "$context" success "$workflow_name passed" "$run_url" + } + + resolve_context + + evaluate_workflow "OpenShell / Branch Checks" "branch-checks.yml" "Branch Checks" + evaluate_workflow "OpenShell / E2E" "branch-e2e.yml" "Branch E2E Checks" "test:e2e" + evaluate_workflow "OpenShell / GPU E2E" "test-gpu.yml" "GPU Test" "test:e2e-gpu" + evaluate_workflow "OpenShell / Kubernetes E2E" "branch-kubernetes-e2e.yml" "Branch Kubernetes E2E" "test:e2e-kubernetes" + evaluate_workflow "OpenShell / Helm Lint" "helm-lint.yml" "Helm Lint" diff --git a/CI.md b/CI.md index 57e6627ed..d1b4fd176 100644 --- a/CI.md +++ b/CI.md @@ -8,14 +8,15 @@ For local test commands see [TESTING.md](TESTING.md). For PR conventions see [CO PR CI that runs on NVIDIA self-hosted runners uses NVIDIA's copy-pr-bot. The bot mirrors trusted PR commits to internal `pull-request/` branches in this repository. The gated workflows trigger on pushes to those branches, not on the original PR. -`Branch Checks` run automatically after copy-pr-bot mirrors the PR. E2E suites are opt-in because they are more expensive and publish temporary images. +`Branch Checks` run automatically after copy-pr-bot mirrors the PR. `Required CI Gates` posts PR-head statuses that verify the mirror exists, is current, and ran the expected push-based workflows. E2E suites are opt-in because they are more expensive and publish temporary images. -Two opt-in labels enable the suites: +Three opt-in labels enable the suites: - `test:e2e` runs `Branch E2E Checks` (non-GPU E2E) - `test:e2e-gpu` runs `GPU Test` +- `test:e2e-kubernetes` runs `Branch Kubernetes E2E` -Both are required to merge once the corresponding `E2E Gate` checks are marked required in branch protection. +The GitHub ruleset should require the `OpenShell / ...` statuses published by `Required CI Gates`, not the push-triggered workflow jobs directly. ## Commit signing @@ -65,11 +66,11 @@ Prerequisites: Flow: 1. Open the PR. copy-pr-bot mirrors it to `pull-request/` automatically. -2. The mirror push runs `Branch Checks` automatically. The first `Branch E2E Checks` / `GPU Test` run only resolves metadata and skips expensive jobs unless the matching label is already set. -3. A maintainer applies `test:e2e` and/or `test:e2e-gpu`. `E2E Label Help` posts a comment with a link to the existing gated workflow run. +2. The mirror push runs `Branch Checks` automatically. `Required CI Gates` keeps the PR blocked until the mirror exists, matches the PR head SHA, and the required push-based workflow succeeds. The first `Branch E2E Checks` / `GPU Test` / `Branch Kubernetes E2E` run only resolves metadata and skips expensive jobs unless the matching label is already set. +3. A maintainer applies `test:e2e`, `test:e2e-gpu`, and/or `test:e2e-kubernetes`. `E2E Label Help` posts a comment with a link to the existing gated workflow run. 4. The maintainer opens that link and clicks **Re-run all jobs**. This time `pr_metadata` sees the label and the build/E2E jobs run. -5. When the run finishes, the `E2E Gate` check on the PR flips to green automatically. -6. New commits push to the mirror automatically and re-trigger `Branch Checks` plus any labeled E2E/GPU workflows. +5. When the run finishes, the matching `OpenShell / ...` gate status flips to green automatically. +6. New commits push to the mirror automatically and re-trigger `Branch Checks` plus any labeled E2E/GPU/Kubernetes workflows. ### Forked PR @@ -82,9 +83,9 @@ Flow: 1. Open the PR. The vouch check confirms you are vouched (otherwise the PR is auto-closed). 2. copy-pr-bot does not mirror forks automatically. A maintainer reviews the diff and comments `/ok to test ` with your latest commit SHA. -3. After `/ok to test`, copy-pr-bot mirrors to `pull-request/`. From here the flow is identical to internal PRs: maintainer applies the label, follows the comment from `E2E Label Help`, and re-runs the workflow. +3. After `/ok to test`, copy-pr-bot mirrors to `pull-request/`. From here the flow is identical to internal PRs: `Required CI Gates` verifies the mirror and required push workflows, and maintainers apply E2E labels when the extra suites are needed. -Important: every new commit you push requires another `/ok to test ` from a maintainer before E2E will run on it. If a label is applied while the mirror is stale, `E2E Label Help` will post a comment explaining what's needed. +Important: every new commit you push requires another `/ok to test ` from a maintainer before push-based CI will run on it. If a label is applied while the mirror is stale, `E2E Label Help` will post a comment explaining what's needed. ## copy-pr-bot @@ -107,7 +108,23 @@ The bot's full administrator documentation is internal to NVIDIA. The only comma | `.github/workflows/branch-checks.yml` | Required non-E2E PR checks. Triggers on `push: pull-request/[0-9]+`. | | `.github/workflows/branch-e2e.yml` | Non-GPU E2E. Triggers on `push: pull-request/[0-9]+`. | | `.github/workflows/test-gpu.yml` | GPU E2E. Triggers on `push: pull-request/[0-9]+`. | +| `.github/workflows/branch-kubernetes-e2e.yml` | Kubernetes E2E. Triggers on `push: pull-request/[0-9]+`. | +| `.github/workflows/helm-lint.yml` | Helm chart validation. Triggers on `push: pull-request/[0-9]+` and skips lint jobs unless Helm inputs changed. | | `.github/actions/pr-gate/action.yml` | Composite action that resolves PR metadata and verifies the required label is set. | +| `.github/actions/pr-merge-base/action.yml` | Composite action that resolves and fetches the merge-base commit for `pull-request/` push workflows. | +| `.github/workflows/required-ci-gates.yml` | Posts required PR-head statuses for push-based CI workflows. This is what branch protection should require. | | `.github/workflows/e2e-gate.yml` | Posts the required `E2E Gate` check on the PR. Re-evaluates after the gated workflow completes. | | `.github/workflows/e2e-gate-check.yml` | Reusable gate logic shared by E2E and GPU E2E. | | `.github/workflows/e2e-label-help.yml` | When a `test:e2e*` label is applied, posts a PR comment telling the maintainer the next manual step (re-run an existing workflow run, or `/ok to test ` to refresh the mirror). | + +## Required status contexts + +Require these statuses in the branch ruleset for push-based CI: + +- `OpenShell / Branch Checks` +- `OpenShell / E2E` +- `OpenShell / GPU E2E` +- `OpenShell / Kubernetes E2E` +- `OpenShell / Helm Lint` + +Do not require the underlying push workflow jobs directly. Those jobs only appear after copy-pr-bot mirrors trusted code, so they cannot independently prove that an untrusted or stale PR head was tested. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9dddac69a..ef07c4495 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -282,4 +282,4 @@ DCO sign-off is separate from cryptographic commit signing. CI requires signing ## CI -How E2E runs in CI, the `test:e2e` / `test:e2e-gpu` labels, copy-pr-bot, and commit-signing setup are documented in [CI.md](CI.md). +How PR CI runs, the `test:e2e` / `test:e2e-gpu` / `test:e2e-kubernetes` labels, copy-pr-bot, and commit-signing setup are documented in [CI.md](CI.md). diff --git a/architecture/build.md b/architecture/build.md index 8a4212aa7..be50eeb8d 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -62,16 +62,15 @@ the same staging and tagging assumptions are used locally and in CI. ## CI and E2E -Required checks run on GitHub Actions. E2E and GPU workflows use NVIDIA -self-hosted runners, so trusted PRs are mirrored by copy-pr-bot into -`pull-request/` branches before those workflows run. +Required checks run on GitHub Actions. Workflows that use NVIDIA self-hosted runners trigger from copy-pr-bot mirror branches, so trusted PRs are mirrored into `pull-request/` branches before those workflows run. The high-level CI model: -1. Standard branch checks run on normal PR activity. -2. Label-gated E2E and GPU checks run from trusted mirror branches. -3. Gate jobs verify that the expected non-gate workflow actually ran. -4. Release workflows rebuild and publish binaries, wheels, images, and docs. +1. PR-context gate jobs publish required statuses for the PR head commit. +2. Standard branch checks run from trusted mirror branches. +3. Label-gated E2E, GPU, and Kubernetes checks run from trusted mirror branches. +4. Gate jobs verify that the mirror branch matches the PR head and that the expected non-gate workflow actually ran. +5. Release workflows rebuild and publish binaries, wheels, images, and docs. See `CI.md` for the contributor workflow and labels. From 178b9d69c1a9e098a8b23337304b2b64357d9754 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 19 May 2026 14:21:00 -0700 Subject: [PATCH 2/3] fix(ci): keep stale mirror gates pending Signed-off-by: Piotr Mlocek --- .github/workflows/required-ci-gates.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/required-ci-gates.yml b/.github/workflows/required-ci-gates.yml index ddde80519..cba7a6b9a 100644 --- a/.github/workflows/required-ci-gates.yml +++ b/.github/workflows/required-ci-gates.yml @@ -113,12 +113,12 @@ jobs: mirror_sha=$(gh api "repos/$GH_REPO/branches/$MIRROR_REF" --jq '.commit.sha' 2>/dev/null || true) if [ -z "$mirror_sha" ]; then - post_status "$context" pending "Waiting for copy-pr-bot mirror" "$PR_URL" + post_status "$context" pending "Waiting for /ok to test mirror" "$PR_URL" return 1 fi if [ "$mirror_sha" != "$HEAD_SHA" ]; then - post_status "$context" failure "copy-pr-bot mirror is stale" "$PR_URL" + post_status "$context" pending "Waiting for /ok to test mirror" "$PR_URL" return 1 fi From 592c25364d44f35fd521fb6904b956e0cec52633 Mon Sep 17 00:00:00 2001 From: Piotr Mlocek Date: Tue, 19 May 2026 14:25:13 -0700 Subject: [PATCH 3/3] chore(ci): clarify merge-base comment Signed-off-by: Piotr Mlocek --- .github/actions/pr-merge-base/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/pr-merge-base/action.yml b/.github/actions/pr-merge-base/action.yml index 9393f6518..a702e9c0b 100644 --- a/.github/actions/pr-merge-base/action.yml +++ b/.github/actions/pr-merge-base/action.yml @@ -27,8 +27,8 @@ runs: if [[ "$REF_NAME" =~ ^pull-request/([0-9]+)$ ]]; then pr_number="${BASH_REMATCH[1]}" base_ref=$(gh pr view "$pr_number" --repo "$GH_REPO" --json baseRefName -q '.baseRefName') - # The mirrored branch is a push ref, so changed-files needs the true - # merge-base to diff the PR head against its base branch. + # The mirrored branch is a push ref, so changed-files needs the true merge-base + # to diff the PR head against its base branch. base_sha=$(gh api "repos/$GH_REPO/compare/$base_ref...$GITHUB_SHA_VALUE" --jq '.merge_base_commit.sha') git fetch --no-tags --depth=1 origin "$base_sha" echo "base_sha=$base_sha" >> "$GITHUB_OUTPUT"