From cd29eab220cb4fa266d69a692de1fbc0358ad205 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Thu, 26 Jun 2025 14:50:06 +0100
Subject: [PATCH 01/28] Add NGC release test workflow

---
 .github/workflows/ngc-release-testing.yaml | 82 ++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 .github/workflows/ngc-release-testing.yaml

diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml
new file mode 100644
index 000000000..3150a07c2
--- /dev/null
+++ b/.github/workflows/ngc-release-testing.yaml
@@ -0,0 +1,82 @@
+name: ~NGC release testing
+
+on:
+  workflow_dispatch:
+    inputs:
+      JAX_IMAGE:
+        type: string
+        description: "JAX image to run tests on"
+        required: false
+        default: ''
+      MAXTEXT_IMAGE:
+        type: string
+        description: "MaxText image to run tests on"
+        required: false
+        default: ''
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+
+permissions:
+  contents: read  # to fetch code
+  actions:  write # to cancel previous workflows
+  packages: write # to upload container
+
+jobs:
+  test-nccl:
+    if: inputs.JAX_IMAGE != ''
+    uses: ./.github/workflows/_test_nccl.yaml
+    with:
+      CONTAINER: ${{ inputs.JAX_IMAGE }}
+    secrets: inherit
+
+  test-jax:
+    if: inputs.JAX_IMAGE != ''
+    uses: ./.github/workflows/_test_unit.yaml
+    with:
+      TEST_NAME: jax
+      EXECUTE: |
+        docker run -i --shm-size=1g --gpus all \
+        ${{ inputs.JAX_IMAGE }} \
+        bash <<"EOF" |& tee test-backend-independent.log
+          test-jax.sh -b backend-independent
+        EOF
+        docker run -i --shm-size=1g --gpus all \
+        ${{ inputs.JAX_IMAGE }} \
+        bash <<"EOF" |& tee tee test-gpu.log
+          test-jax.sh -b gpu
+        EOF
+      STATISTICS_SCRIPT: |
+        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+        total_tests=$((failed_tests + passed_tests))
+        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+      ARTIFACTS: |
+        test-backend-independent.log
+        test-gpu.log
+    secrets: inherit
+
+  test-maxtext:
+    if: inputs.MAXTEXT_IMAGE != ''
+    uses: ./.github/workflows/_test_maxtext.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
+    secrets: inherit
+
+  test-maxtext-eks:
+    if: inputs.MAXTEXT_IMAGE != ''
+    uses: ./.github/workflows/_test_maxtext_k8s.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
+    secrets: inherit
+
+  finalize:
+    needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks ]
+    if: "!cancelled()"
+    uses: ./.github/workflows/_finalize.yaml
+    secrets: inherit

From 013d4a515629c249bd481ee2614c25efb6741f6d Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Thu, 26 Jun 2025 17:15:15 +0200
Subject: [PATCH 02/28] test-jax.sh: fix typo (#1526)

This meant that ~40-80GB GPUs would run >4 parallel jobs.
---
 .github/container/test-jax.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/container/test-jax.sh b/.github/container/test-jax.sh
index 2671bcb65..3398b72c8 100755
--- a/.github/container/test-jax.sh
+++ b/.github/container/test-jax.sh
@@ -140,7 +140,7 @@ FLAGS+=("--//jaxlib/tools:add_pypi_cuda_wheel_deps=false")
 
 # Default parallelism: at least 10GB per test, no more than 4 tests per GPU.
 DEFAULT_JOBS_PER_GPU=$(( GPU_MEMORIES_MIB[0] / 10000))
-if (( DEFAULT_JOBS_PER_GPU > 8 )); then DEFAULT_JOBS_PER_GPU=4; fi
+if (( DEFAULT_JOBS_PER_GPU > 4 )); then DEFAULT_JOBS_PER_GPU=4; fi
 set_default JOBS_PER_GPU ${DEFAULT_JOBS_PER_GPU}
 FLAGS+=(
     "--cache_test_results=${CACHE_TEST_RESULTS}"

From 25ad36549e0d1081e019164c55c919f046f37178 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Tue, 15 Jul 2025 11:33:43 +0100
Subject: [PATCH 03/28] Add k8s maxtext workflow

---
 .github/workflows/_test_maxtext_k8s.yaml | 107 +++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 .github/workflows/_test_maxtext_k8s.yaml

diff --git a/.github/workflows/_test_maxtext_k8s.yaml b/.github/workflows/_test_maxtext_k8s.yaml
new file mode 100644
index 000000000..7f82d3f42
--- /dev/null
+++ b/.github/workflows/_test_maxtext_k8s.yaml
@@ -0,0 +1,107 @@
+name: ~test MaxText functionality on Kubernetes
+
+on:
+  workflow_call:
+    inputs:
+      MAXTEXT_IMAGE:
+        type: string
+        description: MaxText container to test
+        required: true
+
+permissions:
+  contents: read  # to fetch code
+
+jobs:
+  maxtext:
+    runs-on: eks
+    env:
+      CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}"
+      JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}"
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Login to NVIDIA Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
+      - name: Store GitHub Container Registry token as Kubernetes secret
+        run: |
+          # Make this available to later steps
+          TOKEN_NAME="${JOB_NAME}-token"
+          echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
+          kubectl create secret generic \
+            ${TOKEN_NAME} \
+            --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+            --type=kubernetes.io/dockerconfigjson
+      - name: Configure Kubernetes job
+        run: |
+          export SERVICE_NAME="${JOB_NAME}-svc"
+          yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME)
+            | select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+            | select(di == 1).metadata.name = strenv(JOB_NAME)
+            | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
+            | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE)
+            | select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME)
+            | select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME)
+            | select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \
+            .github/eks-workflow-files/maxtext-job.yaml
+          git diff .github/eks-workflow-files/maxtext-job.yaml
+      - name: Submit Kubernetes job
+        run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml
+      - name: Wait for Kubernetes job to start
+        run: |
+          # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
+          # resources are available, but that is where there can be a long wait if the
+          # cluster is busy executing other jobs.
+          kubectl wait --for=create job/${JOB_NAME}
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s
+      - name: Stream Kubernetes job output
+        run: |
+          # Streaming logs will fail if the container/pod is still pending
+          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+            sleep 1
+          done
+          kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME}
+      - name: Retrieve Kubernetes job status
+        shell: bash -exo pipefail {0}
+        run: |
+          while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failure=${status[0]:-0}
+            success=${status[1]:-0}
+            total=$((failure+success))
+            if [[ ${total} < 2 ]]; then
+              sleep 1
+            elif [[ ${total} == 2 ]]; then
+              break
+            else
+              # FIXME
+              exit 255
+            fi
+          done
+          exit ${failure}
+      # Provide more debug output in case of failure; note that some kinds of launch
+      # failure do not produce any log output.
+      - name: Debug failed Kubernetes job
+        if: failure()
+        run: |
+          # Provide better debug in case of launch failures that will not produce log output
+          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name)
+          if [[ -n "${pods}" ]]; then
+            kubectl describe ${pods}
+          fi
+      # Clean up in case of errors as well as success
+      - name: Delete Kubernetes job
+        if: always()
+        run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml
+      - name: Delete GitHub Container Registry token
+        if: always()
+        run: kubectl delete secret ${TOKEN_NAME}

From a598d8d3a63dda7f157b685fa45ca1da0e8d6cbc Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Thu, 3 Jul 2025 15:15:00 +0100
Subject: [PATCH 04/28] Add GKE example (#1481)

Add `GKE` `MaxText` train ([example
run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15744603099/job/44379358307))
and `NCCL` test ([example
run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15744603099/job/44378422712))
workflows with reusable composite action for managing `xpk` job
lifecycle (launch, logs streaming, clean up, artifact upload).

Patches on `xpk` address the following identified issues:
- https://github.com/AI-Hypercomputer/xpk/issues/476
- https://github.com/AI-Hypercomputer/xpk/issues/488
- https://github.com/AI-Hypercomputer/xpk/issues/490
- https://github.com/AI-Hypercomputer/xpk/issues/491
- https://github.com/AI-Hypercomputer/xpk/issues/492

Cluster create with `xpk` ([example
run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15591134618/job/43910254644#step:5:1))
- added as a separate
[workflow](https://github.com/NVIDIA/JAX-Toolbox/pull/1481/files#diff-801fc28cafbf1e0fa0ea521355fa8a1c9e6c01dcb8b1083c47f66e2ead4d560a)
for demonstration purposes (will not be operational in the CI)

---------

Co-authored-by: Olli Lupton <olupton@nvidia.com>
---
 .github/actions/gke-xpk/action.yml            | 264 ++++++++++++++++++
 .github/container/Dockerfile.nccl-gke         |  12 +
 .../nccl/scripts/generate_hostfiles.sh        |  24 ++
 .../nccl/scripts/nccl-test-launch.sh          |  39 +++
 .../gke-workflow/nccl/scripts/start_ssh.sh    |  11 +
 .github/gke-workflow/nccl/scripts/test.sh     |  61 ++++
 .github/gke-workflow/nccl/service.yml         |  25 ++
 .github/gke-workflow/xpk/blueprint.patch      |  35 +++
 .../gke-workflow/xpk/docker_resources.patch   |  98 +++++++
 .../gke-workflow/xpk/tcpxo_decorator.patch    |  13 +
 .github/gke-workflow/xpk/workload.patch       |  26 ++
 .github/gke-workflow/xpk/xpk-sa-rbac.yml      |  33 +++
 .github/workflows/_ci.yaml                    |  13 +
 .github/workflows/_create_gke_cluster_xpk.yml |  65 +++++
 .github/workflows/_test_maxtext_gke_xpk.yaml  |  57 ++++
 .github/workflows/_test_nccl.yaml             |   7 +-
 .github/workflows/_test_nccl_gke.yaml         | 109 ++++++++
 17 files changed, 891 insertions(+), 1 deletion(-)
 create mode 100644 .github/actions/gke-xpk/action.yml
 create mode 100644 .github/container/Dockerfile.nccl-gke
 create mode 100644 .github/gke-workflow/nccl/scripts/generate_hostfiles.sh
 create mode 100644 .github/gke-workflow/nccl/scripts/nccl-test-launch.sh
 create mode 100644 .github/gke-workflow/nccl/scripts/start_ssh.sh
 create mode 100644 .github/gke-workflow/nccl/scripts/test.sh
 create mode 100644 .github/gke-workflow/nccl/service.yml
 create mode 100644 .github/gke-workflow/xpk/blueprint.patch
 create mode 100644 .github/gke-workflow/xpk/docker_resources.patch
 create mode 100644 .github/gke-workflow/xpk/tcpxo_decorator.patch
 create mode 100644 .github/gke-workflow/xpk/workload.patch
 create mode 100644 .github/gke-workflow/xpk/xpk-sa-rbac.yml
 create mode 100644 .github/workflows/_create_gke_cluster_xpk.yml
 create mode 100644 .github/workflows/_test_maxtext_gke_xpk.yaml
 create mode 100644 .github/workflows/_test_nccl_gke.yaml

diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml
new file mode 100644
index 000000000..1b30e5ddb
--- /dev/null
+++ b/.github/actions/gke-xpk/action.yml
@@ -0,0 +1,264 @@
+name: Launch workload on GKE with XPK
+
+description: "Launch a JobSet workload on GKE with XPK. Upload artifacts from container to GCS and GitHub Actions."
+
+inputs:
+  GCP_PROJECT:
+    description: 'GCP project ID'
+    default: nv-jaxtoolboxgcp-20240925
+    type: string
+  GKE_CLUSTER:
+    description: 'GKE cluster name'
+    default: jtb-2025-06-12
+    required: false
+    type: string
+  GCP_ZONE:
+    description: 'GCP zone of the cluster'
+    default: us-central1-a
+    required: false
+    type: string
+  CLUSTER_DEVICE: 
+    description: 'GPU device type in the cluster'
+    default: h100-mega-80gb-8
+    required: false
+    type: string
+  NUM_NODES:
+    description: 'Number of nodes to use in JobSet (n.b each a3-megagpu-8g node has 8xGPU)'
+    default: 2
+    required: false
+    type: string
+  MAIN_CONTAINER_NAME: 
+    description: 'Name of the main contianer in an XPK JobSet (fixed)'
+    default: gpu-image
+    required: false
+    type: string
+  CONTAINER_OUTPUT_PATH:
+    description: 'Output directory for artifacts'
+    default: /opt/output
+    required: false
+    type: string
+  GCS_BUCKET:
+    description: 'GCS bucket to which CI output artifacts will be uploaded'
+    default: jaxtoolbox-ci
+    required: false
+    type: string
+  IMAGE:
+    description: 'URI of image to use in JobSet'
+    required: false
+    default: ghcr.io/nvidia/jax:latest
+    type: string
+  COMMAND:
+    description: 'Command to run in main container on JobSet start up'
+    required: false
+    default: 'nvidia-smi; free -h;'
+    type: string
+  EXIT_COMMAND:
+    description: 'Command to set exit code'
+    required: false
+    default: 'exit \$EXIT_CODE'
+    type: string
+  WORKLOAD_NAME_PREFIX:
+    description: 'Workload name prefix for XPK, also used to name uploaded artifact'
+    required: false
+    default: 'xpk'
+    type: string
+  XPK_VERSION:
+    description: 'XPK release tag'
+    required: false
+    default: 'v0.8.0'
+    type: string
+  XPK_PYTHON:
+    description: 'Python version for XPK'
+    required: false
+    default: '3.12.10'
+    type: string
+
+runs:
+  using: 'composite'
+  steps:
+
+  - name: Set workload name
+    shell: bash -x -e -u {0}
+    run: |
+      WORKLOAD_NAME="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
+      DATE=$(date +'%Y-%m-%d')
+      GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME}"
+
+      echo "WORKLOAD_NAME=${WORKLOAD_NAME}" >> ${GITHUB_ENV}
+      echo "DATE=${DATE}" >> ${GITHUB_ENV}
+      echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV}
+
+  - name: Setup environment
+    shell: bash -x -e -u {0}
+    run: |
+      mkdir -p ${WORKLOAD_NAME}
+      uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME}
+      source ${WORKLOAD_NAME}/.venv/bin/activate
+
+      # install xpk
+      git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME}/xpk
+
+      sed 's@pip install \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME}/xpk/Makefile
+      cd ${WORKLOAD_NAME}/xpk && sudo make install; cd -
+
+  - name: Show environment
+    shell: bash -x -e -u {0}
+    run: |
+      gcloud version
+  
+      source ${WORKLOAD_NAME}/.venv/bin/activate
+      python --version
+      xpk version
+  
+  - name: Apply XPK workload create patch
+    shell: bash -x -e -u {0}
+    run: |
+      git apply --unsafe-paths .github/gke-workflow/xpk/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk
+      git apply --unsafe-paths .github/gke-workflow/xpk/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk
+      git apply --unsafe-paths .github/gke-workflow/xpk/workload.patch --directory ${WORKLOAD_NAME}/xpk
+  
+  - name: Set workload commands
+    shell: bash -x -e -u {0}
+    run: |
+      PRELUDE="
+          apt install -y ripgrep > /dev/null;
+          curl -LO https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz;
+          tar xf google-cloud-cli-linux-x86_64.tar.gz;
+          ./google-cloud-sdk/install.sh --quiet > /dev/null;
+          ./google-cloud-sdk/bin/gcloud init;
+  
+          mkdir -p /usr/share/workload;
+          mkdir -p ${{ inputs.CONTAINER_OUTPUT_PATH }};
+      "
+  
+      POSTLUDE="
+          ./google-cloud-sdk/bin/gsutil cp -r ${{ inputs.CONTAINER_OUTPUT_PATH }}/ ${GCS_ARTIFACT_PATH}/node-0\$NODE_RANK;
+          ${{ inputs.EXIT_COMMAND }}
+      "
+  
+      CMD="${{ inputs.COMMAND }}"
+  
+      # set container commands in-line
+      PRELUDE=$(echo ${PRELUDE} | sed 's/\n/\ /g')
+      POSTLUDE=$(echo ${POSTLUDE} | sed 's/\n/\ /g')
+      CMD=$(echo ${CMD} | sed 's/\n/\ /g')
+
+      echo "PRELUDE=${PRELUDE}" >> ${GITHUB_ENV}
+      echo "CMD=${CMD}" >> ${GITHUB_ENV}
+      echo "POSTLUDE=${POSTLUDE}" >> ${GITHUB_ENV}
+  
+  - name: Create workload on cluster with XPK
+    shell: bash -x -e -u {0}
+    run: |
+      source ${WORKLOAD_NAME}/.venv/bin/activate
+      cd ${WORKLOAD_NAME}/xpk
+      python xpk.py workload create \
+                    --project ${{ inputs.GCP_PROJECT }} \
+                    --cluster ${{ inputs.GKE_CLUSTER }} \
+                    --zone ${{ inputs.GCP_ZONE }} \
+                    --workload ${WORKLOAD_NAME} \
+                    --docker-image ${{ inputs.IMAGE }} \
+                    --device-type ${{ inputs.CLUSTER_DEVICE }} \
+                    --num-nodes ${{ inputs.NUM_NODES }} \
+                    --num-slices ${{ inputs.NUM_NODES }} \
+                    --priority=high \
+                    --scheduler=gke.io/topology-aware-auto \
+                    --command "${PRELUDE} ${CMD} ${POSTLUDE}"
+  
+  - name: Wait for JobSet to unsuspend on cluster
+    shell: bash -u {0}
+    env:
+      POLL_TIMEOUT: 3600
+    run: |
+      START=$(date +%s)
+      JOBSET_ACTIVE=false
+      while ! ${JOBSET_ACTIVE}  || [ -z ${JOBSET_ACTIVE} ]; do
+        JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1')
+        NOW=$(date +%s)
+        ELAPSED=$(( NOW - START ))
+        if (( ELAPSED > POLL_TIMEOUT )) ; then
+          echo "Timeout after waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
+          exit 1
+        fi
+        echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
+        sleep 5
+      done
+  
+      echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
+  
+  - name: Set JobSet Pod name
+    shell: bash -u {0}
+    run: |
+      echo "POD=$(kubectl get pods -o json | jq -r '.items[] | select(.metadata.labels."'jobset.sigs.k8s.io/jobset-name'" == "'${WORKLOAD_NAME}'") | .metadata.name ' | sort | head -n1 )" >> ${GITHUB_ENV}
+  
+  - name: Wait for JobSet Pod readiness
+    shell: bash -u {0}
+    run: |
+      POD_READY=false
+      while ! ${POD_READY}  || [ -z ${POD_READY} ]; do
+        echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready"
+        sleep 10
+  
+        POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
+        if ${POD_ERROR} ; then
+          echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}"
+          break
+        fi
+  
+        POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready')
+      done;
+  
+  - name: Stream logs from JobSet Pods
+    shell: bash -u {0}
+    run: |
+      jobset_pods=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' '))
+  
+      for jobset_pod in ${jobset_pods[@]}; do
+          kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME}-${jobset_pod}-jobset.log &
+      done
+      wait < <(jobs -p)
+  
+  - name: Set exit code from JobSet logs
+    shell: bash -u {0}
+    run: |
+      MAYBE_XPK_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME}-${POD}-jobset.log | awk '{ print $3 }' )"
+      echo ${MAYBE_XPK_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$'
+  
+      if [ $? -ne 0 ]; then
+        echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
+        exit 1
+      fi
+  
+      eval "export ${MAYBE_XPK_EXIT_CODE}"
+      exit ${EXIT_CODE}
+  
+  - name: Clean up JobSet from cluster
+    shell: bash -x -u {0}
+    if: ${{ always() }}
+    run: |
+      kubectl delete jobset --wait ${WORKLOAD_NAME} || echo "JobSet ${WORKLOAD_NAME} does not exist in ${{ inputs.GKE_CLUSTER }}"
+  
+  - name: Download artifacts from GCS to runner
+    shell: bash -x -u {0}
+    run: |
+      mkdir -p output/${WORKLOAD_NAME}
+      mv ${WORKLOAD_NAME}-*.log output/${WORKLOAD_NAME}
+      gsutil cp -r ${GCS_ARTIFACT_PATH} output/${WORKLOAD_NAME}
+  
+  - name: Upload artifacts to GitHub Actions from runner
+    uses: actions/upload-artifact@v4
+    with:
+      name: ${{ inputs.WORKLOAD_NAME_PREFIX }}
+      path: output/${{ env.WORKLOAD_NAME }}/*
+  
+  - name: Clean up GCS artifacts from runner
+    shell: bash -x -u {0}
+    if: ${{ always() }}
+    run: |
+      rm -rf output/${WORKLOAD_NAME}
+
+  - name: Clean up xpk environment from runner
+    shell: bash -x -u {0}
+    if: ${{ always() }}
+    run: |
+      sudo rm -rf ${WORKLOAD_NAME}
diff --git a/.github/container/Dockerfile.nccl-gke b/.github/container/Dockerfile.nccl-gke
new file mode 100644
index 000000000..7bfe9d4fe
--- /dev/null
+++ b/.github/container/Dockerfile.nccl-gke
@@ -0,0 +1,12 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE} as mealkit
+FROM mealkit as final
+COPY .github/gke-workflow/nccl/scripts /scripts
+RUN apt-get update \
+    && apt install -y openssh-server
+RUN passwd -d root && \
+     echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \
+     echo "PermitEmptyPasswords yes" >> /etc/ssh/sshd_config && \
+     echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \
+     chmod +x /scripts/*
+
diff --git a/.github/gke-workflow/nccl/scripts/generate_hostfiles.sh b/.github/gke-workflow/nccl/scripts/generate_hostfiles.sh
new file mode 100644
index 000000000..8f5d3117e
--- /dev/null
+++ b/.github/gke-workflow/nccl/scripts/generate_hostfiles.sh
@@ -0,0 +1,24 @@
+len() {
+  local -r arr=($@)
+  echo "${#arr[@]}"
+}
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+NRANKS_FACTORS=(1 2 4 8)
+
+NHOSTS=$(len "$@")
+echo "generating hostfiles for ${NHOSTS} hosts: "
+for h in "$@"; do echo "$h"; done
+
+mkdir -p "${SCRIPT_DIR}/hostfiles${NHOSTS}"
+
+for nr in "${NRANKS_FACTORS[@]}";
+do
+  rm -f "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}"
+  touch "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}"
+  for h in "$@";
+  do
+    echo "$h port=22 slots=${nr}" >> "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}"
+  done
+done
diff --git a/.github/gke-workflow/nccl/scripts/nccl-test-launch.sh b/.github/gke-workflow/nccl/scripts/nccl-test-launch.sh
new file mode 100644
index 000000000..cf84dde8e
--- /dev/null
+++ b/.github/gke-workflow/nccl/scripts/nccl-test-launch.sh
@@ -0,0 +1,39 @@
+BENCHMARK=$1
+NHOSTS=${NHOSTS:-2}
+shift
+
+/scripts/start_ssh.sh ${@};
+pushd /scripts;
+
+/scripts/generate_hostfiles.sh ${@};
+popd;
+
+COMPLETION_FLAG=/opt/output/${BENCHMARK}_done
+
+service ssh restart
+
+if [ $NODE_RANK = 0 ] ; then
+  for host in ${@}; do
+    host_ready=false
+    while ! $host_ready; do
+      status=$(ssh $host echo "ready" 2> /dev/null || echo "unready")
+      if [ "$status" = "ready" ]; then
+        host_ready=true
+        break
+      fi
+      echo "$host not ready"
+      sleep 5
+    done
+    echo "$host ready"
+  done
+
+  NCCL_BENCHMARK=$BENCHMARK NHOSTS=$NHOSTS /scripts/test.sh
+
+  for host in ${@}; do
+    ssh ${host} touch ${COMPLETION_FLAG}
+  done
+
+else
+  while [ ! -f $COMPLETION_FLAG ]; do sleep 10; done
+fi
+
diff --git a/.github/gke-workflow/nccl/scripts/start_ssh.sh b/.github/gke-workflow/nccl/scripts/start_ssh.sh
new file mode 100644
index 000000000..de3d3aba1
--- /dev/null
+++ b/.github/gke-workflow/nccl/scripts/start_ssh.sh
@@ -0,0 +1,11 @@
+PORT=${PORT:-22}
+
+while true; do
+  host=$1
+  if [[ -z $host ]]; then
+    break
+  fi
+  ssh -p "${PORT}" "$host" \
+    echo "Connected to ${host}"
+  shift
+done
diff --git a/.github/gke-workflow/nccl/scripts/test.sh b/.github/gke-workflow/nccl/scripts/test.sh
new file mode 100644
index 000000000..4165b77d7
--- /dev/null
+++ b/.github/gke-workflow/nccl/scripts/test.sh
@@ -0,0 +1,61 @@
+set -x
+
+export SCRIPT_DIR=/scripts
+
+ulimit -n 1048576
+
+NCCL_LIB_DIR=${NCCL_LIB_DIR} . /usr/local/nvidia/lib64/nccl-env-profile.sh
+
+: "${NCCL_BENCHMARK:?Must set NCCL_BENCHMARK}"
+NCCL_MINBYTES="${NCCL_MINBYTES:-8G}"
+NCCL_MAXBYTES="${NCCL_MAXBYTES:-16G}"
+NCCL_STEPFACTOR="${NCCL_STEPFACTOR:-2}"
+NCCL_ITERS="${NCCL_ITERS:-100}"
+NCCL_WARMUP_ITERS="${NCCL_WARMUP_ITERS:-0}"
+
+run_nccl() {
+  mpirun --mca btl tcp,self \
+         --mca btl_tcp_if_include eth0 \
+         --allow-run-as-root \
+         -np $(( GPUS_PER_NODE * "${NHOSTS}" )) \
+         --hostfile "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${GPUS_PER_NODE}"     \
+         -x LD_LIBRARY_PATH \
+         -x PATH     \
+         -x NCCL_DEBUG=VERSION \
+         -x NCCL_TESTS_SPLIT_MASK="${NCCL_TESTS_SPLIT_MASK:-0x0}"     \
+         -x NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="${NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY}"     \
+         -x NCCL_LIB_DIR \
+         -x NCCL_FASTRAK_IFNAME=${NCCL_FASTRAK_IFNAME} \
+         -x NCCL_FASTRAK_CTRL_DEV="${NCCL_SOCKET_IFNAME}" \
+         -x NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME}" \
+         -x NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \
+         -x NCCL_ALGO=${NCCL_ALGO} \
+         -x NCCL_PROTO=${NCCL_PROTO} \
+         -x NCCL_MIN_NCHANNELS=${NCCL_MIN_NCHANNELS} \
+         -x NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE} \
+         -x NCCL_P2P_PCI_CHUNKSIZE=${NCCL_P2P_PCI_CHUNKSIZE} \
+         -x NCCL_P2P_NVL_CHUNKSIZE=${NCCL_P2P_NVL_CHUNKSIZE} \
+         -x NCCL_FASTRAK_NUM_FLOWS=${NCCL_FASTRAK_NUM_FLOWS} \
+         -x NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL=${NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL} \
+         -x NCCL_BUFFSIZE=${NCCL_BUFFSIZE} \
+         -x NCCL_FASTRAK_USE_SNAP=${NCCL_FASTRAK_USE_SNAP} \
+         -x NCCL_FASTRAK_USE_LLCM=${NCCL_FASTRAK_USE_LLCM} \
+         -x CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} \
+         -x NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL} \
+         -x NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=${NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING} \
+         -x NCCL_TUNER_PLUGIN=${NCCL_TUNER_PLUGIN} \
+         -x NCCL_TUNER_CONFIG_PATH=/usr/local/nvidia/lib64/a3plus_tuner_config.textproto \
+         -x NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/usr/local/nvidia/lib64/a3plus_guest_config.textproto \
+         -x NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS=${NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS} \
+         -x NCCL_NVLS_ENABLE=${NCCL_NVLS_ENABLE} \
+         ${NCCL_BENCHMARK} --minbytes ${NCCL_MINBYTES} \
+                           --maxbytes ${NCCL_MAXBYTES} \
+                           --stepfactor ${NCCL_STEPFACTOR} \
+                           --ngpus 1 \
+                           --check 1 \
+                           --warmup_iters ${NCCL_WARMUP_ITERS} \
+                           --iters ${NCCL_ITERS} 2>&1 | \
+         tee "/opt/output/${NCCL_BENCHMARK}_nh${NHOSTS}_ng${GPUS_PER_NODE}_i${NCCL_ITERS}.txt"
+}
+
+run_nccl "$@"
diff --git a/.github/gke-workflow/nccl/service.yml b/.github/gke-workflow/nccl/service.yml
new file mode 100644
index 000000000..7bd6e049b
--- /dev/null
+++ b/.github/gke-workflow/nccl/service.yml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name:  nccl-test-host-1
+spec:
+  selector:
+    batch.kubernetes.io/job-completion-index: "0"
+  clusterIP: None
+  ports:
+  - port: 22
+    targetPort: 22
+    protocol: TCP
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: nccl-test-host-2
+spec:
+  selector:
+    batch.kubernetes.io/job-completion-index: "1"
+  clusterIP: None
+  ports:
+  - port: 22
+    targetPort: 22
+    protocol: TCP
diff --git a/.github/gke-workflow/xpk/blueprint.patch b/.github/gke-workflow/xpk/blueprint.patch
new file mode 100644
index 000000000..50cdf746d
--- /dev/null
+++ b/.github/gke-workflow/xpk/blueprint.patch
@@ -0,0 +1,35 @@
+diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py
+index ccbca90..22a880a 100644
+--- a/src/xpk/core/blueprint/blueprint_generator.py
++++ b/src/xpk/core/blueprint/blueprint_generator.py
+@@ -156,7 +156,6 @@ class BlueprintGenerator:
+         source="modules/scheduler/gke-cluster",
+         use=[primary_vpc_name, gpu_subnets_name],
+         settings={
+-            "release_channel": "RAPID",
+             "prefix_with_deployment_name": False,
+             "name_suffix": cluster_name,
+             "enable_private_endpoint": False,
+@@ -194,20 +193,18 @@ class BlueprintGenerator:
+     a3_megagpu_pool_0 = DeploymentModule(
+         id="a3_megagpu_pool_0",
+         source="modules/compute/gke-node-pool",
+-        use=["gke_cluster", gpu_subnets_name, "group_placement_0"],
++        use=["gke_cluster", gpu_subnets_name],
+         settings={
+             "name": f"{cluster_name}-a3-megagpu-pool-0",
+             "machine_type": system.gce_machine_type,
++            "guest_accelerator": [{"type":"nvidia-h100-mega-80gb", "count": 8, "gpu_driver_installation_config": {"gpu_driver_version": "DEFAULT"}}],
+             "static_node_count": num_nodes,
+             "zones": [zone],
+-            "host_maintenance_interval": "PERIODIC",
+             "reservation_affinity": self._getblock_reservation_affinity(
+                 reservation
+             ),
+             "run_workload_script": False,
+             "spot": capacity_type == CapacityType.SPOT,
+-            "max_pods_per_node": 32,
+-            "auto_upgrade": True,
+         },
+         outputs=["instructions"],
+     )
diff --git a/.github/gke-workflow/xpk/docker_resources.patch b/.github/gke-workflow/xpk/docker_resources.patch
new file mode 100644
index 000000000..74c0ef9e6
--- /dev/null
+++ b/.github/gke-workflow/xpk/docker_resources.patch
@@ -0,0 +1,98 @@
+diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py
+index a95c557..11e8e43 100644
+--- a/src/xpk/core/docker_resources.py
++++ b/src/xpk/core/docker_resources.py
+@@ -20,6 +20,8 @@ from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to
+ from .system_characteristics import AcceleratorType, SystemCharacteristics
+ 
+ 
++JAX_TOOLBOX_IMAGE_CONTAINER_PORT = 3389
++
+ def get_main_container_resources(
+     args, system: SystemCharacteristics, resource_type
+ ) -> str:
+@@ -64,7 +66,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
+     str:
+       YAML with the env config for the main container, as a YAML string.
+   """
+-  gpu_env_yaml = """
++  gpu_env_yaml = f"""
+                   - name: REPLICATED_JOB_NAME
+                     valueFrom:
+                       fieldRef:
+@@ -74,22 +76,22 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
+                       fieldRef:
+                         fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
+                   - name: JAX_COORDINATOR_ADDRESS
+-                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
++                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME):{JAX_TOOLBOX_IMAGE_CONTAINER_PORT}"
+                   - name: NNODES
+-                    value: "{args.num_nodes}"
++                    value: "{{args.num_nodes}}"
+                   - name: NODE_RANK
+                     valueFrom:
+                       fieldRef:
+                         fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+                   - name: USE_GPUDIRECT
+-                    value: {gpu_direct_name}
++                    value: {{gpu_direct_name}}
+                   - name: GPUS_PER_NODE
+-                    value: "{system.chips_per_vm}"
++                    value: "{{system.chips_per_vm}}"
+                   - name: JAX_COORDINATOR_PORT
+-                    value: "6002"
++                    value: "{JAX_TOOLBOX_IMAGE_CONTAINER_PORT}"
+                   - name: COMMAND
+-                    value: "{args.command}"
+-                  {args.env}"""
++                    value: "{{args.command}}"
++                  {{args.env}}"""
+ 
+   if system.accelerator_type == AcceleratorType['GPU']:
+     gpu_direct_name = 'fastrak'
+@@ -123,7 +125,7 @@ def get_cpu_env(num_slices, env_vars, system) -> str:
+   Returns:
+     str: yaml containing env variables
+   """
+-  yaml = """
++  yaml = f"""
+                 - name: REPLICATED_JOB_NAME
+                   valueFrom:
+                     fieldRef:
+@@ -137,12 +139,12 @@ def get_cpu_env(num_slices, env_vars, system) -> str:
+                     fieldRef:
+                       fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+                 - name: PROCESSES_IN_JOB
+-                  value: "{processes_in_job}"
++                  value: "{{processes_in_job}}"
+                 - name: JAX_PROCESS_COUNT
+-                  value: "{process_count}"
+-                {env_vars}
++                  value: "{{process_count}}"
++                {{env_vars}}
+                 - name: JAX_COORDINATOR_ADDRESS
+-                  value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
++                  value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME):{JAX_TOOLBOX_IMAGE_CONTAINER_PORT}"
+   """
+   return yaml.format(
+       processes_in_job=system.vms_per_slice,
+@@ -251,7 +253,9 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
+         or system.device_type == H200_DEVICE_TYPE
+         or system.device_type == B200_DEVICE_TYPE
+     ):
+-      volume_mount_yaml = ''
++      volume_mount_yaml = """- name: shared-memory
++                  mountPath: /dev/shm
++      """
+ 
+   storages: list[Storage] = get_storages_to_mount(
+       setup_k8s_env(args), args.storage
+@@ -300,7 +304,7 @@ def add_container_ports(args, system: SystemCharacteristics) -> str:
+   if args.use_pathways:
+     return ''
+ 
+-  gpu_port_yaml = """- containerPort: 6002"""
++  gpu_port_yaml = f"- containerPort: {JAX_TOOLBOX_IMAGE_CONTAINER_PORT}"
+   if system.accelerator_type == AcceleratorType['GPU']:
+     return gpu_port_yaml
+   return port_yaml
diff --git a/.github/gke-workflow/xpk/tcpxo_decorator.patch b/.github/gke-workflow/xpk/tcpxo_decorator.patch
new file mode 100644
index 000000000..62679f1e1
--- /dev/null
+++ b/.github/gke-workflow/xpk/tcpxo_decorator.patch
@@ -0,0 +1,13 @@
+diff --git a/src/xpk/core/workload_decorators/tcpxo_decorator.py b/src/xpk/core/workload_decorators/tcpxo_decorator.py
+index 322e574..5a0cc42 100644
+--- a/src/xpk/core/workload_decorators/tcpxo_decorator.py
++++ b/src/xpk/core/workload_decorators/tcpxo_decorator.py
+@@ -175,7 +175,7 @@ def update_gpu_containers(job_manifest):
+     if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
+       container.setdefault('env', [])
+       container['env'].append(
+-          {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
++              {'name': 'LD_LIBRARY_PATH', 'value': '/opt/nvidia/nccl/lib:/usr/local/cuda-12.8/targets/x86_64-local/lib:/usr/local/nvidia/lib64'}
+       )
+       container['env'].append({
+           'name': 'NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY',
diff --git a/.github/gke-workflow/xpk/workload.patch b/.github/gke-workflow/xpk/workload.patch
new file mode 100644
index 000000000..447f633bd
--- /dev/null
+++ b/.github/gke-workflow/xpk/workload.patch
@@ -0,0 +1,26 @@
+diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py
+index a466a5c..8a5b99e 100644
+--- a/src/xpk/commands/workload.py
++++ b/src/xpk/commands/workload.py
+@@ -227,6 +227,8 @@ spec:
+               tolerations:
+               - operator: "Exists"
+                 key: nvidia.com/gpu
++              imagePullSecrets:
++              - name: jax-toolbox-ghcr
+               containers:
+               {container}
+ """
+@@ -463,6 +465,12 @@ def workload_create(args) -> None:
+       if args.device_type == cluster_gcluster.a3mega_device_type:
+         sub_networks = get_subnetworks_for_a3mega(args.cluster)
+         yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks)
++        yml_string += """\
++            - name: shared-memory
++              emptyDir:
++                medium: Memory
++                sizeLimit: 0.5Ti
++        """
+ 
+       if args.device_type == cluster_gcluster.a3ultra_device_type:
+         sub_networks = get_subnetworks_for_a3ultra(args.cluster)
diff --git a/.github/gke-workflow/xpk/xpk-sa-rbac.yml b/.github/gke-workflow/xpk/xpk-sa-rbac.yml
new file mode 100644
index 000000000..9934c83d1
--- /dev/null
+++ b/.github/gke-workflow/xpk/xpk-sa-rbac.yml
@@ -0,0 +1,33 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: xpk-sa
+  namespace: default
+  annotations:
+    iam.gke.io/gcp-service-account: jobset-xpk-user@nv-jaxtoolboxgcp-20240925.iam.gserviceaccount.com
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: xpk-sa
+rules:
+  - apiGroups: [""]
+    resources: ["pods", "services"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: xpk-sa-binding
+  namespace: default
+subjects:
+  - kind: ServiceAccount
+    name: xpk-sa
+    namespace: default
+roleRef:
+  kind: Role
+  name: xpk-sa
+  apiGroup: rbac.authorization.k8s.io
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index abe7d7dae..03a04d9aa 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -631,6 +631,19 @@ jobs:
       MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
 
+  test-maxtext-gke:
+    needs: build-maxtext
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'maxtext'
+      )
+    uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
   test-axlearn-eks:
     needs: build-axlearn
     if: >-
diff --git a/.github/workflows/_create_gke_cluster_xpk.yml b/.github/workflows/_create_gke_cluster_xpk.yml
new file mode 100644
index 000000000..3b177ef7c
--- /dev/null
+++ b/.github/workflows/_create_gke_cluster_xpk.yml
@@ -0,0 +1,65 @@
+name: ~Create GKE cluster with XPK
+
+on:
+  workflow_call:
+    inputs:
+      CLUSTER_NAME:
+        type: string
+        description: Cluster name
+        default: jtb-2025-06-12
+        required: false
+
+jobs:
+  xpk-create-gke-cluster:
+    env:
+      GKE_VERSION: 1.31.6-gke.1221000 
+      DEVICE_TYPE: h100-mega-80gb-8
+      DEFAULT_CPU_MACHINE: e2-standard-8
+      NUM_NODES: 2
+      ZONE: us-central1-a
+      RESERVATION: jtb-reservation
+      PROJECT: nv-jaxtoolboxgcp-20240925
+
+    runs-on: gke-a3mega
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Show environment
+        run: |
+          set -x 
+          
+          gcloud version
+
+          source $HOME/.venv/bin/activate
+          python --version
+          xpk version
+
+      - name: Apply xpk cluster create patch
+        run: |
+          cd $HOME/xpk && git checkout src/xpk/core/blueprint/blueprint_generator.py && cd -
+          git apply --unsafe-paths .github/gke-workflow/xpk/blueprint.patch --directory $HOME/xpk
+
+      - name: Create cluster from compute reservation with xpk
+        run: |
+          CLUSTER_EXISTS=$(gcloud container clusters list  --format=json | jq -r  'any(.[].name; . == "'${CLUSTER_NAME}'")')
+          
+          if ! [ $CLUSTER_EXISTS = true  ]; then
+            cd $HOME/xpk
+            source $HOME/.venv/bin/activate
+            python xpk.py cluster create \
+                    --cluster ${CLUSTER_NAME} \
+                    --gke-version ${GKE_VERSION} \
+                    --device-type ${DEVICE_TYPE} \
+                    --num-nodes ${NUM_NODES} \
+                    --default-pool-cpu-machine-type=${DEFAULT_CPU_MACHINE} \
+                    --project=${PROJECT} \
+                    --reservation ${RESERVATION} \
+                    --zone ${ZONE}
+          else
+            echo "Cluster ${CLUSTER_NAME} already exists, skipping creation"
+          fi
+
+      - name: Configure cluster ServiceAccount
+        run: |
+          kubectl apply -f .github/gke-workflow/xpk/xpk-sa-rbac.yml
diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml
new file mode 100644
index 000000000..94a72f6e3
--- /dev/null
+++ b/.github/workflows/_test_maxtext_gke_xpk.yaml
@@ -0,0 +1,57 @@
+name: ~Test MaxText (GKE, XPK)
+
+on:
+  workflow_call:
+    inputs:
+      MAXTEXT_IMAGE:
+        type: string
+        description: MaxText image from ghcr.io/nvidia
+        default: ghcr.io/nvidia/jax:maxtext
+        required: false
+
+jobs:
+  maxtext-gke-xpk:
+    runs-on: gke-a3mega
+
+    env:
+      WORKLOAD_NAME_PREFIX: gke-maxtext-train
+      MAXTEXT_MODEL: llama2-7b
+      MAXTEXT_ATTENTION_TYPE: cudnn_flash_te
+      MAXTEXT_REMAT_POLICY: minimal_flash
+      MAXTEXT_TRAIN_STEPS: 20
+      MAXTEXT_FSDP: 16
+      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
+      NUM_NODES: 2
+
+    steps:
+    - name: Run XPK workload on cluster
+      uses: ./.github/actions/gke-xpk
+      with:
+        IMAGE: ${{ env.MAXTEXT_IMAGE }}
+        WORKLOAD_NAME_PREFIX: ${{ env.WORKLOAD_NAME_PREFIX }}
+        COMMAND: |
+          export NCCL_NET_PLUGIN=/opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so;
+          export NCCL_TUNER_PLUGIN=none;
+          console=/dev/stdout;
+          
+          nsys-jax --capture-range=cudaProfilerApi
+                   --capture-range-end=stop
+                   -o /opt/output/profile.zip
+                   --
+                   test-maxtext.sh -n ${{ env.NUM_NODES }}
+                                   -b ${{ env.NUM_NODES }}
+                                   --model-name=${{ env.MAXTEXT_MODEL }}
+                                   --attn-type=${{ env.MAXTEXT_ATTENTION_TYPE }}
+                                   --remat-policy=${{ env.MAXTEXT_REMAT_POLICY }}
+                                   --steps=${{ env.MAXTEXT_TRAIN_STEPS }}
+                                   --fsdp=${{ env.MAXTEXT_FSDP }}
+                                   --multiprocess
+                                   -a 'scan_layers=false
+                                       max_target_length=4096
+                                       use_iota_embed=true
+                                       logits_dot_in_fp32=false
+                                       profiler=nsys
+                                       skip_first_n_steps_for_profiler=3
+                                       profiler_steps=8' |&
+          tee /opt/output/output.log &> \${console};
+          EXIT_CODE=\$PIPESTATUS;
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index b0436b562..8dbe3ac7b 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -14,6 +14,12 @@ permissions:
   packages: write # to upload container
 
 jobs:
+  nccl-test-gke:
+    uses: ./.github/workflows/_test_nccl_gke.yaml
+    with:
+      JAX_IMAGE: ${{ inputs.CONTAINER }}
+    secrets: inherit
+
   build-mpi-operator-compatible-base:
     runs-on: [self-hosted, "amd64", "large"]
     steps:
@@ -39,7 +45,6 @@ jobs:
       DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
       DOCKER_TAG_FINAL:   ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
 
-
   nccl-test:
     needs: build-mpi-operator-compatible-base
     strategy:
diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml
new file mode 100644
index 000000000..0f9860407
--- /dev/null
+++ b/.github/workflows/_test_nccl_gke.yaml
@@ -0,0 +1,109 @@
+name: ~Test NCCL Kubernetes (GKE)
+
+on:
+  workflow_call:
+    inputs:
+      JAX_IMAGE:
+        type: string
+        description: JAX image from ghcr.io/nvidia
+        default: ghcr.io/nvidia/jax-toolbox-internal:15729070690-base-amd64
+        required: false
+
+jobs:
+  build-nccl-gke:
+    runs-on: [self-hosted, "amd64", "large"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build NCCL image
+        id: build
+        uses: ./.github/actions/build-container
+        with:
+          ARCHITECTURE: amd64
+          ARTIFACT_NAME: artifact-nccl-gke-build
+          BADGE_FILENAME: badge-nccl-gke-build
+          BUILD_DATE: 0000-00-00 # not important; this image is never published
+          BASE_IMAGE: ${{ inputs.JAX_IMAGE }}
+          CONTAINER_NAME: nccl-gke
+          DOCKERFILE: .github/container/Dockerfile.nccl-gke
+          RUNNER_SIZE: small
+          DOCKER_CONTEXT: .
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+    outputs:
+      DOCKER_TAG_FINAL:   ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
+
+  nccl-gke:
+    runs-on: gke-a3mega
+
+    needs: build-nccl-gke 
+
+    strategy:
+      matrix:
+        test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
+
+    env:
+      BASE_IMAGE: ${{ needs.build-nccl-gke.outputs.DOCKER_TAG_FINAL }}
+      TEST_NAME: ${{ matrix.test }}
+      WORKLOAD_NAME_PREFIX: nccl-gke
+      NHOSTS: 2
+      NCCL_MINBYTES: 8
+      NCCL_MAXBYTES: 16G
+      NCCL_STEPFACTOR: 2
+      NCCL_ITERS: 100
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set workload name prefix # due to 40 char limit
+        id: workload-name
+        run: |
+          TEST_NAME=$(echo "${{ matrix.test }}" | sed 's/_perf_mpi//g' | sed 's/_/-/g')
+          WORKLOAD_PREFIX="${{ env.WORKLOAD_NAME_PREFIX }}-${TEST_NAME}"
+
+          echo "WORKLOAD_PREFIX=${WORKLOAD_PREFIX}" >> ${GITHUB_OUTPUT}
+
+      - name: Create NCCL test Services on cluster
+        run: |
+          SERVICE_MANIFEST=".github/gke-workflow/nccl/service-${WORKLOAD_NAME}-${{ matrix.test }}.yaml"
+          WORKLOAD_NAME="${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
+          echo "SERVICE_MANIFEST=${SERVICE_MANIFEST}" >> ${GITHUB_ENV}
+
+          cat .github/gke-workflow/nccl/service.yml | yq '.spec.selector."jobset.sigs.k8s.io/jobset-name" = "'${WORKLOAD_NAME}'"' --yaml-output | tee ${SERVICE_MANIFEST}
+          kubectl apply -f ${SERVICE_MANIFEST}
+
+      - name: Run XPK workload on cluster
+        uses: ./.github/actions/gke-xpk
+        with:
+          IMAGE: ${{ env.BASE_IMAGE }}
+          WORKLOAD_NAME_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}
+          COMMAND: |
+            export NHOSTS=${{ env.NHOSTS }};
+            export NCCL_LIB_DIR=/opt/nvida/nccl/lib;
+            export SCRIPT_DIR=/scripts;
+
+            export NCCL_MINBYTES=${{ env.NCCL_MINBYTES }};
+            export NCCL_MAXBYTES=${{ env.NCCL_MAXBYTES }};
+            export NCCL_STEPFACTOR=${{ env.NCCL_STEPFACTOR }};
+            export NCCL_ITERS=${{ env.NCCL_ITERS }};
+
+            service ssh restart;
+            console=/dev/stdout;
+            declare -a hosts=('nccl-test-host-1' 'nccl-test-host-2');
+
+            /scripts/nccl-test-launch.sh ${{ matrix.test }} \${hosts[@]} |&
+            tee /opt/output/output.log &> \${console};
+
+            MAYBE_MPI_EXIT_CODE=\$(tail /opt/output/output.log | rg 'Exit code:[ ]+([0-9]+)' -or '\$1');
+            if [ -z \${MAYBE_MPI_EXIT_CODE} ]; then
+              EXIT_CODE=0;
+            else
+              EXIT_CODE=\${MAYBE_MPI_EXIT_CODE};
+            fi;
+
+      - name: Clean up NCCL test Services from cluster
+        if: ${{ always() }}
+        run: |
+          kubectl delete -f ${SERVICE_MANIFEST}
+

From 5bbd1bd832814013bde3f76829abeb5e88eacfe1 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Tue, 15 Jul 2025 11:38:41 +0100
Subject: [PATCH 05/28] Add GKE test to NGC release workflow

---
 .github/workflows/ngc-release-testing.yaml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml
index 3150a07c2..4bd0f7123 100644
--- a/.github/workflows/ngc-release-testing.yaml
+++ b/.github/workflows/ngc-release-testing.yaml
@@ -75,8 +75,15 @@ jobs:
       MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
     secrets: inherit
 
+  test-maxtext-gke:
+    if: inputs.MAXTEXT_IMAGE != ''
+    uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
+    secrets: inherit
+
   finalize:
-    needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks ]
+    needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks, test-maxtext-gke ]
     if: "!cancelled()"
     uses: ./.github/workflows/_finalize.yaml
     secrets: inherit

From 39baf576c8968d4d79c377862ecf42b76be4d079 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 12:20:04 +0100
Subject: [PATCH 06/28] Version xpk patch files

---
 .github/gke-workflow/xpk/{ => v0.8.0}/blueprint.patch        | 0
 .github/gke-workflow/xpk/{ => v0.8.0}/docker_resources.patch | 0
 .github/gke-workflow/xpk/{ => v0.8.0}/tcpxo_decorator.patch  | 0
 .github/gke-workflow/xpk/{ => v0.8.0}/workload.patch         | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/gke-workflow/xpk/{ => v0.8.0}/blueprint.patch (100%)
 rename .github/gke-workflow/xpk/{ => v0.8.0}/docker_resources.patch (100%)
 rename .github/gke-workflow/xpk/{ => v0.8.0}/tcpxo_decorator.patch (100%)
 rename .github/gke-workflow/xpk/{ => v0.8.0}/workload.patch (100%)

diff --git a/.github/gke-workflow/xpk/blueprint.patch b/.github/gke-workflow/xpk/v0.8.0/blueprint.patch
similarity index 100%
rename from .github/gke-workflow/xpk/blueprint.patch
rename to .github/gke-workflow/xpk/v0.8.0/blueprint.patch
diff --git a/.github/gke-workflow/xpk/docker_resources.patch b/.github/gke-workflow/xpk/v0.8.0/docker_resources.patch
similarity index 100%
rename from .github/gke-workflow/xpk/docker_resources.patch
rename to .github/gke-workflow/xpk/v0.8.0/docker_resources.patch
diff --git a/.github/gke-workflow/xpk/tcpxo_decorator.patch b/.github/gke-workflow/xpk/v0.8.0/tcpxo_decorator.patch
similarity index 100%
rename from .github/gke-workflow/xpk/tcpxo_decorator.patch
rename to .github/gke-workflow/xpk/v0.8.0/tcpxo_decorator.patch
diff --git a/.github/gke-workflow/xpk/workload.patch b/.github/gke-workflow/xpk/v0.8.0/workload.patch
similarity index 100%
rename from .github/gke-workflow/xpk/workload.patch
rename to .github/gke-workflow/xpk/v0.8.0/workload.patch

From 293f0dc0cc9be3f0b700175eea0d3973cb08a6d8 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 12:24:14 +0100
Subject: [PATCH 07/28] Add image pull secret name arg to xpk action template

---
 .github/actions/gke-xpk/action.yml             | 12 +++++++++---
 .github/gke-workflow/xpk/v0.8.0/workload.patch |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml
index 1b30e5ddb..876bb3250 100644
--- a/.github/actions/gke-xpk/action.yml
+++ b/.github/actions/gke-xpk/action.yml
@@ -47,6 +47,11 @@ inputs:
     required: false
     default: ghcr.io/nvidia/jax:latest
     type: string
+  IMAGE_PULL_SECRET_NAME:
+    description: 'Name of k8s Secret resource for registry ImagePullSecret'
+    required: false
+    default: jax-toolbox-ghcr
+    type: string
   COMMAND:
     description: 'Command to run in main container on JobSet start up'
     required: false
@@ -113,9 +118,10 @@ runs:
   - name: Apply XPK workload create patch
     shell: bash -x -e -u {0}
     run: |
-      git apply --unsafe-paths .github/gke-workflow/xpk/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk
-      git apply --unsafe-paths .github/gke-workflow/xpk/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk
-      git apply --unsafe-paths .github/gke-workflow/xpk/workload.patch --directory ${WORKLOAD_NAME}/xpk
+      sed -i 's/{{ IMAGE_PULL_SECRET_NAME}}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch 
+      git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk
+      git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk
+      git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch --directory ${WORKLOAD_NAME}/xpk
   
   - name: Set workload commands
     shell: bash -x -e -u {0}
diff --git a/.github/gke-workflow/xpk/v0.8.0/workload.patch b/.github/gke-workflow/xpk/v0.8.0/workload.patch
index 447f633bd..85ce8d424 100644
--- a/.github/gke-workflow/xpk/v0.8.0/workload.patch
+++ b/.github/gke-workflow/xpk/v0.8.0/workload.patch
@@ -7,7 +7,7 @@ index a466a5c..8a5b99e 100644
                - operator: "Exists"
                  key: nvidia.com/gpu
 +              imagePullSecrets:
-+              - name: jax-toolbox-ghcr
++              - name: {{ IMAGE_PULL_SECRET_NAME }}
                containers:
                {container}
  """

From 2efa29446242a3ad48f67f6751f71a56a16f1e12 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 12:26:36 +0100
Subject: [PATCH 08/28] Set custom image pull secret in maxtext GKE workflow

---
 .github/workflows/_test_maxtext_gke_xpk.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml
index 94a72f6e3..dec9804fe 100644
--- a/.github/workflows/_test_maxtext_gke_xpk.yaml
+++ b/.github/workflows/_test_maxtext_gke_xpk.yaml
@@ -24,11 +24,21 @@ jobs:
       NUM_NODES: 2
 
     steps:
+    - name: Login to nvcr.io Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: nvcr.io
+        username: '$oauthtoken'
+        password: ${{ secrets.NVCR_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
     - name: Run XPK workload on cluster
       uses: ./.github/actions/gke-xpk
       with:
         IMAGE: ${{ env.MAXTEXT_IMAGE }}
         WORKLOAD_NAME_PREFIX: ${{ env.WORKLOAD_NAME_PREFIX }}
+        IMAGE_PULL_SECRET_NAME: ${{ steps.store-token.outputs.token-name }}
         COMMAND: |
           export NCCL_NET_PLUGIN=/opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so;
           export NCCL_TUNER_PLUGIN=none;

From 4b233e6e3154ece5ca0d96959efe43e6aed76827 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 14:42:49 +0100
Subject: [PATCH 09/28] Fix pattern typo

---
 .github/actions/gke-xpk/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml
index 876bb3250..c4d03c2dc 100644
--- a/.github/actions/gke-xpk/action.yml
+++ b/.github/actions/gke-xpk/action.yml
@@ -118,7 +118,7 @@ runs:
   - name: Apply XPK workload create patch
     shell: bash -x -e -u {0}
     run: |
-      sed -i 's/{{ IMAGE_PULL_SECRET_NAME}}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch 
+      sed -i 's/{{ IMAGE_PULL_SECRET_NAME }}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch 
       git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk
       git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk
       git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch --directory ${WORKLOAD_NAME}/xpk

From 0e4289479b821faf3129d40fd1a51bc67a48a972 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 16:13:39 +0100
Subject: [PATCH 10/28] Fix nvcr username

---
 .github/workflows/_test_maxtext_gke_xpk.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml
index dec9804fe..942527352 100644
--- a/.github/workflows/_test_maxtext_gke_xpk.yaml
+++ b/.github/workflows/_test_maxtext_gke_xpk.yaml
@@ -28,7 +28,7 @@ jobs:
       uses: docker/login-action@v3
       with:
         registry: nvcr.io
-        username: '$oauthtoken'
+        username: $oauthtoken
         password: ${{ secrets.NVCR_TOKEN }}
     - name: K8s GHCR store and delete token
       id: store-token

From 3b045aee2115e4179a7684541b3edef3963f934b Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 16:27:12 +0100
Subject: [PATCH 11/28] Use checkout action to avoid cached repo use

---
 .github/workflows/_test_maxtext_gke_xpk.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml
index 942527352..d8bd7b288 100644
--- a/.github/workflows/_test_maxtext_gke_xpk.yaml
+++ b/.github/workflows/_test_maxtext_gke_xpk.yaml
@@ -24,15 +24,19 @@ jobs:
       NUM_NODES: 2
 
     steps:
+    - uses: actions/checkout@v4
+
     - name: Login to nvcr.io Container Registry
       uses: docker/login-action@v3
       with:
         registry: nvcr.io
         username: $oauthtoken
         password: ${{ secrets.NVCR_TOKEN }}
+
     - name: K8s GHCR store and delete token
       id: store-token
       uses: ./.github/actions/store-delete-k8s-ghcr
+
     - name: Run XPK workload on cluster
       uses: ./.github/actions/gke-xpk
       with:

From 9e1d3608f55bc2b74eaef990d5d415d1ecc98d94 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 16:55:53 +0100
Subject: [PATCH 12/28] Add EKS MaxText job via 481e71b

---
 .github/eks-workflow-files/maxtext-job.yaml | 120 ++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 .github/eks-workflow-files/maxtext-job.yaml

diff --git a/.github/eks-workflow-files/maxtext-job.yaml b/.github/eks-workflow-files/maxtext-job.yaml
new file mode 100644
index 000000000..7d9728f87
--- /dev/null
+++ b/.github/eks-workflow-files/maxtext-job.yaml
@@ -0,0 +1,120 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: PLACEHOLDER
+spec:
+  clusterIP: None # clusterIP must be None to create a headless service
+  selector:
+    job-name: PLACEHOLDER # must match Job name
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
+spec:
+  completions: 2 # number of nodes
+  parallelism: 2 # number of nodes
+  completionMode: Indexed
+  backoffLimitPerIndex: 0 # max failures per index
+  maxFailedIndexes:     0 # all indices must succeed
+  template:
+    spec:
+      subdomain: PLACEHOLDER # has to match Service name
+      restartPolicy: Never
+      imagePullSecrets:
+        - name: PLACEHOLDER
+      containers:
+        - name: maxtext
+          image: PLACEHOLDER
+          ports:
+            - containerPort: 3389
+          command:
+            - bash
+            - -c
+            # The logging logic: stream stdout/stderr from the 0th process inside this pod,
+            # record all of the processes' stdout/stderr + the INFO-level NCCL logs to file
+            - |
+              export SERVICE_NAME=$0
+              export JOB_NAME=$1
+              cat >each-process.sh <<'EOL'
+              export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME}
+              export JAX_COORDINATOR_PORT=3389
+              export NNODES=16 # actually #processes == #GPUs
+              export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK))
+              export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK
+              export NCCL_DEBUG=INFO
+              export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log
+              [[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null"
+              nsys-jax \
+                --capture-range=cudaProfilerApi \
+                --capture-range-end=stop \
+                -o /opt/output/profile.$NODE_RANK.zip \
+                -- \
+                test-maxtext.sh \
+                -n 2 \
+                -b 2 \
+                --model-name=llama2-7b \
+                --attn-type=cudnn_flash_te \
+                --remat-policy=minimal_flash \
+                --steps=20 \
+                --fsdp=16 \
+                -a "scan_layers=false \
+                    max_target_length=4096 \
+                    use_iota_embed=true \
+                    logits_dot_in_fp32=false \
+                    profiler=nsys \
+                    skip_first_n_steps_for_profiler=3 \
+                    profiler_steps=8" \
+                |& tee /opt/output/output.$NODE_RANK.log >"${console}"
+              code=$?
+              # Should run even on failure
+              cat /opt/output/nccl.$NODE_RANK.log >"${console}"
+              exit $code
+              EOL
+              # TODO: upgrade parallel-launch to return a failure code as soon as any
+              #       of its children do (it already does this eventually, but it could
+              #       be slow)
+              parallel-launch LOCAL_RANK 8 bash each-process.sh
+              code=$?
+              # Should run even on failure
+              touch /opt/output/.done
+              exit $code
+            - PLACEHOLDER
+            - PLACEHOLDER
+          resources:
+            limits:
+              nvidia.com/gpu: 8
+              vpc.amazonaws.com/efa: 32
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: shmem
+            - mountPath: /opt/output
+              name: output
+        - name: upload
+          image: amazon/aws-cli
+          command:
+            - bash
+            - -c
+            - |
+              JOB_NAME="$0"
+              while [[ ! -f /opt/output/.done ]]; do
+                sleep 1
+              done
+              rm /opt/output/.done
+              aws s3 cp \
+                --recursive \
+                /opt/output \
+                "s3://jax-toolbox-eks-output/${JOB_NAME}/"
+            - PLACEHOLDER
+          volumeMounts:
+            - mountPath: /opt/output
+              name: output
+      volumes:
+        - name: output
+          emptyDir: {}
+        - name: shmem
+          emptyDir:
+            medium: Memory
+            sizeLimit: 16Gi

From 34d8c668f560ba1ab42bb3b9169562459c0e5509 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 17:40:27 +0100
Subject: [PATCH 13/28] Update NCCL registry

---
 .github/workflows/_test_nccl.yaml     |  6 +++---
 .github/workflows/_test_nccl_gke.yaml | 12 ++++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 8dbe3ac7b..9e8fc33c5 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -60,9 +60,9 @@ jobs:
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
         with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
       - name: Create env vars
         id: var
         shell: bash
diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml
index 0f9860407..b9336869c 100644
--- a/.github/workflows/_test_nccl_gke.yaml
+++ b/.github/workflows/_test_nccl_gke.yaml
@@ -14,6 +14,18 @@ jobs:
     runs-on: [self-hosted, "amd64", "large"]
     steps:
       - uses: actions/checkout@v4
+
+      - name: Login to nvcr.io Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
+
+      - name: K8s GHCR store and delete token
+        id: store-token
+        uses: ./.github/actions/store-delete-k8s-ghcr
+
       - name: Build NCCL image
         id: build
         uses: ./.github/actions/build-container

From 9ca9854b0fcc3ee6a855452a4bef8e7ab15c2672 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 16 Jul 2025 17:42:03 +0100
Subject: [PATCH 14/28] Update jax unit test slurm registry

---
 .github/workflows/_test_unit.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml
index c376e64ab..44568669c 100644
--- a/.github/workflows/_test_unit.yaml
+++ b/.github/workflows/_test_unit.yaml
@@ -64,8 +64,8 @@ jobs:
       - name: Login to GitHub Container Registry
         uses: docker/login-action@v3
         with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
+          registry: nvcr.io
+          username: $oauthtoken
           password: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Run tests

From 6c486ce39210280d95a523951fcc9098e7a61763 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Thu, 17 Jul 2025 09:43:06 +0100
Subject: [PATCH 15/28] Fix registry login

---
 .github/workflows/_test_nccl.yaml     | 6 ++++++
 .github/workflows/_test_nccl_gke.yaml | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 9e8fc33c5..f81de95da 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -23,6 +23,12 @@ jobs:
   build-mpi-operator-compatible-base:
     runs-on: [self-hosted, "amd64", "large"]
     steps:
+      - name: Login to nvcr.io Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
       - name: Checkout repository
         uses: actions/checkout@v4
       - name: Build MPI operator compatible base container
diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml
index b9336869c..0a5dfaac0 100644
--- a/.github/workflows/_test_nccl_gke.yaml
+++ b/.github/workflows/_test_nccl_gke.yaml
@@ -22,10 +22,6 @@ jobs:
           username: $oauthtoken
           password: ${{ secrets.NVCR_TOKEN }}
 
-      - name: K8s GHCR store and delete token
-        id: store-token
-        uses: ./.github/actions/store-delete-k8s-ghcr
-
       - name: Build NCCL image
         id: build
         uses: ./.github/actions/build-container

From 766f71960ac41b8c4e6e428c2283beeb4f8007bb Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Thu, 17 Jul 2025 09:46:48 +0100
Subject: [PATCH 16/28] Update trigger

---
 .github/workflows/ci.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 1a6f53ec4..d602488ff 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -12,6 +12,8 @@ on:
     paths-ignore:
       - '**.md'
       - '.github/triage/**'
+    branches-ignore:
+      - '25.08-devel-add-ngc-release-testing'
   workflow_dispatch:
     inputs:
       PUBLISH:

From 580f54cb62bba712ef94dc88620b9f039f4f83ca Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Thu, 17 Jul 2025 10:37:26 +0100
Subject: [PATCH 17/28] Update registry login

---
 .github/workflows/_test_unit.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml
index 44568669c..fe74491e8 100644
--- a/.github/workflows/_test_unit.yaml
+++ b/.github/workflows/_test_unit.yaml
@@ -61,12 +61,12 @@ jobs:
       - name: Check out repository
         uses: actions/checkout@v4
 
-      - name: Login to GitHub Container Registry
+      - name: Login to nvcr Container Registry
         uses: docker/login-action@v3
         with:
           registry: nvcr.io
           username: $oauthtoken
-          password: ${{ secrets.GITHUB_TOKEN }}
+          password: ${{ secrets.NVCR_TOKEN }}
 
       - name: Run tests
         shell: bash -x -e {0}

From b54859aae9deac13402373cd107fdbb8a9891ae1 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Thu, 17 Jul 2025 14:02:47 +0100
Subject: [PATCH 18/28] Fix image pull registry

---
 .github/workflows/_test_nccl.yaml | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index f81de95da..98937f251 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -63,25 +63,24 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
-      - name: Login to GitHub Container Registry
+      - name: Login to GHCR Container Registry
         uses: docker/login-action@v3
         with:
-          registry: nvcr.io
-          username: $oauthtoken
-          password: ${{ secrets.NVCR_TOKEN }}
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ inputs.github-token }}
+      - name: K8s GHCR store and delete token
+        id: store-token
+        uses: ./.github/actions/store-delete-k8s-ghcr
       - name: Create env vars
         id: var
         shell: bash
         run: |
           JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
           LAUNCHER_NAME="${JOB_NAME}-launcher"
-          TOKEN_NAME="${JOB_NAME}-token"
           # Make these available to later steps
           echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
           echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
-      - name: K8s GHCR store and delete token
-        id: store-token
-        uses: ./.github/actions/store-delete-k8s-ghcr
       - name: Configure Kubernetes job
         run: |
           export WORKER_NAME="${JOB_NAME}-worker"

From bd1f3adc97331ab99f645b2b38cc4199e500f4cc Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Thu, 17 Jul 2025 14:12:13 +0100
Subject: [PATCH 19/28] Set correct registry password

---
 .github/workflows/_test_nccl.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 98937f251..a2365edd7 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -68,7 +68,7 @@ jobs:
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
-          password: ${{ inputs.github-token }}
+          password: ${{ secrets.GITHUB_TOKEN }}
       - name: K8s GHCR store and delete token
         id: store-token
         uses: ./.github/actions/store-delete-k8s-ghcr

From c902d5348f13202fc36fbf09b91a01479e2a262d Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Tue, 22 Jul 2025 11:14:07 +0100
Subject: [PATCH 20/28] Remove redundant testing (covered internally)

---
 .github/workflows/ngc-release-testing.yaml | 37 ----------------------
 1 file changed, 37 deletions(-)

diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml
index 4bd0f7123..b810e3232 100644
--- a/.github/workflows/ngc-release-testing.yaml
+++ b/.github/workflows/ngc-release-testing.yaml
@@ -31,43 +31,6 @@ jobs:
       CONTAINER: ${{ inputs.JAX_IMAGE }}
     secrets: inherit
 
-  test-jax:
-    if: inputs.JAX_IMAGE != ''
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ inputs.JAX_IMAGE }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ inputs.JAX_IMAGE }} \
-        bash <<"EOF" |& tee tee test-gpu.log
-          test-jax.sh -b gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-gpu.log
-    secrets: inherit
-
-  test-maxtext:
-    if: inputs.MAXTEXT_IMAGE != ''
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
-    secrets: inherit
-
   test-maxtext-eks:
     if: inputs.MAXTEXT_IMAGE != ''
     uses: ./.github/workflows/_test_maxtext_k8s.yaml

From 19a2ec1623a144b387e5e5611723e60248a9e271 Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Tue, 22 Jul 2025 13:57:57 +0100
Subject: [PATCH 21/28] Remove redundant testing (covered internally)

---
 .github/workflows/ngc-release-testing.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml
index b810e3232..a9f2f48d9 100644
--- a/.github/workflows/ngc-release-testing.yaml
+++ b/.github/workflows/ngc-release-testing.yaml
@@ -46,7 +46,7 @@ jobs:
     secrets: inherit
 
   finalize:
-    needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks, test-maxtext-gke ]
+    needs: [ test-nccl, test-maxtext-eks, test-maxtext-gke ]
     if: "!cancelled()"
     uses: ./.github/workflows/_finalize.yaml
     secrets: inherit

From 92f086b15c471aede8dc61c683c1edbfdc16e533 Mon Sep 17 00:00:00 2001
From: Brian Yang <125406446+gpupuck@users.noreply.github.com>
Date: Thu, 24 Jul 2025 15:09:16 -0700
Subject: [PATCH 22/28] Remove remote and main (#1574)

---
 .github/container/git-clone.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/container/git-clone.sh b/.github/container/git-clone.sh
index f4ddbc7fb..956d189b7 100755
--- a/.github/container/git-clone.sh
+++ b/.github/container/git-clone.sh
@@ -77,6 +77,13 @@ pushd ${DESTINATION}
 git checkout ${GIT_REF}
 COMMIT_SHA=$(git rev-parse HEAD)
 git submodule update --init --recursive
+if [[ "${GIT_REPO}" == *"gitlab"* ]]; then
+  git remote remove origin
+  if grep -q -r gitlab-ci-token .git; then
+    grep -r gitlab-ci-token .git | awk -F: '{print $1}' | xargs rm -f
+  fi
+  git branch -D main
+fi
 popd
 
 ## update the manifest file

From 9a366f4bab04b1a029e5ad1453722d8d7658137f Mon Sep 17 00:00:00 2001
From: Brian Yang <125406446+gpupuck@users.noreply.github.com>
Date: Fri, 25 Jul 2025 02:33:01 -0700
Subject: [PATCH 23/28] Upgrade werkzeug for MaxText (#1575)

Upgrade werkzeug to avoid vulnerabilities in 2.0.3. To be able to do
that, google-cloud-aiplatform needs to at least >= 1.90.0 (refer to
https://github.com/googleapis/python-aiplatform/blob/v1.90.0/setup.py#L51)
---
 .github/container/Dockerfile.maxtext | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext
index 49329e1aa..752a8e105 100644
--- a/.github/container/Dockerfile.maxtext
+++ b/.github/container/Dockerfile.maxtext
@@ -39,6 +39,7 @@ for pattern in \
     "s|tensorflow-datasets|tensorflow-datasets>=4.8.0|g" \
     "s|sentencepiece==0.1.97|sentencepiece>=0.2|g" \
     "s|tensorflow>=2.13.0|tensorflow==2.18.1|g" \
+    "s|google-cloud-aiplatform==1.61.0|google-cloud-aiplatform>=1.90.0|g" \
   ; do
     # tensorflow-cpu==2.19.0 is incompatible with tensorflow-text
     sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt
@@ -51,6 +52,7 @@ echo >> ${SRC_PATH_MAXTEXT}/requirements.txt  # add new line
 for requirement in \
     "tensorflow-metadata>=1.15.0" \
     "seqio@git+https://github.com/google/seqio.git" \
+    "werkzeug>=3.0.3" \
   ; do
     echo "${requirement}" >> ${SRC_PATH_MAXTEXT}/requirements.txt
 done

From c8fe23a19516ffc95322e7c9e0912983943629ef Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Fri, 25 Jul 2025 13:01:55 +0100
Subject: [PATCH 24/28] Add basic sitrep steps for GKE XPK action (#1580)

---
 .github/actions/gke-xpk/action.yml | 38 ++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml
index c4d03c2dc..574e1be33 100644
--- a/.github/actions/gke-xpk/action.yml
+++ b/.github/actions/gke-xpk/action.yml
@@ -232,10 +232,12 @@ runs:
   
       if [ $? -ne 0 ]; then
         echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
+        echo "XPK_EXIT_CODE=1" >> ${GITHUB_ENV}
         exit 1
       fi
   
       eval "export ${MAYBE_XPK_EXIT_CODE}"
+      echo "XPK_EXIT_CODE=${EXIT_CODE}" >> ${GITHUB_ENV}
       exit ${EXIT_CODE}
   
   - name: Clean up JobSet from cluster
@@ -268,3 +270,39 @@ runs:
     if: ${{ always() }}
     run: |
       sudo rm -rf ${WORKLOAD_NAME}
+
+  - name: Generate sitrep
+    id: sitrep
+    shell: bash -x -e {0}
+    if: ${{ always() }}
+    run: |
+      source .github/workflows/scripts/to_json.sh
+      badge_label="${{ matrix.test }}"
+
+      summary="${{ inputs.WORKLOAD_NAME_PREFIX }}"
+      outcome=success
+      badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}"
+      badge_color=brightgreen
+
+      if [ "${XPK_EXIT_CODE}" -gt 0 ]; then
+        badge_color=red
+        outcome=failed
+        summary+=": fail"
+      else
+        summary+=": pass"
+      fi
+
+      to_json summary \
+              badge_label \
+              badge_color \
+              outcome | \
+      tee sitrep.json
+
+  - name: Upload sitrep to GitHub Actions from runner
+    if: ${{ always() }}
+    uses: actions/upload-artifact@v4
+    with:
+      name: ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep
+      path: |
+        sitrep.json
+

From 0b5135b97569874ef7e41422206379f478b9254e Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Wed, 30 Jul 2025 15:18:28 +0200
Subject: [PATCH 25/28] nccl-tests: set LD_LIBRARY_PATH through mpirun (#1589)
 (#1590)

This helps CUDA forward compatibility work when spawning processes over
SSH, as those processes do not see environment variables set by the
container entrypoint that handles forward compatibility.
`/usr/local/cuda/compat/lib` will only exist if the entrypoint detects
that forward compatibility mode is enabled.
---
 .github/eks-workflow-files/mpi-nccl-test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml
index 88cca0590..0e34cb7a2 100644
--- a/.github/eks-workflow-files/mpi-nccl-test.yml
+++ b/.github/eks-workflow-files/mpi-nccl-test.yml
@@ -41,7 +41,8 @@ spec:
                     echo "Workers were still not reachable after ${limit}, exiting"
                     exit 1
                   fi
-                  mpirun --allow-run-as-root -np 16 -N 8 $0 \
+                  mpirun --allow-run-as-root --tag-output -N 1 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib nvidia-smi
+                  mpirun --allow-run-as-root -N 8 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib $0 \
                     -b 8 \
                     -e 16G \
                     -f 2 \

From c70b23990fdb1560a0bacd2ebec559c7c72691d7 Mon Sep 17 00:00:00 2001
From: Brian Yang <125406446+gpupuck@users.noreply.github.com>
Date: Fri, 1 Aug 2025 09:27:12 -0700
Subject: [PATCH 26/28] Pin orbax-checkpoint to 0.11.19 and pip-tools to 7.4.1
 (#1594)

They just so happened to get upgraded on July 31st together, but
- orbax-checkpoint 0.11.20 has issues without internal checkpoint
testing
- pip-tools 7.5.0 will cause `ValueError: '/opt/maxtext/requirements.txt
(line 1)' is not in the subpath of '/opt/pip-tools.d'`. I'm guessing
something is not quite compatible with the Python 3.12 we current have
in the base container. Theoretically, `-r ../maxtext/requirements.txt`
should work, but since we are using a specific version of pip. Let's
play safe at this point and use 7.4.1
---
 .github/container/Dockerfile.base    | 2 +-
 .github/container/Dockerfile.jax     | 1 +
 .github/container/Dockerfile.maxtext | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base
index 06072e8ae..8603ad054 100644
--- a/.github/container/Dockerfile.base
+++ b/.github/container/Dockerfile.base
@@ -129,7 +129,7 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
 # after upgrading to ver 23.3.1 (from /opt/pip) `pip` tries to uninstall itself (default pip-24.0) 
 # and fails due to pip-24.0 has been installed with system tool `apt` but not `python`. So we keep 
 # both pip-24.0 and pip-23.3.1 in the system, but use 23.3.1 with equivalency patch (see above).
-RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*
+RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip "pip-tools==7.4.1" && rm -rf ~/.cache/*
 
 # The symlinks for CUDA/cuDNN/NCCL exist to make the container's installations
 # of those components conform to XLA's expectations for local installations.
diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
index 4f20aa2b3..6d9bc956a 100644
--- a/.github/container/Dockerfile.jax
+++ b/.github/container/Dockerfile.jax
@@ -109,6 +109,7 @@ EOF
 ## Flax
 RUN <<"EOF" bash -ex
 git-clone.sh ${URLREF_FLAX} ${SRC_PATH_FLAX}
+sed -i 's/orbax-checkpoint/orbax-checkpoint==0.11.19/' ${SRC_PATH_FLAX}/pyproject.toml
 echo "-e file://${SRC_PATH_FLAX}" >> /opt/pip-tools.d/requirements-flax.in
 EOF
 
diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext
index 752a8e105..2033c6e4f 100644
--- a/.github/container/Dockerfile.maxtext
+++ b/.github/container/Dockerfile.maxtext
@@ -40,6 +40,7 @@ for pattern in \
     "s|sentencepiece==0.1.97|sentencepiece>=0.2|g" \
     "s|tensorflow>=2.13.0|tensorflow==2.18.1|g" \
     "s|google-cloud-aiplatform==1.61.0|google-cloud-aiplatform>=1.90.0|g" \
+    "s|orbax-checkpoint>=0.5.12|orbax-checkpoint==0.11.19|g" \
   ; do
     # tensorflow-cpu==2.19.0 is incompatible with tensorflow-text
     sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt

From d506eefa612ea5dcb69423303417330b06a08f5d Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Tue, 5 Aug 2025 15:06:32 +0100
Subject: [PATCH 27/28] Set GKE NCCL to use k8s secret action

---
 .github/workflows/_test_nccl_gke.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml
index 0a5dfaac0..eef8b2990 100644
--- a/.github/workflows/_test_nccl_gke.yaml
+++ b/.github/workflows/_test_nccl_gke.yaml
@@ -72,6 +72,17 @@ jobs:
 
           echo "WORKLOAD_PREFIX=${WORKLOAD_PREFIX}" >> ${GITHUB_OUTPUT}
 
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Store registry secret on cluster
+        id: store-token
+        uses: ./.github/actions/store-delete-k8s-ghcr
+
       - name: Create NCCL test Services on cluster
         run: |
           SERVICE_MANIFEST=".github/gke-workflow/nccl/service-${WORKLOAD_NAME}-${{ matrix.test }}.yaml"
@@ -86,6 +97,7 @@ jobs:
         with:
           IMAGE: ${{ env.BASE_IMAGE }}
           WORKLOAD_NAME_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}
+          IMAGE_PULL_SECRET_NAME: ${{ steps.store-token.outputs.token-name }}
           COMMAND: |
             export NHOSTS=${{ env.NHOSTS }};
             export NCCL_LIB_DIR=/opt/nvida/nccl/lib;

From 389bc2f52722cdcd130184080d16d997f60a107a Mon Sep 17 00:00:00 2001
From: "Alex Y. Chan" <alechan@nvidia.com>
Date: Wed, 6 Aug 2025 12:14:13 +0100
Subject: [PATCH 28/28] Update default GKE cluster

---
 .github/actions/gke-xpk/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml
index 574e1be33..57b61b411 100644
--- a/.github/actions/gke-xpk/action.yml
+++ b/.github/actions/gke-xpk/action.yml
@@ -9,7 +9,7 @@ inputs:
     type: string
   GKE_CLUSTER:
     description: 'GKE cluster name'
-    default: jtb-2025-06-12
+    default: jtb-2025-08-06
     required: false
     type: string
   GCP_ZONE: