diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml
index a0bfb0f92..cdb1437e1 100644
--- a/.github/actions/gke-xpk/action.yml
+++ b/.github/actions/gke-xpk/action.yml
@@ -9,7 +9,7 @@ inputs:
     type: string
   GKE_CLUSTER:
     description: 'GKE cluster name'
-    default: jtb-2025-08-26
+    default: jtb-2025-10-07
     required: false
     type: string
   GCP_ZONE:
@@ -247,6 +247,7 @@ runs:
         
         if [ $? -ne 0 ]; then
           echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
+          echo "XPK_EXIT_CODE=1" >> ${GITHUB_ENV}
           exit 1
         fi
 
@@ -262,11 +263,12 @@ runs:
         ALL_EXIT_CODES=$(( ALL_EXIT_CODES + POD_EXIT_CODE ))
       done
 
+      echo "XPK_EXIT_CODE=${ALL_EXIT_CODES}" >> ${GITHUB_ENV}
       if [ ${ALL_EXIT_CODES} -gt 0 ]; then
         exit 1
       fi
       exit 0
-  
+
   - name: Clean up JobSet from cluster
     shell: bash -x -u {0}
     if: ${{ always() }}
@@ -291,3 +293,38 @@ runs:
     if: ${{ always() }}
     run: |
       sudo rm -rf ${WORKLOAD_NAME}
+
+  - name: Generate sitrep
+    id: sitrep
+    shell: bash -x -e {0}
+    if: ${{ always() }}
+    run: |
+      source .github/workflows/scripts/to_json.sh
+      badge_label="${{ matrix.test }}"
+
+      summary="${{ inputs.WORKLOAD_NAME_PREFIX }}"
+      outcome=success
+      badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}"
+      badge_color=brightgreen
+
+      if [ "${XPK_EXIT_CODE}" -gt 0 ]; then
+        badge_color=red
+        outcome=failed
+        summary+=": fail"
+      else
+        summary+=": pass"
+      fi
+
+      to_json summary \
+              badge_label \
+              badge_color \
+              outcome | \
+      tee sitrep.json
+
+  - name: Upload sitrep to GitHub Actions from runner
+    if: ${{ always() }}
+    uses: actions/upload-artifact@v4
+    with:
+      name: ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep
+      path: |
+        sitrep.json
diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
index 1f5aabed7..25136d1e3 100644
--- a/.github/container/Dockerfile.jax
+++ b/.github/container/Dockerfile.jax
@@ -64,6 +64,7 @@ RUN mkdir -p /builder/extra-targets && \
     --src-path-xla ${SRC_PATH_XLA} \
     --sm all \
     --clean \
+    --release \
     ${EXTRA_BUILD_JAX_ARGS}
 
 ## Transformer engine: check out source and build wheel
@@ -97,7 +98,6 @@ ENV BUILD_DATE=${BUILD_DATE}
 # The following environment variables tune performance
 ENV XLA_FLAGS=""
 ENV XLA_FLAGS="${XLA_FLAGS} --xla_gpu_enable_latency_hiding_scheduler=true"
-ENV NCCL_NVLS_ENABLE=0
 
 COPY --from=builder ${BUILD_PATH_JAXLIB} ${BUILD_PATH_JAXLIB}
 COPY --from=builder ${SRC_PATH_JAX} ${SRC_PATH_JAX}
diff --git a/.github/container/build-jax.sh b/.github/container/build-jax.sh
index 244b048ae..12d698a5d 100755
--- a/.github/container/build-jax.sh
+++ b/.github/container/build-jax.sh
@@ -83,7 +83,7 @@ INSTALL=1
 SRC_PATH_JAX="/opt/jax"
 SRC_PATH_XLA="/opt/xla"
 
-args=$(getopt -o h --long bazel-cache:,bazel-cache-namespace:,build-param:,build-path-jaxlib:,clean,cpu-arch:,debug,extra-targets:,extra-target-dest:,no-clean,clean-only,help,install,no-install,src-path-jax:,src-path-xla:,sm: -- "$@")
+args=$(getopt -o h,r --long bazel-cache:,bazel-cache-namespace:,build-param:,build-path-jaxlib:,clean,release,cpu-arch:,debug,extra-targets:,extra-target-dest:,no-clean,clean-only,help,install,no-install,src-path-jax:,src-path-xla:,sm: -- "$@")
 if [[ $? -ne 0 ]]; then
     exit 1
 fi
@@ -135,6 +135,10 @@ while [ : ]; do
             EXTRA_TARGET_DEST="$2"
             shift 2
             ;;
+        -r | --release)
+            IS_RELEASE=1
+            shift 1
+            ;;
         -h | --help)
             usage 1
             ;;
@@ -225,6 +229,7 @@ print_var INSTALL
 print_var PYTHON_VERSION
 print_var SRC_PATH_JAX
 print_var SRC_PATH_XLA
+print_var IS_RELEASE
 
 echo "=================================================="
 
@@ -268,6 +273,12 @@ for component in jaxlib "jax-cuda${CUDA_MAJOR_VERSION}-pjrt" "jax-cuda${CUDA_MAJ
     # version, so nvidia-*-cu12 wheels disappear from the lock file
     sed -i "s|^${component}.*$|${component} @ file://${BUILD_PATH_JAXLIB}/${component//-/_}|" build/requirements.in
 done
+
+if [[ "${IS_RELEASE}" == "1" ]]; then
+    jaxlib_version=$(pip show jaxlib | grep Version | tr ':' '\n' | tail -1)
+    sed -i "s|      f'jaxlib >={_minimum_jaxlib_version}, <={_jax_version}',|      f'jaxlib>=0.5.0',|" /opt/jax/setup.py
+fi
+
 # Bazel args to avoid cache invalidation
 BAZEL_ARGS=(
     --config=cuda_libraries_from_stubs
diff --git a/.github/container/build-te.sh b/.github/container/build-te.sh
index 2c47b725b..3271b4504 100755
--- a/.github/container/build-te.sh
+++ b/.github/container/build-te.sh
@@ -103,6 +103,12 @@ if [[ "$SM" == "all" ]]; then
     SM_LIST=$(default_compute_capabilities)
 elif [[ "$SM" == "local" ]]; then
     SM_LIST=$("${SCRIPT_DIR}/local_cuda_arch")
+    if [[ -z "${SM_LIST}" ]]; then
+        echo "Could not determine the local GPU architecture."
+        echo "You should pass --sm when compiling on a machine without GPUs."
+        nvidia-smi || true
+        exit 1
+    fi
 else
     SM_LIST=${SM}
 fi
@@ -131,8 +137,19 @@ export NVTE_FRAMEWORK=jax
 export XLA_HOME=${SRC_PATH_XLA}
 
 pushd ${SRC_PATH_TE}
-# Install required packages that were removed in https://github.com/NVIDIA/TransformerEngine/pull/1852
-pip install "pybind11[global]"
+# Install some build dependencies, but avoid installing everything
+# (jax, torch, ...) because we do not want to pull in a released version of
+# JAX, or the wheel-based installation of CUDA. Note that when we build TE as
+# part of building the JAX containers, JAX and XLA are not yet installed.
+python - << EOF
+import subprocess, sys, tomllib
+with open("pyproject.toml", "rb") as ifile:
+    data = tomllib.load(ifile)
+subprocess.run(
+    [sys.executable, "-m", "pip", "install"]
+    + [r for r in data["build-system"]["requires"]
+       if r.startswith("nvidia-mathdx") or r.startswith("pybind11")])
+EOF
 
 # The wheel filename includes the TE commit; if this has changed since the last
 # incremental build then we would end up with multiple wheels.
diff --git a/.github/container/git-clone.sh b/.github/container/git-clone.sh
index f4ddbc7fb..956d189b7 100755
--- a/.github/container/git-clone.sh
+++ b/.github/container/git-clone.sh
@@ -77,6 +77,13 @@ pushd ${DESTINATION}
 git checkout ${GIT_REF}
 COMMIT_SHA=$(git rev-parse HEAD)
 git submodule update --init --recursive
+if [[ "${GIT_REPO}" == *"gitlab"* ]]; then
+  git remote remove origin
+  if grep -q -r gitlab-ci-token .git; then
+    grep -r gitlab-ci-token .git | awk -F: '{print $1}' | xargs rm -f
+  fi
+  git branch -D main
+fi
 popd
 
 ## update the manifest file
diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh
index 285da565c..6e2a59aed 100755
--- a/.github/container/pip-finalize.sh
+++ b/.github/container/pip-finalize.sh
@@ -4,54 +4,60 @@ set -eoux pipefail
 
 pushd /opt/pip-tools.d
 
-# First pip-compile gathers all reqs, but we are care only about VCS installs
-# It's possible there are 2nd degree transitive dependencies that are VCS, so
-# this is more robust to gather VCS requirements at the cost of pip-compiling
-# twice
-pip-compile -o requirements.pre $(ls requirements-*.in)
+# If requirements-pinned.txt exists, skip compilation
+if [[ -f "requirements-pinned.txt" ]]; then
+  sed -E 's/#sha256=[a-f0-9]+//g' requirements-pinned.txt > requirements.txt
+else
+  # First pip-compile gathers all reqs, but we are care only about VCS installs
+  # It's possible there are 2nd degree transitive dependencies that are VCS, so
+  # this is more robust to gather VCS requirements at the cost of pip-compiling
+  # twice
+  pip-compile -o requirements.pre $(ls requirements-*.in)
 
-IFS=$'\n'
-for line in $(cat requirements.pre | egrep '^[^#].+ @ git\+' || true); do
-  # VCS installs are of the form "PACKAGE @ git+..."
-  PACKAGE=$(echo "$line" | awk '{print $1}')
-  ref=$(yq e ".${PACKAGE}.latest_verified_commit" ${MANIFEST_FILE})
-  if [[ "$line" == *"#subdirectory="* ]]; then
-    # This is required b/c git-refs/commits cannot come after
-    # the subdirectory fragment.
-    # An example of an install that is of this form is:
-    # 'orbax-checkpoint @ git+https://github.com/google/orbax/#subdirectory=checkpoint'
-    echo "${line}" | sed "s/#subdirectory=/@${ref}#subdirectory=/"
-  else
-    echo "${line}@${ref}"
-  fi
-done | tee requirements.vcs
-unset IFS
+  IFS=$'\n'
+  for line in $(cat requirements.pre | egrep '^[^#].+ @ git\+' || true); do
+    # VCS installs are of the form "PACKAGE @ git+..."
+    PACKAGE=$(echo "$line" | awk '{print $1}')
+    ref=$(yq e ".${PACKAGE}.latest_verified_commit" ${MANIFEST_FILE})
+    if [[ "$line" == *"#subdirectory="* ]]; then
+      # This is required b/c git-refs/commits cannot come after
+      # the subdirectory fragment.
+      # An example of an install that is of this form is:
+      # 'orbax-checkpoint @ git+https://github.com/google/orbax/#subdirectory=checkpoint'
+      echo "${line}" | sed "s/#subdirectory=/@${ref}#subdirectory=/"
+    else
+      echo "${line}@${ref}"
+    fi
+  done | tee requirements.vcs
+  unset IFS
 
-# Second pip-compile includes one more requirements file that pins all vcs installs
-# Uses a special env var to let our custom pip impl know to treat the following as
-# equivalent:
-#
-# fiddle @ git+https://github.com/google/fiddle
-# fiddle @ git+https://github.com/google/fiddle@cd4497e4c09bdf95dcccaa1e138c2c125d32d39f
-#
-# JAX_TOOLBOX_VCS_EQUIVALENCY is an environment variable enabling custom logic in pip
-# that treats the above as equivalent and prefers the URI wit the SHA
-JAX_TOOLBOX_VCS_EQUIVALENCY=true pip-compile -o requirements.txt requirements.vcs $(ls requirements-*.in)
+  # Second pip-compile includes one more requirements file that pins all vcs installs
+  # Uses a special env var to let our custom pip impl know to treat the following as
+  # equivalent:
+  #
+  # fiddle @ git+https://github.com/google/fiddle
+  # fiddle @ git+https://github.com/google/fiddle@cd4497e4c09bdf95dcccaa1e138c2c125d32d39f
+  #
+  # JAX_TOOLBOX_VCS_EQUIVALENCY is an environment variable enabling custom logic in pip
+  # that treats the above as equivalent and prefers the URI wit the SHA
+  JAX_TOOLBOX_VCS_EQUIVALENCY=true pip-compile -o requirements.txt requirements.vcs $(ls requirements-*.in)
 
-# If there are unpinned VCS dependencies, error since these should be included in the manifest
-unpinned_vcs_dependencies=$(cat requirements.txt | egrep '^[^#].+ @ git\+' | egrep -v '^[^#].+ @ git\+.+@' || true)
-if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
-  echo "Unpinned VCS installs found in $(readlink -f requirements.txt):"
-  echo "$unpinned_vcs_dependencies"
-  exit 1
-fi
+  # If there are unpinned VCS dependencies, error since these should be included in the manifest
+  unpinned_vcs_dependencies=$(cat requirements.txt | egrep '^[^#].+ @ git\+' | egrep -v '^[^#].+ @ git\+.+@' || true)
+  if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
+    echo "Unpinned VCS installs found in $(readlink -f requirements.txt):"
+    echo "$unpinned_vcs_dependencies"
+    exit 1
+  fi
 
-# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64
-if [ "$(uname -m)" = "x86_64" ]; then
-  sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
-else
-  echo "Skipping TF on $(uname -m)"
+  # Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64
+  if [[ "$(uname -m)" = "x86_64" ]]; then
+    sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
+  else
+    echo "Skipping TF on $(uname -m)"
+  fi
 fi
+
 # --no-deps is required since conflicts can still appear during pip-sync
 pip-sync --pip-args '--no-deps --src /opt' requirements.txt
 
@@ -63,3 +69,6 @@ for post_install in $(ls /opt/pip-tools-post-install.d/*); do
     "${post_install}"
   fi
 done
+
+echo "######## Frozen requirements ########"
+pip freeze
diff --git a/.github/container/test-jax.sh b/.github/container/test-jax.sh
index 3398b72c8..73aab9fd0 100755
--- a/.github/container/test-jax.sh
+++ b/.github/container/test-jax.sh
@@ -119,7 +119,15 @@ fi
 
 readarray -t GPU_MEMORIES < <(nvidia-smi --query-gpu=memory.total --format=csv,noheader)
 NGPUS="${#GPU_MEMORIES[@]}"
-GPU_MEMORIES_MIB=("${GPU_MEMORIES[@]/ MiB/}")
+if [[ " ${GPU_MEMORIES[*]} " =~ [[:space:]]\[N/A\][[:space:]] ]]; then
+    # On iGPU devices, nvidia-smi reports [N/A] GPU memory; use the system
+    # memory size instead to estimate what each GPU can use
+    SYSTEM_MEMORY_MIB=$(grep MemTotal /proc/meminfo | awk '{print $2 / 1024}')
+    declare -a GPU_MEMORIES_MIB
+    for (( i = 0; i < NGPUS; i++ )); do GPU_MEMORIES_MIB+=($(( SYSTEM_MEMORY_MIB / NGPUS ))); done
+else
+    GPU_MEMORIES_MIB=("${GPU_MEMORIES[@]/ MiB/}")
+fi
 
 FLAGS=()
 
diff --git a/.github/eks-workflow-files/maxtext-job.yaml b/.github/eks-workflow-files/maxtext-job.yaml
new file mode 100644
index 000000000..7d9728f87
--- /dev/null
+++ b/.github/eks-workflow-files/maxtext-job.yaml
@@ -0,0 +1,120 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: PLACEHOLDER
+spec:
+  clusterIP: None # clusterIP must be None to create a headless service
+  selector:
+    job-name: PLACEHOLDER # must match Job name
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
+spec:
+  completions: 2 # number of nodes
+  parallelism: 2 # number of nodes
+  completionMode: Indexed
+  backoffLimitPerIndex: 0 # max failures per index
+  maxFailedIndexes:     0 # all indices must succeed
+  template:
+    spec:
+      subdomain: PLACEHOLDER # has to match Service name
+      restartPolicy: Never
+      imagePullSecrets:
+        - name: PLACEHOLDER
+      containers:
+        - name: maxtext
+          image: PLACEHOLDER
+          ports:
+            - containerPort: 3389
+          command:
+            - bash
+            - -c
+            # The logging logic: stream stdout/stderr from the 0th process inside this pod,
+            # record all of the processes' stdout/stderr + the INFO-level NCCL logs to file
+            - |
+              export SERVICE_NAME=$0
+              export JOB_NAME=$1
+              cat >each-process.sh <<'EOL'
+              export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME}
+              export JAX_COORDINATOR_PORT=3389
+              export NNODES=16 # actually #processes == #GPUs
+              export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK))
+              export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK
+              export NCCL_DEBUG=INFO
+              export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log
+              [[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null"
+              nsys-jax \
+                --capture-range=cudaProfilerApi \
+                --capture-range-end=stop \
+                -o /opt/output/profile.$NODE_RANK.zip \
+                -- \
+                test-maxtext.sh \
+                -n 2 \
+                -b 2 \
+                --model-name=llama2-7b \
+                --attn-type=cudnn_flash_te \
+                --remat-policy=minimal_flash \
+                --steps=20 \
+                --fsdp=16 \
+                -a "scan_layers=false \
+                    max_target_length=4096 \
+                    use_iota_embed=true \
+                    logits_dot_in_fp32=false \
+                    profiler=nsys \
+                    skip_first_n_steps_for_profiler=3 \
+                    profiler_steps=8" \
+                |& tee /opt/output/output.$NODE_RANK.log >"${console}"
+              code=$?
+              # Should run even on failure
+              cat /opt/output/nccl.$NODE_RANK.log >"${console}"
+              exit $code
+              EOL
+              # TODO: upgrade parallel-launch to return a failure code as soon as any
+              #       of its children do (it already does this eventually, but it could
+              #       be slow)
+              parallel-launch LOCAL_RANK 8 bash each-process.sh
+              code=$?
+              # Should run even on failure
+              touch /opt/output/.done
+              exit $code
+            - PLACEHOLDER
+            - PLACEHOLDER
+          resources:
+            limits:
+              nvidia.com/gpu: 8
+              vpc.amazonaws.com/efa: 32
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: shmem
+            - mountPath: /opt/output
+              name: output
+        - name: upload
+          image: amazon/aws-cli
+          command:
+            - bash
+            - -c
+            - |
+              JOB_NAME="$0"
+              while [[ ! -f /opt/output/.done ]]; do
+                sleep 1
+              done
+              rm /opt/output/.done
+              aws s3 cp \
+                --recursive \
+                /opt/output \
+                "s3://jax-toolbox-eks-output/${JOB_NAME}/"
+            - PLACEHOLDER
+          volumeMounts:
+            - mountPath: /opt/output
+              name: output
+      volumes:
+        - name: output
+          emptyDir: {}
+        - name: shmem
+          emptyDir:
+            medium: Memory
+            sizeLimit: 16Gi
diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml
index 88feb716a..464b6af51 100644
--- a/.github/workflows/_test_maxtext_gke_xpk.yaml
+++ b/.github/workflows/_test_maxtext_gke_xpk.yaml
@@ -26,14 +26,14 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    - name: Login to GitHub Container Registry
+    - name: Login to nvcr.io Container Registry
       uses: docker/login-action@v3
       with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
+        registry: nvcr.io
+        username: $oauthtoken
+        password: ${{ secrets.NVCR_TOKEN }}
 
-    - name: K8s GHCR store and delete token
+    - name: K8s store and delete token
       id: store-token
       uses: ./.github/actions/store-delete-k8s-ghcr
 
diff --git a/.github/workflows/_test_maxtext_k8s.yaml b/.github/workflows/_test_maxtext_k8s.yaml
new file mode 100644
index 000000000..7f82d3f42
--- /dev/null
+++ b/.github/workflows/_test_maxtext_k8s.yaml
@@ -0,0 +1,107 @@
+name: ~test MaxText functionality on Kubernetes
+
+on:
+  workflow_call:
+    inputs:
+      MAXTEXT_IMAGE:
+        type: string
+        description: MaxText container to test
+        required: true
+
+permissions:
+  contents: read  # to fetch code
+
+jobs:
+  maxtext:
+    runs-on: eks
+    env:
+      CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}"
+      JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}"
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Login to NVIDIA Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
+      - name: Store GitHub Container Registry token as Kubernetes secret
+        run: |
+          # Make this available to later steps
+          TOKEN_NAME="${JOB_NAME}-token"
+          echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
+          kubectl create secret generic \
+            ${TOKEN_NAME} \
+            --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+            --type=kubernetes.io/dockerconfigjson
+      - name: Configure Kubernetes job
+        run: |
+          export SERVICE_NAME="${JOB_NAME}-svc"
+          yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME)
+            | select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+            | select(di == 1).metadata.name = strenv(JOB_NAME)
+            | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
+            | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE)
+            | select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME)
+            | select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME)
+            | select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \
+            .github/eks-workflow-files/maxtext-job.yaml
+          git diff .github/eks-workflow-files/maxtext-job.yaml
+      - name: Submit Kubernetes job
+        run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml
+      - name: Wait for Kubernetes job to start
+        run: |
+          # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
+          # resources are available, but that is where there can be a long wait if the
+          # cluster is busy executing other jobs.
+          kubectl wait --for=create job/${JOB_NAME}
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s
+      - name: Stream Kubernetes job output
+        run: |
+          # Streaming logs will fail if the container/pod is still pending
+          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+            sleep 1
+          done
+          kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME}
+      - name: Retrieve Kubernetes job status
+        shell: bash -exo pipefail {0}
+        run: |
+          while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failure=${status[0]:-0}
+            success=${status[1]:-0}
+            total=$((failure+success))
+            if [[ ${total} < 2 ]]; then
+              sleep 1
+            elif [[ ${total} == 2 ]]; then
+              break
+            else
+              # FIXME
+              exit 255
+            fi
+          done
+          exit ${failure}
+      # Provide more debug output in case of failure; note that some kinds of launch
+      # failure do not produce any log output.
+      - name: Debug failed Kubernetes job
+        if: failure()
+        run: |
+          # Provide better debug in case of launch failures that will not produce log output
+          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name)
+          if [[ -n "${pods}" ]]; then
+            kubectl describe ${pods}
+          fi
+      # Clean up in case of errors as well as success
+      - name: Delete Kubernetes job
+        if: always()
+        run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml
+      - name: Delete GitHub Container Registry token
+        if: always()
+        run: kubectl delete secret ${TOKEN_NAME}
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 987ccb34c..c68ebd9fc 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -23,6 +23,12 @@ jobs:
   build-mpi-operator-compatible-base:
     runs-on: [self-hosted, "amd64", "large"]
     steps:
+      - name: Login to nvcr.io Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
       - name: Checkout repository
         uses: actions/checkout@v4
       - name: Build MPI operator compatible base container
diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml
index ed10c0f47..7814fd858 100644
--- a/.github/workflows/_test_nccl_gke.yaml
+++ b/.github/workflows/_test_nccl_gke.yaml
@@ -14,6 +14,12 @@ jobs:
     runs-on: [self-hosted, "amd64", "large"]
     steps:
       - uses: actions/checkout@v4
+      - name: Login to nvcr.io Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
       - name: Build NCCL image
         id: build
         uses: ./.github/actions/build-container
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 1a6f53ec4..d328c993b 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -12,6 +12,8 @@ on:
     paths-ignore:
       - '**.md'
       - '.github/triage/**'
+    branches-ignore:
+      - '25.*' # workflows for release to be triggered via dispatch event only
   workflow_dispatch:
     inputs:
       PUBLISH:
diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml
new file mode 100644
index 000000000..2a0d7c57e
--- /dev/null
+++ b/.github/workflows/ngc-release-testing.yaml
@@ -0,0 +1,52 @@
+name: ~NGC release testing
+
+on:
+  workflow_dispatch:
+    inputs:
+      JAX_IMAGE:
+        type: string
+        description: "JAX image to run tests on"
+        required: false
+        default: ''
+      MAXTEXT_IMAGE:
+        type: string
+        description: "MaxText image to run tests on"
+        required: false
+        default: ''
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+
+permissions:
+  contents: read  # to fetch code
+  actions:  write # to cancel previous workflows
+  packages: write # to upload container
+
+jobs:
+  test-nccl:
+    if: inputs.JAX_IMAGE != ''
+    uses: ./.github/workflows/_test_nccl.yaml
+    with:
+      CONTAINER: ${{ inputs.JAX_IMAGE }}
+    secrets: inherit
+
+  test-maxtext-eks:
+    if: inputs.MAXTEXT_IMAGE != ''
+    uses: ./.github/workflows/_test_maxtext_k8s.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
+    secrets: inherit
+
+  test-maxtext-gke:
+    if: inputs.MAXTEXT_IMAGE != ''
+    uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
+    secrets: inherit
+
+  finalize:
+    needs: [ test-nccl, test-maxtext-gke,test-maxtext-eks ]
+    if: "!cancelled()"
+    uses: ./.github/workflows/_finalize.yaml
+    secrets: inherit
diff --git a/README.md b/README.md
index 9d3be4a34..d928e614c 100644
--- a/README.md
+++ b/README.md
@@ -218,10 +218,6 @@ The [JAX image](https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax) is emb
 | --------- | ----- | ----------- |
 | `--xla_gpu_enable_latency_hiding_scheduler` | `true`  | allows XLA to move communication collectives to increase overlap with compute kernels |
 
-| Environment Variable | Value | Explanation |
-| -------------------- | ----- | ----------- |
-| `NCCL_NVLS_ENABLE` | `0` | Disables NVLink SHARP ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature. |
-
 There are various other XLA flags users can set to improve performance. For a detailed explanation of these flags, please refer to the [GPU performance](./rosetta/docs/GPU_performance.md) doc. XLA flags can also be tuned per workload. For example, each script includes a directory [xla_flags](./rosetta/rosetta/projects/maxtext/xla_flags).
 
 For a list of previously used XLA flags that are no longer needed, please also refer to the [GPU performance](./rosetta/docs/GPU_performance.md#previously-used-xla-flags) page.