From cd29eab220cb4fa266d69a692de1fbc0358ad205 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Thu, 26 Jun 2025 14:50:06 +0100 Subject: [PATCH 01/28] Add NGC release test workflow --- .github/workflows/ngc-release-testing.yaml | 82 ++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 .github/workflows/ngc-release-testing.yaml diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml new file mode 100644 index 000000000..3150a07c2 --- /dev/null +++ b/.github/workflows/ngc-release-testing.yaml @@ -0,0 +1,82 @@ +name: ~NGC release testing + +on: + workflow_dispatch: + inputs: + JAX_IMAGE: + type: string + description: "JAX image to run tests on" + required: false + default: '' + MAXTEXT_IMAGE: + type: string + description: "MaxText image to run tests on" + required: false + default: '' + + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container + +jobs: + test-nccl: + if: inputs.JAX_IMAGE != '' + uses: ./.github/workflows/_test_nccl.yaml + with: + CONTAINER: ${{ inputs.JAX_IMAGE }} + secrets: inherit + + test-jax: + if: inputs.JAX_IMAGE != '' + uses: ./.github/workflows/_test_unit.yaml + with: + TEST_NAME: jax + EXECUTE: | + docker run -i --shm-size=1g --gpus all \ + ${{ inputs.JAX_IMAGE }} \ + bash <<"EOF" |& tee test-backend-independent.log + test-jax.sh -b backend-independent + EOF + docker run -i --shm-size=1g --gpus all \ + ${{ inputs.JAX_IMAGE }} \ + bash <<"EOF" |& tee tee test-gpu.log + test-jax.sh -b gpu + EOF + STATISTICS_SCRIPT: | + errors=$(cat test-*.log | grep -c 'ERROR:' || true) + failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + total_tests=$((failed_tests + passed_tests)) + echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + ARTIFACTS: | + test-backend-independent.log + test-gpu.log + secrets: inherit + + test-maxtext: + if: inputs.MAXTEXT_IMAGE != '' + uses: ./.github/workflows/_test_maxtext.yaml + with: + MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} + secrets: inherit + + test-maxtext-eks: + if: inputs.MAXTEXT_IMAGE != '' + uses: ./.github/workflows/_test_maxtext_k8s.yaml + with: + MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} + secrets: inherit + + finalize: + needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks ] + if: "!cancelled()" + uses: ./.github/workflows/_finalize.yaml + secrets: inherit From 013d4a515629c249bd481ee2614c25efb6741f6d Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Thu, 26 Jun 2025 17:15:15 +0200 Subject: [PATCH 02/28] test-jax.sh: fix typo (#1526) This meant that ~40-80GB GPUs would run >4 parallel jobs. --- .github/container/test-jax.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/container/test-jax.sh b/.github/container/test-jax.sh index 2671bcb65..3398b72c8 100755 --- a/.github/container/test-jax.sh +++ b/.github/container/test-jax.sh @@ -140,7 +140,7 @@ FLAGS+=("--//jaxlib/tools:add_pypi_cuda_wheel_deps=false") # Default parallelism: at least 10GB per test, no more than 4 tests per GPU. DEFAULT_JOBS_PER_GPU=$(( GPU_MEMORIES_MIB[0] / 10000)) -if (( DEFAULT_JOBS_PER_GPU > 8 )); then DEFAULT_JOBS_PER_GPU=4; fi +if (( DEFAULT_JOBS_PER_GPU > 4 )); then DEFAULT_JOBS_PER_GPU=4; fi set_default JOBS_PER_GPU ${DEFAULT_JOBS_PER_GPU} FLAGS+=( "--cache_test_results=${CACHE_TEST_RESULTS}" From 25ad36549e0d1081e019164c55c919f046f37178 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Tue, 15 Jul 2025 11:33:43 +0100 Subject: [PATCH 03/28] Add k8s maxtext workflow --- .github/workflows/_test_maxtext_k8s.yaml | 107 +++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 .github/workflows/_test_maxtext_k8s.yaml diff --git a/.github/workflows/_test_maxtext_k8s.yaml b/.github/workflows/_test_maxtext_k8s.yaml new file mode 100644 index 000000000..7f82d3f42 --- /dev/null +++ b/.github/workflows/_test_maxtext_k8s.yaml @@ -0,0 +1,107 @@ +name: ~test MaxText functionality on Kubernetes + +on: + workflow_call: + inputs: + MAXTEXT_IMAGE: + type: string + description: MaxText container to test + required: true + +permissions: + contents: read # to fetch code + +jobs: + maxtext: + runs-on: eks + env: + CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}" + JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}" + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to NVIDIA Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} + - name: Store GitHub Container Registry token as Kubernetes secret + run: | + # Make this available to later steps + TOKEN_NAME="${JOB_NAME}-token" + echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV" + kubectl create secret generic \ + ${TOKEN_NAME} \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson + - name: Configure Kubernetes job + run: | + export SERVICE_NAME="${JOB_NAME}-svc" + yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME) + | select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + | select(di == 1).metadata.name = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE) + | select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME) + | select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \ + .github/eks-workflow-files/maxtext-job.yaml + git diff .github/eks-workflow-files/maxtext-job.yaml + - name: Submit Kubernetes job + run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml + - name: Wait for Kubernetes job to start + run: | + # Launcher job is created eagerly, but suspended. Kueue un-suspends it when + # resources are available, but that is where there can be a long wait if the + # cluster is busy executing other jobs. + kubectl wait --for=create job/${JOB_NAME} + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s + - name: Stream Kubernetes job output + run: | + # Streaming logs will fail if the container/pod is still pending + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 1 + done + kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME} + - name: Retrieve Kubernetes job status + shell: bash -exo pipefail {0} + run: | + while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do + failure=${status[0]:-0} + success=${status[1]:-0} + total=$((failure+success)) + if [[ ${total} < 2 ]]; then + sleep 1 + elif [[ ${total} == 2 ]]; then + break + else + # FIXME + exit 255 + fi + done + exit ${failure} + # Provide more debug output in case of failure; note that some kinds of launch + # failure do not produce any log output. + - name: Debug failed Kubernetes job + if: failure() + run: | + # Provide better debug in case of launch failures that will not produce log output + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name) + if [[ -n "${pods}" ]]; then + kubectl describe ${pods} + fi + # Clean up in case of errors as well as success + - name: Delete Kubernetes job + if: always() + run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml + - name: Delete GitHub Container Registry token + if: always() + run: kubectl delete secret ${TOKEN_NAME} From a598d8d3a63dda7f157b685fa45ca1da0e8d6cbc Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Thu, 3 Jul 2025 15:15:00 +0100 Subject: [PATCH 04/28] Add GKE example (#1481) Add `GKE` `MaxText` train ([example run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15744603099/job/44379358307)) and `NCCL` test ([example run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15744603099/job/44378422712)) workflows with reusable composite action for managing `xpk` job lifecycle (launch, logs streaming, clean up, artifact upload). Patches on `xpk` address the following identified issues: - https://github.com/AI-Hypercomputer/xpk/issues/476 - https://github.com/AI-Hypercomputer/xpk/issues/488 - https://github.com/AI-Hypercomputer/xpk/issues/490 - https://github.com/AI-Hypercomputer/xpk/issues/491 - https://github.com/AI-Hypercomputer/xpk/issues/492 Cluster create with `xpk` ([example run](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/15591134618/job/43910254644#step:5:1)) - added as a separate [workflow](https://github.com/NVIDIA/JAX-Toolbox/pull/1481/files#diff-801fc28cafbf1e0fa0ea521355fa8a1c9e6c01dcb8b1083c47f66e2ead4d560a) for demonstration purposes (will not be operational in the CI) --------- Co-authored-by: Olli Lupton --- .github/actions/gke-xpk/action.yml | 264 ++++++++++++++++++ .github/container/Dockerfile.nccl-gke | 12 + .../nccl/scripts/generate_hostfiles.sh | 24 ++ .../nccl/scripts/nccl-test-launch.sh | 39 +++ .../gke-workflow/nccl/scripts/start_ssh.sh | 11 + .github/gke-workflow/nccl/scripts/test.sh | 61 ++++ .github/gke-workflow/nccl/service.yml | 25 ++ .github/gke-workflow/xpk/blueprint.patch | 35 +++ .../gke-workflow/xpk/docker_resources.patch | 98 +++++++ .../gke-workflow/xpk/tcpxo_decorator.patch | 13 + .github/gke-workflow/xpk/workload.patch | 26 ++ .github/gke-workflow/xpk/xpk-sa-rbac.yml | 33 +++ .github/workflows/_ci.yaml | 13 + .github/workflows/_create_gke_cluster_xpk.yml | 65 +++++ .github/workflows/_test_maxtext_gke_xpk.yaml | 57 ++++ .github/workflows/_test_nccl.yaml | 7 +- .github/workflows/_test_nccl_gke.yaml | 109 ++++++++ 17 files changed, 891 insertions(+), 1 deletion(-) create mode 100644 .github/actions/gke-xpk/action.yml create mode 100644 .github/container/Dockerfile.nccl-gke create mode 100644 .github/gke-workflow/nccl/scripts/generate_hostfiles.sh create mode 100644 .github/gke-workflow/nccl/scripts/nccl-test-launch.sh create mode 100644 .github/gke-workflow/nccl/scripts/start_ssh.sh create mode 100644 .github/gke-workflow/nccl/scripts/test.sh create mode 100644 .github/gke-workflow/nccl/service.yml create mode 100644 .github/gke-workflow/xpk/blueprint.patch create mode 100644 .github/gke-workflow/xpk/docker_resources.patch create mode 100644 .github/gke-workflow/xpk/tcpxo_decorator.patch create mode 100644 .github/gke-workflow/xpk/workload.patch create mode 100644 .github/gke-workflow/xpk/xpk-sa-rbac.yml create mode 100644 .github/workflows/_create_gke_cluster_xpk.yml create mode 100644 .github/workflows/_test_maxtext_gke_xpk.yaml create mode 100644 .github/workflows/_test_nccl_gke.yaml diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml new file mode 100644 index 000000000..1b30e5ddb --- /dev/null +++ b/.github/actions/gke-xpk/action.yml @@ -0,0 +1,264 @@ +name: Launch workload on GKE with XPK + +description: "Launch a JobSet workload on GKE with XPK. Upload artifacts from container to GCS and GitHub Actions." + +inputs: + GCP_PROJECT: + description: 'GCP project ID' + default: nv-jaxtoolboxgcp-20240925 + type: string + GKE_CLUSTER: + description: 'GKE cluster name' + default: jtb-2025-06-12 + required: false + type: string + GCP_ZONE: + description: 'GCP zone of the cluster' + default: us-central1-a + required: false + type: string + CLUSTER_DEVICE: + description: 'GPU device type in the cluster' + default: h100-mega-80gb-8 + required: false + type: string + NUM_NODES: + description: 'Number of nodes to use in JobSet (n.b each a3-megagpu-8g node has 8xGPU)' + default: 2 + required: false + type: string + MAIN_CONTAINER_NAME: + description: 'Name of the main contianer in an XPK JobSet (fixed)' + default: gpu-image + required: false + type: string + CONTAINER_OUTPUT_PATH: + description: 'Output directory for artifacts' + default: /opt/output + required: false + type: string + GCS_BUCKET: + description: 'GCS bucket to which CI output artifacts will be uploaded' + default: jaxtoolbox-ci + required: false + type: string + IMAGE: + description: 'URI of image to use in JobSet' + required: false + default: ghcr.io/nvidia/jax:latest + type: string + COMMAND: + description: 'Command to run in main container on JobSet start up' + required: false + default: 'nvidia-smi; free -h;' + type: string + EXIT_COMMAND: + description: 'Command to set exit code' + required: false + default: 'exit \$EXIT_CODE' + type: string + WORKLOAD_NAME_PREFIX: + description: 'Workload name prefix for XPK, also used to name uploaded artifact' + required: false + default: 'xpk' + type: string + XPK_VERSION: + description: 'XPK release tag' + required: false + default: 'v0.8.0' + type: string + XPK_PYTHON: + description: 'Python version for XPK' + required: false + default: '3.12.10' + type: string + +runs: + using: 'composite' + steps: + + - name: Set workload name + shell: bash -x -e -u {0} + run: | + WORKLOAD_NAME="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" + DATE=$(date +'%Y-%m-%d') + GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME}" + + echo "WORKLOAD_NAME=${WORKLOAD_NAME}" >> ${GITHUB_ENV} + echo "DATE=${DATE}" >> ${GITHUB_ENV} + echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV} + + - name: Setup environment + shell: bash -x -e -u {0} + run: | + mkdir -p ${WORKLOAD_NAME} + uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME} + source ${WORKLOAD_NAME}/.venv/bin/activate + + # install xpk + git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME}/xpk + + sed 's@pip install \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME}/xpk/Makefile + cd ${WORKLOAD_NAME}/xpk && sudo make install; cd - + + - name: Show environment + shell: bash -x -e -u {0} + run: | + gcloud version + + source ${WORKLOAD_NAME}/.venv/bin/activate + python --version + xpk version + + - name: Apply XPK workload create patch + shell: bash -x -e -u {0} + run: | + git apply --unsafe-paths .github/gke-workflow/xpk/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk + git apply --unsafe-paths .github/gke-workflow/xpk/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk + git apply --unsafe-paths .github/gke-workflow/xpk/workload.patch --directory ${WORKLOAD_NAME}/xpk + + - name: Set workload commands + shell: bash -x -e -u {0} + run: | + PRELUDE=" + apt install -y ripgrep > /dev/null; + curl -LO https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz; + tar xf google-cloud-cli-linux-x86_64.tar.gz; + ./google-cloud-sdk/install.sh --quiet > /dev/null; + ./google-cloud-sdk/bin/gcloud init; + + mkdir -p /usr/share/workload; + mkdir -p ${{ inputs.CONTAINER_OUTPUT_PATH }}; + " + + POSTLUDE=" + ./google-cloud-sdk/bin/gsutil cp -r ${{ inputs.CONTAINER_OUTPUT_PATH }}/ ${GCS_ARTIFACT_PATH}/node-0\$NODE_RANK; + ${{ inputs.EXIT_COMMAND }} + " + + CMD="${{ inputs.COMMAND }}" + + # set container commands in-line + PRELUDE=$(echo ${PRELUDE} | sed 's/\n/\ /g') + POSTLUDE=$(echo ${POSTLUDE} | sed 's/\n/\ /g') + CMD=$(echo ${CMD} | sed 's/\n/\ /g') + + echo "PRELUDE=${PRELUDE}" >> ${GITHUB_ENV} + echo "CMD=${CMD}" >> ${GITHUB_ENV} + echo "POSTLUDE=${POSTLUDE}" >> ${GITHUB_ENV} + + - name: Create workload on cluster with XPK + shell: bash -x -e -u {0} + run: | + source ${WORKLOAD_NAME}/.venv/bin/activate + cd ${WORKLOAD_NAME}/xpk + python xpk.py workload create \ + --project ${{ inputs.GCP_PROJECT }} \ + --cluster ${{ inputs.GKE_CLUSTER }} \ + --zone ${{ inputs.GCP_ZONE }} \ + --workload ${WORKLOAD_NAME} \ + --docker-image ${{ inputs.IMAGE }} \ + --device-type ${{ inputs.CLUSTER_DEVICE }} \ + --num-nodes ${{ inputs.NUM_NODES }} \ + --num-slices ${{ inputs.NUM_NODES }} \ + --priority=high \ + --scheduler=gke.io/topology-aware-auto \ + --command "${PRELUDE} ${CMD} ${POSTLUDE}" + + - name: Wait for JobSet to unsuspend on cluster + shell: bash -u {0} + env: + POLL_TIMEOUT: 3600 + run: | + START=$(date +%s) + JOBSET_ACTIVE=false + while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do + JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1') + NOW=$(date +%s) + ELAPSED=$(( NOW - START )) + if (( ELAPSED > POLL_TIMEOUT )) ; then + echo "Timeout after waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}" + exit 1 + fi + echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}" + sleep 5 + done + + echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}" + + - name: Set JobSet Pod name + shell: bash -u {0} + run: | + echo "POD=$(kubectl get pods -o json | jq -r '.items[] | select(.metadata.labels."'jobset.sigs.k8s.io/jobset-name'" == "'${WORKLOAD_NAME}'") | .metadata.name ' | sort | head -n1 )" >> ${GITHUB_ENV} + + - name: Wait for JobSet Pod readiness + shell: bash -u {0} + run: | + POD_READY=false + while ! ${POD_READY} || [ -z ${POD_READY} ]; do + echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready" + sleep 10 + + POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))') + if ${POD_ERROR} ; then + echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}" + break + fi + + POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready') + done; + + - name: Stream logs from JobSet Pods + shell: bash -u {0} + run: | + jobset_pods=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' ')) + + for jobset_pod in ${jobset_pods[@]}; do + kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME}-${jobset_pod}-jobset.log & + done + wait < <(jobs -p) + + - name: Set exit code from JobSet logs + shell: bash -u {0} + run: | + MAYBE_XPK_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME}-${POD}-jobset.log | awk '{ print $3 }' )" + echo ${MAYBE_XPK_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$' + + if [ $? -ne 0 ]; then + echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected " + exit 1 + fi + + eval "export ${MAYBE_XPK_EXIT_CODE}" + exit ${EXIT_CODE} + + - name: Clean up JobSet from cluster + shell: bash -x -u {0} + if: ${{ always() }} + run: | + kubectl delete jobset --wait ${WORKLOAD_NAME} || echo "JobSet ${WORKLOAD_NAME} does not exist in ${{ inputs.GKE_CLUSTER }}" + + - name: Download artifacts from GCS to runner + shell: bash -x -u {0} + run: | + mkdir -p output/${WORKLOAD_NAME} + mv ${WORKLOAD_NAME}-*.log output/${WORKLOAD_NAME} + gsutil cp -r ${GCS_ARTIFACT_PATH} output/${WORKLOAD_NAME} + + - name: Upload artifacts to GitHub Actions from runner + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.WORKLOAD_NAME_PREFIX }} + path: output/${{ env.WORKLOAD_NAME }}/* + + - name: Clean up GCS artifacts from runner + shell: bash -x -u {0} + if: ${{ always() }} + run: | + rm -rf output/${WORKLOAD_NAME} + + - name: Clean up xpk environment from runner + shell: bash -x -u {0} + if: ${{ always() }} + run: | + sudo rm -rf ${WORKLOAD_NAME} diff --git a/.github/container/Dockerfile.nccl-gke b/.github/container/Dockerfile.nccl-gke new file mode 100644 index 000000000..7bfe9d4fe --- /dev/null +++ b/.github/container/Dockerfile.nccl-gke @@ -0,0 +1,12 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} as mealkit +FROM mealkit as final +COPY .github/gke-workflow/nccl/scripts /scripts +RUN apt-get update \ + && apt install -y openssh-server +RUN passwd -d root && \ + echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \ + echo "PermitEmptyPasswords yes" >> /etc/ssh/sshd_config && \ + echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \ + chmod +x /scripts/* + diff --git a/.github/gke-workflow/nccl/scripts/generate_hostfiles.sh b/.github/gke-workflow/nccl/scripts/generate_hostfiles.sh new file mode 100644 index 000000000..8f5d3117e --- /dev/null +++ b/.github/gke-workflow/nccl/scripts/generate_hostfiles.sh @@ -0,0 +1,24 @@ +len() { + local -r arr=($@) + echo "${#arr[@]}" +} + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +NRANKS_FACTORS=(1 2 4 8) + +NHOSTS=$(len "$@") +echo "generating hostfiles for ${NHOSTS} hosts: " +for h in "$@"; do echo "$h"; done + +mkdir -p "${SCRIPT_DIR}/hostfiles${NHOSTS}" + +for nr in "${NRANKS_FACTORS[@]}"; +do + rm -f "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}" + touch "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}" + for h in "$@"; + do + echo "$h port=22 slots=${nr}" >> "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${nr}" + done +done diff --git a/.github/gke-workflow/nccl/scripts/nccl-test-launch.sh b/.github/gke-workflow/nccl/scripts/nccl-test-launch.sh new file mode 100644 index 000000000..cf84dde8e --- /dev/null +++ b/.github/gke-workflow/nccl/scripts/nccl-test-launch.sh @@ -0,0 +1,39 @@ +BENCHMARK=$1 +NHOSTS=${NHOSTS:-2} +shift + +/scripts/start_ssh.sh ${@}; +pushd /scripts; + +/scripts/generate_hostfiles.sh ${@}; +popd; + +COMPLETION_FLAG=/opt/output/${BENCHMARK}_done + +service ssh restart + +if [ $NODE_RANK = 0 ] ; then + for host in ${@}; do + host_ready=false + while ! $host_ready; do + status=$(ssh $host echo "ready" 2> /dev/null || echo "unready") + if [ "$status" = "ready" ]; then + host_ready=true + break + fi + echo "$host not ready" + sleep 5 + done + echo "$host ready" + done + + NCCL_BENCHMARK=$BENCHMARK NHOSTS=$NHOSTS /scripts/test.sh + + for host in ${@}; do + ssh ${host} touch ${COMPLETION_FLAG} + done + +else + while [ ! -f $COMPLETION_FLAG ]; do sleep 10; done +fi + diff --git a/.github/gke-workflow/nccl/scripts/start_ssh.sh b/.github/gke-workflow/nccl/scripts/start_ssh.sh new file mode 100644 index 000000000..de3d3aba1 --- /dev/null +++ b/.github/gke-workflow/nccl/scripts/start_ssh.sh @@ -0,0 +1,11 @@ +PORT=${PORT:-22} + +while true; do + host=$1 + if [[ -z $host ]]; then + break + fi + ssh -p "${PORT}" "$host" \ + echo "Connected to ${host}" + shift +done diff --git a/.github/gke-workflow/nccl/scripts/test.sh b/.github/gke-workflow/nccl/scripts/test.sh new file mode 100644 index 000000000..4165b77d7 --- /dev/null +++ b/.github/gke-workflow/nccl/scripts/test.sh @@ -0,0 +1,61 @@ +set -x + +export SCRIPT_DIR=/scripts + +ulimit -n 1048576 + +NCCL_LIB_DIR=${NCCL_LIB_DIR} . /usr/local/nvidia/lib64/nccl-env-profile.sh + +: "${NCCL_BENCHMARK:?Must set NCCL_BENCHMARK}" +NCCL_MINBYTES="${NCCL_MINBYTES:-8G}" +NCCL_MAXBYTES="${NCCL_MAXBYTES:-16G}" +NCCL_STEPFACTOR="${NCCL_STEPFACTOR:-2}" +NCCL_ITERS="${NCCL_ITERS:-100}" +NCCL_WARMUP_ITERS="${NCCL_WARMUP_ITERS:-0}" + +run_nccl() { + mpirun --mca btl tcp,self \ + --mca btl_tcp_if_include eth0 \ + --allow-run-as-root \ + -np $(( GPUS_PER_NODE * "${NHOSTS}" )) \ + --hostfile "${SCRIPT_DIR}/hostfiles${NHOSTS}/hostfile${GPUS_PER_NODE}" \ + -x LD_LIBRARY_PATH \ + -x PATH \ + -x NCCL_DEBUG=VERSION \ + -x NCCL_TESTS_SPLIT_MASK="${NCCL_TESTS_SPLIT_MASK:-0x0}" \ + -x NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY="${NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY}" \ + -x NCCL_LIB_DIR \ + -x NCCL_FASTRAK_IFNAME=${NCCL_FASTRAK_IFNAME} \ + -x NCCL_FASTRAK_CTRL_DEV="${NCCL_SOCKET_IFNAME}" \ + -x NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME}" \ + -x NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \ + -x NCCL_ALGO=${NCCL_ALGO} \ + -x NCCL_PROTO=${NCCL_PROTO} \ + -x NCCL_MIN_NCHANNELS=${NCCL_MIN_NCHANNELS} \ + -x NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE} \ + -x NCCL_P2P_PCI_CHUNKSIZE=${NCCL_P2P_PCI_CHUNKSIZE} \ + -x NCCL_P2P_NVL_CHUNKSIZE=${NCCL_P2P_NVL_CHUNKSIZE} \ + -x NCCL_FASTRAK_NUM_FLOWS=${NCCL_FASTRAK_NUM_FLOWS} \ + -x NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL=${NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL} \ + -x NCCL_BUFFSIZE=${NCCL_BUFFSIZE} \ + -x NCCL_FASTRAK_USE_SNAP=${NCCL_FASTRAK_USE_SNAP} \ + -x NCCL_FASTRAK_USE_LLCM=${NCCL_FASTRAK_USE_LLCM} \ + -x CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} \ + -x NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL} \ + -x NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=${NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING} \ + -x NCCL_TUNER_PLUGIN=${NCCL_TUNER_PLUGIN} \ + -x NCCL_TUNER_CONFIG_PATH=/usr/local/nvidia/lib64/a3plus_tuner_config.textproto \ + -x NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/usr/local/nvidia/lib64/a3plus_guest_config.textproto \ + -x NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS=${NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS} \ + -x NCCL_NVLS_ENABLE=${NCCL_NVLS_ENABLE} \ + ${NCCL_BENCHMARK} --minbytes ${NCCL_MINBYTES} \ + --maxbytes ${NCCL_MAXBYTES} \ + --stepfactor ${NCCL_STEPFACTOR} \ + --ngpus 1 \ + --check 1 \ + --warmup_iters ${NCCL_WARMUP_ITERS} \ + --iters ${NCCL_ITERS} 2>&1 | \ + tee "/opt/output/${NCCL_BENCHMARK}_nh${NHOSTS}_ng${GPUS_PER_NODE}_i${NCCL_ITERS}.txt" +} + +run_nccl "$@" diff --git a/.github/gke-workflow/nccl/service.yml b/.github/gke-workflow/nccl/service.yml new file mode 100644 index 000000000..7bd6e049b --- /dev/null +++ b/.github/gke-workflow/nccl/service.yml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Service +metadata: + name: nccl-test-host-1 +spec: + selector: + batch.kubernetes.io/job-completion-index: "0" + clusterIP: None + ports: + - port: 22 + targetPort: 22 + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: nccl-test-host-2 +spec: + selector: + batch.kubernetes.io/job-completion-index: "1" + clusterIP: None + ports: + - port: 22 + targetPort: 22 + protocol: TCP diff --git a/.github/gke-workflow/xpk/blueprint.patch b/.github/gke-workflow/xpk/blueprint.patch new file mode 100644 index 000000000..50cdf746d --- /dev/null +++ b/.github/gke-workflow/xpk/blueprint.patch @@ -0,0 +1,35 @@ +diff --git a/src/xpk/core/blueprint/blueprint_generator.py b/src/xpk/core/blueprint/blueprint_generator.py +index ccbca90..22a880a 100644 +--- a/src/xpk/core/blueprint/blueprint_generator.py ++++ b/src/xpk/core/blueprint/blueprint_generator.py +@@ -156,7 +156,6 @@ class BlueprintGenerator: + source="modules/scheduler/gke-cluster", + use=[primary_vpc_name, gpu_subnets_name], + settings={ +- "release_channel": "RAPID", + "prefix_with_deployment_name": False, + "name_suffix": cluster_name, + "enable_private_endpoint": False, +@@ -194,20 +193,18 @@ class BlueprintGenerator: + a3_megagpu_pool_0 = DeploymentModule( + id="a3_megagpu_pool_0", + source="modules/compute/gke-node-pool", +- use=["gke_cluster", gpu_subnets_name, "group_placement_0"], ++ use=["gke_cluster", gpu_subnets_name], + settings={ + "name": f"{cluster_name}-a3-megagpu-pool-0", + "machine_type": system.gce_machine_type, ++ "guest_accelerator": [{"type":"nvidia-h100-mega-80gb", "count": 8, "gpu_driver_installation_config": {"gpu_driver_version": "DEFAULT"}}], + "static_node_count": num_nodes, + "zones": [zone], +- "host_maintenance_interval": "PERIODIC", + "reservation_affinity": self._getblock_reservation_affinity( + reservation + ), + "run_workload_script": False, + "spot": capacity_type == CapacityType.SPOT, +- "max_pods_per_node": 32, +- "auto_upgrade": True, + }, + outputs=["instructions"], + ) diff --git a/.github/gke-workflow/xpk/docker_resources.patch b/.github/gke-workflow/xpk/docker_resources.patch new file mode 100644 index 000000000..74c0ef9e6 --- /dev/null +++ b/.github/gke-workflow/xpk/docker_resources.patch @@ -0,0 +1,98 @@ +diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py +index a95c557..11e8e43 100644 +--- a/src/xpk/core/docker_resources.py ++++ b/src/xpk/core/docker_resources.py +@@ -20,6 +20,8 @@ from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to + from .system_characteristics import AcceleratorType, SystemCharacteristics + + ++JAX_TOOLBOX_IMAGE_CONTAINER_PORT = 3389 ++ + def get_main_container_resources( + args, system: SystemCharacteristics, resource_type + ) -> str: +@@ -64,7 +66,7 @@ def get_env_container(args, system: SystemCharacteristics) -> str: + str: + YAML with the env config for the main container, as a YAML string. + """ +- gpu_env_yaml = """ ++ gpu_env_yaml = f""" + - name: REPLICATED_JOB_NAME + valueFrom: + fieldRef: +@@ -74,22 +76,22 @@ def get_env_container(args, system: SystemCharacteristics) -> str: + fieldRef: + fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] + - name: JAX_COORDINATOR_ADDRESS +- value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)" ++ value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME):{JAX_TOOLBOX_IMAGE_CONTAINER_PORT}" + - name: NNODES +- value: "{args.num_nodes}" ++ value: "{{args.num_nodes}}" + - name: NODE_RANK + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: USE_GPUDIRECT +- value: {gpu_direct_name} ++ value: {{gpu_direct_name}} + - name: GPUS_PER_NODE +- value: "{system.chips_per_vm}" ++ value: "{{system.chips_per_vm}}" + - name: JAX_COORDINATOR_PORT +- value: "6002" ++ value: "{JAX_TOOLBOX_IMAGE_CONTAINER_PORT}" + - name: COMMAND +- value: "{args.command}" +- {args.env}""" ++ value: "{{args.command}}" ++ {{args.env}}""" + + if system.accelerator_type == AcceleratorType['GPU']: + gpu_direct_name = 'fastrak' +@@ -123,7 +125,7 @@ def get_cpu_env(num_slices, env_vars, system) -> str: + Returns: + str: yaml containing env variables + """ +- yaml = """ ++ yaml = f""" + - name: REPLICATED_JOB_NAME + valueFrom: + fieldRef: +@@ -137,12 +139,12 @@ def get_cpu_env(num_slices, env_vars, system) -> str: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: PROCESSES_IN_JOB +- value: "{processes_in_job}" ++ value: "{{processes_in_job}}" + - name: JAX_PROCESS_COUNT +- value: "{process_count}" +- {env_vars} ++ value: "{{process_count}}" ++ {{env_vars}} + - name: JAX_COORDINATOR_ADDRESS +- value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)" ++ value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME):{JAX_TOOLBOX_IMAGE_CONTAINER_PORT}" + """ + return yaml.format( + processes_in_job=system.vms_per_slice, +@@ -251,7 +253,9 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str: + or system.device_type == H200_DEVICE_TYPE + or system.device_type == B200_DEVICE_TYPE + ): +- volume_mount_yaml = '' ++ volume_mount_yaml = """- name: shared-memory ++ mountPath: /dev/shm ++ """ + + storages: list[Storage] = get_storages_to_mount( + setup_k8s_env(args), args.storage +@@ -300,7 +304,7 @@ def add_container_ports(args, system: SystemCharacteristics) -> str: + if args.use_pathways: + return '' + +- gpu_port_yaml = """- containerPort: 6002""" ++ gpu_port_yaml = f"- containerPort: {JAX_TOOLBOX_IMAGE_CONTAINER_PORT}" + if system.accelerator_type == AcceleratorType['GPU']: + return gpu_port_yaml + return port_yaml diff --git a/.github/gke-workflow/xpk/tcpxo_decorator.patch b/.github/gke-workflow/xpk/tcpxo_decorator.patch new file mode 100644 index 000000000..62679f1e1 --- /dev/null +++ b/.github/gke-workflow/xpk/tcpxo_decorator.patch @@ -0,0 +1,13 @@ +diff --git a/src/xpk/core/workload_decorators/tcpxo_decorator.py b/src/xpk/core/workload_decorators/tcpxo_decorator.py +index 322e574..5a0cc42 100644 +--- a/src/xpk/core/workload_decorators/tcpxo_decorator.py ++++ b/src/xpk/core/workload_decorators/tcpxo_decorator.py +@@ -175,7 +175,7 @@ def update_gpu_containers(job_manifest): + if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}): + container.setdefault('env', []) + container['env'].append( +- {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'} ++ {'name': 'LD_LIBRARY_PATH', 'value': '/opt/nvidia/nccl/lib:/usr/local/cuda-12.8/targets/x86_64-local/lib:/usr/local/nvidia/lib64'} + ) + container['env'].append({ + 'name': 'NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY', diff --git a/.github/gke-workflow/xpk/workload.patch b/.github/gke-workflow/xpk/workload.patch new file mode 100644 index 000000000..447f633bd --- /dev/null +++ b/.github/gke-workflow/xpk/workload.patch @@ -0,0 +1,26 @@ +diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py +index a466a5c..8a5b99e 100644 +--- a/src/xpk/commands/workload.py ++++ b/src/xpk/commands/workload.py +@@ -227,6 +227,8 @@ spec: + tolerations: + - operator: "Exists" + key: nvidia.com/gpu ++ imagePullSecrets: ++ - name: jax-toolbox-ghcr + containers: + {container} + """ +@@ -463,6 +465,12 @@ def workload_create(args) -> None: + if args.device_type == cluster_gcluster.a3mega_device_type: + sub_networks = get_subnetworks_for_a3mega(args.cluster) + yml_string = tcpxo_decorator.decorate_jobset(yml_string, sub_networks) ++ yml_string += """\ ++ - name: shared-memory ++ emptyDir: ++ medium: Memory ++ sizeLimit: 0.5Ti ++ """ + + if args.device_type == cluster_gcluster.a3ultra_device_type: + sub_networks = get_subnetworks_for_a3ultra(args.cluster) diff --git a/.github/gke-workflow/xpk/xpk-sa-rbac.yml b/.github/gke-workflow/xpk/xpk-sa-rbac.yml new file mode 100644 index 000000000..9934c83d1 --- /dev/null +++ b/.github/gke-workflow/xpk/xpk-sa-rbac.yml @@ -0,0 +1,33 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: xpk-sa + namespace: default + annotations: + iam.gke.io/gcp-service-account: jobset-xpk-user@nv-jaxtoolboxgcp-20240925.iam.gserviceaccount.com +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: xpk-sa +rules: + - apiGroups: [""] + resources: ["pods", "services"] + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: xpk-sa-binding + namespace: default +subjects: + - kind: ServiceAccount + name: xpk-sa + namespace: default +roleRef: + kind: Role + name: xpk-sa + apiGroup: rbac.authorization.k8s.io diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index abe7d7dae..03a04d9aa 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -631,6 +631,19 @@ jobs: MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} secrets: inherit + test-maxtext-gke: + needs: build-maxtext + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'maxtext' + ) + uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml + with: + MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + test-axlearn-eks: needs: build-axlearn if: >- diff --git a/.github/workflows/_create_gke_cluster_xpk.yml b/.github/workflows/_create_gke_cluster_xpk.yml new file mode 100644 index 000000000..3b177ef7c --- /dev/null +++ b/.github/workflows/_create_gke_cluster_xpk.yml @@ -0,0 +1,65 @@ +name: ~Create GKE cluster with XPK + +on: + workflow_call: + inputs: + CLUSTER_NAME: + type: string + description: Cluster name + default: jtb-2025-06-12 + required: false + +jobs: + xpk-create-gke-cluster: + env: + GKE_VERSION: 1.31.6-gke.1221000 + DEVICE_TYPE: h100-mega-80gb-8 + DEFAULT_CPU_MACHINE: e2-standard-8 + NUM_NODES: 2 + ZONE: us-central1-a + RESERVATION: jtb-reservation + PROJECT: nv-jaxtoolboxgcp-20240925 + + runs-on: gke-a3mega + + steps: + - uses: actions/checkout@v4 + + - name: Show environment + run: | + set -x + + gcloud version + + source $HOME/.venv/bin/activate + python --version + xpk version + + - name: Apply xpk cluster create patch + run: | + cd $HOME/xpk && git checkout src/xpk/core/blueprint/blueprint_generator.py && cd - + git apply --unsafe-paths .github/gke-workflow/xpk/blueprint.patch --directory $HOME/xpk + + - name: Create cluster from compute reservation with xpk + run: | + CLUSTER_EXISTS=$(gcloud container clusters list --format=json | jq -r 'any(.[].name; . == "'${CLUSTER_NAME}'")') + + if ! [ $CLUSTER_EXISTS = true ]; then + cd $HOME/xpk + source $HOME/.venv/bin/activate + python xpk.py cluster create \ + --cluster ${CLUSTER_NAME} \ + --gke-version ${GKE_VERSION} \ + --device-type ${DEVICE_TYPE} \ + --num-nodes ${NUM_NODES} \ + --default-pool-cpu-machine-type=${DEFAULT_CPU_MACHINE} \ + --project=${PROJECT} \ + --reservation ${RESERVATION} \ + --zone ${ZONE} + else + echo "Cluster ${CLUSTER_NAME} already exists, skipping creation" + fi + + - name: Configure cluster ServiceAccount + run: | + kubectl apply -f .github/gke-workflow/xpk/xpk-sa-rbac.yml diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml new file mode 100644 index 000000000..94a72f6e3 --- /dev/null +++ b/.github/workflows/_test_maxtext_gke_xpk.yaml @@ -0,0 +1,57 @@ +name: ~Test MaxText (GKE, XPK) + +on: + workflow_call: + inputs: + MAXTEXT_IMAGE: + type: string + description: MaxText image from ghcr.io/nvidia + default: ghcr.io/nvidia/jax:maxtext + required: false + +jobs: + maxtext-gke-xpk: + runs-on: gke-a3mega + + env: + WORKLOAD_NAME_PREFIX: gke-maxtext-train + MAXTEXT_MODEL: llama2-7b + MAXTEXT_ATTENTION_TYPE: cudnn_flash_te + MAXTEXT_REMAT_POLICY: minimal_flash + MAXTEXT_TRAIN_STEPS: 20 + MAXTEXT_FSDP: 16 + MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} + NUM_NODES: 2 + + steps: + - name: Run XPK workload on cluster + uses: ./.github/actions/gke-xpk + with: + IMAGE: ${{ env.MAXTEXT_IMAGE }} + WORKLOAD_NAME_PREFIX: ${{ env.WORKLOAD_NAME_PREFIX }} + COMMAND: | + export NCCL_NET_PLUGIN=/opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so; + export NCCL_TUNER_PLUGIN=none; + console=/dev/stdout; + + nsys-jax --capture-range=cudaProfilerApi + --capture-range-end=stop + -o /opt/output/profile.zip + -- + test-maxtext.sh -n ${{ env.NUM_NODES }} + -b ${{ env.NUM_NODES }} + --model-name=${{ env.MAXTEXT_MODEL }} + --attn-type=${{ env.MAXTEXT_ATTENTION_TYPE }} + --remat-policy=${{ env.MAXTEXT_REMAT_POLICY }} + --steps=${{ env.MAXTEXT_TRAIN_STEPS }} + --fsdp=${{ env.MAXTEXT_FSDP }} + --multiprocess + -a 'scan_layers=false + max_target_length=4096 + use_iota_embed=true + logits_dot_in_fp32=false + profiler=nsys + skip_first_n_steps_for_profiler=3 + profiler_steps=8' |& + tee /opt/output/output.log &> \${console}; + EXIT_CODE=\$PIPESTATUS; diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index b0436b562..8dbe3ac7b 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -14,6 +14,12 @@ permissions: packages: write # to upload container jobs: + nccl-test-gke: + uses: ./.github/workflows/_test_nccl_gke.yaml + with: + JAX_IMAGE: ${{ inputs.CONTAINER }} + secrets: inherit + build-mpi-operator-compatible-base: runs-on: [self-hosted, "amd64", "large"] steps: @@ -39,7 +45,6 @@ jobs: DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }} DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }} - nccl-test: needs: build-mpi-operator-compatible-base strategy: diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml new file mode 100644 index 000000000..0f9860407 --- /dev/null +++ b/.github/workflows/_test_nccl_gke.yaml @@ -0,0 +1,109 @@ +name: ~Test NCCL Kubernetes (GKE) + +on: + workflow_call: + inputs: + JAX_IMAGE: + type: string + description: JAX image from ghcr.io/nvidia + default: ghcr.io/nvidia/jax-toolbox-internal:15729070690-base-amd64 + required: false + +jobs: + build-nccl-gke: + runs-on: [self-hosted, "amd64", "large"] + steps: + - uses: actions/checkout@v4 + - name: Build NCCL image + id: build + uses: ./.github/actions/build-container + with: + ARCHITECTURE: amd64 + ARTIFACT_NAME: artifact-nccl-gke-build + BADGE_FILENAME: badge-nccl-gke-build + BUILD_DATE: 0000-00-00 # not important; this image is never published + BASE_IMAGE: ${{ inputs.JAX_IMAGE }} + CONTAINER_NAME: nccl-gke + DOCKERFILE: .github/container/Dockerfile.nccl-gke + RUNNER_SIZE: small + DOCKER_CONTEXT: . + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + outputs: + DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }} + + nccl-gke: + runs-on: gke-a3mega + + needs: build-nccl-gke + + strategy: + matrix: + test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi] + + env: + BASE_IMAGE: ${{ needs.build-nccl-gke.outputs.DOCKER_TAG_FINAL }} + TEST_NAME: ${{ matrix.test }} + WORKLOAD_NAME_PREFIX: nccl-gke + NHOSTS: 2 + NCCL_MINBYTES: 8 + NCCL_MAXBYTES: 16G + NCCL_STEPFACTOR: 2 + NCCL_ITERS: 100 + + steps: + - uses: actions/checkout@v4 + + - name: Set workload name prefix # due to 40 char limit + id: workload-name + run: | + TEST_NAME=$(echo "${{ matrix.test }}" | sed 's/_perf_mpi//g' | sed 's/_/-/g') + WORKLOAD_PREFIX="${{ env.WORKLOAD_NAME_PREFIX }}-${TEST_NAME}" + + echo "WORKLOAD_PREFIX=${WORKLOAD_PREFIX}" >> ${GITHUB_OUTPUT} + + - name: Create NCCL test Services on cluster + run: | + SERVICE_MANIFEST=".github/gke-workflow/nccl/service-${WORKLOAD_NAME}-${{ matrix.test }}.yaml" + WORKLOAD_NAME="${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" + echo "SERVICE_MANIFEST=${SERVICE_MANIFEST}" >> ${GITHUB_ENV} + + cat .github/gke-workflow/nccl/service.yml | yq '.spec.selector."jobset.sigs.k8s.io/jobset-name" = "'${WORKLOAD_NAME}'"' --yaml-output | tee ${SERVICE_MANIFEST} + kubectl apply -f ${SERVICE_MANIFEST} + + - name: Run XPK workload on cluster + uses: ./.github/actions/gke-xpk + with: + IMAGE: ${{ env.BASE_IMAGE }} + WORKLOAD_NAME_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }} + COMMAND: | + export NHOSTS=${{ env.NHOSTS }}; + export NCCL_LIB_DIR=/opt/nvida/nccl/lib; + export SCRIPT_DIR=/scripts; + + export NCCL_MINBYTES=${{ env.NCCL_MINBYTES }}; + export NCCL_MAXBYTES=${{ env.NCCL_MAXBYTES }}; + export NCCL_STEPFACTOR=${{ env.NCCL_STEPFACTOR }}; + export NCCL_ITERS=${{ env.NCCL_ITERS }}; + + service ssh restart; + console=/dev/stdout; + declare -a hosts=('nccl-test-host-1' 'nccl-test-host-2'); + + /scripts/nccl-test-launch.sh ${{ matrix.test }} \${hosts[@]} |& + tee /opt/output/output.log &> \${console}; + + MAYBE_MPI_EXIT_CODE=\$(tail /opt/output/output.log | rg 'Exit code:[ ]+([0-9]+)' -or '\$1'); + if [ -z \${MAYBE_MPI_EXIT_CODE} ]; then + EXIT_CODE=0; + else + EXIT_CODE=\${MAYBE_MPI_EXIT_CODE}; + fi; + + - name: Clean up NCCL test Services from cluster + if: ${{ always() }} + run: | + kubectl delete -f ${SERVICE_MANIFEST} + From 5bbd1bd832814013bde3f76829abeb5e88eacfe1 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Tue, 15 Jul 2025 11:38:41 +0100 Subject: [PATCH 05/28] Add GKE test to NGC release workflow --- .github/workflows/ngc-release-testing.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml index 3150a07c2..4bd0f7123 100644 --- a/.github/workflows/ngc-release-testing.yaml +++ b/.github/workflows/ngc-release-testing.yaml @@ -75,8 +75,15 @@ jobs: MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} secrets: inherit + test-maxtext-gke: + if: inputs.MAXTEXT_IMAGE != '' + uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml + with: + MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} + secrets: inherit + finalize: - needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks ] + needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks, test-maxtext-gke ] if: "!cancelled()" uses: ./.github/workflows/_finalize.yaml secrets: inherit From 39baf576c8968d4d79c377862ecf42b76be4d079 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 12:20:04 +0100 Subject: [PATCH 06/28] Version xpk patch files --- .github/gke-workflow/xpk/{ => v0.8.0}/blueprint.patch | 0 .github/gke-workflow/xpk/{ => v0.8.0}/docker_resources.patch | 0 .github/gke-workflow/xpk/{ => v0.8.0}/tcpxo_decorator.patch | 0 .github/gke-workflow/xpk/{ => v0.8.0}/workload.patch | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename .github/gke-workflow/xpk/{ => v0.8.0}/blueprint.patch (100%) rename .github/gke-workflow/xpk/{ => v0.8.0}/docker_resources.patch (100%) rename .github/gke-workflow/xpk/{ => v0.8.0}/tcpxo_decorator.patch (100%) rename .github/gke-workflow/xpk/{ => v0.8.0}/workload.patch (100%) diff --git a/.github/gke-workflow/xpk/blueprint.patch b/.github/gke-workflow/xpk/v0.8.0/blueprint.patch similarity index 100% rename from .github/gke-workflow/xpk/blueprint.patch rename to .github/gke-workflow/xpk/v0.8.0/blueprint.patch diff --git a/.github/gke-workflow/xpk/docker_resources.patch b/.github/gke-workflow/xpk/v0.8.0/docker_resources.patch similarity index 100% rename from .github/gke-workflow/xpk/docker_resources.patch rename to .github/gke-workflow/xpk/v0.8.0/docker_resources.patch diff --git a/.github/gke-workflow/xpk/tcpxo_decorator.patch b/.github/gke-workflow/xpk/v0.8.0/tcpxo_decorator.patch similarity index 100% rename from .github/gke-workflow/xpk/tcpxo_decorator.patch rename to .github/gke-workflow/xpk/v0.8.0/tcpxo_decorator.patch diff --git a/.github/gke-workflow/xpk/workload.patch b/.github/gke-workflow/xpk/v0.8.0/workload.patch similarity index 100% rename from .github/gke-workflow/xpk/workload.patch rename to .github/gke-workflow/xpk/v0.8.0/workload.patch From 293f0dc0cc9be3f0b700175eea0d3973cb08a6d8 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 12:24:14 +0100 Subject: [PATCH 07/28] Add image pull secret name arg to xpk action template --- .github/actions/gke-xpk/action.yml | 12 +++++++++--- .github/gke-workflow/xpk/v0.8.0/workload.patch | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml index 1b30e5ddb..876bb3250 100644 --- a/.github/actions/gke-xpk/action.yml +++ b/.github/actions/gke-xpk/action.yml @@ -47,6 +47,11 @@ inputs: required: false default: ghcr.io/nvidia/jax:latest type: string + IMAGE_PULL_SECRET_NAME: + description: 'Name of k8s Secret resource for registry ImagePullSecret' + required: false + default: jax-toolbox-ghcr + type: string COMMAND: description: 'Command to run in main container on JobSet start up' required: false @@ -113,9 +118,10 @@ runs: - name: Apply XPK workload create patch shell: bash -x -e -u {0} run: | - git apply --unsafe-paths .github/gke-workflow/xpk/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk - git apply --unsafe-paths .github/gke-workflow/xpk/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk - git apply --unsafe-paths .github/gke-workflow/xpk/workload.patch --directory ${WORKLOAD_NAME}/xpk + sed -i 's/{{ IMAGE_PULL_SECRET_NAME}}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch + git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk + git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk + git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch --directory ${WORKLOAD_NAME}/xpk - name: Set workload commands shell: bash -x -e -u {0} diff --git a/.github/gke-workflow/xpk/v0.8.0/workload.patch b/.github/gke-workflow/xpk/v0.8.0/workload.patch index 447f633bd..85ce8d424 100644 --- a/.github/gke-workflow/xpk/v0.8.0/workload.patch +++ b/.github/gke-workflow/xpk/v0.8.0/workload.patch @@ -7,7 +7,7 @@ index a466a5c..8a5b99e 100644 - operator: "Exists" key: nvidia.com/gpu + imagePullSecrets: -+ - name: jax-toolbox-ghcr ++ - name: {{ IMAGE_PULL_SECRET_NAME }} containers: {container} """ From 2efa29446242a3ad48f67f6751f71a56a16f1e12 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 12:26:36 +0100 Subject: [PATCH 08/28] Set custom image pull secret in maxtext GKE workflow --- .github/workflows/_test_maxtext_gke_xpk.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml index 94a72f6e3..dec9804fe 100644 --- a/.github/workflows/_test_maxtext_gke_xpk.yaml +++ b/.github/workflows/_test_maxtext_gke_xpk.yaml @@ -24,11 +24,21 @@ jobs: NUM_NODES: 2 steps: + - name: Login to nvcr.io Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: '$oauthtoken' + password: ${{ secrets.NVCR_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr - name: Run XPK workload on cluster uses: ./.github/actions/gke-xpk with: IMAGE: ${{ env.MAXTEXT_IMAGE }} WORKLOAD_NAME_PREFIX: ${{ env.WORKLOAD_NAME_PREFIX }} + IMAGE_PULL_SECRET_NAME: ${{ steps.store-token.outputs.token-name }} COMMAND: | export NCCL_NET_PLUGIN=/opt/hpcx/nccl_rdma_sharp_plugin/lib/libnccl-net.so; export NCCL_TUNER_PLUGIN=none; From 4b233e6e3154ece5ca0d96959efe43e6aed76827 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 14:42:49 +0100 Subject: [PATCH 09/28] Fix pattern typo --- .github/actions/gke-xpk/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml index 876bb3250..c4d03c2dc 100644 --- a/.github/actions/gke-xpk/action.yml +++ b/.github/actions/gke-xpk/action.yml @@ -118,7 +118,7 @@ runs: - name: Apply XPK workload create patch shell: bash -x -e -u {0} run: | - sed -i 's/{{ IMAGE_PULL_SECRET_NAME}}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch + sed -i 's/{{ IMAGE_PULL_SECRET_NAME }}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch --directory ${WORKLOAD_NAME}/xpk From 0e4289479b821faf3129d40fd1a51bc67a48a972 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 16:13:39 +0100 Subject: [PATCH 10/28] Fix nvcr username --- .github/workflows/_test_maxtext_gke_xpk.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml index dec9804fe..942527352 100644 --- a/.github/workflows/_test_maxtext_gke_xpk.yaml +++ b/.github/workflows/_test_maxtext_gke_xpk.yaml @@ -28,7 +28,7 @@ jobs: uses: docker/login-action@v3 with: registry: nvcr.io - username: '$oauthtoken' + username: $oauthtoken password: ${{ secrets.NVCR_TOKEN }} - name: K8s GHCR store and delete token id: store-token From 3b045aee2115e4179a7684541b3edef3963f934b Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 16:27:12 +0100 Subject: [PATCH 11/28] Use checkout action to avoid cached repo use --- .github/workflows/_test_maxtext_gke_xpk.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml index 942527352..d8bd7b288 100644 --- a/.github/workflows/_test_maxtext_gke_xpk.yaml +++ b/.github/workflows/_test_maxtext_gke_xpk.yaml @@ -24,15 +24,19 @@ jobs: NUM_NODES: 2 steps: + - uses: actions/checkout@v4 + - name: Login to nvcr.io Container Registry uses: docker/login-action@v3 with: registry: nvcr.io username: $oauthtoken password: ${{ secrets.NVCR_TOKEN }} + - name: K8s GHCR store and delete token id: store-token uses: ./.github/actions/store-delete-k8s-ghcr + - name: Run XPK workload on cluster uses: ./.github/actions/gke-xpk with: From 9e1d3608f55bc2b74eaef990d5d415d1ecc98d94 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 16:55:53 +0100 Subject: [PATCH 12/28] Add EKS MaxText job via 481e71b --- .github/eks-workflow-files/maxtext-job.yaml | 120 ++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 .github/eks-workflow-files/maxtext-job.yaml diff --git a/.github/eks-workflow-files/maxtext-job.yaml b/.github/eks-workflow-files/maxtext-job.yaml new file mode 100644 index 000000000..7d9728f87 --- /dev/null +++ b/.github/eks-workflow-files/maxtext-job.yaml @@ -0,0 +1,120 @@ +apiVersion: v1 +kind: Service +metadata: + name: PLACEHOLDER +spec: + clusterIP: None # clusterIP must be None to create a headless service + selector: + job-name: PLACEHOLDER # must match Job name +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + completions: 2 # number of nodes + parallelism: 2 # number of nodes + completionMode: Indexed + backoffLimitPerIndex: 0 # max failures per index + maxFailedIndexes: 0 # all indices must succeed + template: + spec: + subdomain: PLACEHOLDER # has to match Service name + restartPolicy: Never + imagePullSecrets: + - name: PLACEHOLDER + containers: + - name: maxtext + image: PLACEHOLDER + ports: + - containerPort: 3389 + command: + - bash + - -c + # The logging logic: stream stdout/stderr from the 0th process inside this pod, + # record all of the processes' stdout/stderr + the INFO-level NCCL logs to file + - | + export SERVICE_NAME=$0 + export JOB_NAME=$1 + cat >each-process.sh <<'EOL' + export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME} + export JAX_COORDINATOR_PORT=3389 + export NNODES=16 # actually #processes == #GPUs + export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK)) + export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK + export NCCL_DEBUG=INFO + export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log + [[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null" + nsys-jax \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + -o /opt/output/profile.$NODE_RANK.zip \ + -- \ + test-maxtext.sh \ + -n 2 \ + -b 2 \ + --model-name=llama2-7b \ + --attn-type=cudnn_flash_te \ + --remat-policy=minimal_flash \ + --steps=20 \ + --fsdp=16 \ + -a "scan_layers=false \ + max_target_length=4096 \ + use_iota_embed=true \ + logits_dot_in_fp32=false \ + profiler=nsys \ + skip_first_n_steps_for_profiler=3 \ + profiler_steps=8" \ + |& tee /opt/output/output.$NODE_RANK.log >"${console}" + code=$? + # Should run even on failure + cat /opt/output/nccl.$NODE_RANK.log >"${console}" + exit $code + EOL + # TODO: upgrade parallel-launch to return a failure code as soon as any + # of its children do (it already does this eventually, but it could + # be slow) + parallel-launch LOCAL_RANK 8 bash each-process.sh + code=$? + # Should run even on failure + touch /opt/output/.done + exit $code + - PLACEHOLDER + - PLACEHOLDER + resources: + limits: + nvidia.com/gpu: 8 + vpc.amazonaws.com/efa: 32 + volumeMounts: + - mountPath: /dev/shm + name: shmem + - mountPath: /opt/output + name: output + - name: upload + image: amazon/aws-cli + command: + - bash + - -c + - | + JOB_NAME="$0" + while [[ ! -f /opt/output/.done ]]; do + sleep 1 + done + rm /opt/output/.done + aws s3 cp \ + --recursive \ + /opt/output \ + "s3://jax-toolbox-eks-output/${JOB_NAME}/" + - PLACEHOLDER + volumeMounts: + - mountPath: /opt/output + name: output + volumes: + - name: output + emptyDir: {} + - name: shmem + emptyDir: + medium: Memory + sizeLimit: 16Gi From 34d8c668f560ba1ab42bb3b9169562459c0e5509 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 17:40:27 +0100 Subject: [PATCH 13/28] Update NCCL registry --- .github/workflows/_test_nccl.yaml | 6 +++--- .github/workflows/_test_nccl_gke.yaml | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 8dbe3ac7b..9e8fc33c5 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -60,9 +60,9 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} - name: Create env vars id: var shell: bash diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml index 0f9860407..b9336869c 100644 --- a/.github/workflows/_test_nccl_gke.yaml +++ b/.github/workflows/_test_nccl_gke.yaml @@ -14,6 +14,18 @@ jobs: runs-on: [self-hosted, "amd64", "large"] steps: - uses: actions/checkout@v4 + + - name: Login to nvcr.io Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} + + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Build NCCL image id: build uses: ./.github/actions/build-container From 9ca9854b0fcc3ee6a855452a4bef8e7ab15c2672 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 16 Jul 2025 17:42:03 +0100 Subject: [PATCH 14/28] Update jax unit test slurm registry --- .github/workflows/_test_unit.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml index c376e64ab..44568669c 100644 --- a/.github/workflows/_test_unit.yaml +++ b/.github/workflows/_test_unit.yaml @@ -64,8 +64,8 @@ jobs: - name: Login to GitHub Container Registry uses: docker/login-action@v3 with: - registry: ghcr.io - username: ${{ github.repository_owner }} + registry: nvcr.io + username: $oauthtoken password: ${{ secrets.GITHUB_TOKEN }} - name: Run tests From 6c486ce39210280d95a523951fcc9098e7a61763 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Thu, 17 Jul 2025 09:43:06 +0100 Subject: [PATCH 15/28] Fix registry login --- .github/workflows/_test_nccl.yaml | 6 ++++++ .github/workflows/_test_nccl_gke.yaml | 4 ---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 9e8fc33c5..f81de95da 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -23,6 +23,12 @@ jobs: build-mpi-operator-compatible-base: runs-on: [self-hosted, "amd64", "large"] steps: + - name: Login to nvcr.io Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} - name: Checkout repository uses: actions/checkout@v4 - name: Build MPI operator compatible base container diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml index b9336869c..0a5dfaac0 100644 --- a/.github/workflows/_test_nccl_gke.yaml +++ b/.github/workflows/_test_nccl_gke.yaml @@ -22,10 +22,6 @@ jobs: username: $oauthtoken password: ${{ secrets.NVCR_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Build NCCL image id: build uses: ./.github/actions/build-container From 766f71960ac41b8c4e6e428c2283beeb4f8007bb Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Thu, 17 Jul 2025 09:46:48 +0100 Subject: [PATCH 16/28] Update trigger --- .github/workflows/ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a6f53ec4..d602488ff 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,6 +12,8 @@ on: paths-ignore: - '**.md' - '.github/triage/**' + branches-ignore: + - '25.08-devel-add-ngc-release-testing' workflow_dispatch: inputs: PUBLISH: From 580f54cb62bba712ef94dc88620b9f039f4f83ca Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Thu, 17 Jul 2025 10:37:26 +0100 Subject: [PATCH 17/28] Update registry login --- .github/workflows/_test_unit.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_test_unit.yaml b/.github/workflows/_test_unit.yaml index 44568669c..fe74491e8 100644 --- a/.github/workflows/_test_unit.yaml +++ b/.github/workflows/_test_unit.yaml @@ -61,12 +61,12 @@ jobs: - name: Check out repository uses: actions/checkout@v4 - - name: Login to GitHub Container Registry + - name: Login to nvcr Container Registry uses: docker/login-action@v3 with: registry: nvcr.io username: $oauthtoken - password: ${{ secrets.GITHUB_TOKEN }} + password: ${{ secrets.NVCR_TOKEN }} - name: Run tests shell: bash -x -e {0} From b54859aae9deac13402373cd107fdbb8a9891ae1 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Thu, 17 Jul 2025 14:02:47 +0100 Subject: [PATCH 18/28] Fix image pull registry --- .github/workflows/_test_nccl.yaml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index f81de95da..98937f251 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -63,25 +63,24 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - - name: Login to GitHub Container Registry + - name: Login to GHCR Container Registry uses: docker/login-action@v3 with: - registry: nvcr.io - username: $oauthtoken - password: ${{ secrets.NVCR_TOKEN }} + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ inputs.github-token }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr - name: Create env vars id: var shell: bash run: | JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}" LAUNCHER_NAME="${JOB_NAME}-launcher" - TOKEN_NAME="${JOB_NAME}-token" # Make these available to later steps echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV" echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV" - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - name: Configure Kubernetes job run: | export WORKER_NAME="${JOB_NAME}-worker" From bd1f3adc97331ab99f645b2b38cc4199e500f4cc Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Thu, 17 Jul 2025 14:12:13 +0100 Subject: [PATCH 19/28] Set correct registry password --- .github/workflows/_test_nccl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 98937f251..a2365edd7 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -68,7 +68,7 @@ jobs: with: registry: ghcr.io username: ${{ github.repository_owner }} - password: ${{ inputs.github-token }} + password: ${{ secrets.GITHUB_TOKEN }} - name: K8s GHCR store and delete token id: store-token uses: ./.github/actions/store-delete-k8s-ghcr From c902d5348f13202fc36fbf09b91a01479e2a262d Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Tue, 22 Jul 2025 11:14:07 +0100 Subject: [PATCH 20/28] Remove redundant testing (covered internally) --- .github/workflows/ngc-release-testing.yaml | 37 ---------------------- 1 file changed, 37 deletions(-) diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml index 4bd0f7123..b810e3232 100644 --- a/.github/workflows/ngc-release-testing.yaml +++ b/.github/workflows/ngc-release-testing.yaml @@ -31,43 +31,6 @@ jobs: CONTAINER: ${{ inputs.JAX_IMAGE }} secrets: inherit - test-jax: - if: inputs.JAX_IMAGE != '' - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ inputs.JAX_IMAGE }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ inputs.JAX_IMAGE }} \ - bash <<"EOF" |& tee tee test-gpu.log - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit - - test-maxtext: - if: inputs.MAXTEXT_IMAGE != '' - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} - secrets: inherit - test-maxtext-eks: if: inputs.MAXTEXT_IMAGE != '' uses: ./.github/workflows/_test_maxtext_k8s.yaml From 19a2ec1623a144b387e5e5611723e60248a9e271 Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Tue, 22 Jul 2025 13:57:57 +0100 Subject: [PATCH 21/28] Remove redundant testing (covered internally) --- .github/workflows/ngc-release-testing.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml index b810e3232..a9f2f48d9 100644 --- a/.github/workflows/ngc-release-testing.yaml +++ b/.github/workflows/ngc-release-testing.yaml @@ -46,7 +46,7 @@ jobs: secrets: inherit finalize: - needs: [ test-nccl, test-jax, test-maxtext, test-maxtext-eks, test-maxtext-gke ] + needs: [ test-nccl, test-maxtext-eks, test-maxtext-gke ] if: "!cancelled()" uses: ./.github/workflows/_finalize.yaml secrets: inherit From 92f086b15c471aede8dc61c683c1edbfdc16e533 Mon Sep 17 00:00:00 2001 From: Brian Yang <125406446+gpupuck@users.noreply.github.com> Date: Thu, 24 Jul 2025 15:09:16 -0700 Subject: [PATCH 22/28] Remove remote and main (#1574) --- .github/container/git-clone.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/container/git-clone.sh b/.github/container/git-clone.sh index f4ddbc7fb..956d189b7 100755 --- a/.github/container/git-clone.sh +++ b/.github/container/git-clone.sh @@ -77,6 +77,13 @@ pushd ${DESTINATION} git checkout ${GIT_REF} COMMIT_SHA=$(git rev-parse HEAD) git submodule update --init --recursive +if [[ "${GIT_REPO}" == *"gitlab"* ]]; then + git remote remove origin + if grep -q -r gitlab-ci-token .git; then + grep -r gitlab-ci-token .git | awk -F: '{print $1}' | xargs rm -f + fi + git branch -D main +fi popd ## update the manifest file From 9a366f4bab04b1a029e5ad1453722d8d7658137f Mon Sep 17 00:00:00 2001 From: Brian Yang <125406446+gpupuck@users.noreply.github.com> Date: Fri, 25 Jul 2025 02:33:01 -0700 Subject: [PATCH 23/28] Upgrade werkzeug for MaxText (#1575) Upgrade werkzeug to avoid vulnerabilities in 2.0.3. To be able to do that, google-cloud-aiplatform needs to at least >= 1.90.0 (refer to https://github.com/googleapis/python-aiplatform/blob/v1.90.0/setup.py#L51) --- .github/container/Dockerfile.maxtext | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index 49329e1aa..752a8e105 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -39,6 +39,7 @@ for pattern in \ "s|tensorflow-datasets|tensorflow-datasets>=4.8.0|g" \ "s|sentencepiece==0.1.97|sentencepiece>=0.2|g" \ "s|tensorflow>=2.13.0|tensorflow==2.18.1|g" \ + "s|google-cloud-aiplatform==1.61.0|google-cloud-aiplatform>=1.90.0|g" \ ; do # tensorflow-cpu==2.19.0 is incompatible with tensorflow-text sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt @@ -51,6 +52,7 @@ echo >> ${SRC_PATH_MAXTEXT}/requirements.txt # add new line for requirement in \ "tensorflow-metadata>=1.15.0" \ "seqio@git+https://github.com/google/seqio.git" \ + "werkzeug>=3.0.3" \ ; do echo "${requirement}" >> ${SRC_PATH_MAXTEXT}/requirements.txt done From c8fe23a19516ffc95322e7c9e0912983943629ef Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Fri, 25 Jul 2025 13:01:55 +0100 Subject: [PATCH 24/28] Add basic sitrep steps for GKE XPK action (#1580) --- .github/actions/gke-xpk/action.yml | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml index c4d03c2dc..574e1be33 100644 --- a/.github/actions/gke-xpk/action.yml +++ b/.github/actions/gke-xpk/action.yml @@ -232,10 +232,12 @@ runs: if [ $? -ne 0 ]; then echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected " + echo "XPK_EXIT_CODE=1" >> ${GITHUB_ENV} exit 1 fi eval "export ${MAYBE_XPK_EXIT_CODE}" + echo "XPK_EXIT_CODE=${EXIT_CODE}" >> ${GITHUB_ENV} exit ${EXIT_CODE} - name: Clean up JobSet from cluster @@ -268,3 +270,39 @@ runs: if: ${{ always() }} run: | sudo rm -rf ${WORKLOAD_NAME} + + - name: Generate sitrep + id: sitrep + shell: bash -x -e {0} + if: ${{ always() }} + run: | + source .github/workflows/scripts/to_json.sh + badge_label="${{ matrix.test }}" + + summary="${{ inputs.WORKLOAD_NAME_PREFIX }}" + outcome=success + badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}" + badge_color=brightgreen + + if [ "${XPK_EXIT_CODE}" -gt 0 ]; then + badge_color=red + outcome=failed + summary+=": fail" + else + summary+=": pass" + fi + + to_json summary \ + badge_label \ + badge_color \ + outcome | \ + tee sitrep.json + + - name: Upload sitrep to GitHub Actions from runner + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep + path: | + sitrep.json + From 0b5135b97569874ef7e41422206379f478b9254e Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 30 Jul 2025 15:18:28 +0200 Subject: [PATCH 25/28] nccl-tests: set LD_LIBRARY_PATH through mpirun (#1589) (#1590) This helps CUDA forward compatibility work when spawning processes over SSH, as those processes do not see environment variables set by the container entrypoint that handles forward compatibility. `/usr/local/cuda/compat/lib` will only exist if the entrypoint detects that forward compatibility mode is enabled. --- .github/eks-workflow-files/mpi-nccl-test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml index 88cca0590..0e34cb7a2 100644 --- a/.github/eks-workflow-files/mpi-nccl-test.yml +++ b/.github/eks-workflow-files/mpi-nccl-test.yml @@ -41,7 +41,8 @@ spec: echo "Workers were still not reachable after ${limit}, exiting" exit 1 fi - mpirun --allow-run-as-root -np 16 -N 8 $0 \ + mpirun --allow-run-as-root --tag-output -N 1 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib nvidia-smi + mpirun --allow-run-as-root -N 8 -x LD_LIBRARY_PATH=/usr/local/cuda/compat/lib $0 \ -b 8 \ -e 16G \ -f 2 \ From c70b23990fdb1560a0bacd2ebec559c7c72691d7 Mon Sep 17 00:00:00 2001 From: Brian Yang <125406446+gpupuck@users.noreply.github.com> Date: Fri, 1 Aug 2025 09:27:12 -0700 Subject: [PATCH 26/28] Pin orbax-checkpoint to 0.11.19 and pip-tools to 7.4.1 (#1594) They just so happened to get upgraded on July 31st together, but - orbax-checkpoint 0.11.20 has issues without internal checkpoint testing - pip-tools 7.5.0 will cause `ValueError: '/opt/maxtext/requirements.txt (line 1)' is not in the subpath of '/opt/pip-tools.d'`. I'm guessing something is not quite compatible with the Python 3.12 we current have in the base container. Theoretically, `-r ../maxtext/requirements.txt` should work, but since we are using a specific version of pip. Let's play safe at this point and use 7.4.1 --- .github/container/Dockerfile.base | 2 +- .github/container/Dockerfile.jax | 1 + .github/container/Dockerfile.maxtext | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 06072e8ae..8603ad054 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -129,7 +129,7 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1 # after upgrading to ver 23.3.1 (from /opt/pip) `pip` tries to uninstall itself (default pip-24.0) # and fails due to pip-24.0 has been installed with system tool `apt` but not `python`. So we keep # both pip-24.0 and pip-23.3.1 in the system, but use 23.3.1 with equivalency patch (see above). -RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/* +RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip "pip-tools==7.4.1" && rm -rf ~/.cache/* # The symlinks for CUDA/cuDNN/NCCL exist to make the container's installations # of those components conform to XLA's expectations for local installations. diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 4f20aa2b3..6d9bc956a 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -109,6 +109,7 @@ EOF ## Flax RUN <<"EOF" bash -ex git-clone.sh ${URLREF_FLAX} ${SRC_PATH_FLAX} +sed -i 's/orbax-checkpoint/orbax-checkpoint==0.11.19/' ${SRC_PATH_FLAX}/pyproject.toml echo "-e file://${SRC_PATH_FLAX}" >> /opt/pip-tools.d/requirements-flax.in EOF diff --git a/.github/container/Dockerfile.maxtext b/.github/container/Dockerfile.maxtext index 752a8e105..2033c6e4f 100644 --- a/.github/container/Dockerfile.maxtext +++ b/.github/container/Dockerfile.maxtext @@ -40,6 +40,7 @@ for pattern in \ "s|sentencepiece==0.1.97|sentencepiece>=0.2|g" \ "s|tensorflow>=2.13.0|tensorflow==2.18.1|g" \ "s|google-cloud-aiplatform==1.61.0|google-cloud-aiplatform>=1.90.0|g" \ + "s|orbax-checkpoint>=0.5.12|orbax-checkpoint==0.11.19|g" \ ; do # tensorflow-cpu==2.19.0 is incompatible with tensorflow-text sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt From d506eefa612ea5dcb69423303417330b06a08f5d Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Tue, 5 Aug 2025 15:06:32 +0100 Subject: [PATCH 27/28] Set GKE NCCL to use k8s secret action --- .github/workflows/_test_nccl_gke.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml index 0a5dfaac0..eef8b2990 100644 --- a/.github/workflows/_test_nccl_gke.yaml +++ b/.github/workflows/_test_nccl_gke.yaml @@ -72,6 +72,17 @@ jobs: echo "WORKLOAD_PREFIX=${WORKLOAD_PREFIX}" >> ${GITHUB_OUTPUT} + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Store registry secret on cluster + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Create NCCL test Services on cluster run: | SERVICE_MANIFEST=".github/gke-workflow/nccl/service-${WORKLOAD_NAME}-${{ matrix.test }}.yaml" @@ -86,6 +97,7 @@ jobs: with: IMAGE: ${{ env.BASE_IMAGE }} WORKLOAD_NAME_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }} + IMAGE_PULL_SECRET_NAME: ${{ steps.store-token.outputs.token-name }} COMMAND: | export NHOSTS=${{ env.NHOSTS }}; export NCCL_LIB_DIR=/opt/nvida/nccl/lib; From 389bc2f52722cdcd130184080d16d997f60a107a Mon Sep 17 00:00:00 2001 From: "Alex Y. Chan" Date: Wed, 6 Aug 2025 12:14:13 +0100 Subject: [PATCH 28/28] Update default GKE cluster --- .github/actions/gke-xpk/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml index 574e1be33..57b61b411 100644 --- a/.github/actions/gke-xpk/action.yml +++ b/.github/actions/gke-xpk/action.yml @@ -9,7 +9,7 @@ inputs: type: string GKE_CLUSTER: description: 'GKE cluster name' - default: jtb-2025-06-12 + default: jtb-2025-08-06 required: false type: string GCP_ZONE: