Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
cd29eab
Add NGC release test workflow
aybchan Jun 26, 2025
8bbcfa3
Merge branch '25.07-devel' into 25.07-devel-add-ngc-release-testing
aybchan Jul 1, 2025
013d4a5
test-jax.sh: fix typo (#1526)
olupton Jun 26, 2025
25ad365
Add k8s maxtext workflow
aybchan Jul 15, 2025
a598d8d
Add GKE example (#1481)
aybchan Jul 3, 2025
5bbd1bd
Add GKE test to NGC release workflow
aybchan Jul 15, 2025
39baf57
Version xpk patch files
aybchan Jul 16, 2025
293f0dc
Add image pull secret name arg to xpk action template
aybchan Jul 16, 2025
2efa294
Set custom image pull secret in maxtext GKE workflow
aybchan Jul 16, 2025
4b233e6
Fix pattern typo
aybchan Jul 16, 2025
0e42894
Fix nvcr username
aybchan Jul 16, 2025
3b045ae
Use checkout action to avoid cached repo use
aybchan Jul 16, 2025
9e1d360
Add EKS MaxText job via 481e71b
aybchan Jul 16, 2025
34d8c66
Update NCCL registry
aybchan Jul 16, 2025
9ca9854
Update jax unit test slurm registry
aybchan Jul 16, 2025
6c486ce
Fix registry login
aybchan Jul 17, 2025
766f719
Update trigger
aybchan Jul 17, 2025
580f54c
Update registry login
aybchan Jul 17, 2025
b54859a
Fix image pull registry
aybchan Jul 17, 2025
bd1f3ad
Set correct registry password
aybchan Jul 17, 2025
c902d53
Remove redundant testing (covered internally)
aybchan Jul 22, 2025
19a2ec1
Remove redundant testing (covered internally)
aybchan Jul 22, 2025
92f086b
Remove remote and main (#1574)
gpupuck Jul 24, 2025
9a366f4
Upgrade werkzeug for MaxText (#1575)
gpupuck Jul 25, 2025
c8fe23a
Add basic sitrep steps for GKE XPK action (#1580)
aybchan Jul 25, 2025
0b5135b
nccl-tests: set LD_LIBRARY_PATH through mpirun (#1589) (#1590)
olupton Jul 30, 2025
c70b239
Pin orbax-checkpoint to 0.11.19 and pip-tools to 7.4.1 (#1594)
gpupuck Aug 1, 2025
d506eef
Set GKE NCCL to use k8s secret action
aybchan Aug 5, 2025
389bc2f
Update default GKE cluster
aybchan Aug 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
308 changes: 308 additions & 0 deletions .github/actions/gke-xpk/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
name: Launch workload on GKE with XPK

description: "Launch a JobSet workload on GKE with XPK. Upload artifacts from container to GCS and GitHub Actions."

inputs:
GCP_PROJECT:
description: 'GCP project ID'
default: nv-jaxtoolboxgcp-20240925
type: string
GKE_CLUSTER:
description: 'GKE cluster name'
default: jtb-2025-08-06
required: false
type: string
GCP_ZONE:
description: 'GCP zone of the cluster'
default: us-central1-a
required: false
type: string
CLUSTER_DEVICE:
description: 'GPU device type in the cluster'
default: h100-mega-80gb-8
required: false
type: string
NUM_NODES:
description: 'Number of nodes to use in JobSet (n.b each a3-megagpu-8g node has 8xGPU)'
default: 2
required: false
type: string
MAIN_CONTAINER_NAME:
description: 'Name of the main contianer in an XPK JobSet (fixed)'
default: gpu-image
required: false
type: string
CONTAINER_OUTPUT_PATH:
description: 'Output directory for artifacts'
default: /opt/output
required: false
type: string
GCS_BUCKET:
description: 'GCS bucket to which CI output artifacts will be uploaded'
default: jaxtoolbox-ci
required: false
type: string
IMAGE:
description: 'URI of image to use in JobSet'
required: false
default: ghcr.io/nvidia/jax:latest
type: string
IMAGE_PULL_SECRET_NAME:
description: 'Name of k8s Secret resource for registry ImagePullSecret'
required: false
default: jax-toolbox-ghcr
type: string
COMMAND:
description: 'Command to run in main container on JobSet start up'
required: false
default: 'nvidia-smi; free -h;'
type: string
EXIT_COMMAND:
description: 'Command to set exit code'
required: false
default: 'exit \$EXIT_CODE'
type: string
WORKLOAD_NAME_PREFIX:
description: 'Workload name prefix for XPK, also used to name uploaded artifact'
required: false
default: 'xpk'
type: string
XPK_VERSION:
description: 'XPK release tag'
required: false
default: 'v0.8.0'
type: string
XPK_PYTHON:
description: 'Python version for XPK'
required: false
default: '3.12.10'
type: string

runs:
using: 'composite'
steps:

- name: Set workload name
shell: bash -x -e -u {0}
run: |
WORKLOAD_NAME="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
DATE=$(date +'%Y-%m-%d')
GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME}"

echo "WORKLOAD_NAME=${WORKLOAD_NAME}" >> ${GITHUB_ENV}
echo "DATE=${DATE}" >> ${GITHUB_ENV}
echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV}

- name: Setup environment
shell: bash -x -e -u {0}
run: |
mkdir -p ${WORKLOAD_NAME}
uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME}
source ${WORKLOAD_NAME}/.venv/bin/activate

# install xpk
git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME}/xpk

sed 's@pip install \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME}/xpk/Makefile
cd ${WORKLOAD_NAME}/xpk && sudo make install; cd -

- name: Show environment
shell: bash -x -e -u {0}
run: |
gcloud version

source ${WORKLOAD_NAME}/.venv/bin/activate
python --version
xpk version

- name: Apply XPK workload create patch
shell: bash -x -e -u {0}
run: |
sed -i 's/{{ IMAGE_PULL_SECRET_NAME }}/${{ inputs.IMAGE_PULL_SECRET_NAME }}/g' .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch
git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/tcpxo_decorator.patch --directory ${WORKLOAD_NAME}/xpk
git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/docker_resources.patch --directory ${WORKLOAD_NAME}/xpk
git apply --unsafe-paths .github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}/workload.patch --directory ${WORKLOAD_NAME}/xpk

- name: Set workload commands
shell: bash -x -e -u {0}
run: |
PRELUDE="
apt install -y ripgrep > /dev/null;
curl -LO https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz;
tar xf google-cloud-cli-linux-x86_64.tar.gz;
./google-cloud-sdk/install.sh --quiet > /dev/null;
./google-cloud-sdk/bin/gcloud init;

mkdir -p /usr/share/workload;
mkdir -p ${{ inputs.CONTAINER_OUTPUT_PATH }};
"

POSTLUDE="
./google-cloud-sdk/bin/gsutil cp -r ${{ inputs.CONTAINER_OUTPUT_PATH }}/ ${GCS_ARTIFACT_PATH}/node-0\$NODE_RANK;
${{ inputs.EXIT_COMMAND }}
"

CMD="${{ inputs.COMMAND }}"

# set container commands in-line
PRELUDE=$(echo ${PRELUDE} | sed 's/\n/\ /g')
POSTLUDE=$(echo ${POSTLUDE} | sed 's/\n/\ /g')
CMD=$(echo ${CMD} | sed 's/\n/\ /g')

echo "PRELUDE=${PRELUDE}" >> ${GITHUB_ENV}
echo "CMD=${CMD}" >> ${GITHUB_ENV}
echo "POSTLUDE=${POSTLUDE}" >> ${GITHUB_ENV}

- name: Create workload on cluster with XPK
shell: bash -x -e -u {0}
run: |
source ${WORKLOAD_NAME}/.venv/bin/activate
cd ${WORKLOAD_NAME}/xpk
python xpk.py workload create \
--project ${{ inputs.GCP_PROJECT }} \
--cluster ${{ inputs.GKE_CLUSTER }} \
--zone ${{ inputs.GCP_ZONE }} \
--workload ${WORKLOAD_NAME} \
--docker-image ${{ inputs.IMAGE }} \
--device-type ${{ inputs.CLUSTER_DEVICE }} \
--num-nodes ${{ inputs.NUM_NODES }} \
--num-slices ${{ inputs.NUM_NODES }} \
--priority=high \
--scheduler=gke.io/topology-aware-auto \
--command "${PRELUDE} ${CMD} ${POSTLUDE}"

- name: Wait for JobSet to unsuspend on cluster
shell: bash -u {0}
env:
POLL_TIMEOUT: 3600
run: |
START=$(date +%s)
JOBSET_ACTIVE=false
while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do
JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1')
NOW=$(date +%s)
ELAPSED=$(( NOW - START ))
if (( ELAPSED > POLL_TIMEOUT )) ; then
echo "Timeout after waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
exit 1
fi
echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
sleep 5
done

echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}"

- name: Set JobSet Pod name
shell: bash -u {0}
run: |
echo "POD=$(kubectl get pods -o json | jq -r '.items[] | select(.metadata.labels."'jobset.sigs.k8s.io/jobset-name'" == "'${WORKLOAD_NAME}'") | .metadata.name ' | sort | head -n1 )" >> ${GITHUB_ENV}

- name: Wait for JobSet Pod readiness
shell: bash -u {0}
run: |
POD_READY=false
while ! ${POD_READY} || [ -z ${POD_READY} ]; do
echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready"
sleep 10

POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
if ${POD_ERROR} ; then
echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}"
break
fi

POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready')
done;

- name: Stream logs from JobSet Pods
shell: bash -u {0}
run: |
jobset_pods=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' '))

for jobset_pod in ${jobset_pods[@]}; do
kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME}-${jobset_pod}-jobset.log &
done
wait < <(jobs -p)

- name: Set exit code from JobSet logs
shell: bash -u {0}
run: |
MAYBE_XPK_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME}-${POD}-jobset.log | awk '{ print $3 }' )"
echo ${MAYBE_XPK_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$'

if [ $? -ne 0 ]; then
echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
echo "XPK_EXIT_CODE=1" >> ${GITHUB_ENV}
exit 1
fi

eval "export ${MAYBE_XPK_EXIT_CODE}"
echo "XPK_EXIT_CODE=${EXIT_CODE}" >> ${GITHUB_ENV}
exit ${EXIT_CODE}

- name: Clean up JobSet from cluster
shell: bash -x -u {0}
if: ${{ always() }}
run: |
kubectl delete jobset --wait ${WORKLOAD_NAME} || echo "JobSet ${WORKLOAD_NAME} does not exist in ${{ inputs.GKE_CLUSTER }}"

- name: Download artifacts from GCS to runner
shell: bash -x -u {0}
run: |
mkdir -p output/${WORKLOAD_NAME}
mv ${WORKLOAD_NAME}-*.log output/${WORKLOAD_NAME}
gsutil cp -r ${GCS_ARTIFACT_PATH} output/${WORKLOAD_NAME}

- name: Upload artifacts to GitHub Actions from runner
uses: actions/upload-artifact@v4
with:
name: ${{ inputs.WORKLOAD_NAME_PREFIX }}
path: output/${{ env.WORKLOAD_NAME }}/*

- name: Clean up GCS artifacts from runner
shell: bash -x -u {0}
if: ${{ always() }}
run: |
rm -rf output/${WORKLOAD_NAME}

- name: Clean up xpk environment from runner
shell: bash -x -u {0}
if: ${{ always() }}
run: |
sudo rm -rf ${WORKLOAD_NAME}

- name: Generate sitrep
id: sitrep
shell: bash -x -e {0}
if: ${{ always() }}
run: |
source .github/workflows/scripts/to_json.sh
badge_label="${{ matrix.test }}"

summary="${{ inputs.WORKLOAD_NAME_PREFIX }}"
outcome=success
badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}"
badge_color=brightgreen

if [ "${XPK_EXIT_CODE}" -gt 0 ]; then
badge_color=red
outcome=failed
summary+=": fail"
else
summary+=": pass"
fi

to_json summary \
badge_label \
badge_color \
outcome | \
tee sitrep.json

- name: Upload sitrep to GitHub Actions from runner
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep
path: |
sitrep.json

2 changes: 1 addition & 1 deletion .github/container/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
# after upgrading to ver 23.3.1 (from /opt/pip) `pip` tries to uninstall itself (default pip-24.0)
# and fails due to pip-24.0 has been installed with system tool `apt` but not `python`. So we keep
# both pip-24.0 and pip-23.3.1 in the system, but use 23.3.1 with equivalency patch (see above).
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/*
RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip "pip-tools==7.4.1" && rm -rf ~/.cache/*

# The symlinks for CUDA/cuDNN/NCCL exist to make the container's installations
# of those components conform to XLA's expectations for local installations.
Expand Down
1 change: 1 addition & 0 deletions .github/container/Dockerfile.jax
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ EOF
## Flax
RUN <<"EOF" bash -ex
git-clone.sh ${URLREF_FLAX} ${SRC_PATH_FLAX}
sed -i 's/orbax-checkpoint/orbax-checkpoint==0.11.19/' ${SRC_PATH_FLAX}/pyproject.toml
echo "-e file://${SRC_PATH_FLAX}" >> /opt/pip-tools.d/requirements-flax.in
EOF

Expand Down
3 changes: 3 additions & 0 deletions .github/container/Dockerfile.maxtext
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ for pattern in \
"s|tensorflow-datasets|tensorflow-datasets>=4.8.0|g" \
"s|sentencepiece==0.1.97|sentencepiece>=0.2|g" \
"s|tensorflow>=2.13.0|tensorflow==2.18.1|g" \
"s|google-cloud-aiplatform==1.61.0|google-cloud-aiplatform>=1.90.0|g" \
"s|orbax-checkpoint>=0.5.12|orbax-checkpoint==0.11.19|g" \
; do
# tensorflow-cpu==2.19.0 is incompatible with tensorflow-text
sed -i "${pattern}" ${SRC_PATH_MAXTEXT}/requirements.txt
Expand All @@ -51,6 +53,7 @@ echo >> ${SRC_PATH_MAXTEXT}/requirements.txt # add new line
for requirement in \
"tensorflow-metadata>=1.15.0" \
"seqio@git+https://github.com/google/seqio.git" \
"werkzeug>=3.0.3" \
; do
echo "${requirement}" >> ${SRC_PATH_MAXTEXT}/requirements.txt
done
Expand Down
12 changes: 12 additions & 0 deletions .github/container/Dockerfile.nccl-gke
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE} as mealkit
FROM mealkit as final
COPY .github/gke-workflow/nccl/scripts /scripts
RUN apt-get update \
&& apt install -y openssh-server
RUN passwd -d root && \
echo "PermitRootLogin yes" >> /etc/ssh/sshd_config && \
echo "PermitEmptyPasswords yes" >> /etc/ssh/sshd_config && \
echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \
chmod +x /scripts/*

7 changes: 7 additions & 0 deletions .github/container/git-clone.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ pushd ${DESTINATION}
git checkout ${GIT_REF}
COMMIT_SHA=$(git rev-parse HEAD)
git submodule update --init --recursive
if [[ "${GIT_REPO}" == *"gitlab"* ]]; then
git remote remove origin
if grep -q -r gitlab-ci-token .git; then
grep -r gitlab-ci-token .git | awk -F: '{print $1}' | xargs rm -f
fi
git branch -D main
fi
popd

## update the manifest file
Expand Down
2 changes: 1 addition & 1 deletion .github/container/test-jax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ FLAGS+=("--//jaxlib/tools:add_pypi_cuda_wheel_deps=false")

# Default parallelism: at least 10GB per test, no more than 4 tests per GPU.
DEFAULT_JOBS_PER_GPU=$(( GPU_MEMORIES_MIB[0] / 10000))
if (( DEFAULT_JOBS_PER_GPU > 8 )); then DEFAULT_JOBS_PER_GPU=4; fi
if (( DEFAULT_JOBS_PER_GPU > 4 )); then DEFAULT_JOBS_PER_GPU=4; fi
set_default JOBS_PER_GPU ${DEFAULT_JOBS_PER_GPU}
FLAGS+=(
"--cache_test_results=${CACHE_TEST_RESULTS}"
Expand Down
Loading
Loading