diff --git a/.github/actions/gke-xpk/action.yml b/.github/actions/gke-xpk/action.yml index a0bfb0f92..cdb1437e1 100644 --- a/.github/actions/gke-xpk/action.yml +++ b/.github/actions/gke-xpk/action.yml @@ -9,7 +9,7 @@ inputs: type: string GKE_CLUSTER: description: 'GKE cluster name' - default: jtb-2025-08-26 + default: jtb-2025-10-07 required: false type: string GCP_ZONE: @@ -247,6 +247,7 @@ runs: if [ $? -ne 0 ]; then echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected " + echo "XPK_EXIT_CODE=1" >> ${GITHUB_ENV} exit 1 fi @@ -262,11 +263,12 @@ runs: ALL_EXIT_CODES=$(( ALL_EXIT_CODES + POD_EXIT_CODE )) done + echo "XPK_EXIT_CODE=${ALL_EXIT_CODES}" >> ${GITHUB_ENV} if [ ${ALL_EXIT_CODES} -gt 0 ]; then exit 1 fi exit 0 - + - name: Clean up JobSet from cluster shell: bash -x -u {0} if: ${{ always() }} @@ -291,3 +293,38 @@ runs: if: ${{ always() }} run: | sudo rm -rf ${WORKLOAD_NAME} + + - name: Generate sitrep + id: sitrep + shell: bash -x -e {0} + if: ${{ always() }} + run: | + source .github/workflows/scripts/to_json.sh + badge_label="${{ matrix.test }}" + + summary="${{ inputs.WORKLOAD_NAME_PREFIX }}" + outcome=success + badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}" + badge_color=brightgreen + + if [ "${XPK_EXIT_CODE}" -gt 0 ]; then + badge_color=red + outcome=failed + summary+=": fail" + else + summary+=": pass" + fi + + to_json summary \ + badge_label \ + badge_color \ + outcome | \ + tee sitrep.json + + - name: Upload sitrep to GitHub Actions from runner + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep + path: | + sitrep.json diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 1f5aabed7..25136d1e3 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -64,6 +64,7 @@ RUN mkdir -p /builder/extra-targets && \ --src-path-xla ${SRC_PATH_XLA} \ --sm all \ --clean \ + --release \ ${EXTRA_BUILD_JAX_ARGS} ## Transformer engine: check out source and build wheel @@ -97,7 +98,6 @@ ENV BUILD_DATE=${BUILD_DATE} # The following environment variables tune performance ENV XLA_FLAGS="" ENV XLA_FLAGS="${XLA_FLAGS} --xla_gpu_enable_latency_hiding_scheduler=true" -ENV NCCL_NVLS_ENABLE=0 COPY --from=builder ${BUILD_PATH_JAXLIB} ${BUILD_PATH_JAXLIB} COPY --from=builder ${SRC_PATH_JAX} ${SRC_PATH_JAX} diff --git a/.github/container/build-jax.sh b/.github/container/build-jax.sh index 244b048ae..12d698a5d 100755 --- a/.github/container/build-jax.sh +++ b/.github/container/build-jax.sh @@ -83,7 +83,7 @@ INSTALL=1 SRC_PATH_JAX="/opt/jax" SRC_PATH_XLA="/opt/xla" -args=$(getopt -o h --long bazel-cache:,bazel-cache-namespace:,build-param:,build-path-jaxlib:,clean,cpu-arch:,debug,extra-targets:,extra-target-dest:,no-clean,clean-only,help,install,no-install,src-path-jax:,src-path-xla:,sm: -- "$@") +args=$(getopt -o h,r --long bazel-cache:,bazel-cache-namespace:,build-param:,build-path-jaxlib:,clean,release,cpu-arch:,debug,extra-targets:,extra-target-dest:,no-clean,clean-only,help,install,no-install,src-path-jax:,src-path-xla:,sm: -- "$@") if [[ $? -ne 0 ]]; then exit 1 fi @@ -135,6 +135,10 @@ while [ : ]; do EXTRA_TARGET_DEST="$2" shift 2 ;; + -r | --release) + IS_RELEASE=1 + shift 1 + ;; -h | --help) usage 1 ;; @@ -225,6 +229,7 @@ print_var INSTALL print_var PYTHON_VERSION print_var SRC_PATH_JAX print_var SRC_PATH_XLA +print_var IS_RELEASE echo "==================================================" @@ -268,6 +273,12 @@ for component in jaxlib "jax-cuda${CUDA_MAJOR_VERSION}-pjrt" "jax-cuda${CUDA_MAJ # version, so nvidia-*-cu12 wheels disappear from the lock file sed -i "s|^${component}.*$|${component} @ file://${BUILD_PATH_JAXLIB}/${component//-/_}|" build/requirements.in done + +if [[ "${IS_RELEASE}" == "1" ]]; then + jaxlib_version=$(pip show jaxlib | grep Version | tr ':' '\n' | tail -1) + sed -i "s| f'jaxlib >={_minimum_jaxlib_version}, <={_jax_version}',| f'jaxlib>=0.5.0',|" /opt/jax/setup.py +fi + # Bazel args to avoid cache invalidation BAZEL_ARGS=( --config=cuda_libraries_from_stubs diff --git a/.github/container/build-te.sh b/.github/container/build-te.sh index 2c47b725b..3271b4504 100755 --- a/.github/container/build-te.sh +++ b/.github/container/build-te.sh @@ -103,6 +103,12 @@ if [[ "$SM" == "all" ]]; then SM_LIST=$(default_compute_capabilities) elif [[ "$SM" == "local" ]]; then SM_LIST=$("${SCRIPT_DIR}/local_cuda_arch") + if [[ -z "${SM_LIST}" ]]; then + echo "Could not determine the local GPU architecture." + echo "You should pass --sm when compiling on a machine without GPUs." + nvidia-smi || true + exit 1 + fi else SM_LIST=${SM} fi @@ -131,8 +137,19 @@ export NVTE_FRAMEWORK=jax export XLA_HOME=${SRC_PATH_XLA} pushd ${SRC_PATH_TE} -# Install required packages that were removed in https://github.com/NVIDIA/TransformerEngine/pull/1852 -pip install "pybind11[global]" +# Install some build dependencies, but avoid installing everything +# (jax, torch, ...) because we do not want to pull in a released version of +# JAX, or the wheel-based installation of CUDA. Note that when we build TE as +# part of building the JAX containers, JAX and XLA are not yet installed. +python - << EOF +import subprocess, sys, tomllib +with open("pyproject.toml", "rb") as ifile: + data = tomllib.load(ifile) +subprocess.run( + [sys.executable, "-m", "pip", "install"] + + [r for r in data["build-system"]["requires"] + if r.startswith("nvidia-mathdx") or r.startswith("pybind11")]) +EOF # The wheel filename includes the TE commit; if this has changed since the last # incremental build then we would end up with multiple wheels. diff --git a/.github/container/git-clone.sh b/.github/container/git-clone.sh index f4ddbc7fb..956d189b7 100755 --- a/.github/container/git-clone.sh +++ b/.github/container/git-clone.sh @@ -77,6 +77,13 @@ pushd ${DESTINATION} git checkout ${GIT_REF} COMMIT_SHA=$(git rev-parse HEAD) git submodule update --init --recursive +if [[ "${GIT_REPO}" == *"gitlab"* ]]; then + git remote remove origin + if grep -q -r gitlab-ci-token .git; then + grep -r gitlab-ci-token .git | awk -F: '{print $1}' | xargs rm -f + fi + git branch -D main +fi popd ## update the manifest file diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 285da565c..6e2a59aed 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -4,54 +4,60 @@ set -eoux pipefail pushd /opt/pip-tools.d -# First pip-compile gathers all reqs, but we are care only about VCS installs -# It's possible there are 2nd degree transitive dependencies that are VCS, so -# this is more robust to gather VCS requirements at the cost of pip-compiling -# twice -pip-compile -o requirements.pre $(ls requirements-*.in) +# If requirements-pinned.txt exists, skip compilation +if [[ -f "requirements-pinned.txt" ]]; then + sed -E 's/#sha256=[a-f0-9]+//g' requirements-pinned.txt > requirements.txt +else + # First pip-compile gathers all reqs, but we are care only about VCS installs + # It's possible there are 2nd degree transitive dependencies that are VCS, so + # this is more robust to gather VCS requirements at the cost of pip-compiling + # twice + pip-compile -o requirements.pre $(ls requirements-*.in) -IFS=$'\n' -for line in $(cat requirements.pre | egrep '^[^#].+ @ git\+' || true); do - # VCS installs are of the form "PACKAGE @ git+..." - PACKAGE=$(echo "$line" | awk '{print $1}') - ref=$(yq e ".${PACKAGE}.latest_verified_commit" ${MANIFEST_FILE}) - if [[ "$line" == *"#subdirectory="* ]]; then - # This is required b/c git-refs/commits cannot come after - # the subdirectory fragment. - # An example of an install that is of this form is: - # 'orbax-checkpoint @ git+https://github.com/google/orbax/#subdirectory=checkpoint' - echo "${line}" | sed "s/#subdirectory=/@${ref}#subdirectory=/" - else - echo "${line}@${ref}" - fi -done | tee requirements.vcs -unset IFS + IFS=$'\n' + for line in $(cat requirements.pre | egrep '^[^#].+ @ git\+' || true); do + # VCS installs are of the form "PACKAGE @ git+..." + PACKAGE=$(echo "$line" | awk '{print $1}') + ref=$(yq e ".${PACKAGE}.latest_verified_commit" ${MANIFEST_FILE}) + if [[ "$line" == *"#subdirectory="* ]]; then + # This is required b/c git-refs/commits cannot come after + # the subdirectory fragment. + # An example of an install that is of this form is: + # 'orbax-checkpoint @ git+https://github.com/google/orbax/#subdirectory=checkpoint' + echo "${line}" | sed "s/#subdirectory=/@${ref}#subdirectory=/" + else + echo "${line}@${ref}" + fi + done | tee requirements.vcs + unset IFS -# Second pip-compile includes one more requirements file that pins all vcs installs -# Uses a special env var to let our custom pip impl know to treat the following as -# equivalent: -# -# fiddle @ git+https://github.com/google/fiddle -# fiddle @ git+https://github.com/google/fiddle@cd4497e4c09bdf95dcccaa1e138c2c125d32d39f -# -# JAX_TOOLBOX_VCS_EQUIVALENCY is an environment variable enabling custom logic in pip -# that treats the above as equivalent and prefers the URI wit the SHA -JAX_TOOLBOX_VCS_EQUIVALENCY=true pip-compile -o requirements.txt requirements.vcs $(ls requirements-*.in) + # Second pip-compile includes one more requirements file that pins all vcs installs + # Uses a special env var to let our custom pip impl know to treat the following as + # equivalent: + # + # fiddle @ git+https://github.com/google/fiddle + # fiddle @ git+https://github.com/google/fiddle@cd4497e4c09bdf95dcccaa1e138c2c125d32d39f + # + # JAX_TOOLBOX_VCS_EQUIVALENCY is an environment variable enabling custom logic in pip + # that treats the above as equivalent and prefers the URI wit the SHA + JAX_TOOLBOX_VCS_EQUIVALENCY=true pip-compile -o requirements.txt requirements.vcs $(ls requirements-*.in) -# If there are unpinned VCS dependencies, error since these should be included in the manifest -unpinned_vcs_dependencies=$(cat requirements.txt | egrep '^[^#].+ @ git\+' | egrep -v '^[^#].+ @ git\+.+@' || true) -if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then - echo "Unpinned VCS installs found in $(readlink -f requirements.txt):" - echo "$unpinned_vcs_dependencies" - exit 1 -fi + # If there are unpinned VCS dependencies, error since these should be included in the manifest + unpinned_vcs_dependencies=$(cat requirements.txt | egrep '^[^#].+ @ git\+' | egrep -v '^[^#].+ @ git\+.+@' || true) + if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then + echo "Unpinned VCS installs found in $(readlink -f requirements.txt):" + echo "$unpinned_vcs_dependencies" + exit 1 + fi -# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64 -if [ "$(uname -m)" = "x86_64" ]; then - sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt -else - echo "Skipping TF on $(uname -m)" + # Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64 + if [[ "$(uname -m)" = "x86_64" ]]; then + sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt + else + echo "Skipping TF on $(uname -m)" + fi fi + # --no-deps is required since conflicts can still appear during pip-sync pip-sync --pip-args '--no-deps --src /opt' requirements.txt @@ -63,3 +69,6 @@ for post_install in $(ls /opt/pip-tools-post-install.d/*); do "${post_install}" fi done + +echo "######## Frozen requirements ########" +pip freeze diff --git a/.github/container/test-jax.sh b/.github/container/test-jax.sh index 3398b72c8..73aab9fd0 100755 --- a/.github/container/test-jax.sh +++ b/.github/container/test-jax.sh @@ -119,7 +119,15 @@ fi readarray -t GPU_MEMORIES < <(nvidia-smi --query-gpu=memory.total --format=csv,noheader) NGPUS="${#GPU_MEMORIES[@]}" -GPU_MEMORIES_MIB=("${GPU_MEMORIES[@]/ MiB/}") +if [[ " ${GPU_MEMORIES[*]} " =~ [[:space:]]\[N/A\][[:space:]] ]]; then + # On iGPU devices, nvidia-smi reports [N/A] GPU memory; use the system + # memory size instead to estimate what each GPU can use + SYSTEM_MEMORY_MIB=$(grep MemTotal /proc/meminfo | awk '{print $2 / 1024}') + declare -a GPU_MEMORIES_MIB + for (( i = 0; i < NGPUS; i++ )); do GPU_MEMORIES_MIB+=($(( SYSTEM_MEMORY_MIB / NGPUS ))); done +else + GPU_MEMORIES_MIB=("${GPU_MEMORIES[@]/ MiB/}") +fi FLAGS=() diff --git a/.github/eks-workflow-files/maxtext-job.yaml b/.github/eks-workflow-files/maxtext-job.yaml new file mode 100644 index 000000000..7d9728f87 --- /dev/null +++ b/.github/eks-workflow-files/maxtext-job.yaml @@ -0,0 +1,120 @@ +apiVersion: v1 +kind: Service +metadata: + name: PLACEHOLDER +spec: + clusterIP: None # clusterIP must be None to create a headless service + selector: + job-name: PLACEHOLDER # must match Job name +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + completions: 2 # number of nodes + parallelism: 2 # number of nodes + completionMode: Indexed + backoffLimitPerIndex: 0 # max failures per index + maxFailedIndexes: 0 # all indices must succeed + template: + spec: + subdomain: PLACEHOLDER # has to match Service name + restartPolicy: Never + imagePullSecrets: + - name: PLACEHOLDER + containers: + - name: maxtext + image: PLACEHOLDER + ports: + - containerPort: 3389 + command: + - bash + - -c + # The logging logic: stream stdout/stderr from the 0th process inside this pod, + # record all of the processes' stdout/stderr + the INFO-level NCCL logs to file + - | + export SERVICE_NAME=$0 + export JOB_NAME=$1 + cat >each-process.sh <<'EOL' + export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME} + export JAX_COORDINATOR_PORT=3389 + export NNODES=16 # actually #processes == #GPUs + export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK)) + export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK + export NCCL_DEBUG=INFO + export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log + [[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null" + nsys-jax \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + -o /opt/output/profile.$NODE_RANK.zip \ + -- \ + test-maxtext.sh \ + -n 2 \ + -b 2 \ + --model-name=llama2-7b \ + --attn-type=cudnn_flash_te \ + --remat-policy=minimal_flash \ + --steps=20 \ + --fsdp=16 \ + -a "scan_layers=false \ + max_target_length=4096 \ + use_iota_embed=true \ + logits_dot_in_fp32=false \ + profiler=nsys \ + skip_first_n_steps_for_profiler=3 \ + profiler_steps=8" \ + |& tee /opt/output/output.$NODE_RANK.log >"${console}" + code=$? + # Should run even on failure + cat /opt/output/nccl.$NODE_RANK.log >"${console}" + exit $code + EOL + # TODO: upgrade parallel-launch to return a failure code as soon as any + # of its children do (it already does this eventually, but it could + # be slow) + parallel-launch LOCAL_RANK 8 bash each-process.sh + code=$? + # Should run even on failure + touch /opt/output/.done + exit $code + - PLACEHOLDER + - PLACEHOLDER + resources: + limits: + nvidia.com/gpu: 8 + vpc.amazonaws.com/efa: 32 + volumeMounts: + - mountPath: /dev/shm + name: shmem + - mountPath: /opt/output + name: output + - name: upload + image: amazon/aws-cli + command: + - bash + - -c + - | + JOB_NAME="$0" + while [[ ! -f /opt/output/.done ]]; do + sleep 1 + done + rm /opt/output/.done + aws s3 cp \ + --recursive \ + /opt/output \ + "s3://jax-toolbox-eks-output/${JOB_NAME}/" + - PLACEHOLDER + volumeMounts: + - mountPath: /opt/output + name: output + volumes: + - name: output + emptyDir: {} + - name: shmem + emptyDir: + medium: Memory + sizeLimit: 16Gi diff --git a/.github/workflows/_test_maxtext_gke_xpk.yaml b/.github/workflows/_test_maxtext_gke_xpk.yaml index 88feb716a..464b6af51 100644 --- a/.github/workflows/_test_maxtext_gke_xpk.yaml +++ b/.github/workflows/_test_maxtext_gke_xpk.yaml @@ -26,14 +26,14 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry + - name: Login to nvcr.io Container Registry uses: docker/login-action@v3 with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} - - name: K8s GHCR store and delete token + - name: K8s store and delete token id: store-token uses: ./.github/actions/store-delete-k8s-ghcr diff --git a/.github/workflows/_test_maxtext_k8s.yaml b/.github/workflows/_test_maxtext_k8s.yaml new file mode 100644 index 000000000..7f82d3f42 --- /dev/null +++ b/.github/workflows/_test_maxtext_k8s.yaml @@ -0,0 +1,107 @@ +name: ~test MaxText functionality on Kubernetes + +on: + workflow_call: + inputs: + MAXTEXT_IMAGE: + type: string + description: MaxText container to test + required: true + +permissions: + contents: read # to fetch code + +jobs: + maxtext: + runs-on: eks + env: + CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}" + JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}" + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to NVIDIA Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} + - name: Store GitHub Container Registry token as Kubernetes secret + run: | + # Make this available to later steps + TOKEN_NAME="${JOB_NAME}-token" + echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV" + kubectl create secret generic \ + ${TOKEN_NAME} \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson + - name: Configure Kubernetes job + run: | + export SERVICE_NAME="${JOB_NAME}-svc" + yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME) + | select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + | select(di == 1).metadata.name = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE) + | select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME) + | select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \ + .github/eks-workflow-files/maxtext-job.yaml + git diff .github/eks-workflow-files/maxtext-job.yaml + - name: Submit Kubernetes job + run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml + - name: Wait for Kubernetes job to start + run: | + # Launcher job is created eagerly, but suspended. Kueue un-suspends it when + # resources are available, but that is where there can be a long wait if the + # cluster is busy executing other jobs. + kubectl wait --for=create job/${JOB_NAME} + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s + - name: Stream Kubernetes job output + run: | + # Streaming logs will fail if the container/pod is still pending + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 1 + done + kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME} + - name: Retrieve Kubernetes job status + shell: bash -exo pipefail {0} + run: | + while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do + failure=${status[0]:-0} + success=${status[1]:-0} + total=$((failure+success)) + if [[ ${total} < 2 ]]; then + sleep 1 + elif [[ ${total} == 2 ]]; then + break + else + # FIXME + exit 255 + fi + done + exit ${failure} + # Provide more debug output in case of failure; note that some kinds of launch + # failure do not produce any log output. + - name: Debug failed Kubernetes job + if: failure() + run: | + # Provide better debug in case of launch failures that will not produce log output + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name) + if [[ -n "${pods}" ]]; then + kubectl describe ${pods} + fi + # Clean up in case of errors as well as success + - name: Delete Kubernetes job + if: always() + run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml + - name: Delete GitHub Container Registry token + if: always() + run: kubectl delete secret ${TOKEN_NAME} diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 987ccb34c..c68ebd9fc 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -23,6 +23,12 @@ jobs: build-mpi-operator-compatible-base: runs-on: [self-hosted, "amd64", "large"] steps: + - name: Login to nvcr.io Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} - name: Checkout repository uses: actions/checkout@v4 - name: Build MPI operator compatible base container diff --git a/.github/workflows/_test_nccl_gke.yaml b/.github/workflows/_test_nccl_gke.yaml index ed10c0f47..7814fd858 100644 --- a/.github/workflows/_test_nccl_gke.yaml +++ b/.github/workflows/_test_nccl_gke.yaml @@ -14,6 +14,12 @@ jobs: runs-on: [self-hosted, "amd64", "large"] steps: - uses: actions/checkout@v4 + - name: Login to nvcr.io Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} - name: Build NCCL image id: build uses: ./.github/actions/build-container diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a6f53ec4..d328c993b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,6 +12,8 @@ on: paths-ignore: - '**.md' - '.github/triage/**' + branches-ignore: + - '25.*' # workflows for release to be triggered via dispatch event only workflow_dispatch: inputs: PUBLISH: diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml new file mode 100644 index 000000000..2a0d7c57e --- /dev/null +++ b/.github/workflows/ngc-release-testing.yaml @@ -0,0 +1,52 @@ +name: ~NGC release testing + +on: + workflow_dispatch: + inputs: + JAX_IMAGE: + type: string + description: "JAX image to run tests on" + required: false + default: '' + MAXTEXT_IMAGE: + type: string + description: "MaxText image to run tests on" + required: false + default: '' + + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + +permissions: + contents: read # to fetch code + actions: write # to cancel previous workflows + packages: write # to upload container + +jobs: + test-nccl: + if: inputs.JAX_IMAGE != '' + uses: ./.github/workflows/_test_nccl.yaml + with: + CONTAINER: ${{ inputs.JAX_IMAGE }} + secrets: inherit + + test-maxtext-eks: + if: inputs.MAXTEXT_IMAGE != '' + uses: ./.github/workflows/_test_maxtext_k8s.yaml + with: + MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} + secrets: inherit + + test-maxtext-gke: + if: inputs.MAXTEXT_IMAGE != '' + uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml + with: + MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} + secrets: inherit + + finalize: + needs: [ test-nccl, test-maxtext-gke,test-maxtext-eks ] + if: "!cancelled()" + uses: ./.github/workflows/_finalize.yaml + secrets: inherit diff --git a/README.md b/README.md index 9d3be4a34..d928e614c 100644 --- a/README.md +++ b/README.md @@ -218,10 +218,6 @@ The [JAX image](https://github.com/NVIDIA/JAX-Toolbox/pkgs/container/jax) is emb | --------- | ----- | ----------- | | `--xla_gpu_enable_latency_hiding_scheduler` | `true` | allows XLA to move communication collectives to increase overlap with compute kernels | -| Environment Variable | Value | Explanation | -| -------------------- | ----- | ----------- | -| `NCCL_NVLS_ENABLE` | `0` | Disables NVLink SHARP ([1](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-nvls-enable)). Future releases will re-enable this feature. | - There are various other XLA flags users can set to improve performance. For a detailed explanation of these flags, please refer to the [GPU performance](./rosetta/docs/GPU_performance.md) doc. XLA flags can also be tuned per workload. For example, each script includes a directory [xla_flags](./rosetta/rosetta/projects/maxtext/xla_flags). For a list of previously used XLA flags that are no longer needed, please also refer to the [GPU performance](./rosetta/docs/GPU_performance.md#previously-used-xla-flags) page.