diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 736e04a50..8a44385a0 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1-labs ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_AXLEARN=https://github.com/Steboss/axlearn.git#sbosisio/working_branch +ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git ARG SRC_PATH_AXLEARN=/opt/axlearn ############################################################################### diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index a304033b0..961a7d6c6 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -102,8 +102,8 @@ pathwaysutils: latest_verified_commit: 359776d454940ffaa337c36d1df16308d44a95a9 mode: pip-vcs axlearn: - url: https://github.com/Steboss/axlearn.git - tracking_ref: sbosisio/working_branch + url: https://github.com/apple/axlearn.git + tracking_ref: main mode: git-clone qwix: url: https://github.com/google/qwix.git diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 288a03a4e..4cd21cd9d 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -11,9 +11,9 @@ usage() { echo "" echo " OPTIONS DESCRIPTION" echo " -d, --directory DIR Directory to run tests in." - echo " Default: 'axlearn/axlearn/common'." + echo " Default: 'opt/axlearn'." echo " -t, --test-files FILES Pattern for test files to run." - echo " Default: '*_test.py'." + echo " Default: 'axlearn/common/*_test.py'." echo " -o, --output DIRECTORY Output directory for logs and summary." echo " Default: 'test_runs/'." echo " -h, --help Show this help message and exit." @@ -39,7 +39,7 @@ run_tests() { } # DEFAULT VALUES -DIR='/opt/axlearn/axlearn/common' +DIR='/opt/axlearn' TEST_FILES=() OUTPUT_DIRECTORY='' @@ -95,15 +95,6 @@ LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs" mkdir -p "${LOG_DIRECTORY}" -if [ "${#TEST_FILES[@]}" -gt 0 ]; then - echo " Test Files:" - for f in "${TEST_FILES[@]}"; do - echo " $f" - done -else - echo " Test Files Pattern: '*_test.py' (default)" -fi - # DEPENDENCIES pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu pip install timm transformers scikit-learn grain evaluate prefixed wandb @@ -115,26 +106,30 @@ curl https://huggingface.co/FacebookAI/roberta-base/raw/main/merges.txt -o /opt/ curl https://huggingface.co/FacebookAI/roberta-base/raw/main/vocab.json -o /opt/axlearn/axlearn/data/tokenizers/bpe/roberta-base-vocab.json # RETRIEVE TEST FILES +expanded_test_files=() if [ "${#TEST_FILES[@]}" -eq 0 ]; then - TEST_FILES=("*_test.py") + # if we are not giving anything for --test-files than we can match all those *_test.py files + readarray -t expanded_test_files < <(find . -name "*_test.py" -type f) + # otherwise let's check in the --test-files pattern +else + for pattern in "${TEST_FILES[@]}"; do + echo "looking for pattern: $pattern" + echo "Cmd: find . -name \"$pattern\" -type f" + readarray -t found_files < <(find . -path "./$pattern" -type f) + if [ ${#found_files[@]} -gt 0 ]; then + expanded_test_files+=( "${found_files[@]}" ) + else + echo "Warning: No files found matching pattern '$pattern'" + fi + done fi -expanded_test_files=() -for pattern in "${TEST_FILES[@]}"; do - # retrieve all the files - files=( $pattern ) - if [ "${#files[@]}" -gt 0 ]; then - expanded_test_files+=( "${files[@]}" ) - else - echo "Warning: No files matched pattern '$pattern'" - fi -done - if [ "${#expanded_test_files[@]}" -eq 0 ]; then echo "No test files found to run." exit 1 fi +# EXCLUDE PATTERNS EXCLUDE_PATTERNS=("array_serialization_test.py" "t5_test.py" # tensorflow bug "loss_test.py" @@ -185,23 +180,13 @@ done # RUN TESTS -TEST_8_DEVICES_FILES=("gda_test.py" - "input_base_test.py" - "input_dispatch_test.py" - "trainer_test.py" - "utils_test.py" +TEST_8_DEVICES_WITH_PATHS=( + "./axlearn/common/gda_test.py" + "./axlearn/common/input_base_test.py" + "./axlearn/common/input_dispatch_test.py" + "./axlearn/common/trainer_test.py" + "./axlearn/common/utils_test.py" ) -TEST_8_DEVICES_WITH_PATHS=() -for file in "${TEST_8_DEVICES_FILES[@]}"; do - found_files=$(find . -name "$file" -type f 2>/dev/null) - if [[ -n "$found_files" ]]; then - while IFS= read -r found_file; do - TEST_8_DEVICES_WITH_PATHS+=("$found_file") - done <<< "$found_files" - else - echo "Warning: Test file $file not found in current directory structure" - fi -done run_tests "" "for_8_devices" "8_dev" "${TEST_8_DEVICES_WITH_PATHS[@]}" # All the other tests diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 24e7ca8e7..0a77f4c86 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -22,7 +22,7 @@ spec: LOG_DIR="/output/${RUN_ID}" mkdir -p ${LOG_DIR} # test on JAX, make sure 8 devices are visible - pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "/opt/axlearn/axlearn/common/*_test.py" + pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --output ${LOG_DIR} --test-files "axlearn/common/*_test.py" env: - name: RUN_ID value: PLACEHOLDER diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml index 0e34cb7a2..e02ddf45c 100644 --- a/.github/eks-workflow-files/mpi-nccl-test.yml +++ b/.github/eks-workflow-files/mpi-nccl-test.yml @@ -71,7 +71,7 @@ spec: resources: limits: nvidia.com/gpu: 8 - hugepages-2Mi: 5120Mi + #hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 32 memory: 32000Mi imagePullSecrets: diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index aafc32fdb..86ee9b239 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -54,7 +54,12 @@ jobs: secrets: inherit test-nccl: - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'nccl' + ) needs: build-base uses: ./.github/workflows/_test_nccl.yaml with: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a6f53ec4..3553e46fd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -51,7 +51,8 @@ on: - t5x - run build rosetta - maxtext - run only the tests for maxtext - axlearn - run only the tests for axlearn - options: [full, jax, te, t5x, maxtext, axlearn] + - nccl - run only the nccl tests + options: [full, jax, te, t5x, maxtext, axlearn, nccl] default: full concurrency: @@ -210,7 +211,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "nccl" # ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -222,7 +223,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "nccl" # ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds