@@ -25,26 +25,43 @@ permissions:
2525 contents : read
2626
2727jobs :
28+ # Step 1: Dynamically compute the matrix based on conditions
29+ set-matrix :
30+ runs-on : ubuntu-latest
31+ outputs :
32+ matrix : ${{ steps.set.outputs.matrix }}
33+ steps :
34+ - id : set
35+ run : |
36+ # Decide which matrix entries to include based on event type
37+ if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
38+ # Include both CUDA and ROCm
39+ echo '{"include":[
40+ {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"},
41+ {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"}
42+ ]}' > matrix.json
43+ else
44+ # Include only CUDA
45+ echo '{"include":[
46+ {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}
47+ ]}' > matrix.json
48+ fi
49+
50+ # Export matrix to job outputs
51+ {
52+ echo 'matrix<<EOF'
53+ cat matrix.json
54+ echo 'EOF'
55+ } >> $GITHUB_OUTPUT
56+
57+
58+ # Step 2: Use the dynamic matrix in the build-test job
2859 build-test :
60+ needs : set-matrix
2961 uses : pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
3062 strategy :
3163 fail-fast : false
32- matrix :
33- include :
34- - name : cuda
35- runner : linux.g5.48xlarge.nvidia.gpu
36- gpu-arch-type : cuda
37- gpu-arch-version : " 12.6"
38- # This image is faster to clone than the default, but it lacks CC needed by triton
39- # (1m25s vs 2m37s).
40- docker-image : torchtitan-ubuntu-20.04-clang12
41- index-url : https://download.pytorch.org/whl/nightly/cu126
42- - name : rocm
43- runner : linux.rocm.gpu.gfx942.8
44- gpu-arch-type : rocm
45- gpu-arch-version : " 7.0"
46- docker-image : torchtitan-rocm-ubuntu-22.04-clang12
47- index-url : https://download.pytorch.org/whl/nightly/rocm7.0
64+ matrix : ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
4865 with :
4966 runner : ${{ matrix.runner }}
5067 gpu-arch-type : ${{ matrix.gpu-arch-type }}
7390 sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
7491 sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
7592
76- export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0)
77- python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
93+ python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
7894
7995 rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
8096 rm -rf artifacts-to-be-uploaded/*/checkpoint
0 commit comments