Skip to content

Commit 4a5fa99

Browse files
authored
Re:Run Torchtitan ROCm workflow on cron schedule & push to Main branch only (#2018)
Addressing following issues in this PR- Running Torchtitan ROCm workflow on cron schedule & only when push to Main branch. CUDA workflow will run as is. Refactor Torchtitan test run to address older PR comment #1786 (comment)
1 parent bfdc974 commit 4a5fa99

File tree

2 files changed

+41
-22
lines changed

2 files changed

+41
-22
lines changed

.github/workflows/integration_test_8gpu_features.yaml

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,43 @@ permissions:
2525
contents: read
2626

2727
jobs:
28+
# Step 1: Dynamically compute the matrix based on conditions
29+
set-matrix:
30+
runs-on: ubuntu-latest
31+
outputs:
32+
matrix: ${{ steps.set.outputs.matrix }}
33+
steps:
34+
- id: set
35+
run: |
36+
# Decide which matrix entries to include based on event type
37+
if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
38+
# Include both CUDA and ROCm
39+
echo '{"include":[
40+
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"},
41+
{"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"}
42+
]}' > matrix.json
43+
else
44+
# Include only CUDA
45+
echo '{"include":[
46+
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}
47+
]}' > matrix.json
48+
fi
49+
50+
# Export matrix to job outputs
51+
{
52+
echo 'matrix<<EOF'
53+
cat matrix.json
54+
echo 'EOF'
55+
} >> $GITHUB_OUTPUT
56+
57+
58+
# Step 2: Use the dynamic matrix in the build-test job
2859
build-test:
60+
needs: set-matrix
2961
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
3062
strategy:
3163
fail-fast: false
32-
matrix:
33-
include:
34-
- name: cuda
35-
runner: linux.g5.48xlarge.nvidia.gpu
36-
gpu-arch-type: cuda
37-
gpu-arch-version: "12.6"
38-
# This image is faster to clone than the default, but it lacks CC needed by triton
39-
# (1m25s vs 2m37s).
40-
docker-image: torchtitan-ubuntu-20.04-clang12
41-
index-url: https://download.pytorch.org/whl/nightly/cu126
42-
- name: rocm
43-
runner: linux.rocm.gpu.gfx942.8
44-
gpu-arch-type: rocm
45-
gpu-arch-version: "7.0"
46-
docker-image: torchtitan-rocm-ubuntu-22.04-clang12
47-
index-url: https://download.pytorch.org/whl/nightly/rocm7.0
64+
matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
4865
with:
4966
runner: ${{ matrix.runner }}
5067
gpu-arch-type: ${{ matrix.gpu-arch-type }}
@@ -73,8 +90,7 @@ jobs:
7390
sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
7491
sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
7592
76-
export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0)
77-
python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
93+
python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
7894
7995
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
8096
rm -rf artifacts-to-be-uploaded/*/checkpoint

tests/integration_tests/run_tests.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,6 @@
2525
}
2626

2727

28-
TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"
29-
30-
3128
def _run_cmd(cmd):
3229
return subprocess.run([cmd], text=True, shell=True)
3330

@@ -92,7 +89,7 @@ def run_tests(args, test_list: list[OverrideDefinitions]):
9289
continue
9390

9491
# Skip the test for ROCm
95-
if TEST_WITH_ROCM and test_flavor.skip_rocm_test:
92+
if args.gpu_arch_type == "rocm" and test_flavor.skip_rocm_test:
9693
continue
9794

9895
# Check if we have enough GPUs
@@ -110,6 +107,12 @@ def main():
110107
parser.add_argument(
111108
"output_dir", help="Directory to dump results generated by tests"
112109
)
110+
parser.add_argument(
111+
"--gpu_arch_type",
112+
default="cuda",
113+
choices=["cuda", "rocm"],
114+
help="GPU architecture type. Must be specified as either 'cuda' or 'rocm'.",
115+
)
113116
parser.add_argument(
114117
"--test_suite",
115118
default="features",

0 commit comments

Comments
 (0)