diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index ea876649cc..09b58195cc 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,11 +3,11 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string @@ -71,9 +71,9 @@ runs: fi TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" fi git clone ${PYTORCH_REPO} pytorch cd pytorch @@ -99,14 +99,9 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - fi + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} git status && git diff && git show -s - name: Install E2E Requirements shell: bash -xe {0} diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 847d71b8ac..452ebc4bb3 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -157,7 +157,8 @@ runs: export CCL_ROOT=$(dirname $(which python))/../ export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" - python run_distributed.py \ + export TEMP_DIR=/tmp + python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \; diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index bb9bd7a304..099a9ed449 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -19,7 +19,7 @@ done # Set pytorch rm -rf ${WORKSPACE}/pytorch -git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch +git clone https://github.com/daisyden/pytorch.git ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s @@ -44,7 +44,6 @@ git remote -v && git branch && git show -s # Pre Build cd ${WORKSPACE}/pytorch python -m pip install requests -python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 1bed161f2b..7e30e885ac 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,11 +10,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index dc10647124..b306317e59 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,11 +9,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string @@ -99,12 +99,12 @@ jobs: test-in-baremetal: needs: runner - timeout-minutes: 600 + timeout-minutes: 1200 if: ${{ contains(inputs.ut, 'distributed') }} runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1 --max-worker-restart 10000 + PYTEST_ADDOPTS: -v steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 @@ -175,6 +175,7 @@ jobs: else ut_list="${{ inputs.ut }}" fi + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ for ut_name in ${ut_list} do cp Known_issue.log.tmp Known_issue.log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 6c26af3c71..85281b43fc 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,8 +100,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + pytorch: distributed_2.10 + runner: PVC-7358 linux-ut: needs: [conditions-filter, linux-build] @@ -130,9 +130,8 @@ jobs: ut_name: [xpu_distributed] uses: ./.github/workflows/_linux_ut.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} - torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} + runner: PVC-7358 + pytorch: distributed_2.10 ut: ${{ matrix.ut_name }} linux-e2e: