From 5ac5b25a999708051ac0c22e66b9f2c6b93ad40b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 9 Jul 2025 15:24:57 +0000 Subject: [PATCH 01/10] distributed_weekly test --- .github/scripts/build.sh | 1 - .github/workflows/_linux_build.yml | 8 +- .github/workflows/_linux_ut.yml | 37 ++++--- .github/workflows/pull.yml | 162 +---------------------------- 4 files changed, 29 insertions(+), 179 deletions(-) diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 9dcd170aa1..a9c43eef5b 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -45,7 +45,6 @@ cp -r ${WORKSPACE}/torch-xpu-ops third_party/torch-xpu-ops # Pre Build cd ${WORKSPACE}/pytorch python -m pip install requests -python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt python -m pip install mkl-static mkl-include diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 60dd2c49b6..0004243ed9 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -11,7 +11,7 @@ on: keep_torch_xpu_ops: required: false type: string - default: 'false' + default: 'https://github.com/intel/torch-xpu-ops/tree/daisyden/distributed_2.8' description: Keep torch-xpu-ops pin. `true` means use pined commit driver: required: false @@ -80,12 +80,12 @@ jobs: PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" PYTORCH_VERSION="$(echo ${{ inputs.pytorch }} |sed 's/.*@//')" else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" PYTORCH_VERSION="${{ inputs.pytorch }}" fi if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then - TORCH_XPU_OPS_REPO="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/@.*//')" - TORCH_XPU_OPS_VERSION="$(echo ${{ inputs.keep_torch_xpu_ops }} |sed 's/.*@//')" + TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" + TORCH_XPU_OPS_VERSION="daisyden/distributed_2.8" elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then TORCH_XPU_OPS_VERSION="pinned" else diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 7261c66e10..cf2c665551 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -16,7 +16,7 @@ on: triton: required: false type: string - default: '' + default: 'bdd0656b' description: Triton commit. Use pytorch pined commit by default ut: required: true @@ -402,7 +402,7 @@ jobs: run: | cd ../ rm -rf ./pytorch || sudo rm -rf ./pytorch - git clone https://github.com/pytorch/pytorch pytorch + git clone -b distributed_2.8 https://github.com/daisyden/pytorch.git pytorch source activate xpu_op_${ZE_AFFINITY_MASK} if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then pip install --force-reinstall ${{ github.workspace }}/torch*.whl @@ -426,7 +426,10 @@ jobs: cd ../pytorch rm -rf third_party/torch-xpu-ops if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then - cp -r ${{ github.workspace }} third_party + cd third_party + git clone https://github.com/intel/torch-xpu-ops.git + cd torch-xpu-ops + git checkout daisyden/distributed_2.8 else TORCH_XPU_OPS_COMMIT=$(${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ - tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log + python run_distributed_local.py \ + 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \ + tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log + cp *.xml ${{ github.workspace }}/ut_log + cp - name: Reset Ptrace_scope if: ${{ always() }} run: | if [ -f ptrace_scope.bk ]; then sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope fi + - name: UT Test Results Summary + run: | + source activate xpu_op_${ZE_AFFINITY_MASK} + pip install junitparser + python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml \ + 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \ + >> $GITHUB_STEP_SUMMARY || true - name: Upload Inductor XPU UT Log if: ${{ ! cancelled() }} uses: actions/upload-artifact@v4 @@ -534,15 +547,7 @@ jobs: echo "UT_NAME=$(echo ${{ inputs.ut }} |sed 's/,/-/g')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" cd ${{ github.workspace }}/ut_log/xpu_distributed gh --repo $repo issue view $ut_skip_issue --json body -q .body | sed '/^$/d' > Known_issue.log - gh api "repos/${{ github.repository }}/issues?labels=skipped" \ - --jq '.[] | select(.pull_request == null) | "Issue #\(.number): \(.title)\n\(.body)\n"' \ - > issues.log - awk '/Cases:/ {flag=1; next} /^\|\||^$/ {flag=0} flag' issues.log | grep -Eo 'test[^[:space:]]+( \|\| [^[:space:]]+)?' | sed 's/ *|| */ /g' | sort -u > issues_temp.log - awk '$2 == "op_ut" {print $1}' issues_temp.log > issues_op_ut.log - cat issues_temp.log | awk '{print $1}' >> Known_issue.log - awk -F'::' '{print $1}' issues_op_ut.log | sort -u | paste -sd ',' >> Known_issue.log - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ - bash ut_result_check.sh 'xpu_distributed' + bash ut_result_check.sh 'pytorch_distributed' - name: Upload Inductor XPU UT Log if: always() uses: actions/upload-artifact@v4 diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 88ecf43df5..3014cced3a 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -99,8 +99,8 @@ jobs: secrets: inherit uses: ./.github/workflows/_linux_build.yml with: - pytorch: main - runner: pvc_e2e + pytorch: distributed_2.8 + runner: PVC-7358 preci-linux-ut: name: preci-linux @@ -108,159 +108,5 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} - ut: op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed - runner: linux.idc.xpu - - preci-linux-e2e: - if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }} - name: preci-linux / e2e_test - needs: [preci-conditions-filter, preci-linux-build] - runs-on: pvc_e2e - env: - GH_TOKEN: ${{ github.token }} - reference_issue: 1645 - timeout-minutes: 300 - steps: - - name: Checkout torch-xpu-ops - uses: actions/checkout@v4 - - name: Prepare Conda ENV - run: | - which conda && conda clean -ay - conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci - conda create -n e2e_ci python=3.10 cmake ninja -y - source activate e2e_ci - pip install pandas scipy psutil requests - - name: Download Pytorch wheel - uses: actions/download-artifact@v4 - with: - name: Torch-XPU-Wheel-${{ github.event.pull_request.number }} - - name: Install Pytorch XPU - run: | - source activate e2e_ci - pip install --force-reinstall ${{ github.workspace }}/torch*.whl - TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') - cd ../ - rm -rf pytorch || sudo rm -rf pytorch - git clone https://github.com/pytorch/pytorch pytorch - cd pytorch && git checkout ${TORCH_COMMIT_ID} - # apply PRs for stock pytorch - # https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list - python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940 - git show -s && git status && git diff - - name: Triton Installation - run: | - source activate e2e_ci - cd ../pytorch - pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - python .github/scripts/build_triton_wheel.py --device xpu - pip install pytorch_triton_xpu-*.whl - - name: Identify pinned versions - run: | - cd ../pytorch - echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}" - echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}" - echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}" - echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}" - echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}" - echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}" - . /etc/os-release - echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - source ../torch-xpu-ops/.github/scripts/env.sh - echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" - - name: Torch Config - run: | - echo "$GITHUB_ENV" - rm -rf ../pytorch/inductor_log - rm -rf /tmp/torchinductor_* - rm -rf ~/.triton/cache - cd .. - source activate e2e_ci - python -c "import triton; print(triton.__version__)" - python pytorch/torch/utils/collect_env.py - - name: Huggingface BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Huggingface FP16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: huggingface - dt: float16 - mode: training - scenario: accuracy,performance - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Timm_models BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: timm_models - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Torchbench BF16 Training Accuracy Test - uses: ./.github/actions/inductor-xpu-e2e-test - with: - suite: torchbench - dt: bfloat16 - mode: training - scenario: accuracy,performance - env_prepare: true - hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - name: Download Reference Artifact - id: reference_id - run: | - set -xe - source activate e2e_ci - conda install gh --channel conda-forge -y - REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \ - --json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')" - gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*" - rm -rf reference && mv Inductor-*-XPU-E2E-* reference - - name: Summarize archieve files - if: ${{ ! cancelled() }} - run: | - set -x -e -o pipefail - rm -rf ${{ github.workspace }}/upload_files - cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files - # Print summary - source activate e2e_ci - export IS_PR=1 - bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \ - ${{ github.workspace }}/upload_files \ - ${{ github.workspace }}/reference \ - >> ${GITHUB_STEP_SUMMARY} - exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) - if [ ${exit_label} -ne 0 ];then - grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1 - echo "There are ${exit_label} cases that need look into!!! Please check them" - exit ${exit_label} - fi - - name: Upload Inductor XPU E2E Data - if: ${{ ! cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }} - path: ${{ github.workspace }}/upload_files - - preci-windows: - name: preci-windows - if: ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }} - needs: [preci-conditions-filter] - uses: ./.github/workflows/_windows_ut.yml - with: - ut: op_extended,torch_xpu - runner: Windows_CI - src_changed: ${{ needs.preci-conditions-filter.outputs.src_changed }} - has_label: ${{ needs.preci-conditions-filter.outputs.has_label }} + ut: xpu_distributed + runner: PVC-7358 From c8a8490b8ee4883341babe94ac3ee7148559580e Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 9 Jul 2025 16:14:41 +0000 Subject: [PATCH 02/10] update --- .github/workflows/_linux_ut.yml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index cf2c665551..edb68089c4 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -374,7 +374,7 @@ jobs: path: ${{ github.workspace }}/ut_log distributed_ut_test: - runs-on: pytorch-06 + runs-on: PVC-7358 if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }} timeout-minutes: 60 env: @@ -447,13 +447,9 @@ jobs: fi if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then pip install cmake ninja pybind11 - rm -rf pytorch_triton_xpu-*.whl - TRITON_VERSION_NAME="$( - curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ - grep '__version__' |head -n 1 |awk -F "'" '{print $2}' - )" - python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} - pip install pytorch_triton_xpu-*.whl + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + TRITON_COMMIT_ID="bdd0656b" + pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python" fi - name: Torch Config run: | @@ -491,7 +487,6 @@ jobs: 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log cp *.xml ${{ github.workspace }}/ut_log - cp - name: Reset Ptrace_scope if: ${{ always() }} run: | From 1f89b657cdba6e4fffae7ed3dccb812bed17de35 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Wed, 9 Jul 2025 21:39:00 +0800 Subject: [PATCH 03/10] Update _linux_ut.yml timeout --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index edb68089c4..aaafec8e47 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -376,7 +376,7 @@ jobs: distributed_ut_test: runs-on: PVC-7358 if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }} - timeout-minutes: 60 + timeout-minutes: 600 env: GH_TOKEN: ${{ github.token }} NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }} From 2fc532f7d71b0e69d38664d51239b85bf4ed9738 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Thu, 10 Jul 2025 11:18:44 +0800 Subject: [PATCH 04/10] Update _linux_ut.yml --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index aaafec8e47..0d03fa3366 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -485,7 +485,7 @@ jobs: fi python run_distributed_local.py \ 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_error.log | \ - tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log + tee ${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test.log || true cp *.xml ${{ github.workspace }}/ut_log - name: Reset Ptrace_scope if: ${{ always() }} From 5efbb6988b81495ee00dc5910cb3885be2ad16e2 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 10 Jul 2025 23:30:10 +0000 Subject: [PATCH 05/10] update --- .github/scripts/env.sh | 8 ++++---- .github/workflows/_linux_ut.yml | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh index 831864d6d4..db95edb946 100644 --- a/.github/scripts/env.sh +++ b/.github/scripts/env.sh @@ -1,9 +1,9 @@ #!/bin/bash -source /opt/intel/oneapi/compiler/latest/env/vars.sh +source /opt/intel/oneapi/compiler/2025.1/env/vars.sh source /opt/intel/oneapi/pti/latest/env/vars.sh -source /opt/intel/oneapi/umf/latest/env/vars.sh -source /opt/intel/oneapi/ccl/latest/env/vars.sh -source /opt/intel/oneapi/mpi/latest/env/vars.sh +source /opt/intel/oneapi/umf/0.10/env/vars.sh +source /opt/intel/oneapi/ccl/2021.15/env/vars.sh +source /opt/intel/oneapi/mpi/2021.15/env/vars.sh icpx --version sycl-ls diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 0d03fa3366..8fe4ef3900 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -474,9 +474,11 @@ jobs: run: | set -x -e -o pipefail source activate xpu_op_${ZE_AFFINITY_MASK} - pip install pytest pytest-timeout xmlrunner unittest-xml-reporting + pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers + pip install hypothesis==6.131.27 mkdir -p ut_log/xpu_distributed cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/ + cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/check-ut.py ut_log/ cd ../pytorch/third_party/torch-xpu-ops/test/xpu XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then @@ -497,7 +499,8 @@ jobs: run: | source activate xpu_op_${ZE_AFFINITY_MASK} pip install junitparser - python .github/scripts/check-ut.py ${{ github.workspace }}/ut_log/*.xml \ + cd ${{ github.workspace }}/ut_log/ + python check-ut.py ${{ github.workspace }}/ut_log/*.xml \ 2>${{ github.workspace }}/ut_log/xpu_distributed/pytorch_distributed_test_summary.log | \ >> $GITHUB_STEP_SUMMARY || true - name: Upload Inductor XPU UT Log From 8fa07f6faab8f2f47862d294691f49257badc427 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Fri, 18 Jul 2025 15:26:19 +0800 Subject: [PATCH 06/10] Update lintrunner.sh --- .github/scripts/lintrunner.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh index 6d3122b6f3..12a9ac6e66 100755 --- a/.github/scripts/lintrunner.sh +++ b/.github/scripts/lintrunner.sh @@ -4,7 +4,7 @@ set -ex # Creat a venv for lint check python3 -m venv lint source lint/bin/activate -python3 -m pip install setuptools +python3 -m pip install -U pip setuptools wheel # Use uv to speed up lintrunner init python3 -m pip install uv==0.1.45 From ac3d5dbf2842abb17112ff4532d96cef090cb347 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 18 Jul 2025 15:32:09 +0000 Subject: [PATCH 07/10] weekly test --- .github/workflows/pull.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index c08a272bb2..3014cced3a 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -107,6 +107,6 @@ jobs: needs: [preci-conditions-filter, preci-linux-build] uses: ./.github/workflows/_linux_ut.yml with: - disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests + disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} ut: xpu_distributed runner: PVC-7358 From 680c2cce3a18293f28a702418eab9feb4dada37f Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 17:23:17 +0800 Subject: [PATCH 08/10] update --- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 28454fedd2..998b50b848 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -374,7 +374,7 @@ jobs: path: ${{ github.workspace }}/ut_log distributed_ut_test: - runs-on: PVC-7358 + runs-on: pvc_e2e if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }} timeout-minutes: 600 env: diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3014cced3a..5f685d0a3f 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,7 +100,7 @@ jobs: uses: ./.github/workflows/_linux_build.yml with: pytorch: distributed_2.8 - runner: PVC-7358 + runner: pvc_e2e preci-linux-ut: name: preci-linux @@ -109,4 +109,4 @@ jobs: with: disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }} ut: xpu_distributed - runner: PVC-7358 + runner: pvc_e2e From 8b2f627b50c3a27d15ad91634f5f11d93f60a70a Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 23:18:35 +0800 Subject: [PATCH 09/10] update --- .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 6 +++--- .github/workflows/pull.yml | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 0004243ed9..6470b81db8 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -11,7 +11,7 @@ on: keep_torch_xpu_ops: required: false type: string - default: 'https://github.com/intel/torch-xpu-ops/tree/daisyden/distributed_2.8' + default: 'https://github.com/intel/torch-xpu-ops/tree/daisyden/distributed_2.9' description: Keep torch-xpu-ops pin. `true` means use pined commit driver: required: false @@ -85,7 +85,7 @@ jobs: fi if [[ "${{ inputs.keep_torch_xpu_ops }}" == *"https://"* ]];then TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" - TORCH_XPU_OPS_VERSION="daisyden/distributed_2.8" + TORCH_XPU_OPS_VERSION="daisyden/distributed_2.9" elif [ "${{ inputs.keep_torch_xpu_ops }}" == "true" ];then TORCH_XPU_OPS_VERSION="pinned" else diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 998b50b848..3b5fb692d0 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -374,7 +374,7 @@ jobs: path: ${{ github.workspace }}/ut_log distributed_ut_test: - runs-on: pvc_e2e + runs-on: PVC-7358 if: ${{ contains(inputs.ut, 'xpu_distributed') && !contains(inputs.disabled_tests, 'disable_distribute') }} timeout-minutes: 600 env: @@ -402,7 +402,7 @@ jobs: run: | cd ../ rm -rf ./pytorch || sudo rm -rf ./pytorch - git clone -b distributed_2.8 https://github.com/daisyden/pytorch.git pytorch + git clone -b distributed_2.9 https://github.com/daisyden/pytorch.git pytorch source activate xpu_op_${ZE_AFFINITY_MASK} if [ "${{ inputs.pytorch }}" != "nightly_wheel" ]; then pip install --force-reinstall ${{ github.workspace }}/torch*.whl @@ -429,7 +429,7 @@ jobs: cd third_party git clone https://github.com/intel/torch-xpu-ops.git cd torch-xpu-ops - git checkout daisyden/distributed_2.8 + git checkout daisyden/distributed_2.9 else TORCH_XPU_OPS_COMMIT=$( Date: Fri, 8 Aug 2025 16:50:19 +0800 Subject: [PATCH 10/10] update --- .github/workflows/pull.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index c737e06117..103f7c3428 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -95,7 +95,6 @@ jobs: preci-linux-build: name: preci-linux if: ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all')}} - needs: [preci-conditions-filter] secrets: inherit uses: ./.github/workflows/_linux_build.yml with: @@ -104,7 +103,7 @@ jobs: preci-linux-ut: name: preci-linux - needs: [preci-conditions-filter, preci-linux-build] + needs: [preci-linux-build] uses: ./.github/workflows/_linux_ut.yml with: disabled_tests: ${{ needs.preci-conditions-filter.outputs.disabled_tests }}