@@ -99,168 +99,14 @@ jobs:
9999 secrets : inherit
100100 uses : ./.github/workflows/_linux_build.yml
101101 with :
102- pytorch : main
103- runner : pvc_e2e
102+ pytorch : distributed_2.8
103+ runner : PVC-7358
104104
105105 preci-linux-ut :
106106 name : preci-linux
107107 needs : [preci-conditions-filter, preci-linux-build]
108108 uses : ./.github/workflows/_linux_ut.yml
109109 with :
110110 disabled_tests : ${{ needs.preci-conditions-filter.outputs.disabled_tests }}
111- ut : op_regression,op_regression_dev1,op_transformers,op_extended,op_ut,xpu_distributed
112- runner : linux.idc.xpu
113-
114- preci-linux-e2e :
115- if : ${{ !contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_e2e') }}
116- name : preci-linux / e2e_test
117- needs : [preci-conditions-filter, preci-linux-build]
118- runs-on : pvc_e2e
119- env :
120- GH_TOKEN : ${{ github.token }}
121- reference_issue : 1645
122- timeout-minutes : 300
123- steps :
124- - name : Checkout torch-xpu-ops
125- uses : actions/checkout@v4
126- - name : Prepare Conda ENV
127- run : |
128- which conda && conda clean -ay
129- conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
130- conda create -n e2e_ci python=3.10 cmake ninja -y
131- source activate e2e_ci
132- pip install pandas scipy psutil requests
133- - name : Download Pytorch wheel
134- uses : actions/download-artifact@v4
135- with :
136- name : Torch-XPU-Wheel-${{ github.event.pull_request.number }}
137- - name : Install Pytorch XPU
138- run : |
139- source activate e2e_ci
140- pip install --force-reinstall ${{ github.workspace }}/torch*.whl
141- TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
142- cd ../
143- rm -rf pytorch || sudo rm -rf pytorch
144- git clone https://github.com/pytorch/pytorch pytorch
145- cd pytorch && git checkout ${TORCH_COMMIT_ID}
146- # apply PRs for stock pytorch
147- # https://github.com/pytorch/pytorch/pull/152940 internal use only for subset model list
148- python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py -e https://github.com/pytorch/pytorch/pull/152940
149- git show -s && git status && git diff
150- - name : Triton Installation
151- run : |
152- source activate e2e_ci
153- cd ../pytorch
154- pip install cmake ninja pybind11
155- rm -rf pytorch_triton_xpu-*.whl
156- python .github/scripts/build_triton_wheel.py --device xpu
157- pip install pytorch_triton_xpu-*.whl
158- - name : Identify pinned versions
159- run : |
160- cd ../pytorch
161- echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
162- echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
163- echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
164- echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
165- echo "TORCHBENCH_COMMIT_ID=$(<.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
166- echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
167- echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
168- echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
169- . /etc/os-release
170- echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
171- echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
172- source ../torch-xpu-ops/.github/scripts/env.sh
173- echo "DRIVER_VERSION=$(sycl-ls |grep 'opencl:gpu' |awk '{print $NF}' |sort |uniq -c |sed 's/ //g;s/\[/*[/')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
174- echo "KERNEL_VERSION=$(uname -rv 2>&1)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
175- echo "BUNDLE_VERSION=$(icpx --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
176- - name : Torch Config
177- run : |
178- echo "$GITHUB_ENV"
179- rm -rf ../pytorch/inductor_log
180- rm -rf /tmp/torchinductor_*
181- rm -rf ~/.triton/cache
182- cd ..
183- source activate e2e_ci
184- python -c "import triton; print(triton.__version__)"
185- python pytorch/torch/utils/collect_env.py
186- - name : Huggingface BF16 Training Accuracy Test
187- uses : ./.github/actions/inductor-xpu-e2e-test
188- with :
189- suite : huggingface
190- dt : bfloat16
191- mode : training
192- scenario : accuracy,performance
193- env_prepare : true
194- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
195- - name : Huggingface FP16 Training Accuracy Test
196- uses : ./.github/actions/inductor-xpu-e2e-test
197- with :
198- suite : huggingface
199- dt : float16
200- mode : training
201- scenario : accuracy,performance
202- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
203- - name : Timm_models BF16 Training Accuracy Test
204- uses : ./.github/actions/inductor-xpu-e2e-test
205- with :
206- suite : timm_models
207- dt : bfloat16
208- mode : training
209- scenario : accuracy,performance
210- env_prepare : true
211- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
212- - name : Torchbench BF16 Training Accuracy Test
213- uses : ./.github/actions/inductor-xpu-e2e-test
214- with :
215- suite : torchbench
216- dt : bfloat16
217- mode : training
218- scenario : accuracy,performance
219- env_prepare : true
220- hf_token : ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
221- - name : Download Reference Artifact
222- id : reference_id
223- run : |
224- set -xe
225- source activate e2e_ci
226- conda install gh --channel conda-forge -y
227- REFERENCE_RUN_ID="$(gh --repo ${GITHUB_REPOSITORY} issue view ${reference_issue} \
228- --json body -q .body |grep "Inductor-weekly-LTS-XPU-E2E" |sed 's/.*: *//')"
229- gh --repo ${GITHUB_REPOSITORY} run download ${REFERENCE_RUN_ID} -p "Inductor-*-XPU-E2E-*"
230- rm -rf reference && mv Inductor-*-XPU-E2E-* reference
231- - name : Summarize archieve files
232- if : ${{ ! cancelled() }}
233- run : |
234- set -x -e -o pipefail
235- rm -rf ${{ github.workspace }}/upload_files
236- cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
237- # Print summary
238- source activate e2e_ci
239- export IS_PR=1
240- bash ${{ github.workspace }}/.github/scripts/e2e_summary.sh \
241- ${{ github.workspace }}/upload_files \
242- ${{ github.workspace }}/reference \
243- >> ${GITHUB_STEP_SUMMARY}
244- exit_label=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt)
245- if [ ${exit_label} -ne 0 ];then
246- grep -E "(Real failed|to passed|Warning timeout).*: [1-9]|Summary for" /tmp/tmp-*.txt |grep -E "failed|passed|timeout" -B 1
247- echo "There are ${exit_label} cases that need look into!!! Please check them"
248- exit ${exit_label}
249- fi
250- - name : Upload Inductor XPU E2E Data
251- if : ${{ ! cancelled() }}
252- uses : actions/upload-artifact@v4
253- with :
254- name : Inductor-CI-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
255- path : ${{ github.workspace }}/upload_files
256-
257- preci-windows :
258- name : preci-windows
259- if : ${{ !(contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.preci-conditions-filter.outputs.disabled_tests, 'disable_win')) }}
260- needs : [preci-conditions-filter]
261- uses : ./.github/workflows/_windows_ut.yml
262- with :
263- ut : op_extended,torch_xpu
264- runner : Windows_CI
265- src_changed : ${{ needs.preci-conditions-filter.outputs.src_changed }}
266- has_label : ${{ needs.preci-conditions-filter.outputs.has_label }}
111+ ut : xpu_distributed
112+ runner : PVC-7358
0 commit comments