Skip to content

Commit 3460951

Browse files
[BE] [CI] Set up 1xL4, 1xH100, 4xH100 CI workflows (#2561)
1 parent 2885dca commit 3460951

File tree

5 files changed

+144
-16
lines changed

5 files changed

+144
-16
lines changed

.github/workflows/1xH100_tests.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: Run 1xH100 Tests
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
- 'gh/**'
8+
pull_request:
9+
branches:
10+
- main
11+
- 'gh/**'
12+
13+
concurrency:
14+
group: 1xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
15+
cancel-in-progress: true
16+
17+
env:
18+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
19+
20+
jobs:
21+
test:
22+
strategy:
23+
fail-fast: false
24+
matrix:
25+
include:
26+
- name: H100
27+
runs-on: linux.aws.h100
28+
torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
29+
gpu-arch-type: "cuda"
30+
gpu-arch-version: "12.4"
31+
permissions:
32+
id-token: write
33+
contents: read
34+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
35+
with:
36+
timeout: 60
37+
runner: ${{ matrix.runs-on }}
38+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
39+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
40+
submodules: recursive
41+
script: |
42+
conda create -n venv python=3.9 -y
43+
conda activate venv
44+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
45+
python -m pip install --upgrade pip
46+
pip install uv
47+
pip install ${{ matrix.torch-spec }}
48+
uv pip install -r dev-requirements.txt
49+
uv pip install vllm
50+
pip install .
51+
pytest test/integration --verbose -s
52+
pytest test/dtypes/test_affine_quantized_float.py --verbose -s
53+
./test/float8/test_everything_single_gpu.sh

.github/workflows/float8_test.yml renamed to .github/workflows/1xL4_tests.yml

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Run Float8 Tests
1+
name: Run 1xL4 Tests
22

33
on:
44
push:
@@ -11,7 +11,7 @@ on:
1111
- 'gh/**'
1212

1313
concurrency:
14-
group: float8_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
14+
group: 1xL4_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
1515
cancel-in-progress: true
1616

1717
env:
@@ -28,11 +28,6 @@ jobs:
2828
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
2929
gpu-arch-type: "cuda"
3030
gpu-arch-version: "12.6"
31-
- name: H100
32-
runs-on: linux.aws.h100.4
33-
torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
34-
gpu-arch-type: "cuda"
35-
gpu-arch-version: "12.4"
3631
permissions:
3732
id-token: write
3833
contents: read
@@ -53,14 +48,6 @@ jobs:
5348
uv pip install -r dev-requirements.txt
5449
uv pip install vllm
5550
pip install .
56-
pytest test/float8 --verbose -s
5751
pytest test/integration --verbose -s
5852
pytest test/dtypes/test_affine_quantized_float.py --verbose -s
59-
GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l)
60-
if [ "$GPU_COUNT" -ge 4 ]; then
61-
echo "Found $GPU_COUNT GPUs - running test_everything.sh"
62-
./test/float8/test_everything.sh
63-
else
64-
echo "Only $GPU_COUNT GPUs available. Need at least 4 GPUs to run test_everything.sh"
65-
exit 0
66-
fi
53+
./test/float8/test_everything_single_gpu.sh

.github/workflows/4xH100_tests.yml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
name: Run 4xH100 tests
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
- 'gh/**'
8+
pull_request:
9+
branches:
10+
- main
11+
- 'gh/**'
12+
13+
concurrency:
14+
group: 4xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
15+
cancel-in-progress: true
16+
17+
env:
18+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
19+
20+
jobs:
21+
test:
22+
strategy:
23+
fail-fast: false
24+
matrix:
25+
include:
26+
- name: H100
27+
runs-on: linux.aws.h100.4
28+
torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
29+
gpu-arch-type: "cuda"
30+
gpu-arch-version: "12.4"
31+
permissions:
32+
id-token: write
33+
contents: read
34+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
35+
with:
36+
timeout: 60
37+
runner: ${{ matrix.runs-on }}
38+
gpu-arch-type: ${{ matrix.gpu-arch-type }}
39+
gpu-arch-version: ${{ matrix.gpu-arch-version }}
40+
submodules: recursive
41+
script: |
42+
conda create -n venv python=3.9 -y
43+
conda activate venv
44+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
45+
python -m pip install --upgrade pip
46+
pip install uv
47+
pip install ${{ matrix.torch-spec }}
48+
uv pip install -r dev-requirements.txt
49+
uv pip install vllm
50+
pip install .
51+
./test/float8/test_everything_multi_gpu.sh
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD 3-Clause license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
#!/bin/bash
7+
8+
# terminate script on first error
9+
set -e
10+
IS_ROCM=$(rocm-smi --version || true)
11+
12+
# These tests do not work on ROCm yet
13+
if [ -z "$IS_ROCM" ]
14+
then
15+
./test/float8/test_fsdp.sh
16+
./test/float8/test_fsdp_compile.sh
17+
./test/float8/test_dtensor.sh
18+
python test/float8/test_fsdp2/test_fsdp2.py
19+
fi
20+
21+
echo "all multi gpu tests successful"
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD 3-Clause license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
#!/bin/bash
7+
8+
# terminate script on first error
9+
set -e
10+
11+
pytest test/float8/test_base.py --verbose -s
12+
pytest test/float8/test_compile.py --verbose -s
13+
pytest test/float8/test_numerics_integration.py --verbose -s
14+
pytest test/float8/test_auto_filter.py --verbose -s
15+
16+
echo "all float8 single gpu tests successful"

0 commit comments

Comments
 (0)