[BE] [CI] Set up 1xL4, 1xH100, 4xH100 CI workflows (#2561)

danielvegamyhre · web-flow · commit 346095197fa1 · 2025-07-18T16:17:41.000-07:00
diff --git a/.github/workflows/1xH100_tests.yml b/.github/workflows/1xH100_tests.yml
@@ -0,0 +1,53 @@
+name: Run 1xH100 Tests
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+
+concurrency:
+  group: 1xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: H100
+            runs-on: linux.aws.h100
+            torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.4"
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 60
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        python -m pip install --upgrade pip
+        pip install uv
+        pip install ${{ matrix.torch-spec }}
+        uv pip install -r dev-requirements.txt
+        uv pip install vllm
+        pip install .
+        pytest test/integration --verbose -s
+        pytest test/dtypes/test_affine_quantized_float.py --verbose -s
+        ./test/float8/test_everything_single_gpu.sh
diff --git a/.github/workflows/1xL4_tests.yml b/.github/workflows/1xL4_tests.yml
@@ -1,4 +1,4 @@
-name: Run Float8 Tests
+name: Run 1xL4 Tests
 
 on:
   push:
@@ -11,7 +11,7 @@ on:
       - 'gh/**'
 
 concurrency:
-  group: float8_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  group: 1xL4_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 env:
@@ -28,11 +28,6 @@ jobs:
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
-          - name: H100
-            runs-on: linux.aws.h100.4
-            torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.4"
     permissions:
       id-token: write
       contents: read
@@ -53,14 +48,6 @@ jobs:
         uv pip install -r dev-requirements.txt
         uv pip install vllm
         pip install .
-        pytest test/float8 --verbose -s
         pytest test/integration --verbose -s
         pytest test/dtypes/test_affine_quantized_float.py --verbose -s
-        GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l)
-        if [ "$GPU_COUNT" -ge 4 ]; then
-            echo "Found $GPU_COUNT GPUs - running test_everything.sh"
-            ./test/float8/test_everything.sh
-        else
-            echo "Only $GPU_COUNT GPUs available. Need at least 4 GPUs to run test_everything.sh"
-            exit 0
-        fi
+        ./test/float8/test_everything_single_gpu.sh
diff --git a/.github/workflows/4xH100_tests.yml b/.github/workflows/4xH100_tests.yml
@@ -0,0 +1,51 @@
+name: Run 4xH100 tests
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+
+concurrency:
+  group: 4xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: H100
+            runs-on: linux.aws.h100.4
+            torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.4"
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 60
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        python -m pip install --upgrade pip
+        pip install uv
+        pip install ${{ matrix.torch-spec }}
+        uv pip install -r dev-requirements.txt
+        uv pip install vllm
+        pip install .
+        ./test/float8/test_everything_multi_gpu.sh
diff --git a/test/float8/test_everything_multi_gpu.sh b/test/float8/test_everything_multi_gpu.sh
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+#!/bin/bash
+
+# terminate script on first error
+set -e
+IS_ROCM=$(rocm-smi --version || true)
+
+# These tests do not work on ROCm yet
+if [ -z "$IS_ROCM" ]
+then
+./test/float8/test_fsdp.sh
+./test/float8/test_fsdp_compile.sh
+./test/float8/test_dtensor.sh
+python test/float8/test_fsdp2/test_fsdp2.py
+fi
+
+echo "all multi gpu tests successful"
diff --git a/test/float8/test_everything_single_gpu.sh b/test/float8/test_everything_single_gpu.sh
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+#!/bin/bash
+
+# terminate script on first error
+set -e
+
+pytest test/float8/test_base.py --verbose -s
+pytest test/float8/test_compile.py --verbose -s
+pytest test/float8/test_numerics_integration.py --verbose -s
+pytest test/float8/test_auto_filter.py --verbose -s
+
+echo "all float8 single gpu tests successful"