pytorch · akashveramd · Jun 4, 2025 · Jun 5, 2025 · Jun 7, 2025 · Jun 11, 2025
@@ -13,14 +13,20 @@ shift
 echo "Building ${IMAGE_NAME} Docker image"
 
 OS=ubuntu
-OS_VERSION=20.04
 CLANG_VERSION=""
 PYTHON_VERSION=3.11
 MINICONDA_VERSION=24.3.0-0
 
 case "${IMAGE_NAME}" in
   torchtitan-ubuntu-20.04-clang12)
+    OS_VERSION=20.04
     CLANG_VERSION=12
+    BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+    ;;
+  torchtitan-rocm-ubuntu-22.04-clang12)
+    OS_VERSION=22.04
+    CLANG_VERSION=12
+    BASE_IMAGE=rocm/dev-ubuntu-${OS_VERSION}:latest
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
@@ -30,6 +36,7 @@ esac
 docker build \
   --no-cache \
   --progress=plain \
+  --build-arg "BASE_IMAGE=${BASE_IMAGE}" \
   --build-arg "OS_VERSION=${OS_VERSION}" \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
@@ -38,3 +45,4 @@ docker build \
   -f "${OS}"/Dockerfile \
   "$@" \
   .
+
@@ -1,6 +1,6 @@
-ARG OS_VERSION
+ARG BASE_IMAGE
 
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+FROM ${BASE_IMAGE}
 
 ARG OS_VERSION
 

@@ -22,13 +22,16 @@ concurrency:
 
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge]
-    timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
         include:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
+            runner: [self-hosted, linux.2xlarge]
+          - docker-image-name: torchtitan-rocm-ubuntu-22.04-clang12
+            runner: [linux.2xlarge]
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 240
     env:
       DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }}
     steps:

@@ -23,15 +23,33 @@ defaults:
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.g5.48xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+            is-rocm: 0
+          - name: rocm
+            runner: linux.rocm.gpu.mi300.8
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
+            is-rocm: 1
     with:
-      runner: linux.g5.48xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      timeout: 45
       script: |
         set -eux
 
@@ -41,9 +59,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
+        TEST_WITH_ROCM=${{ matrix.is-rocm }} python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
@@ -20,6 +20,16 @@
 except ModuleNotFoundError:
     import tomli as tomllib
 
+# tests skipped for ROCm
+skip_for_rocm_test_list = [
+    "pp_looped_zero_bubble",
+    "pp_zbv",
+    "pp_custom_csv",
+    "last_save_model_weights_only_bf16",
+    "last_save_model_weights_only_fp32",
+]
+TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"
+
 
 @dataclass
 class OverrideDefinitions:
@@ -568,6 +578,11 @@ def run_tests(args):
                 )
                 if is_integration_test:
                     for test_flavor in integration_tests_flavors[config_file]:
+                        if (
+                            TEST_WITH_ROCM
+                            and test_flavor.test_name in skip_for_rocm_test_list
+                        ):
+                            continue
                         if args.test == "all" or test_flavor.test_name == args.test:
                             if args.ngpu < test_flavor.ngpu:
                                 logger.info(