From 4074d0c1dfdca14aece5621279e05af3a3f0221c Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 3 Jun 2025 23:45:19 -0500
Subject: [PATCH 01/17] Added support to run torchtitan tests on ROCm.

---
 .ci/docker/build.sh               | 26 +++++++++++++++-----
 .ci/docker/ubuntu-cuda/Dockerfile | 41 +++++++++++++++++++++++++++++++
 .ci/docker/ubuntu-rocm/Dockerfile | 16 ++++++++++++
 3 files changed, 77 insertions(+), 6 deletions(-)
 create mode 100644 .ci/docker/ubuntu-cuda/Dockerfile
 create mode 100644 .ci/docker/ubuntu-rocm/Dockerfile

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 34c1f0d45..5511acabf 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -12,15 +12,29 @@ shift
 
 echo "Building ${IMAGE_NAME} Docker image"
 
+# set operating system
 OS=ubuntu
-OS_VERSION=20.04
-CLANG_VERSION=""
-PYTHON_VERSION=3.11
-MINICONDA_VERSION=24.3.0-0
+
+# set Dockerfile
+DOCKERFILE="${OS}/Dockerfile"
+if [[ "$IMAGE_NAME" == *cuda* ]]; then
+  DOCKERFILE="${OS}-cuda/Dockerfile"
+elif [[ "$IMAGE_NAME" == *rocm* ]]; then
+  DOCKERFILE="${OS}-rocm/Dockerfile"
+fi
 
 case "${IMAGE_NAME}" in
   torchtitan-ubuntu-20.04-clang12)
+    OS_VERSION=20.04
     CLANG_VERSION=12
+    PYTHON_VERSION=3.11
+    MINICONDA_VERSION=24.3.0-0
+    ;;
+  torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3)
+    OS_VERSION=22.04
+    CLANG_VERSION=19
+    PYTHON_VERSION=3.10
+    MINICONDA_VERSION=25.3.1-0
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
@@ -34,7 +48,7 @@ docker build \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
-  --shm-size=1g \
-  -f "${OS}"/Dockerfile \
+  -f $(dirname ${DOCKERFILE})/Dockerfile \
   "$@" \
   .
+
diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
new file mode 100644
index 000000000..39e4d8ec5
--- /dev/null
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -0,0 +1,41 @@
+ARG OS_VERSION
+
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+
+ARG OS_VERSION
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install common dependencies
+COPY ./common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Install clang
+ARG CLANG_VERSION
+COPY ./common/install_clang.sh install_clang.sh
+RUN bash ./install_clang.sh && rm install_clang.sh
+
+# Install gcc
+ARG GCC_VERSION
+COPY ./common/install_gcc.sh install_gcc.sh
+RUN bash ./install_gcc.sh && rm install_gcc.sh
+
+# Setup user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+# Install conda and other dependencies
+ARG MINICONDA_VERSION
+ARG PYTHON_VERSION
+ENV PYTHON_VERSION=$PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-dev.txt /opt/conda/
+COPY requirements.txt /opt/conda/
+COPY requirements-flux.txt /opt/conda/
+COPY conda-env-ci.txt /opt/conda/
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/utils.sh utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
+
+USER ci-user
+CMD ["bash"]
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
new file mode 100644
index 000000000..6f292dfdd
--- /dev/null
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -0,0 +1,16 @@
+# base image
+FROM rocm/pytorch-nightly:latest
+
+# args
+ARG OS_VERSION
+ARG CLANG_VERSION
+ARG GCC_VERSION
+ARG MINICONDA_VERSION
+ARG PYTHON_VERSION
+
+# install dependencies
+COPY requirements.txt requirements.txt
+RUN pip install -r ./requirements.txt
+
+CMD ["bash"]
+

From 340478a047b95359aefe51eaf6f6cc0c7a9661cc Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 4 Jun 2025 22:53:57 -0500
Subject: [PATCH 02/17] Added rocm ci support for integration_test_h100.

---
 .../integration_test_8gpu_h100_rocm.yaml      | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/integration_test_8gpu_h100_rocm.yaml

diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_h100_rocm.yaml
new file mode 100644
index 000000000..3debb1760
--- /dev/null
+++ b/.github/workflows/integration_test_8gpu_h100_rocm.yaml
@@ -0,0 +1,37 @@
+name: 8 GPU Integration Test at H100
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.rocm.gpu.mi300.8
+      gpu-arch-type: rocm
+      gpu-arch-version: "6.4"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        USE_CPP=0 python -m pip install --pre torchao
+
+        mkdir artifacts-to-be-uploaded
+        python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
+

From 51427a7aa7ff3b1ae7787cdaa38fd33e20b787e6 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Sat, 7 Jun 2025 00:40:28 -0500
Subject: [PATCH 03/17] Fixed a bug in build script. Removed ubuntu-cuda
 folder, instead using ubuntu folder for cuda Dockerfile.

---
 .ci/docker/build.sh                           |  4 +-
 .ci/docker/ubuntu-cuda/Dockerfile             | 41 -------------------
 ...m.yaml => integration_test_8gpu_rocm.yaml} |  8 ++--
 3 files changed, 4 insertions(+), 49 deletions(-)
 delete mode 100644 .ci/docker/ubuntu-cuda/Dockerfile
 rename .github/workflows/{integration_test_8gpu_h100_rocm.yaml => integration_test_8gpu_rocm.yaml} (70%)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 5511acabf..a1aafe3d3 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -17,9 +17,7 @@ OS=ubuntu
 
 # set Dockerfile
 DOCKERFILE="${OS}/Dockerfile"
-if [[ "$IMAGE_NAME" == *cuda* ]]; then
-  DOCKERFILE="${OS}-cuda/Dockerfile"
-elif [[ "$IMAGE_NAME" == *rocm* ]]; then
+if [[ "$IMAGE_NAME" == *rocm* ]]; then
   DOCKERFILE="${OS}-rocm/Dockerfile"
 fi
 
diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
deleted file mode 100644
index 39e4d8ec5..000000000
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ /dev/null
@@ -1,41 +0,0 @@
-ARG OS_VERSION
-
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
-
-ARG OS_VERSION
-
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install common dependencies
-COPY ./common/install_base.sh install_base.sh
-RUN bash ./install_base.sh && rm install_base.sh
-
-# Install clang
-ARG CLANG_VERSION
-COPY ./common/install_clang.sh install_clang.sh
-RUN bash ./install_clang.sh && rm install_clang.sh
-
-# Install gcc
-ARG GCC_VERSION
-COPY ./common/install_gcc.sh install_gcc.sh
-RUN bash ./install_gcc.sh && rm install_gcc.sh
-
-# Setup user
-COPY ./common/install_user.sh install_user.sh
-RUN bash ./install_user.sh && rm install_user.sh
-
-# Install conda and other dependencies
-ARG MINICONDA_VERSION
-ARG PYTHON_VERSION
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-COPY requirements-dev.txt /opt/conda/
-COPY requirements.txt /opt/conda/
-COPY requirements-flux.txt /opt/conda/
-COPY conda-env-ci.txt /opt/conda/
-COPY ./common/install_conda.sh install_conda.sh
-COPY ./common/utils.sh utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt
-
-USER ci-user
-CMD ["bash"]
diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml
similarity index 70%
rename from .github/workflows/integration_test_8gpu_h100_rocm.yaml
rename to .github/workflows/integration_test_8gpu_rocm.yaml
index 3debb1760..bb64ad528 100644
--- a/.github/workflows/integration_test_8gpu_h100_rocm.yaml
+++ b/.github/workflows/integration_test_8gpu_rocm.yaml
@@ -1,4 +1,4 @@
-name: 8 GPU Integration Test at H100
+name: 8 GPU Integration Test
 
 on:
   push:
@@ -17,13 +17,11 @@ defaults:
 
 jobs:
   build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: rocm
       gpu-arch-version: "6.4"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
       docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
       repository: pytorch/torchtitan
       upload-artifact: outputs
@@ -33,5 +31,5 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
+        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
 

From 2848d518f3c2aef5e89822850c16979ce65db6f4 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 10 Jun 2025 20:07:10 -0500
Subject: [PATCH 04/17] Included test in integration_tests.py after rebase.

---
 tests/integration_tests.py | 76 ++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/tests/integration_tests.py b/tests/integration_tests.py
index 3ccbc1890..e89a6c374 100755
--- a/tests/integration_tests.py
+++ b/tests/integration_tests.py
@@ -20,6 +20,8 @@
 except ModuleNotFoundError:
     import tomli as tomllib
 
+test_with_rocm = os.getenv("TEST_WITH_ROCM", "0")
+
 
 @dataclass
 class OverrideDefinitions:
@@ -139,28 +141,34 @@ def build_test_list():
             "Checkpoint Integration Test - Save Model Weights Only bf16",
             "last_save_model_weights_only_bf16",
         ),
-        OverrideDefinitions(
-            [
+    ]
+    # check test_with_rocm
+    if test_with_rocm != "1":
+        integration_tests_flavors["debug_model.toml"].extend([
+            OverrideDefinitions(
                 [
-                    "--parallelism.pipeline_parallel_degree 4",
-                    "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
+                    [
+                        "--parallelism.pipeline_parallel_degree 4",
+                        "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
+                    ],
                 ],
-            ],
-            "PP looped zero bubble test",
-            "pp_looped_zero_bubble",
-            ngpu=4,
-        ),
-        OverrideDefinitions(
-            [
+                "PP looped zero bubble test",
+                "pp_looped_zero_bubble",
+                ngpu=4,
+            ),
+            OverrideDefinitions(
                 [
-                    "--parallelism.pipeline_parallel_degree 2",
-                    "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
+                    [
+                        "--parallelism.pipeline_parallel_degree 2",
+                        "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
+                    ],
                 ],
-            ],
-            "PP zero bubble test (v shaped)",
-            "pp_zbv",
-            ngpu=2,
-        ),
+                "PP zero bubble test (v shaped)",
+                "pp_zbv",
+                ngpu=2,
+            ),
+        ])
+    integration_tests_flavors["debug_model.toml"].extend([
         OverrideDefinitions(
             [
                 [
@@ -272,18 +280,24 @@ def build_test_list():
             "pp_looped_1f1b",
             ngpu=4,
         ),
-        OverrideDefinitions(
-            [
-                [
-                    "--parallelism.pipeline_parallel_degree 2",
-                    "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
-                    "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
-                ],
-            ],
-            "PP with custom pipeline schedule loaded from CSV file",
-            "pp_custom_csv",
-            ngpu=2,
-        ),
+    ])
+    # check test_with_rocm
+    if test_with_rocm != "1":
+        integration_tests_flavors["debug_model.toml"].extend(
+            OverrideDefinitions(
+                [
+                    [
+                        "--parallelism.pipeline_parallel_degree 2",
+                        "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
+                        "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
+                    ],
+                ],
+                "PP with custom pipeline schedule loaded from CSV file",
+                "pp_custom_csv",
+                ngpu=2,
+            ),
+        )
+    integration_tests_flavors["debug_model.toml"].extend([
         OverrideDefinitions(
             [
                 [
@@ -509,7 +523,7 @@ def build_test_list():
             "gradient_accumulation",
             ngpu=2,
         ),
-    ]
+    ])
     return integration_tests_flavors
 
 
From cb13ad44722613a2cff045d2d700c57e7b0fadc1 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Fri, 13 Jun 2025 13:51:04 -0500
Subject: [PATCH 05/17] Modified docker-builds.yml to build rocm docker image
 for torchtitan.

---
 .github/workflows/docker-builds.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 11ff5390c..75d5082c5 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -22,13 +22,16 @@ concurrency:
 
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8]
     timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
         include:
-          - docker-image-name: torchtitan-ubuntu-20.04-clang12
+          - docker-image-name: [
+            torchtitan-ubuntu-20.04-clang12,
+            torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+          ]
     env:
       DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }}
     steps:

From de9bdcce016c8c247392f715932b195864e976b4 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 18 Jun 2025 13:04:59 -0500
Subject: [PATCH 06/17] Fixed runner for cuda and rocm images in
 docker-builds.yml.

---
 .github/workflows/docker-builds.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 75d5082c5..4289a07ab 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -22,16 +22,16 @@ concurrency:
 
 jobs:
   docker-build:
-    runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8]
-    timeout-minutes: 240
     strategy:
       fail-fast: false
       matrix:
         include:
-          - docker-image-name: [
-            torchtitan-ubuntu-20.04-clang12,
-            torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-          ]
+          - docker-image-name: torchtitan-ubuntu-20.04-clang12
+            runner: [self-hosted, linux.2xlarge]
+          - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+            runner: linux.rocm.gpu.mi300.8
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 240
     env:
       DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }}
     steps:

From f634f000a9d8dc52cf88fd46eb33e80bae5d589b Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 19 Jun 2025 01:01:43 -0500
Subject: [PATCH 07/17] Added TEST_WITH_ROCM environment variable for running
 tests on rocm. Fixed error in integration_tests.py. Fixed lint errors.

---
 .ci/docker/ubuntu-rocm/Dockerfile                 | 1 -
 .github/workflows/integration_test_8gpu_rocm.yaml | 3 +--
 tests/integration_tests.py                        | 4 ++--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
index 6f292dfdd..ae944279c 100644
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@@ -13,4 +13,3 @@ COPY requirements.txt requirements.txt
 RUN pip install -r ./requirements.txt
 
 CMD ["bash"]
-
diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml
index bb64ad528..c5069164d 100644
--- a/.github/workflows/integration_test_8gpu_rocm.yaml
+++ b/.github/workflows/integration_test_8gpu_rocm.yaml
@@ -31,5 +31,4 @@ jobs:
         USE_CPP=0 python -m pip install --pre torchao
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
-
+        python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
index e89a6c374..ce894aea3 100755
--- a/tests/integration_tests.py
+++ b/tests/integration_tests.py
@@ -283,7 +283,7 @@ def build_test_list():
     ])
     # check test_with_rocm
     if test_with_rocm != "1":
-        integration_tests_flavors["debug_model.toml"].extend(
+        integration_tests_flavors["debug_model.toml"].extend([
             OverrideDefinitions(
                 [
                     [
@@ -296,7 +296,7 @@ def build_test_list():
                 "pp_custom_csv",
                 ngpu=2,
             ),
-        )
+        ])
     integration_tests_flavors["debug_model.toml"].extend([
         OverrideDefinitions(
             [

From 87a5a5982feda275e6e059ab0e7e8499adb5d970 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Tue, 24 Jun 2025 13:38:01 -0500
Subject: [PATCH 08/17] Refactored integration_tests.py with skip tests for
 ROCm.

---
 tests/integration_tests.py | 85 +++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 42 deletions(-)
 mode change 100755 => 100644 tests/integration_tests.py

diff --git a/tests/integration_tests.py b/tests/integration_tests.py
old mode 100755
new mode 100644
index ce894aea3..b10b17953
--- a/tests/integration_tests.py
+++ b/tests/integration_tests.py
@@ -20,7 +20,15 @@
 except ModuleNotFoundError:
     import tomli as tomllib
 
-test_with_rocm = os.getenv("TEST_WITH_ROCM", "0")
+# tests skipped for ROCm
+skip_for_rocm_test_list = [
+    "pp_looped_zero_bubble",
+    "pp_zbv",
+    "pp_custom_csv",
+    "last_save_model_weights_only_bf16",
+    "last_save_model_weights_only_fp32",
+]
+TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"
 
 
 @dataclass
@@ -141,34 +149,28 @@ def build_test_list():
             "Checkpoint Integration Test - Save Model Weights Only bf16",
             "last_save_model_weights_only_bf16",
         ),
-    ]
-    # check test_with_rocm
-    if test_with_rocm != "1":
-        integration_tests_flavors["debug_model.toml"].extend([
-            OverrideDefinitions(
+        OverrideDefinitions(
+            [
                 [
-                    [
-                        "--parallelism.pipeline_parallel_degree 4",
-                        "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
-                    ],
+                    "--parallelism.pipeline_parallel_degree 4",
+                    "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble",
                 ],
-                "PP looped zero bubble test",
-                "pp_looped_zero_bubble",
-                ngpu=4,
-            ),
-            OverrideDefinitions(
+            ],
+            "PP looped zero bubble test",
+            "pp_looped_zero_bubble",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
                 [
-                    [
-                        "--parallelism.pipeline_parallel_degree 2",
-                        "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
-                    ],
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_schedule ZBVZeroBubble",
                 ],
-                "PP zero bubble test (v shaped)",
-                "pp_zbv",
-                ngpu=2,
-            ),
-        ])
-    integration_tests_flavors["debug_model.toml"].extend([
+            ],
+            "PP zero bubble test (v shaped)",
+            "pp_zbv",
+            ngpu=2,
+        ),
         OverrideDefinitions(
             [
                 [
@@ -280,24 +282,18 @@ def build_test_list():
             "pp_looped_1f1b",
             ngpu=4,
         ),
-    ])
-    # check test_with_rocm
-    if test_with_rocm != "1":
-        integration_tests_flavors["debug_model.toml"].extend([
-            OverrideDefinitions(
+        OverrideDefinitions(
+            [
                 [
-                    [
-                        "--parallelism.pipeline_parallel_degree 2",
-                        "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
-                        "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
-                    ],
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti",
+                    "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv",
                 ],
-                "PP with custom pipeline schedule loaded from CSV file",
-                "pp_custom_csv",
-                ngpu=2,
-            ),
-        ])
-    integration_tests_flavors["debug_model.toml"].extend([
+            ],
+            "PP with custom pipeline schedule loaded from CSV file",
+            "pp_custom_csv",
+            ngpu=2,
+        ),
         OverrideDefinitions(
             [
                 [
@@ -523,7 +519,7 @@ def build_test_list():
             "gradient_accumulation",
             ngpu=2,
         ),
-    ])
+    ]
     return integration_tests_flavors
 
 
@@ -582,6 +578,11 @@ def run_tests(args):
                 )
                 if is_integration_test:
                     for test_flavor in integration_tests_flavors[config_file]:
+                        if (
+                            TEST_WITH_ROCM
+                            and test_flavor.test_name in skip_for_rocm_test_list
+                        ):
+                            continue
                         if args.test == "all" or test_flavor.test_name == args.test:
                             if args.ngpu < test_flavor.ngpu:
                                 logger.info(

From d748586ab794318b6223a6cb1787f87ff417b6b8 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Sun, 29 Jun 2025 02:14:23 -0500
Subject: [PATCH 09/17] Changed runner to i-0962598bd0e8298b3 for building ROCm
 docker image.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 4289a07ab..bfe2fac82 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -29,7 +29,7 @@ jobs:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
           - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-            runner: linux.rocm.gpu.mi300.8
+            runner: i-0962598bd0e8298b3
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240
     env:

From 66eba9f37b4cc0c57f1f12084e7c60a81fea442c Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 30 Jun 2025 10:59:25 -0500
Subject: [PATCH 10/17] Changed runner to linux.12xlarge for building ROCm
 docker image.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index bfe2fac82..be12cceaa 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -29,7 +29,7 @@ jobs:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
           - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-            runner: i-0962598bd0e8298b3
+            runner: [linux.12xlarge]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240
     env:

From 2d317c3ae8d2e5fdf2ca544274ce4c92b3655ccc Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Mon, 30 Jun 2025 11:21:03 -0500
Subject: [PATCH 11/17] Changed runner to linux.2xlarge for building ROCm
 docker image.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index be12cceaa..84afe8bd2 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -29,7 +29,7 @@ jobs:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
           - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-            runner: [linux.12xlarge]
+            runner: [linux.2xlarge]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240
     env:

From 18025adf8adaf574efdfccba522aa0a1f3b39f8b Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 3 Jul 2025 00:14:42 -0500
Subject: [PATCH 12/17] Added support to use single Dockerfile for both cuda
 and rocm. Using single workflow file to run integration tests including h100
 for both cuda and rocm. Need different name for
 integration_test_8gpu_h100.yaml as we are also running it for rocm. Fixed
 file permission for integration_tests.py.

---
 .ci/docker/build.sh                           | 24 ++++++-------
 .ci/docker/ubuntu-rocm/Dockerfile             | 15 --------
 .ci/docker/ubuntu/Dockerfile                  |  4 +--
 .github/workflows/integration_test_8gpu.yaml  | 35 ++++++++++++++-----
 .../workflows/integration_test_8gpu_h100.yaml | 31 +++++++++++-----
 .../workflows/integration_test_8gpu_rocm.yaml | 34 ------------------
 tests/integration_tests.py                    |  0
 7 files changed, 61 insertions(+), 82 deletions(-)
 delete mode 100644 .ci/docker/ubuntu-rocm/Dockerfile
 delete mode 100644 .github/workflows/integration_test_8gpu_rocm.yaml
 mode change 100644 => 100755 tests/integration_tests.py

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index a1aafe3d3..597b2ee5c 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -12,27 +12,21 @@ shift
 
 echo "Building ${IMAGE_NAME} Docker image"
 
-# set operating system
 OS=ubuntu
-
-# set Dockerfile
-DOCKERFILE="${OS}/Dockerfile"
-if [[ "$IMAGE_NAME" == *rocm* ]]; then
-  DOCKERFILE="${OS}-rocm/Dockerfile"
-fi
+CLANG_VERSION=""
+PYTHON_VERSION=3.11
+MINICONDA_VERSION=24.3.0-0
 
 case "${IMAGE_NAME}" in
   torchtitan-ubuntu-20.04-clang12)
     OS_VERSION=20.04
     CLANG_VERSION=12
-    PYTHON_VERSION=3.11
-    MINICONDA_VERSION=24.3.0-0
+    BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
     ;;
-  torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3)
+  torchtitan-rocm-ubuntu-22.04-clang12)
     OS_VERSION=22.04
-    CLANG_VERSION=19
-    PYTHON_VERSION=3.10
-    MINICONDA_VERSION=25.3.1-0
+    CLANG_VERSION=12
+    BASE_IMAGE=rocm/dev-ubuntu-${OS_VERSION}:latest
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
@@ -42,11 +36,13 @@ esac
 docker build \
   --no-cache \
   --progress=plain \
+  --build-arg "BASE_IMAGE=${BASE_IMAGE}" \
   --build-arg "OS_VERSION=${OS_VERSION}" \
   --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
-  -f $(dirname ${DOCKERFILE})/Dockerfile \
+  --shm-size=1g \
+  -f "${OS}"/Dockerfile \
   "$@" \
   .
 
diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile
deleted file mode 100644
index ae944279c..000000000
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-# base image
-FROM rocm/pytorch-nightly:latest
-
-# args
-ARG OS_VERSION
-ARG CLANG_VERSION
-ARG GCC_VERSION
-ARG MINICONDA_VERSION
-ARG PYTHON_VERSION
-
-# install dependencies
-COPY requirements.txt requirements.txt
-RUN pip install -r ./requirements.txt
-
-CMD ["bash"]
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 39e4d8ec5..5d10c01b7 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -1,6 +1,6 @@
-ARG OS_VERSION
+ARG BASE_IMAGE
 
-FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION}
+FROM ${BASE_IMAGE}
 
 ARG OS_VERSION
 
diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index a2469a913..5ba4be2d4 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -23,13 +23,30 @@ defaults:
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.g5.48xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+            is-rocm: 0
+          - name: rocm
+            runner: linux.rocm.gpu.mi300.8
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
+            is-rocm: 1
     with:
-      runner: linux.g5.48xlarge.nvidia.gpu
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |
@@ -41,9 +58,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
+        TEST_WITH_ROCM=${{ matrix.is-rocm }} python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 813669748..48147f72b 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -24,13 +24,28 @@ defaults:
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        include:
+          - name: cuda
+            runner: linux.aws.h100.8
+            gpu-arch-type: cuda
+            gpu-arch-version: "12.6"
+            # This image is faster to clone than the default, but it lacks CC needed by triton
+            # (1m25s vs 2m37s).
+            docker-image: torchtitan-ubuntu-20.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/cu126
+          - name: rocm
+            runner: linux.rocm.gpu.mi300.8
+            gpu-arch-type: rocm
+            gpu-arch-version: "6.4"
+            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
+            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
     with:
-      runner: linux.aws.h100.8
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.6"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |
@@ -42,9 +57,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
 
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8
diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml
deleted file mode 100644
index c5069164d..000000000
--- a/.github/workflows/integration_test_8gpu_rocm.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: 8 GPU Integration Test
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-  schedule:
-    # Runs every 6 hours
-    - cron: '0 */6 * * *'
-concurrency:
-  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
-  cancel-in-progress: true
-
-defaults:
-  run:
-    shell: bash -l -eo pipefail {0}
-
-jobs:
-  build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      runner: linux.rocm.gpu.mi300.8
-      gpu-arch-type: rocm
-      gpu-arch-version: "6.4"
-      docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
-      repository: pytorch/torchtitan
-      upload-artifact: outputs
-      script: |
-        set -eux
-
-        USE_CPP=0 python -m pip install --pre torchao
-
-        mkdir artifacts-to-be-uploaded
-        python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
old mode 100644
new mode 100755

From 724e202c14fbc120af8f8c933d8c3eba3706798c Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 3 Jul 2025 00:35:08 -0500
Subject: [PATCH 13/17] Changed rocm docker image name in docker-builds.yml.

---
 .github/workflows/docker-builds.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 84afe8bd2..d5f52824d 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -28,7 +28,7 @@ jobs:
         include:
           - docker-image-name: torchtitan-ubuntu-20.04-clang12
             runner: [self-hosted, linux.2xlarge]
-          - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3
+          - docker-image-name: torchtitan-rocm-ubuntu-22.04-clang12
             runner: [linux.2xlarge]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 240

From 15a9554102d3b2e43f03c95c26092e4b24028dc8 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 9 Jul 2025 00:54:14 -0500
Subject: [PATCH 14/17] Reverted the changes to
 integration_test_8gpu_h100.yaml.

---
 .../workflows/integration_test_8gpu_h100.yaml | 31 +++++--------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml
index 48147f72b..813669748 100644
--- a/.github/workflows/integration_test_8gpu_h100.yaml
+++ b/.github/workflows/integration_test_8gpu_h100.yaml
@@ -24,28 +24,13 @@ defaults:
 jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    strategy:
-      matrix:
-        include:
-          - name: cuda
-            runner: linux.aws.h100.8
-            gpu-arch-type: cuda
-            gpu-arch-version: "12.6"
-            # This image is faster to clone than the default, but it lacks CC needed by triton
-            # (1m25s vs 2m37s).
-            docker-image: torchtitan-ubuntu-20.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/cu126
-          - name: rocm
-            runner: linux.rocm.gpu.mi300.8
-            gpu-arch-type: rocm
-            gpu-arch-version: "6.4"
-            docker-image: torchtitan-rocm-ubuntu-22.04-clang12
-            index-url: https://download.pytorch.org/whl/nightly/rocm6.4
     with:
-      runner: ${{ matrix.runner }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      docker-image: ${{ matrix.docker-image }}
+      runner: linux.aws.h100.8
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
       repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |
@@ -57,9 +42,9 @@ jobs:
 
         pip config --user set global.progress_bar off
 
-        python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }}
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
 
-        USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8

From cb528bc6a23a6ef222141c4f72b6c41ccb681589 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 16 Jul 2025 13:47:13 -0500
Subject: [PATCH 15/17] Empty dummy commit.


From 66e5c95d8092e4e68b3442933332413054e12a78 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Wed, 16 Jul 2025 20:30:01 -0500
Subject: [PATCH 16/17] Increased the timeout to 45 minutes to override timeout
 used in linux_job_v2.yml for integration_test_8gpu.yaml.

---
 .github/workflows/integration_test_8gpu.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
index 5ba4be2d4..8173e1225 100644
--- a/.github/workflows/integration_test_8gpu.yaml
+++ b/.github/workflows/integration_test_8gpu.yaml
@@ -49,6 +49,7 @@ jobs:
       docker-image: ${{ matrix.docker-image }}
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      timeout: 45
       script: |
         set -eux
 

From efd11a8b42b7aa064187bd5d101999359ef88dd9 Mon Sep 17 00:00:00 2001
From: Akash Verma <Akash.Verma3@amd.com>
Date: Thu, 17 Jul 2025 13:29:35 -0500
Subject: [PATCH 17/17] Empty dummy commit.