From d47f51fb267409da68e378fe90466a3fc23a0aae Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Thu, 29 May 2025 13:00:57 -0700
Subject: [PATCH 01/44] add integration_test_4gpu_amd.yaml

---
 .../workflows/integration_test_4gpu_amd.yaml  | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .github/workflows/integration_test_4gpu_amd.yaml

diff --git a/.github/workflows/integration_test_4gpu_amd.yaml b/.github/workflows/integration_test_4gpu_amd.yaml
new file mode 100644
index 000000000..81e6c8d4d
--- /dev/null
+++ b/.github/workflows/integration_test_4gpu_amd.yaml
@@ -0,0 +1,39 @@
+name: 4 AMD GPU Integration Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.rocm.gpu.mi300.4
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        pip config --user set global.progress_bar off
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        mkdir artifacts-to-be-uploaded
+        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 4

From ec83247da567f5b752b0e0d7a728175df3e37310 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Thu, 29 May 2025 16:32:23 -0700
Subject: [PATCH 02/44] Update .github/workflows/integration_test_4gpu_amd.yaml

---
 .github/workflows/integration_test_4gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_4gpu_amd.yaml b/.github/workflows/integration_test_4gpu_amd.yaml
index 81e6c8d4d..c49e9641b 100644
--- a/.github/workflows/integration_test_4gpu_amd.yaml
+++ b/.github/workflows/integration_test_4gpu_amd.yaml
@@ -19,7 +19,7 @@ jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
-      runner: linux.rocm.gpu.mi300.4
+      runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: cuda
       gpu-arch-version: "12.6"
       # This image is faster to clone than the default, but it lacks CC needed by triton

From 7cd90e3c430e5cb70fc38dd1d3816d6ea980b3b6 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Thu, 29 May 2025 16:39:21 -0700
Subject: [PATCH 03/44] correct gpu number

---
 ...ration_test_4gpu_amd.yaml => integration_test_8gpu_amd.yaml} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename .github/workflows/{integration_test_4gpu_amd.yaml => integration_test_8gpu_amd.yaml} (99%)

diff --git a/.github/workflows/integration_test_4gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
similarity index 99%
rename from .github/workflows/integration_test_4gpu_amd.yaml
rename to .github/workflows/integration_test_8gpu_amd.yaml
index c49e9641b..8cfd9d559 100644
--- a/.github/workflows/integration_test_4gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -36,4 +36,4 @@ jobs:
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 4
+        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8

From 2bfcf3582d5f406bd7cbd1d5d570dfd2c20fec52 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Thu, 29 May 2025 16:46:23 -0700
Subject: [PATCH 04/44] typo

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 8cfd9d559..15f3da628 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -1,4 +1,4 @@
-name: 4 AMD GPU Integration Test
+name: 8 AMD GPU Integration Test
 
 on:
   push:

From b130aeeb15cdffebce0c77325cd66729c732595d Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Mon, 2 Jun 2025 10:33:09 -0700
Subject: [PATCH 05/44] update job version

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 15f3da628..5f2deef20 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -17,7 +17,7 @@ defaults:
 
 jobs:
   build-test:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: cuda

From e26285cc90aed6aed288ce0355b5e9a92232ce1f Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:57:09 -0700
Subject: [PATCH 06/44] Update .github/workflows/integration_test_8gpu_amd.yaml

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 5f2deef20..d8358a1a0 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -20,7 +20,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.rocm.gpu.mi300.8
-      gpu-arch-type: cuda
+      gpu-arch-type: rocm
       gpu-arch-version: "12.6"
       # This image is faster to clone than the default, but it lacks CC needed by triton
       # (1m25s vs 2m37s).

From 9e8e99727fd8fb9bc375b21cca0f526423e5f0b3 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:10:50 -0700
Subject: [PATCH 07/44] Update .github/workflows/integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index d8358a1a0..8a03676f4 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -21,7 +21,7 @@ jobs:
     with:
       runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: rocm
-      gpu-arch-version: "12.6"
+      gpu-arch-version: "6.3"
       # This image is faster to clone than the default, but it lacks CC needed by triton
       # (1m25s vs 2m37s).
       docker-image: torchtitan-ubuntu-20.04-clang12

From c9e733aca0fbeb964238938cf97f5e4f097e27a7 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:52:38 -0700
Subject: [PATCH 08/44] Remove custom docker image

---
 .github/workflows/integration_test_8gpu_amd.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 8a03676f4..cf2383c2e 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -22,9 +22,6 @@ jobs:
       runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
-      # This image is faster to clone than the default, but it lacks CC needed by triton
-      # (1m25s vs 2m37s).
-      docker-image: torchtitan-ubuntu-20.04-clang12
       repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |

From a9aa64d49ab5a086e99299e8b12bb569696a888b Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:58:06 -0700
Subject: [PATCH 09/44] Update .github/workflows/integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index cf2383c2e..1d3da8386 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -22,7 +22,6 @@ jobs:
       runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
-      repository: pytorch/torchtitan
       upload-artifact: outputs
       script: |
         set -eux

From cf8a9f013d39e4940d5456a07426fd7f17da2952 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Mon, 23 Jun 2025 14:08:06 -0700
Subject: [PATCH 10/44] 2 gpu

---
 .../workflows/integration_test_8gpu_amd.yaml  |   4 +-
 tests/integration_tests_amd.py                | 148 ++++++++++++++++++
 2 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 tests/integration_tests_amd.py

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 1d3da8386..02e208510 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -19,7 +19,7 @@ jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
-      runner: linux.rocm.gpu.mi300.8
+      runner: linux.rocm.gpu.mi300.2
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
       upload-artifact: outputs
@@ -32,4 +32,4 @@ jobs:
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
+        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 2
diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
new file mode 100644
index 000000000..0b6c1dbd0
--- /dev/null
+++ b/tests/integration_tests_amd.py
@@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+import subprocess
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Sequence
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib
+
+
+@dataclass
+class OverrideDefinitions:
+    """
+    This class is used to define the override definitions for the integration tests.
+    """
+
+    override_args: Sequence[Sequence[str]] = tuple(tuple(" "))
+    test_descr: str = "default"
+    test_name: str = "default"
+    ngpu: int = 4
+
+    def __repr__(self):
+        return self.test_descr
+
+
+def build_test_list():
+    """
+    key is the config file name and value is a list of OverrideDefinitions
+    that is used to generate variations of integration tests based on the
+    same root config file.
+    """
+    integration_tests_flavors = defaultdict(list)
+    integration_tests_flavors["debug_model.toml"] = [
+        OverrideDefinitions(
+            [
+                [
+                    "--training.compile",
+                    "--parallelism.tensor_parallel_degree 2",
+                ],
+            ],
+            "2D TP compile",
+            "2d_tp_compile",
+        ),
+    ]
+    return integration_tests_flavors
+
+
+def _run_cmd(cmd):
+    return subprocess.run([cmd], text=True, shell=True)
+
+
+def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
+    # run_test supports sequence of tests.
+    test_name = test_flavor.test_name
+    dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}"
+    all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
+
+    for idx, override_arg in enumerate(test_flavor.override_args):
+        cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh"
+        # dump compile trace for debugging purpose
+        cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd
+        if test_name == "fsdp2_memory_estimation":
+            cmd = (
+                f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} "
+                "./scripts/estimate/run_memory_estimation.sh"
+            )
+        cmd += " " + dump_folder_arg
+        if override_arg:
+            cmd += " " + " ".join(override_arg)
+        logger.info(
+            f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
+        )
+
+        # save checkpoint (idx == 0) and load it for generation (idx == 1)
+        if test_name == "test_generate" and idx == 1:
+            cmd = (
+                f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} "
+                f"CHECKPOINT_DIR={output_dir}/{test_name}/checkpoint/step-10 "
+                "PROMPT='What is the meaning of life?' "
+                f"./scripts/generate/run_llama_generate.sh --out > {output_dir}/{test_name}/generated_output.json"
+            )
+
+        result = _run_cmd(cmd)
+        logger.info(result.stdout)
+        if result.returncode != 0:
+            raise Exception(
+                f"Integration test failed, flavor : {test_flavor.test_descr}, command : {cmd}"
+            )
+
+
+def run_tests(args):
+    integration_tests_flavors = build_test_list()
+    for config_file in os.listdir(args.config_dir):
+        if config_file.endswith(".toml"):
+            full_path = os.path.join(args.config_dir, config_file)
+            with open(full_path, "rb") as f:
+                config = tomllib.load(f)
+                is_integration_test = config["job"].get(
+                    "use_for_integration_test", False
+                )
+                if is_integration_test:
+                    for test_flavor in integration_tests_flavors[config_file]:
+                        if args.test == "all" or test_flavor.test_name == args.test:
+                            if args.ngpu < test_flavor.ngpu:
+                                logger.info(
+                                    f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus,"
+                                    f" because --ngpu arg is {args.ngpu}"
+                                )
+                            else:
+                                run_test(test_flavor, full_path, args.output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--config_dir", default="./torchtitan/models/llama3/train_configs"
+    )
+    parser.add_argument(
+        "--test",
+        default="all",
+        help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
+    )
+    parser.add_argument("--ngpu", default=2, type=int)
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    if os.listdir(args.output_dir):
+        raise RuntimeError("Please provide an empty output directory.")
+    run_tests(args)
+
+
+if __name__ == "__main__":
+    main()

From 47d9d613c42ba939689f24ec58faba3202ff5770 Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Mon, 23 Jun 2025 14:48:32 -0700
Subject: [PATCH 11/44] Update integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 02e208510..f7d31f138 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -19,7 +19,7 @@ jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
-      runner: linux.rocm.gpu.mi300.2
+      runner: linux.rocm.gpu.2
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
       upload-artifact: outputs

From 7a4cc6827477df8c70fd0fe60e49b7f678d1e058 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainRizvi@users.noreply.github.com>
Date: Tue, 24 Jun 2025 11:00:04 -0500
Subject: [PATCH 12/44] Update .github/workflows/integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index f7d31f138..02e208510 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -19,7 +19,7 @@ jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
-      runner: linux.rocm.gpu.2
+      runner: linux.rocm.gpu.mi300.2
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
       upload-artifact: outputs

From ae636c5a62f45ee165cf0d8937551901850988e2 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainRizvi@users.noreply.github.com>
Date: Wed, 25 Jun 2025 12:32:27 -0500
Subject: [PATCH 13/44] Apply suggestions from code review

test change
---
 .github/workflows/integration_test_8gpu_amd.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 02e208510..eb3928da5 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -23,6 +23,7 @@ jobs:
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
       upload-artifact: outputs
+      docker-build-dir: "fake_dir"
       script: |
         set -eux
         # The generic Linux job chooses to use base env, not the one setup by the image

From e4235c946cdc7eec2007c59ef2ca97f63cc8ea40 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 11:37:12 -0700
Subject: [PATCH 14/44] isntall jq

---
 .github/workflows/integration_test_8gpu_amd.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index eb3928da5..d32b24a3f 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -27,6 +27,7 @@ jobs:
       script: |
         set -eux
         # The generic Linux job chooses to use base env, not the one setup by the image
+        conda install -c conda-forge jq
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off

From 687fdf42368ad327683c1f95b18a0797efaed3c5 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 12:02:44 -0700
Subject: [PATCH 15/44] isntall jq

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index d32b24a3f..182a0d5f5 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -27,7 +27,7 @@ jobs:
       script: |
         set -eux
         # The generic Linux job chooses to use base env, not the one setup by the image
-        conda install -c conda-forge jq
+        conda install -c conda-forge jq -y
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off

From a781bb3b9aa7a5c335e8662f295032e2f276e7e8 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 12:32:17 -0700
Subject: [PATCH 16/44] set env

---
 .github/workflows/integration_test_8gpu_amd.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 182a0d5f5..030191e2f 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -31,6 +31,7 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off
+        pip install -r requirements.txt
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded

From 6fefa814ba86f6e99de0fd288f27356d0b509c17 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 12:33:14 -0700
Subject: [PATCH 17/44] set env

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 030191e2f..7a130fb45 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -32,7 +32,7 @@ jobs:
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off
         pip install -r requirements.txt
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --force-reinstall
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 2

From 9404612ecb6b68582b852e221453ebc54a58a3c0 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 12:39:43 -0700
Subject: [PATCH 18/44] to 8 gpu

---
 .../workflows/integration_test_8gpu_amd.yaml  |  4 +--
 tests/integration_tests_amd.py                | 32 +++++++++++++++++--
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 7a130fb45..a109ceedc 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -19,7 +19,7 @@ jobs:
   build-test:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
-      runner: linux.rocm.gpu.mi300.2
+      runner: linux.rocm.gpu.mi300.8
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
       upload-artifact: outputs
@@ -35,4 +35,4 @@ jobs:
         pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --force-reinstall
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 2
+        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 0b6c1dbd0..8e568c980 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -51,8 +51,34 @@ def build_test_list():
                     "--parallelism.tensor_parallel_degree 2",
                 ],
             ],
-            "2D TP compile",
-            "2d_tp_compile",
+            "2D async TP compile",
+            "2d_asynctp_compile",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.compile",
+                    "--parallelism.data_parallel_shard_degree=2",
+                    "--parallelism.tensor_parallel_degree=2",
+                    "--parallelism.pipeline_parallel_degree=2",
+                ]
+            ],
+            "FSDP+TP+PP+torch.compile",
+            "fsdp+tp+cp+compile",
+            ngpu=8,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.compile",
+                    "--parallelism.data_parallel_shard_degree=2",
+                    "--parallelism.data_parallel_replicate_degree=2",
+                    "--parallelism.context_parallel_degree=2",
+                ]
+            ],
+            "HSDP+CP+torch.compile",
+            "hsdp+cp+compile",
+            ngpu=8,
         ),
     ]
     return integration_tests_flavors
@@ -134,7 +160,7 @@ def main():
         default="all",
         help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
     )
-    parser.add_argument("--ngpu", default=2, type=int)
+    parser.add_argument("--ngpu", default=8, type=int)
     args = parser.parse_args()
 
     if not os.path.exists(args.output_dir):

From 98a74d20287c589fd1f63a593d48606c90aad69c Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 13:11:11 -0700
Subject: [PATCH 19/44] Update .github/workflows/integration_test_8gpu_amd.yaml

Co-authored-by: Zain Rizvi <ZainRizvi@users.noreply.github.com>
---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index a109ceedc..06204305e 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -23,7 +23,7 @@ jobs:
       gpu-arch-type: rocm
       gpu-arch-version: "6.3"
       upload-artifact: outputs
-      docker-build-dir: "fake_dir"
+      use-custom-docker-registry: false
       script: |
         set -eux
         # The generic Linux job chooses to use base env, not the one setup by the image

From d59f9782449cfe98f66809f5bb4667b9d3d1821e Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 13:48:52 -0700
Subject: [PATCH 20/44] try cu126

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 06204305e..a3edc3f21 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -32,7 +32,7 @@ jobs:
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off
         pip install -r requirements.txt
-        pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --force-reinstall
+        pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8

From f352e776e2711e83dc2c185edfda4aac6fdf4d21 Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 14:38:20 -0700
Subject: [PATCH 21/44] Update integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index a3edc3f21..b15a8ce0f 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -32,7 +32,7 @@ jobs:
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off
         pip install -r requirements.txt
-        pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8

From 921461841f04aabeb1df5a20c716180dfe61b428 Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 14:54:07 -0700
Subject: [PATCH 22/44] Update integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index b15a8ce0f..9359434ef 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -32,7 +32,7 @@ jobs:
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off
         pip install -r requirements.txt
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8

From c3728b6c831e18cb23fab9aa32e3810f6ce67038 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 25 Jun 2025 15:10:20 -0700
Subject: [PATCH 23/44] lint

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 9359434ef..97db7db9c 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -32,7 +32,7 @@ jobs:
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off
         pip install -r requirements.txt
-        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
         python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8

From 02e141b85c0fd62e91f9797b64bcaca799d64b89 Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Thu, 26 Jun 2025 00:22:34 -0700
Subject: [PATCH 24/44] Update integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 97db7db9c..36d53ebc3 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -35,4 +35,4 @@ jobs:
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8
+        python ./tests/integration_tests_amd.py artifacts-to-be-uploaded --ngpu 8

From 6e7c6dd3578aa947ffc76505a4e562fb66979a5e Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Thu, 26 Jun 2025 08:49:04 -0700
Subject: [PATCH 25/44] Update integration_tests_amd.py

---
 tests/integration_tests_amd.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 8e568c980..bfacc8685 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -60,12 +60,11 @@ def build_test_list():
                     "--training.compile",
                     "--parallelism.data_parallel_shard_degree=2",
                     "--parallelism.tensor_parallel_degree=2",
-                    "--parallelism.pipeline_parallel_degree=2",
                 ]
             ],
-            "FSDP+TP+PP+torch.compile",
-            "fsdp+tp+cp+compile",
-            ngpu=8,
+            "FSDP+TP+torch.compile",
+            "fsdp+tp+compile",
+            ngpu=4,
         ),
         OverrideDefinitions(
             [
@@ -78,7 +77,7 @@ def build_test_list():
             ],
             "HSDP+CP+torch.compile",
             "hsdp+cp+compile",
-            ngpu=8,
+            ngpu=4,
         ),
     ]
     return integration_tests_flavors

From 0e645b213e82816af2f88eaaabf9b9ba889efaaf Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Thu, 26 Jun 2025 09:37:28 -0700
Subject: [PATCH 26/44] Update integration_tests_amd.py

---
 tests/integration_tests_amd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index bfacc8685..cc90ad80a 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -77,7 +77,7 @@ def build_test_list():
             ],
             "HSDP+CP+torch.compile",
             "hsdp+cp+compile",
-            ngpu=4,
+            ngpu=8,
         ),
     ]
     return integration_tests_flavors

From ab49ddb26c2a06654c2bd2af90a670916897c3ea Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Thu, 26 Jun 2025 11:09:18 -0700
Subject: [PATCH 27/44] Update integration_tests_amd.py

---
 tests/integration_tests_amd.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index cc90ad80a..faf5ffb82 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -58,25 +58,24 @@ def build_test_list():
             [
                 [
                     "--training.compile",
-                    "--parallelism.data_parallel_shard_degree=2",
+                    "--parallelism.data_parallel_shard_degree=4",
                     "--parallelism.tensor_parallel_degree=2",
                 ]
             ],
             "FSDP+TP+torch.compile",
             "fsdp+tp+compile",
-            ngpu=4,
+            ngpu=8,
         ),
         OverrideDefinitions(
             [
                 [
                     "--training.compile",
-                    "--parallelism.data_parallel_shard_degree=2",
+                    "--parallelism.data_parallel_shard_degree=4",
                     "--parallelism.data_parallel_replicate_degree=2",
-                    "--parallelism.context_parallel_degree=2",
                 ]
             ],
-            "HSDP+CP+torch.compile",
-            "hsdp+cp+compile",
+            "HSDP+torch.compile",
+            "hsdp+compile",
             ngpu=8,
         ),
     ]

From 237b9559cbd21e7e78c86aaa1cb10d368a78f6bc Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Thu, 26 Jun 2025 11:37:48 -0700
Subject: [PATCH 28/44] Update integration_tests_amd.py

---
 tests/integration_tests_amd.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index faf5ffb82..c21e97a29 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -66,18 +66,6 @@ def build_test_list():
             "fsdp+tp+compile",
             ngpu=8,
         ),
-        OverrideDefinitions(
-            [
-                [
-                    "--training.compile",
-                    "--parallelism.data_parallel_shard_degree=4",
-                    "--parallelism.data_parallel_replicate_degree=2",
-                ]
-            ],
-            "HSDP+torch.compile",
-            "hsdp+compile",
-            ngpu=8,
-        ),
     ]
     return integration_tests_flavors
 

From f4c53d7db46adb35f6eeaac5e70b88c852c3937b Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Thu, 26 Jun 2025 13:25:20 -0700
Subject: [PATCH 29/44] Update integration_tests_amd.py

---
 tests/integration_tests_amd.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index c21e97a29..e0b0b56d5 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -51,19 +51,18 @@ def build_test_list():
                     "--parallelism.tensor_parallel_degree 2",
                 ],
             ],
-            "2D async TP compile",
-            "2d_asynctp_compile",
+            "TP compile",
+            "tp_compile",
         ),
         OverrideDefinitions(
             [
                 [
                     "--training.compile",
-                    "--parallelism.data_parallel_shard_degree=4",
-                    "--parallelism.tensor_parallel_degree=2",
+                    "--parallelism.data_parallel_shard_degree=8",
                 ]
             ],
-            "FSDP+TP+torch.compile",
-            "fsdp+tp+compile",
+            "FSDP+torch.compile",
+            "fsdp+compile",
             ngpu=8,
         ),
     ]

From 496ee046164935693d550063f882852d829b9672 Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Fri, 27 Jun 2025 13:19:17 -0700
Subject: [PATCH 30/44] Update integration_tests_amd.py

---
 tests/integration_tests_amd.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index e0b0b56d5..caf162d5e 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -65,6 +65,20 @@ def build_test_list():
             "fsdp+compile",
             ngpu=8,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.compile",
+                    "--parallelism.data_parallel_shard_degree=2",
+                    "--parallelism.tensor_parallel_degree=2",
+                    "--parallelism.pipeline_parallel_degree=2",
+                ]
+            ],
+            "FSDP+TP+PP+torch.compile",
+            "fsdp+tp+cp+compile",
+            ngpu=8,
+        ),
+        OverrideDefinitions(
     ]
     return integration_tests_flavors
 

From bc43677cedc1d642ff7a2c6e0bc0aec721cc391f Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Fri, 27 Jun 2025 14:47:19 -0700
Subject: [PATCH 31/44] Update integration_tests_amd.py

---
 tests/integration_tests_amd.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index caf162d5e..5508fe777 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -78,7 +78,6 @@ def build_test_list():
             "fsdp+tp+cp+compile",
             ngpu=8,
         ),
-        OverrideDefinitions(
     ]
     return integration_tests_flavors
 

From cbbe7ac8767a71f46ebeeb49d717bb665e54ee26 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Mon, 30 Jun 2025 10:18:31 -0700
Subject: [PATCH 32/44] change folder

---
 .github/workflows/integration_test_8gpu_amd.yaml | 4 +++-
 tests/integration_tests_amd.py                   | 9 +++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 36d53ebc3..982c0f67c 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -35,4 +35,6 @@ jobs:
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
         mkdir artifacts-to-be-uploaded
-        python ./tests/integration_tests_amd.py artifacts-to-be-uploaded --ngpu 8
+        mkdir generated-artifacts
+        python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8
+        mv -r generated-artifacts artifacts-to-be-uploaded
diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 5508fe777..ba2f933ea 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -68,14 +68,11 @@ def build_test_list():
         OverrideDefinitions(
             [
                 [
-                    "--training.compile",
-                    "--parallelism.data_parallel_shard_degree=2",
-                    "--parallelism.tensor_parallel_degree=2",
-                    "--parallelism.pipeline_parallel_degree=2",
+                    "--parallelism.pipeline_parallel_degree=8",
                 ]
             ],
-            "FSDP+TP+PP+torch.compile",
-            "fsdp+tp+cp+compile",
+            "PP",
+            "pp",
             ngpu=8,
         ),
     ]

From e243a546cc0d35486c4d86c2b92c97ed2f7fc3b5 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Mon, 30 Jun 2025 10:35:29 -0700
Subject: [PATCH 33/44] limit test

---
 tests/integration_tests_amd.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index ba2f933ea..fba2d082e 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -41,6 +41,8 @@ def build_test_list():
     key is the config file name and value is a list of OverrideDefinitions
     that is used to generate variations of integration tests based on the
     same root config file.
+    TODO: 8*amd gpu current only support 1D TP/DP test, ebale test for PP/CP
+    and xD test later.
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [
@@ -65,16 +67,6 @@ def build_test_list():
             "fsdp+compile",
             ngpu=8,
         ),
-        OverrideDefinitions(
-            [
-                [
-                    "--parallelism.pipeline_parallel_degree=8",
-                ]
-            ],
-            "PP",
-            "pp",
-            ngpu=8,
-        ),
     ]
     return integration_tests_flavors
 

From f1830eac59d32b9fbef47557afbae790ecea4a01 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Mon, 30 Jun 2025 10:35:46 -0700
Subject: [PATCH 34/44] amend

---
 tests/integration_tests_amd.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index fba2d082e..9b08a4a21 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -67,6 +67,17 @@ def build_test_list():
             "fsdp+compile",
             ngpu=8,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.context_parallel_degree=8",
+                    "--parallelism.context_parallel_rotate_method='allgather'",
+                ]
+            ],
+            "CP (allgather)",
+            "cp_allgather",
+            ngpu=8,
+        ),
     ]
     return integration_tests_flavors
 

From d27971375c379fe6b24840165b9e0ba0fa5db15d Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Mon, 30 Jun 2025 10:49:00 -0700
Subject: [PATCH 35/44] use cp

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 tests/integration_tests_amd.py                   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 982c0f67c..e9558f5fd 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -37,4 +37,4 @@ jobs:
         mkdir artifacts-to-be-uploaded
         mkdir generated-artifacts
         python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8
-        mv -r generated-artifacts artifacts-to-be-uploaded
+        cp -r generated-artifacts/* artifacts-to-be-uploaded/
diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 9b08a4a21..6d11efd16 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -41,8 +41,8 @@ def build_test_list():
     key is the config file name and value is a list of OverrideDefinitions
     that is used to generate variations of integration tests based on the
     same root config file.
-    TODO: 8*amd gpu current only support 1D TP/DP test, ebale test for PP/CP
-    and xD test later.
+    TODO: 8*amd gpu current only support 1D TP/DP/CP test, ebale tests for PP
+    and xD later.
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [

From 79c52538793d0f08c156ec758da23940442b14a9 Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Mon, 30 Jun 2025 16:36:53 -0700
Subject: [PATCH 36/44] Update integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index e9558f5fd..5c39c83fa 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -30,6 +30,7 @@ jobs:
         conda install -c conda-forge jq -y
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
+        conda create -yn test-mps-ops-env python=3.11
         pip config --user set global.progress_bar off
         pip install -r requirements.txt
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3

From a532866cda1ec02224e3da886d312d817e835d43 Mon Sep 17 00:00:00 2001
From: yifanmao <yifanmao@meta.com>
Date: Mon, 30 Jun 2025 17:42:22 -0700
Subject: [PATCH 37/44] Update integration_test_8gpu_amd.yaml

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 5c39c83fa..d52d919bc 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -29,8 +29,8 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         conda install -c conda-forge jq -y
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda create -yn "${CONDA_ENV}"
         conda activate "${CONDA_ENV}"
-        conda create -yn test-mps-ops-env python=3.11
         pip config --user set global.progress_bar off
         pip install -r requirements.txt
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3

From d841f29f61f42b78261eb11359d07318cb924b05 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Tue, 8 Jul 2025 15:18:40 -0700
Subject: [PATCH 38/44] remove artifacts-to-be-uploaded

---
 .github/workflows/integration_test_8gpu_amd.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index d52d919bc..4a3f58a8d 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -35,7 +35,5 @@ jobs:
         pip install -r requirements.txt
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
-        mkdir artifacts-to-be-uploaded
         mkdir generated-artifacts
         python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8
-        cp -r generated-artifacts/* artifacts-to-be-uploaded/

From 9e1ca9eac347014f576729891622a50c5810ea83 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Tue, 8 Jul 2025 16:03:44 -0700
Subject: [PATCH 39/44] remove conda create

---
 .github/workflows/integration_test_8gpu_amd.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 4a3f58a8d..166d9501f 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -29,7 +29,6 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         conda install -c conda-forge jq -y
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda create -yn "${CONDA_ENV}"
         conda activate "${CONDA_ENV}"
         pip config --user set global.progress_bar off
         pip install -r requirements.txt

From 470a7fa3f22178487e68a152457eb4d4e4a0b1db Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Tue, 8 Jul 2025 19:51:11 -0700
Subject: [PATCH 40/44] try other tests

---
 .github/workflows/integration_test_8gpu_amd.yaml |  2 ++
 tests/integration_tests_amd.py                   | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml
index 166d9501f..81aa8c8c0 100644
--- a/.github/workflows/integration_test_8gpu_amd.yaml
+++ b/.github/workflows/integration_test_8gpu_amd.yaml
@@ -34,5 +34,7 @@ jobs:
         pip install -r requirements.txt
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        # There's perrmission issue in uploading files in artifacts-to-be-uploaded on AMD nodes, thus skip it by creating and storing
+        # results in generated-artifacts.
         mkdir generated-artifacts
         python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8
diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 6d11efd16..29a7f9b34 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -41,7 +41,7 @@ def build_test_list():
     key is the config file name and value is a list of OverrideDefinitions
     that is used to generate variations of integration tests based on the
     same root config file.
-    TODO: 8*amd gpu current only support 1D TP/DP/CP test, ebale tests for PP
+    TODO: 8*amd gpu current only support 1D TP/DP/CP test, enbale tests for PP
     and xD later.
     """
     integration_tests_flavors = defaultdict(list)
@@ -70,7 +70,7 @@ def build_test_list():
         OverrideDefinitions(
             [
                 [
-                    "--parallelism.context_parallel_degree=8",
+                    "--parallelism.context_parallel_degree 2",
                     "--parallelism.context_parallel_rotate_method='allgather'",
                 ]
             ],
@@ -78,6 +78,18 @@ def build_test_list():
             "cp_allgather",
             ngpu=8,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.compile",
+                    "--parallelism.data_parallel_shard_degree=4",
+                    "--parallelism.data_parallel_replicate_degree=2",
+                ]
+            ],
+            "HSDP+CP+torch.compile+Float8",
+            "hsdp+cp+compile+float8",
+            ngpu=8,
+        ),
     ]
     return integration_tests_flavors
 

From ece88102298326c51a005dd0a501d75c3f4b14d0 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Tue, 8 Jul 2025 20:06:57 -0700
Subject: [PATCH 41/44] update tests

---
 tests/integration_tests_amd.py | 38 +++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 29a7f9b34..91db8770a 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -41,8 +41,8 @@ def build_test_list():
     key is the config file name and value is a list of OverrideDefinitions
     that is used to generate variations of integration tests based on the
     same root config file.
-    TODO: 8*amd gpu current only support 1D TP/DP/CP test, enbale tests for PP
-    and xD later.
+    TODO: 8*amd gpu current only support TP, DP, CP test.
+    HSDP, PP are not supported yet.
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [
@@ -53,41 +53,51 @@ def build_test_list():
                     "--parallelism.tensor_parallel_degree 2",
                 ],
             ],
-            "TP compile",
-            "tp_compile",
+            "TP+DP compile",
+            "tp_dp_compile",
         ),
         OverrideDefinitions(
             [
                 [
-                    "--training.compile",
-                    "--parallelism.data_parallel_shard_degree=8",
+                    "--parallelism.context_parallel_degree 2",
+                    "--parallelism.context_parallel_rotate_method='allgather'",
                 ]
             ],
-            "FSDP+torch.compile",
-            "fsdp+compile",
+            "DP+CP(allgather)",
+            "dp_cp_allgather",
             ngpu=8,
         ),
         OverrideDefinitions(
             [
                 [
+                    "--parallelism.tensor_parallel_degree 2",
                     "--parallelism.context_parallel_degree 2",
                     "--parallelism.context_parallel_rotate_method='allgather'",
                 ]
             ],
-            "CP (allgather)",
-            "cp_allgather",
+            "DP+CP(allgather)",
+            "dp_cp_allgather",
             ngpu=8,
         ),
         OverrideDefinitions(
             [
                 [
                     "--training.compile",
-                    "--parallelism.data_parallel_shard_degree=4",
-                    "--parallelism.data_parallel_replicate_degree=2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.enable_async_tensor_parallel",
+                ],
+            ],
+            "TP async+ compile",
+            "tp_async_compile",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.pipeline_parallel_degree=2",
                 ]
             ],
-            "HSDP+CP+torch.compile+Float8",
-            "hsdp+cp+compile+float8",
+            "PP",
+            "PP",
             ngpu=8,
         ),
     ]

From 72232122e4a492be58933b94dbbbb6149980a04a Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 9 Jul 2025 09:54:56 -0700
Subject: [PATCH 42/44] update test

---
 tests/integration_tests_amd.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 91db8770a..17d65754c 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -42,7 +42,7 @@ def build_test_list():
     that is used to generate variations of integration tests based on the
     same root config file.
     TODO: 8*amd gpu current only support TP, DP, CP test.
-    HSDP, PP are not supported yet.
+    HSDP, PP and their related test, TP+DP+CP are not supported yet.
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [
@@ -70,13 +70,13 @@ def build_test_list():
         OverrideDefinitions(
             [
                 [
-                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.tensor_parallel_degree 4",
                     "--parallelism.context_parallel_degree 2",
                     "--parallelism.context_parallel_rotate_method='allgather'",
                 ]
             ],
-            "DP+CP(allgather)",
-            "dp_cp_allgather",
+            "TP+CP(allgather)",
+            "tp_cp_allgather",
             ngpu=8,
         ),
         OverrideDefinitions(

From 342a0d3ad8a4f530d3d7b5faa5da718beef1fba5 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 9 Jul 2025 10:09:37 -0700
Subject: [PATCH 43/44] update test

---
 tests/integration_tests_amd.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index 17d65754c..bbb8f3288 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -42,7 +42,7 @@ def build_test_list():
     that is used to generate variations of integration tests based on the
     same root config file.
     TODO: 8*amd gpu current only support TP, DP, CP test.
-    HSDP, PP and their related test, TP+DP+CP are not supported yet.
+    HSDP, PP , TP+CP and their composability tests are not supported yet.
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [
@@ -67,18 +67,6 @@ def build_test_list():
             "dp_cp_allgather",
             ngpu=8,
         ),
-        OverrideDefinitions(
-            [
-                [
-                    "--parallelism.tensor_parallel_degree 4",
-                    "--parallelism.context_parallel_degree 2",
-                    "--parallelism.context_parallel_rotate_method='allgather'",
-                ]
-            ],
-            "TP+CP(allgather)",
-            "tp_cp_allgather",
-            ngpu=8,
-        ),
         OverrideDefinitions(
             [
                 [
@@ -100,6 +88,17 @@ def build_test_list():
             "PP",
             ngpu=8,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.tensor_parallel_degree 4",
+                    "--parallelism.context_parallel_degree 2",
+                ]
+            ],
+            "TP+CP",
+            "tp_cp",
+            ngpu=8,
+        ),
     ]
     return integration_tests_flavors
 

From fc280f5e7b309c892596415f8735a1b2a6fc1462 Mon Sep 17 00:00:00 2001
From: mori360 <yifanmao@meta.com>
Date: Wed, 9 Jul 2025 10:39:10 -0700
Subject: [PATCH 44/44] update test

---
 tests/integration_tests_amd.py | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py
index bbb8f3288..1147a092f 100644
--- a/tests/integration_tests_amd.py
+++ b/tests/integration_tests_amd.py
@@ -46,16 +46,6 @@ def build_test_list():
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [
-        OverrideDefinitions(
-            [
-                [
-                    "--training.compile",
-                    "--parallelism.tensor_parallel_degree 2",
-                ],
-            ],
-            "TP+DP compile",
-            "tp_dp_compile",
-        ),
         OverrideDefinitions(
             [
                 [
@@ -65,7 +55,6 @@ def build_test_list():
             ],
             "DP+CP(allgather)",
             "dp_cp_allgather",
-            ngpu=8,
         ),
         OverrideDefinitions(
             [
@@ -75,18 +64,20 @@ def build_test_list():
                     "--parallelism.enable_async_tensor_parallel",
                 ],
             ],
-            "TP async+ compile",
-            "tp_async_compile",
+            "DP+TP async+ compile",
+            "dp_tp_async_compile",
         ),
         OverrideDefinitions(
             [
                 [
-                    "--parallelism.pipeline_parallel_degree=2",
-                ]
+                    "--model.converters float8",
+                    "--float8.enable_fsdp_float8_all_gather",
+                    "--float8.precompute_float8_dynamic_scale_for_fsdp",
+                    "--float8.force_recompute_fp8_weight_in_bwd",
+                ],
             ],
-            "PP",
-            "PP",
-            ngpu=8,
+            "Float8 test",
+            "float8",
         ),
         OverrideDefinitions(
             [