From d47f51fb267409da68e378fe90466a3fc23a0aae Mon Sep 17 00:00:00 2001 From: mori360 Date: Thu, 29 May 2025 13:00:57 -0700 Subject: [PATCH 01/44] add integration_test_4gpu_amd.yaml --- .../workflows/integration_test_4gpu_amd.yaml | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 .github/workflows/integration_test_4gpu_amd.yaml diff --git a/.github/workflows/integration_test_4gpu_amd.yaml b/.github/workflows/integration_test_4gpu_amd.yaml new file mode 100644 index 000000000..81e6c8d4d --- /dev/null +++ b/.github/workflows/integration_test_4gpu_amd.yaml @@ -0,0 +1,39 @@ +name: 4 AMD GPU Integration Test + +on: + push: + branches: [ main ] + pull_request: + schedule: + # Runs every 6 hours + - cron: '0 */6 * * *' +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.rocm.gpu.mi300.4 + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + repository: pytorch/torchtitan + upload-artifact: outputs + script: | + set -eux + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + pip config --user set global.progress_bar off + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + mkdir artifacts-to-be-uploaded + python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 4 From ec83247da567f5b752b0e0d7a728175df3e37310 Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Thu, 29 May 2025 16:32:23 -0700 Subject: [PATCH 02/44] Update .github/workflows/integration_test_4gpu_amd.yaml --- .github/workflows/integration_test_4gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_4gpu_amd.yaml b/.github/workflows/integration_test_4gpu_amd.yaml index 81e6c8d4d..c49e9641b 100644 --- a/.github/workflows/integration_test_4gpu_amd.yaml +++ b/.github/workflows/integration_test_4gpu_amd.yaml @@ -19,7 +19,7 @@ jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: - runner: linux.rocm.gpu.mi300.4 + runner: linux.rocm.gpu.mi300.8 gpu-arch-type: cuda gpu-arch-version: "12.6" # This image is faster to clone than the default, but it lacks CC needed by triton From 7cd90e3c430e5cb70fc38dd1d3816d6ea980b3b6 Mon Sep 17 00:00:00 2001 From: mori360 Date: Thu, 29 May 2025 16:39:21 -0700 Subject: [PATCH 03/44] correct gpu number --- ...ration_test_4gpu_amd.yaml => integration_test_8gpu_amd.yaml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{integration_test_4gpu_amd.yaml => integration_test_8gpu_amd.yaml} (99%) diff --git a/.github/workflows/integration_test_4gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml similarity index 99% rename from .github/workflows/integration_test_4gpu_amd.yaml rename to .github/workflows/integration_test_8gpu_amd.yaml index c49e9641b..8cfd9d559 100644 --- a/.github/workflows/integration_test_4gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -36,4 +36,4 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 4 + python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 From 2bfcf3582d5f406bd7cbd1d5d570dfd2c20fec52 Mon Sep 17 00:00:00 2001 From: mori360 Date: Thu, 29 May 2025 16:46:23 -0700 Subject: [PATCH 04/44] typo --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 8cfd9d559..15f3da628 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -1,4 +1,4 @@ -name: 4 AMD GPU Integration Test +name: 8 AMD GPU Integration Test on: push: From b130aeeb15cdffebce0c77325cd66729c732595d Mon Sep 17 00:00:00 2001 From: mori360 Date: Mon, 2 Jun 2025 10:33:09 -0700 Subject: [PATCH 05/44] update job version --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 15f3da628..5f2deef20 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -17,7 +17,7 @@ defaults: jobs: build-test: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.rocm.gpu.mi300.8 gpu-arch-type: cuda From e26285cc90aed6aed288ce0355b5e9a92232ce1f Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Fri, 13 Jun 2025 10:57:09 -0700 Subject: [PATCH 06/44] Update .github/workflows/integration_test_8gpu_amd.yaml Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 5f2deef20..d8358a1a0 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -20,7 +20,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.rocm.gpu.mi300.8 - gpu-arch-type: cuda + gpu-arch-type: rocm gpu-arch-version: "12.6" # This image is faster to clone than the default, but it lacks CC needed by triton # (1m25s vs 2m37s). From 9e8e99727fd8fb9bc375b21cca0f526423e5f0b3 Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:10:50 -0700 Subject: [PATCH 07/44] Update .github/workflows/integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index d8358a1a0..8a03676f4 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -21,7 +21,7 @@ jobs: with: runner: linux.rocm.gpu.mi300.8 gpu-arch-type: rocm - gpu-arch-version: "12.6" + gpu-arch-version: "6.3" # This image is faster to clone than the default, but it lacks CC needed by triton # (1m25s vs 2m37s). docker-image: torchtitan-ubuntu-20.04-clang12 From c9e733aca0fbeb964238938cf97f5e4f097e27a7 Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:52:38 -0700 Subject: [PATCH 08/44] Remove custom docker image --- .github/workflows/integration_test_8gpu_amd.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 8a03676f4..cf2383c2e 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -22,9 +22,6 @@ jobs: runner: linux.rocm.gpu.mi300.8 gpu-arch-type: rocm gpu-arch-version: "6.3" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 repository: pytorch/torchtitan upload-artifact: outputs script: | From a9aa64d49ab5a086e99299e8b12bb569696a888b Mon Sep 17 00:00:00 2001 From: Eli Uriegas <1700823+seemethere@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:58:06 -0700 Subject: [PATCH 09/44] Update .github/workflows/integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index cf2383c2e..1d3da8386 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -22,7 +22,6 @@ jobs: runner: linux.rocm.gpu.mi300.8 gpu-arch-type: rocm gpu-arch-version: "6.3" - repository: pytorch/torchtitan upload-artifact: outputs script: | set -eux From cf8a9f013d39e4940d5456a07426fd7f17da2952 Mon Sep 17 00:00:00 2001 From: mori360 Date: Mon, 23 Jun 2025 14:08:06 -0700 Subject: [PATCH 10/44] 2 gpu --- .../workflows/integration_test_8gpu_amd.yaml | 4 +- tests/integration_tests_amd.py | 148 ++++++++++++++++++ 2 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 tests/integration_tests_amd.py diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 1d3da8386..02e208510 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -19,7 +19,7 @@ jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: - runner: linux.rocm.gpu.mi300.8 + runner: linux.rocm.gpu.mi300.2 gpu-arch-type: rocm gpu-arch-version: "6.3" upload-artifact: outputs @@ -32,4 +32,4 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 + python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 2 diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py new file mode 100644 index 000000000..0b6c1dbd0 --- /dev/null +++ b/tests/integration_tests_amd.py @@ -0,0 +1,148 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os +import subprocess +from collections import defaultdict +from dataclasses import dataclass +from typing import Sequence + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +try: + import tomllib +except ModuleNotFoundError: + import tomli as tomllib + + +@dataclass +class OverrideDefinitions: + """ + This class is used to define the override definitions for the integration tests. + """ + + override_args: Sequence[Sequence[str]] = tuple(tuple(" ")) + test_descr: str = "default" + test_name: str = "default" + ngpu: int = 4 + + def __repr__(self): + return self.test_descr + + +def build_test_list(): + """ + key is the config file name and value is a list of OverrideDefinitions + that is used to generate variations of integration tests based on the + same root config file. + """ + integration_tests_flavors = defaultdict(list) + integration_tests_flavors["debug_model.toml"] = [ + OverrideDefinitions( + [ + [ + "--training.compile", + "--parallelism.tensor_parallel_degree 2", + ], + ], + "2D TP compile", + "2d_tp_compile", + ), + ] + return integration_tests_flavors + + +def _run_cmd(cmd): + return subprocess.run([cmd], text=True, shell=True) + + +def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str): + # run_test supports sequence of tests. + test_name = test_flavor.test_name + dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}" + all_ranks = ",".join(map(str, range(test_flavor.ngpu))) + + for idx, override_arg in enumerate(test_flavor.override_args): + cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh" + # dump compile trace for debugging purpose + cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd + if test_name == "fsdp2_memory_estimation": + cmd = ( + f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} " + "./scripts/estimate/run_memory_estimation.sh" + ) + cmd += " " + dump_folder_arg + if override_arg: + cmd += " " + " ".join(override_arg) + logger.info( + f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}=====" + ) + + # save checkpoint (idx == 0) and load it for generation (idx == 1) + if test_name == "test_generate" and idx == 1: + cmd = ( + f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} " + f"CHECKPOINT_DIR={output_dir}/{test_name}/checkpoint/step-10 " + "PROMPT='What is the meaning of life?' " + f"./scripts/generate/run_llama_generate.sh --out > {output_dir}/{test_name}/generated_output.json" + ) + + result = _run_cmd(cmd) + logger.info(result.stdout) + if result.returncode != 0: + raise Exception( + f"Integration test failed, flavor : {test_flavor.test_descr}, command : {cmd}" + ) + + +def run_tests(args): + integration_tests_flavors = build_test_list() + for config_file in os.listdir(args.config_dir): + if config_file.endswith(".toml"): + full_path = os.path.join(args.config_dir, config_file) + with open(full_path, "rb") as f: + config = tomllib.load(f) + is_integration_test = config["job"].get( + "use_for_integration_test", False + ) + if is_integration_test: + for test_flavor in integration_tests_flavors[config_file]: + if args.test == "all" or test_flavor.test_name == args.test: + if args.ngpu < test_flavor.ngpu: + logger.info( + f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus," + f" because --ngpu arg is {args.ngpu}" + ) + else: + run_test(test_flavor, full_path, args.output_dir) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("output_dir") + parser.add_argument( + "--config_dir", default="./torchtitan/models/llama3/train_configs" + ) + parser.add_argument( + "--test", + default="all", + help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)", + ) + parser.add_argument("--ngpu", default=2, type=int) + args = parser.parse_args() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + if os.listdir(args.output_dir): + raise RuntimeError("Please provide an empty output directory.") + run_tests(args) + + +if __name__ == "__main__": + main() From 47d9d613c42ba939689f24ec58faba3202ff5770 Mon Sep 17 00:00:00 2001 From: yifanmao Date: Mon, 23 Jun 2025 14:48:32 -0700 Subject: [PATCH 11/44] Update integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 02e208510..f7d31f138 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -19,7 +19,7 @@ jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: - runner: linux.rocm.gpu.mi300.2 + runner: linux.rocm.gpu.2 gpu-arch-type: rocm gpu-arch-version: "6.3" upload-artifact: outputs From 7a4cc6827477df8c70fd0fe60e49b7f678d1e058 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Tue, 24 Jun 2025 11:00:04 -0500 Subject: [PATCH 12/44] Update .github/workflows/integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index f7d31f138..02e208510 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -19,7 +19,7 @@ jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: - runner: linux.rocm.gpu.2 + runner: linux.rocm.gpu.mi300.2 gpu-arch-type: rocm gpu-arch-version: "6.3" upload-artifact: outputs From ae636c5a62f45ee165cf0d8937551901850988e2 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Wed, 25 Jun 2025 12:32:27 -0500 Subject: [PATCH 13/44] Apply suggestions from code review test change --- .github/workflows/integration_test_8gpu_amd.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 02e208510..eb3928da5 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -23,6 +23,7 @@ jobs: gpu-arch-type: rocm gpu-arch-version: "6.3" upload-artifact: outputs + docker-build-dir: "fake_dir" script: | set -eux # The generic Linux job chooses to use base env, not the one setup by the image From e4235c946cdc7eec2007c59ef2ca97f63cc8ea40 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 25 Jun 2025 11:37:12 -0700 Subject: [PATCH 14/44] isntall jq --- .github/workflows/integration_test_8gpu_amd.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index eb3928da5..d32b24a3f 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -27,6 +27,7 @@ jobs: script: | set -eux # The generic Linux job chooses to use base env, not the one setup by the image + conda install -c conda-forge jq CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off From 687fdf42368ad327683c1f95b18a0797efaed3c5 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 25 Jun 2025 12:02:44 -0700 Subject: [PATCH 15/44] isntall jq --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index d32b24a3f..182a0d5f5 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -27,7 +27,7 @@ jobs: script: | set -eux # The generic Linux job chooses to use base env, not the one setup by the image - conda install -c conda-forge jq + conda install -c conda-forge jq -y CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off From a781bb3b9aa7a5c335e8662f295032e2f276e7e8 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 25 Jun 2025 12:32:17 -0700 Subject: [PATCH 16/44] set env --- .github/workflows/integration_test_8gpu_amd.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 182a0d5f5..030191e2f 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -31,6 +31,7 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off + pip install -r requirements.txt python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded From 6fefa814ba86f6e99de0fd288f27356d0b509c17 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 25 Jun 2025 12:33:14 -0700 Subject: [PATCH 17/44] set env --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 030191e2f..7a130fb45 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -32,7 +32,7 @@ jobs: conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off pip install -r requirements.txt - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --force-reinstall USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 2 From 9404612ecb6b68582b852e221453ebc54a58a3c0 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 25 Jun 2025 12:39:43 -0700 Subject: [PATCH 18/44] to 8 gpu --- .../workflows/integration_test_8gpu_amd.yaml | 4 +-- tests/integration_tests_amd.py | 32 +++++++++++++++++-- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 7a130fb45..a109ceedc 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -19,7 +19,7 @@ jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: - runner: linux.rocm.gpu.mi300.2 + runner: linux.rocm.gpu.mi300.8 gpu-arch-type: rocm gpu-arch-version: "6.3" upload-artifact: outputs @@ -35,4 +35,4 @@ jobs: pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --force-reinstall USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 2 + python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 0b6c1dbd0..8e568c980 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -51,8 +51,34 @@ def build_test_list(): "--parallelism.tensor_parallel_degree 2", ], ], - "2D TP compile", - "2d_tp_compile", + "2D async TP compile", + "2d_asynctp_compile", + ), + OverrideDefinitions( + [ + [ + "--training.compile", + "--parallelism.data_parallel_shard_degree=2", + "--parallelism.tensor_parallel_degree=2", + "--parallelism.pipeline_parallel_degree=2", + ] + ], + "FSDP+TP+PP+torch.compile", + "fsdp+tp+cp+compile", + ngpu=8, + ), + OverrideDefinitions( + [ + [ + "--training.compile", + "--parallelism.data_parallel_shard_degree=2", + "--parallelism.data_parallel_replicate_degree=2", + "--parallelism.context_parallel_degree=2", + ] + ], + "HSDP+CP+torch.compile", + "hsdp+cp+compile", + ngpu=8, ), ] return integration_tests_flavors @@ -134,7 +160,7 @@ def main(): default="all", help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)", ) - parser.add_argument("--ngpu", default=2, type=int) + parser.add_argument("--ngpu", default=8, type=int) args = parser.parse_args() if not os.path.exists(args.output_dir): From 98a74d20287c589fd1f63a593d48606c90aad69c Mon Sep 17 00:00:00 2001 From: yifanmao Date: Wed, 25 Jun 2025 13:11:11 -0700 Subject: [PATCH 19/44] Update .github/workflows/integration_test_8gpu_amd.yaml Co-authored-by: Zain Rizvi --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index a109ceedc..06204305e 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -23,7 +23,7 @@ jobs: gpu-arch-type: rocm gpu-arch-version: "6.3" upload-artifact: outputs - docker-build-dir: "fake_dir" + use-custom-docker-registry: false script: | set -eux # The generic Linux job chooses to use base env, not the one setup by the image From d59f9782449cfe98f66809f5bb4667b9d3d1821e Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 25 Jun 2025 13:48:52 -0700 Subject: [PATCH 20/44] try cu126 --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 06204305e..a3edc3f21 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -32,7 +32,7 @@ jobs: conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off pip install -r requirements.txt - pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 --force-reinstall + pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 From f352e776e2711e83dc2c185edfda4aac6fdf4d21 Mon Sep 17 00:00:00 2001 From: yifanmao Date: Wed, 25 Jun 2025 14:38:20 -0700 Subject: [PATCH 21/44] Update integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index a3edc3f21..b15a8ce0f 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -32,7 +32,7 @@ jobs: conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off pip install -r requirements.txt - pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 --force-reinstall + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 From 921461841f04aabeb1df5a20c716180dfe61b428 Mon Sep 17 00:00:00 2001 From: yifanmao Date: Wed, 25 Jun 2025 14:54:07 -0700 Subject: [PATCH 22/44] Update integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index b15a8ce0f..9359434ef 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -32,7 +32,7 @@ jobs: conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off pip install -r requirements.txt - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 From c3728b6c831e18cb23fab9aa32e3810f6ce67038 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 25 Jun 2025 15:10:20 -0700 Subject: [PATCH 23/44] lint --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 9359434ef..97db7db9c 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -32,7 +32,7 @@ jobs: conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off pip install -r requirements.txt - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 From 02e141b85c0fd62e91f9797b64bcaca799d64b89 Mon Sep 17 00:00:00 2001 From: yifanmao Date: Thu, 26 Jun 2025 00:22:34 -0700 Subject: [PATCH 24/44] Update integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 97db7db9c..36d53ebc3 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -35,4 +35,4 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 + python ./tests/integration_tests_amd.py artifacts-to-be-uploaded --ngpu 8 From 6e7c6dd3578aa947ffc76505a4e562fb66979a5e Mon Sep 17 00:00:00 2001 From: yifanmao Date: Thu, 26 Jun 2025 08:49:04 -0700 Subject: [PATCH 25/44] Update integration_tests_amd.py --- tests/integration_tests_amd.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 8e568c980..bfacc8685 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -60,12 +60,11 @@ def build_test_list(): "--training.compile", "--parallelism.data_parallel_shard_degree=2", "--parallelism.tensor_parallel_degree=2", - "--parallelism.pipeline_parallel_degree=2", ] ], - "FSDP+TP+PP+torch.compile", - "fsdp+tp+cp+compile", - ngpu=8, + "FSDP+TP+torch.compile", + "fsdp+tp+compile", + ngpu=4, ), OverrideDefinitions( [ @@ -78,7 +77,7 @@ def build_test_list(): ], "HSDP+CP+torch.compile", "hsdp+cp+compile", - ngpu=8, + ngpu=4, ), ] return integration_tests_flavors From 0e645b213e82816af2f88eaaabf9b9ba889efaaf Mon Sep 17 00:00:00 2001 From: yifanmao Date: Thu, 26 Jun 2025 09:37:28 -0700 Subject: [PATCH 26/44] Update integration_tests_amd.py --- tests/integration_tests_amd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index bfacc8685..cc90ad80a 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -77,7 +77,7 @@ def build_test_list(): ], "HSDP+CP+torch.compile", "hsdp+cp+compile", - ngpu=4, + ngpu=8, ), ] return integration_tests_flavors From ab49ddb26c2a06654c2bd2af90a670916897c3ea Mon Sep 17 00:00:00 2001 From: yifanmao Date: Thu, 26 Jun 2025 11:09:18 -0700 Subject: [PATCH 27/44] Update integration_tests_amd.py --- tests/integration_tests_amd.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index cc90ad80a..faf5ffb82 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -58,25 +58,24 @@ def build_test_list(): [ [ "--training.compile", - "--parallelism.data_parallel_shard_degree=2", + "--parallelism.data_parallel_shard_degree=4", "--parallelism.tensor_parallel_degree=2", ] ], "FSDP+TP+torch.compile", "fsdp+tp+compile", - ngpu=4, + ngpu=8, ), OverrideDefinitions( [ [ "--training.compile", - "--parallelism.data_parallel_shard_degree=2", + "--parallelism.data_parallel_shard_degree=4", "--parallelism.data_parallel_replicate_degree=2", - "--parallelism.context_parallel_degree=2", ] ], - "HSDP+CP+torch.compile", - "hsdp+cp+compile", + "HSDP+torch.compile", + "hsdp+compile", ngpu=8, ), ] From 237b9559cbd21e7e78c86aaa1cb10d368a78f6bc Mon Sep 17 00:00:00 2001 From: yifanmao Date: Thu, 26 Jun 2025 11:37:48 -0700 Subject: [PATCH 28/44] Update integration_tests_amd.py --- tests/integration_tests_amd.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index faf5ffb82..c21e97a29 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -66,18 +66,6 @@ def build_test_list(): "fsdp+tp+compile", ngpu=8, ), - OverrideDefinitions( - [ - [ - "--training.compile", - "--parallelism.data_parallel_shard_degree=4", - "--parallelism.data_parallel_replicate_degree=2", - ] - ], - "HSDP+torch.compile", - "hsdp+compile", - ngpu=8, - ), ] return integration_tests_flavors From f4c53d7db46adb35f6eeaac5e70b88c852c3937b Mon Sep 17 00:00:00 2001 From: yifanmao Date: Thu, 26 Jun 2025 13:25:20 -0700 Subject: [PATCH 29/44] Update integration_tests_amd.py --- tests/integration_tests_amd.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index c21e97a29..e0b0b56d5 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -51,19 +51,18 @@ def build_test_list(): "--parallelism.tensor_parallel_degree 2", ], ], - "2D async TP compile", - "2d_asynctp_compile", + "TP compile", + "tp_compile", ), OverrideDefinitions( [ [ "--training.compile", - "--parallelism.data_parallel_shard_degree=4", - "--parallelism.tensor_parallel_degree=2", + "--parallelism.data_parallel_shard_degree=8", ] ], - "FSDP+TP+torch.compile", - "fsdp+tp+compile", + "FSDP+torch.compile", + "fsdp+compile", ngpu=8, ), ] From 496ee046164935693d550063f882852d829b9672 Mon Sep 17 00:00:00 2001 From: yifanmao Date: Fri, 27 Jun 2025 13:19:17 -0700 Subject: [PATCH 30/44] Update integration_tests_amd.py --- tests/integration_tests_amd.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index e0b0b56d5..caf162d5e 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -65,6 +65,20 @@ def build_test_list(): "fsdp+compile", ngpu=8, ), + OverrideDefinitions( + [ + [ + "--training.compile", + "--parallelism.data_parallel_shard_degree=2", + "--parallelism.tensor_parallel_degree=2", + "--parallelism.pipeline_parallel_degree=2", + ] + ], + "FSDP+TP+PP+torch.compile", + "fsdp+tp+cp+compile", + ngpu=8, + ), + OverrideDefinitions( ] return integration_tests_flavors From bc43677cedc1d642ff7a2c6e0bc0aec721cc391f Mon Sep 17 00:00:00 2001 From: yifanmao Date: Fri, 27 Jun 2025 14:47:19 -0700 Subject: [PATCH 31/44] Update integration_tests_amd.py --- tests/integration_tests_amd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index caf162d5e..5508fe777 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -78,7 +78,6 @@ def build_test_list(): "fsdp+tp+cp+compile", ngpu=8, ), - OverrideDefinitions( ] return integration_tests_flavors From cbbe7ac8767a71f46ebeeb49d717bb665e54ee26 Mon Sep 17 00:00:00 2001 From: mori360 Date: Mon, 30 Jun 2025 10:18:31 -0700 Subject: [PATCH 32/44] change folder --- .github/workflows/integration_test_8gpu_amd.yaml | 4 +++- tests/integration_tests_amd.py | 9 +++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 36d53ebc3..982c0f67c 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -35,4 +35,6 @@ jobs: python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded - python ./tests/integration_tests_amd.py artifacts-to-be-uploaded --ngpu 8 + mkdir generated-artifacts + python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8 + mv -r generated-artifacts artifacts-to-be-uploaded diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 5508fe777..ba2f933ea 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -68,14 +68,11 @@ def build_test_list(): OverrideDefinitions( [ [ - "--training.compile", - "--parallelism.data_parallel_shard_degree=2", - "--parallelism.tensor_parallel_degree=2", - "--parallelism.pipeline_parallel_degree=2", + "--parallelism.pipeline_parallel_degree=8", ] ], - "FSDP+TP+PP+torch.compile", - "fsdp+tp+cp+compile", + "PP", + "pp", ngpu=8, ), ] From e243a546cc0d35486c4d86c2b92c97ed2f7fc3b5 Mon Sep 17 00:00:00 2001 From: mori360 Date: Mon, 30 Jun 2025 10:35:29 -0700 Subject: [PATCH 33/44] limit test --- tests/integration_tests_amd.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index ba2f933ea..fba2d082e 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -41,6 +41,8 @@ def build_test_list(): key is the config file name and value is a list of OverrideDefinitions that is used to generate variations of integration tests based on the same root config file. + TODO: 8*amd gpu current only support 1D TP/DP test, ebale test for PP/CP + and xD test later. """ integration_tests_flavors = defaultdict(list) integration_tests_flavors["debug_model.toml"] = [ @@ -65,16 +67,6 @@ def build_test_list(): "fsdp+compile", ngpu=8, ), - OverrideDefinitions( - [ - [ - "--parallelism.pipeline_parallel_degree=8", - ] - ], - "PP", - "pp", - ngpu=8, - ), ] return integration_tests_flavors From f1830eac59d32b9fbef47557afbae790ecea4a01 Mon Sep 17 00:00:00 2001 From: mori360 Date: Mon, 30 Jun 2025 10:35:46 -0700 Subject: [PATCH 34/44] amend --- tests/integration_tests_amd.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index fba2d082e..9b08a4a21 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -67,6 +67,17 @@ def build_test_list(): "fsdp+compile", ngpu=8, ), + OverrideDefinitions( + [ + [ + "--parallelism.context_parallel_degree=8", + "--parallelism.context_parallel_rotate_method='allgather'", + ] + ], + "CP (allgather)", + "cp_allgather", + ngpu=8, + ), ] return integration_tests_flavors From d27971375c379fe6b24840165b9e0ba0fa5db15d Mon Sep 17 00:00:00 2001 From: mori360 Date: Mon, 30 Jun 2025 10:49:00 -0700 Subject: [PATCH 35/44] use cp --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- tests/integration_tests_amd.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 982c0f67c..e9558f5fd 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -37,4 +37,4 @@ jobs: mkdir artifacts-to-be-uploaded mkdir generated-artifacts python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8 - mv -r generated-artifacts artifacts-to-be-uploaded + cp -r generated-artifacts/* artifacts-to-be-uploaded/ diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 9b08a4a21..6d11efd16 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -41,8 +41,8 @@ def build_test_list(): key is the config file name and value is a list of OverrideDefinitions that is used to generate variations of integration tests based on the same root config file. - TODO: 8*amd gpu current only support 1D TP/DP test, ebale test for PP/CP - and xD test later. + TODO: 8*amd gpu current only support 1D TP/DP/CP test, ebale tests for PP + and xD later. """ integration_tests_flavors = defaultdict(list) integration_tests_flavors["debug_model.toml"] = [ From 79c52538793d0f08c156ec758da23940442b14a9 Mon Sep 17 00:00:00 2001 From: yifanmao Date: Mon, 30 Jun 2025 16:36:53 -0700 Subject: [PATCH 36/44] Update integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index e9558f5fd..5c39c83fa 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -30,6 +30,7 @@ jobs: conda install -c conda-forge jq -y CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" + conda create -yn test-mps-ops-env python=3.11 pip config --user set global.progress_bar off pip install -r requirements.txt python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 From a532866cda1ec02224e3da886d312d817e835d43 Mon Sep 17 00:00:00 2001 From: yifanmao Date: Mon, 30 Jun 2025 17:42:22 -0700 Subject: [PATCH 37/44] Update integration_test_8gpu_amd.yaml --- .github/workflows/integration_test_8gpu_amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 5c39c83fa..d52d919bc 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -29,8 +29,8 @@ jobs: # The generic Linux job chooses to use base env, not the one setup by the image conda install -c conda-forge jq -y CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda create -yn "${CONDA_ENV}" conda activate "${CONDA_ENV}" - conda create -yn test-mps-ops-env python=3.11 pip config --user set global.progress_bar off pip install -r requirements.txt python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 From d841f29f61f42b78261eb11359d07318cb924b05 Mon Sep 17 00:00:00 2001 From: mori360 Date: Tue, 8 Jul 2025 15:18:40 -0700 Subject: [PATCH 38/44] remove artifacts-to-be-uploaded --- .github/workflows/integration_test_8gpu_amd.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index d52d919bc..4a3f58a8d 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -35,7 +35,5 @@ jobs: pip install -r requirements.txt python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 - mkdir artifacts-to-be-uploaded mkdir generated-artifacts python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8 - cp -r generated-artifacts/* artifacts-to-be-uploaded/ From 9e1ca9eac347014f576729891622a50c5810ea83 Mon Sep 17 00:00:00 2001 From: mori360 Date: Tue, 8 Jul 2025 16:03:44 -0700 Subject: [PATCH 39/44] remove conda create --- .github/workflows/integration_test_8gpu_amd.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 4a3f58a8d..166d9501f 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -29,7 +29,6 @@ jobs: # The generic Linux job chooses to use base env, not the one setup by the image conda install -c conda-forge jq -y CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda create -yn "${CONDA_ENV}" conda activate "${CONDA_ENV}" pip config --user set global.progress_bar off pip install -r requirements.txt From 470a7fa3f22178487e68a152457eb4d4e4a0b1db Mon Sep 17 00:00:00 2001 From: mori360 Date: Tue, 8 Jul 2025 19:51:11 -0700 Subject: [PATCH 40/44] try other tests --- .github/workflows/integration_test_8gpu_amd.yaml | 2 ++ tests/integration_tests_amd.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_amd.yaml b/.github/workflows/integration_test_8gpu_amd.yaml index 166d9501f..81aa8c8c0 100644 --- a/.github/workflows/integration_test_8gpu_amd.yaml +++ b/.github/workflows/integration_test_8gpu_amd.yaml @@ -34,5 +34,7 @@ jobs: pip install -r requirements.txt python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + # There's perrmission issue in uploading files in artifacts-to-be-uploaded on AMD nodes, thus skip it by creating and storing + # results in generated-artifacts. mkdir generated-artifacts python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8 diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 6d11efd16..29a7f9b34 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -41,7 +41,7 @@ def build_test_list(): key is the config file name and value is a list of OverrideDefinitions that is used to generate variations of integration tests based on the same root config file. - TODO: 8*amd gpu current only support 1D TP/DP/CP test, ebale tests for PP + TODO: 8*amd gpu current only support 1D TP/DP/CP test, enbale tests for PP and xD later. """ integration_tests_flavors = defaultdict(list) @@ -70,7 +70,7 @@ def build_test_list(): OverrideDefinitions( [ [ - "--parallelism.context_parallel_degree=8", + "--parallelism.context_parallel_degree 2", "--parallelism.context_parallel_rotate_method='allgather'", ] ], @@ -78,6 +78,18 @@ def build_test_list(): "cp_allgather", ngpu=8, ), + OverrideDefinitions( + [ + [ + "--training.compile", + "--parallelism.data_parallel_shard_degree=4", + "--parallelism.data_parallel_replicate_degree=2", + ] + ], + "HSDP+CP+torch.compile+Float8", + "hsdp+cp+compile+float8", + ngpu=8, + ), ] return integration_tests_flavors From ece88102298326c51a005dd0a501d75c3f4b14d0 Mon Sep 17 00:00:00 2001 From: mori360 Date: Tue, 8 Jul 2025 20:06:57 -0700 Subject: [PATCH 41/44] update tests --- tests/integration_tests_amd.py | 38 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 29a7f9b34..91db8770a 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -41,8 +41,8 @@ def build_test_list(): key is the config file name and value is a list of OverrideDefinitions that is used to generate variations of integration tests based on the same root config file. - TODO: 8*amd gpu current only support 1D TP/DP/CP test, enbale tests for PP - and xD later. + TODO: 8*amd gpu current only support TP, DP, CP test. + HSDP, PP are not supported yet. """ integration_tests_flavors = defaultdict(list) integration_tests_flavors["debug_model.toml"] = [ @@ -53,41 +53,51 @@ def build_test_list(): "--parallelism.tensor_parallel_degree 2", ], ], - "TP compile", - "tp_compile", + "TP+DP compile", + "tp_dp_compile", ), OverrideDefinitions( [ [ - "--training.compile", - "--parallelism.data_parallel_shard_degree=8", + "--parallelism.context_parallel_degree 2", + "--parallelism.context_parallel_rotate_method='allgather'", ] ], - "FSDP+torch.compile", - "fsdp+compile", + "DP+CP(allgather)", + "dp_cp_allgather", ngpu=8, ), OverrideDefinitions( [ [ + "--parallelism.tensor_parallel_degree 2", "--parallelism.context_parallel_degree 2", "--parallelism.context_parallel_rotate_method='allgather'", ] ], - "CP (allgather)", - "cp_allgather", + "DP+CP(allgather)", + "dp_cp_allgather", ngpu=8, ), OverrideDefinitions( [ [ "--training.compile", - "--parallelism.data_parallel_shard_degree=4", - "--parallelism.data_parallel_replicate_degree=2", + "--parallelism.tensor_parallel_degree 2", + "--parallelism.enable_async_tensor_parallel", + ], + ], + "TP async+ compile", + "tp_async_compile", + ), + OverrideDefinitions( + [ + [ + "--parallelism.pipeline_parallel_degree=2", ] ], - "HSDP+CP+torch.compile+Float8", - "hsdp+cp+compile+float8", + "PP", + "PP", ngpu=8, ), ] From 72232122e4a492be58933b94dbbbb6149980a04a Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 9 Jul 2025 09:54:56 -0700 Subject: [PATCH 42/44] update test --- tests/integration_tests_amd.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 91db8770a..17d65754c 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -42,7 +42,7 @@ def build_test_list(): that is used to generate variations of integration tests based on the same root config file. TODO: 8*amd gpu current only support TP, DP, CP test. - HSDP, PP are not supported yet. + HSDP, PP and their related test, TP+DP+CP are not supported yet. """ integration_tests_flavors = defaultdict(list) integration_tests_flavors["debug_model.toml"] = [ @@ -70,13 +70,13 @@ def build_test_list(): OverrideDefinitions( [ [ - "--parallelism.tensor_parallel_degree 2", + "--parallelism.tensor_parallel_degree 4", "--parallelism.context_parallel_degree 2", "--parallelism.context_parallel_rotate_method='allgather'", ] ], - "DP+CP(allgather)", - "dp_cp_allgather", + "TP+CP(allgather)", + "tp_cp_allgather", ngpu=8, ), OverrideDefinitions( From 342a0d3ad8a4f530d3d7b5faa5da718beef1fba5 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 9 Jul 2025 10:09:37 -0700 Subject: [PATCH 43/44] update test --- tests/integration_tests_amd.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index 17d65754c..bbb8f3288 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -42,7 +42,7 @@ def build_test_list(): that is used to generate variations of integration tests based on the same root config file. TODO: 8*amd gpu current only support TP, DP, CP test. - HSDP, PP and their related test, TP+DP+CP are not supported yet. + HSDP, PP , TP+CP and their composability tests are not supported yet. """ integration_tests_flavors = defaultdict(list) integration_tests_flavors["debug_model.toml"] = [ @@ -67,18 +67,6 @@ def build_test_list(): "dp_cp_allgather", ngpu=8, ), - OverrideDefinitions( - [ - [ - "--parallelism.tensor_parallel_degree 4", - "--parallelism.context_parallel_degree 2", - "--parallelism.context_parallel_rotate_method='allgather'", - ] - ], - "TP+CP(allgather)", - "tp_cp_allgather", - ngpu=8, - ), OverrideDefinitions( [ [ @@ -100,6 +88,17 @@ def build_test_list(): "PP", ngpu=8, ), + OverrideDefinitions( + [ + [ + "--parallelism.tensor_parallel_degree 4", + "--parallelism.context_parallel_degree 2", + ] + ], + "TP+CP", + "tp_cp", + ngpu=8, + ), ] return integration_tests_flavors From fc280f5e7b309c892596415f8735a1b2a6fc1462 Mon Sep 17 00:00:00 2001 From: mori360 Date: Wed, 9 Jul 2025 10:39:10 -0700 Subject: [PATCH 44/44] update test --- tests/integration_tests_amd.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/tests/integration_tests_amd.py b/tests/integration_tests_amd.py index bbb8f3288..1147a092f 100644 --- a/tests/integration_tests_amd.py +++ b/tests/integration_tests_amd.py @@ -46,16 +46,6 @@ def build_test_list(): """ integration_tests_flavors = defaultdict(list) integration_tests_flavors["debug_model.toml"] = [ - OverrideDefinitions( - [ - [ - "--training.compile", - "--parallelism.tensor_parallel_degree 2", - ], - ], - "TP+DP compile", - "tp_dp_compile", - ), OverrideDefinitions( [ [ @@ -65,7 +55,6 @@ def build_test_list(): ], "DP+CP(allgather)", "dp_cp_allgather", - ngpu=8, ), OverrideDefinitions( [ @@ -75,18 +64,20 @@ def build_test_list(): "--parallelism.enable_async_tensor_parallel", ], ], - "TP async+ compile", - "tp_async_compile", + "DP+TP async+ compile", + "dp_tp_async_compile", ), OverrideDefinitions( [ [ - "--parallelism.pipeline_parallel_degree=2", - ] + "--model.converters float8", + "--float8.enable_fsdp_float8_all_gather", + "--float8.precompute_float8_dynamic_scale_for_fsdp", + "--float8.force_recompute_fp8_weight_in_bwd", + ], ], - "PP", - "PP", - ngpu=8, + "Float8 test", + "float8", ), OverrideDefinitions( [