pytorch · mori360 · May 29, 2025 · May 29, 2025 · May 29, 2025 · May 29, 2025
@@ -0,0 +1,40 @@
+name: 8 AMD GPU Integration Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+  schedule:
+    # Runs every 6 hours
+    - cron: '0 */6 * * *'
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.rocm.gpu.mi300.8
+      gpu-arch-type: rocm
+      gpu-arch-version: "6.3"
+      upload-artifact: outputs
+      use-custom-docker-registry: false
+      script: |
+        set -eux
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        conda install -c conda-forge jq -y
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        pip config --user set global.progress_bar off
+        pip install -r requirements.txt
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+        # There's perrmission issue in uploading files in artifacts-to-be-uploaded on AMD nodes, thus skip it by creating and storing
+        # results in generated-artifacts.
+        mkdir generated-artifacts
+        python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8
@@ -0,0 +1,184 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+import subprocess
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Sequence
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib
+
+
+@dataclass
+class OverrideDefinitions:
+    """
+    This class is used to define the override definitions for the integration tests.
+    """
+
+    override_args: Sequence[Sequence[str]] = tuple(tuple(" "))
+    test_descr: str = "default"
+    test_name: str = "default"
+    ngpu: int = 4
+
+    def __repr__(self):
+        return self.test_descr
+
+
+def build_test_list():
+    """
+    key is the config file name and value is a list of OverrideDefinitions
+    that is used to generate variations of integration tests based on the
+    same root config file.
+    TODO: 8*amd gpu current only support TP, DP, CP test.
+    HSDP, PP , TP+CP and their composability tests are not supported yet.
+    """
+    integration_tests_flavors = defaultdict(list)
+    integration_tests_flavors["debug_model.toml"] = [
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.context_parallel_degree 2",
+                    "--parallelism.context_parallel_rotate_method='allgather'",
+                ]
+            ],
+            "DP+CP(allgather)",
+            "dp_cp_allgather",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.compile",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.enable_async_tensor_parallel",
+                ],
+            ],
+            "DP+TP async+ compile",
+            "dp_tp_async_compile",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--model.converters float8",
+                    "--float8.enable_fsdp_float8_all_gather",
+                    "--float8.precompute_float8_dynamic_scale_for_fsdp",
+                    "--float8.force_recompute_fp8_weight_in_bwd",
+                ],
+            ],
+            "Float8 test",
+            "float8",
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.tensor_parallel_degree 4",
+                    "--parallelism.context_parallel_degree 2",
+                ]
+            ],
+            "TP+CP",
+            "tp_cp",
+            ngpu=8,
+        ),
+    ]
+    return integration_tests_flavors
+
+
+def _run_cmd(cmd):
+    return subprocess.run([cmd], text=True, shell=True)
+
+
+def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
+    # run_test supports sequence of tests.
+    test_name = test_flavor.test_name
+    dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}"
+    all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
+
+    for idx, override_arg in enumerate(test_flavor.override_args):
+        cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh"
+        # dump compile trace for debugging purpose
+        cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd
+        if test_name == "fsdp2_memory_estimation":
+            cmd = (
+                f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} "
+                "./scripts/estimate/run_memory_estimation.sh"
+            )
+        cmd += " " + dump_folder_arg
+        if override_arg:
+            cmd += " " + " ".join(override_arg)
+        logger.info(
+            f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
+        )
+
+        # save checkpoint (idx == 0) and load it for generation (idx == 1)
+        if test_name == "test_generate" and idx == 1:
+            cmd = (
+                f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} "
+                f"CHECKPOINT_DIR={output_dir}/{test_name}/checkpoint/step-10 "
+                "PROMPT='What is the meaning of life?' "
+                f"./scripts/generate/run_llama_generate.sh --out > {output_dir}/{test_name}/generated_output.json"
+            )
+
+        result = _run_cmd(cmd)
+        logger.info(result.stdout)
+        if result.returncode != 0:
+            raise Exception(
+                f"Integration test failed, flavor : {test_flavor.test_descr}, command : {cmd}"
+            )
+
+
+def run_tests(args):
+    integration_tests_flavors = build_test_list()
+    for config_file in os.listdir(args.config_dir):
+        if config_file.endswith(".toml"):
+            full_path = os.path.join(args.config_dir, config_file)
+            with open(full_path, "rb") as f:
+                config = tomllib.load(f)
+                is_integration_test = config["job"].get(
+                    "use_for_integration_test", False
+                )
+                if is_integration_test:
+                    for test_flavor in integration_tests_flavors[config_file]:
+                        if args.test == "all" or test_flavor.test_name == args.test:
+                            if args.ngpu < test_flavor.ngpu:
+                                logger.info(
+                                    f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus,"
+                                    f" because --ngpu arg is {args.ngpu}"
+                                )
+                            else:
+                                run_test(test_flavor, full_path, args.output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--config_dir", default="./torchtitan/models/llama3/train_configs"
+    )
+    parser.add_argument(
+        "--test",
+        default="all",
+        help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
+    )
+    parser.add_argument("--ngpu", default=8, type=int)
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    if os.listdir(args.output_dir):
+        raise RuntimeError("Please provide an empty output directory.")
+    run_tests(args)
+
+
+if __name__ == "__main__":
+    main()