From 4074d0c1dfdca14aece5621279e05af3a3f0221c Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 3 Jun 2025 23:45:19 -0500 Subject: [PATCH 01/17] Added support to run torchtitan tests on ROCm. --- .ci/docker/build.sh | 26 +++++++++++++++----- .ci/docker/ubuntu-cuda/Dockerfile | 41 +++++++++++++++++++++++++++++++ .ci/docker/ubuntu-rocm/Dockerfile | 16 ++++++++++++ 3 files changed, 77 insertions(+), 6 deletions(-) create mode 100644 .ci/docker/ubuntu-cuda/Dockerfile create mode 100644 .ci/docker/ubuntu-rocm/Dockerfile diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 34c1f0d45..5511acabf 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -12,15 +12,29 @@ shift echo "Building ${IMAGE_NAME} Docker image" +# set operating system OS=ubuntu -OS_VERSION=20.04 -CLANG_VERSION="" -PYTHON_VERSION=3.11 -MINICONDA_VERSION=24.3.0-0 + +# set Dockerfile +DOCKERFILE="${OS}/Dockerfile" +if [[ "$IMAGE_NAME" == *cuda* ]]; then + DOCKERFILE="${OS}-cuda/Dockerfile" +elif [[ "$IMAGE_NAME" == *rocm* ]]; then + DOCKERFILE="${OS}-rocm/Dockerfile" +fi case "${IMAGE_NAME}" in torchtitan-ubuntu-20.04-clang12) + OS_VERSION=20.04 CLANG_VERSION=12 + PYTHON_VERSION=3.11 + MINICONDA_VERSION=24.3.0-0 + ;; + torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3) + OS_VERSION=22.04 + CLANG_VERSION=19 + PYTHON_VERSION=3.10 + MINICONDA_VERSION=25.3.1-0 ;; *) echo "Invalid image name ${IMAGE_NAME}" @@ -34,7 +48,7 @@ docker build \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ - --shm-size=1g \ - -f "${OS}"/Dockerfile \ + -f $(dirname ${DOCKERFILE})/Dockerfile \ "$@" \ . + diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile new file mode 100644 index 000000000..39e4d8ec5 --- /dev/null +++ b/.ci/docker/ubuntu-cuda/Dockerfile @@ -0,0 +1,41 @@ +ARG OS_VERSION + +FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} + +ARG OS_VERSION + +ENV DEBIAN_FRONTEND noninteractive + +# Install common dependencies +COPY ./common/install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh + +# Install clang +ARG CLANG_VERSION +COPY ./common/install_clang.sh install_clang.sh +RUN bash ./install_clang.sh && rm install_clang.sh + +# Install gcc +ARG GCC_VERSION +COPY ./common/install_gcc.sh install_gcc.sh +RUN bash ./install_gcc.sh && rm install_gcc.sh + +# Setup user +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + +# Install conda and other dependencies +ARG MINICONDA_VERSION +ARG PYTHON_VERSION +ENV PYTHON_VERSION=$PYTHON_VERSION +ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH +COPY requirements-dev.txt /opt/conda/ +COPY requirements.txt /opt/conda/ +COPY requirements-flux.txt /opt/conda/ +COPY conda-env-ci.txt /opt/conda/ +COPY ./common/install_conda.sh install_conda.sh +COPY ./common/utils.sh utils.sh +RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt + +USER ci-user +CMD ["bash"] diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile new file mode 100644 index 000000000..6f292dfdd --- /dev/null +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -0,0 +1,16 @@ +# base image +FROM rocm/pytorch-nightly:latest + +# args +ARG OS_VERSION +ARG CLANG_VERSION +ARG GCC_VERSION +ARG MINICONDA_VERSION +ARG PYTHON_VERSION + +# install dependencies +COPY requirements.txt requirements.txt +RUN pip install -r ./requirements.txt + +CMD ["bash"] + From 340478a047b95359aefe51eaf6f6cc0c7a9661cc Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 4 Jun 2025 22:53:57 -0500 Subject: [PATCH 02/17] Added rocm ci support for integration_test_h100. --- .../integration_test_8gpu_h100_rocm.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/integration_test_8gpu_h100_rocm.yaml diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_h100_rocm.yaml new file mode 100644 index 000000000..3debb1760 --- /dev/null +++ b/.github/workflows/integration_test_8gpu_h100_rocm.yaml @@ -0,0 +1,37 @@ +name: 8 GPU Integration Test at H100 + +on: + push: + branches: [ main ] + pull_request: + schedule: + # Runs every 6 hours + - cron: '0 */6 * * *' +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.rocm.gpu.mi300.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + repository: pytorch/torchtitan + upload-artifact: outputs + script: | + set -eux + + USE_CPP=0 python -m pip install --pre torchao + + mkdir artifacts-to-be-uploaded + python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8 + From 51427a7aa7ff3b1ae7787cdaa38fd33e20b787e6 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Sat, 7 Jun 2025 00:40:28 -0500 Subject: [PATCH 03/17] Fixed a bug in build script. Removed ubuntu-cuda folder, instead using ubuntu folder for cuda Dockerfile. --- .ci/docker/build.sh | 4 +- .ci/docker/ubuntu-cuda/Dockerfile | 41 ------------------- ...m.yaml => integration_test_8gpu_rocm.yaml} | 8 ++-- 3 files changed, 4 insertions(+), 49 deletions(-) delete mode 100644 .ci/docker/ubuntu-cuda/Dockerfile rename .github/workflows/{integration_test_8gpu_h100_rocm.yaml => integration_test_8gpu_rocm.yaml} (70%) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 5511acabf..a1aafe3d3 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -17,9 +17,7 @@ OS=ubuntu # set Dockerfile DOCKERFILE="${OS}/Dockerfile" -if [[ "$IMAGE_NAME" == *cuda* ]]; then - DOCKERFILE="${OS}-cuda/Dockerfile" -elif [[ "$IMAGE_NAME" == *rocm* ]]; then +if [[ "$IMAGE_NAME" == *rocm* ]]; then DOCKERFILE="${OS}-rocm/Dockerfile" fi diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile deleted file mode 100644 index 39e4d8ec5..000000000 --- a/.ci/docker/ubuntu-cuda/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -ARG OS_VERSION - -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} - -ARG OS_VERSION - -ENV DEBIAN_FRONTEND noninteractive - -# Install common dependencies -COPY ./common/install_base.sh install_base.sh -RUN bash ./install_base.sh && rm install_base.sh - -# Install clang -ARG CLANG_VERSION -COPY ./common/install_clang.sh install_clang.sh -RUN bash ./install_clang.sh && rm install_clang.sh - -# Install gcc -ARG GCC_VERSION -COPY ./common/install_gcc.sh install_gcc.sh -RUN bash ./install_gcc.sh && rm install_gcc.sh - -# Setup user -COPY ./common/install_user.sh install_user.sh -RUN bash ./install_user.sh && rm install_user.sh - -# Install conda and other dependencies -ARG MINICONDA_VERSION -ARG PYTHON_VERSION -ENV PYTHON_VERSION=$PYTHON_VERSION -ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH -COPY requirements-dev.txt /opt/conda/ -COPY requirements.txt /opt/conda/ -COPY requirements-flux.txt /opt/conda/ -COPY conda-env-ci.txt /opt/conda/ -COPY ./common/install_conda.sh install_conda.sh -COPY ./common/utils.sh utils.sh -RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/requirements-dev.txt /opt/conda/requirements.txt /opt/conda/requirements-flux.txt /opt/conda/conda-env-ci.txt - -USER ci-user -CMD ["bash"] diff --git a/.github/workflows/integration_test_8gpu_h100_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml similarity index 70% rename from .github/workflows/integration_test_8gpu_h100_rocm.yaml rename to .github/workflows/integration_test_8gpu_rocm.yaml index 3debb1760..bb64ad528 100644 --- a/.github/workflows/integration_test_8gpu_h100_rocm.yaml +++ b/.github/workflows/integration_test_8gpu_rocm.yaml @@ -1,4 +1,4 @@ -name: 8 GPU Integration Test at H100 +name: 8 GPU Integration Test on: push: @@ -17,13 +17,11 @@ defaults: jobs: build-test: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.rocm.gpu.mi300.8 gpu-arch-type: rocm gpu-arch-version: "6.4" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 repository: pytorch/torchtitan upload-artifact: outputs @@ -33,5 +31,5 @@ jobs: USE_CPP=0 python -m pip install --pre torchao mkdir artifacts-to-be-uploaded - python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8 + python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 From 2848d518f3c2aef5e89822850c16979ce65db6f4 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 10 Jun 2025 20:07:10 -0500 Subject: [PATCH 04/17] Included test in integration_tests.py after rebase. --- tests/integration_tests.py | 76 ++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/tests/integration_tests.py b/tests/integration_tests.py index 3ccbc1890..e89a6c374 100755 --- a/tests/integration_tests.py +++ b/tests/integration_tests.py @@ -20,6 +20,8 @@ except ModuleNotFoundError: import tomli as tomllib +test_with_rocm = os.getenv("TEST_WITH_ROCM", "0") + @dataclass class OverrideDefinitions: @@ -139,28 +141,34 @@ def build_test_list(): "Checkpoint Integration Test - Save Model Weights Only bf16", "last_save_model_weights_only_bf16", ), - OverrideDefinitions( - [ + ] + # check test_with_rocm + if test_with_rocm != "1": + integration_tests_flavors["debug_model.toml"].extend([ + OverrideDefinitions( [ - "--parallelism.pipeline_parallel_degree 4", - "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", + [ + "--parallelism.pipeline_parallel_degree 4", + "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", + ], ], - ], - "PP looped zero bubble test", - "pp_looped_zero_bubble", - ngpu=4, - ), - OverrideDefinitions( - [ + "PP looped zero bubble test", + "pp_looped_zero_bubble", + ngpu=4, + ), + OverrideDefinitions( [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", + [ + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", + ], ], - ], - "PP zero bubble test (v shaped)", - "pp_zbv", - ngpu=2, - ), + "PP zero bubble test (v shaped)", + "pp_zbv", + ngpu=2, + ), + ]) + integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ [ @@ -272,18 +280,24 @@ def build_test_list(): "pp_looped_1f1b", ngpu=4, ), - OverrideDefinitions( - [ - [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", - "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", - ], - ], - "PP with custom pipeline schedule loaded from CSV file", - "pp_custom_csv", - ngpu=2, - ), + ]) + # check test_with_rocm + if test_with_rocm != "1": + integration_tests_flavors["debug_model.toml"].extend( + OverrideDefinitions( + [ + [ + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", + "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", + ], + ], + "PP with custom pipeline schedule loaded from CSV file", + "pp_custom_csv", + ngpu=2, + ), + ) + integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ [ @@ -509,7 +523,7 @@ def build_test_list(): "gradient_accumulation", ngpu=2, ), - ] + ]) return integration_tests_flavors From cb13ad44722613a2cff045d2d700c57e7b0fadc1 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Fri, 13 Jun 2025 13:51:04 -0500 Subject: [PATCH 05/17] Modified docker-builds.yml to build rocm docker image for torchtitan. --- .github/workflows/docker-builds.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 11ff5390c..75d5082c5 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -22,13 +22,16 @@ concurrency: jobs: docker-build: - runs-on: [self-hosted, linux.2xlarge] + runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8] timeout-minutes: 240 strategy: fail-fast: false matrix: include: - - docker-image-name: torchtitan-ubuntu-20.04-clang12 + - docker-image-name: [ + torchtitan-ubuntu-20.04-clang12, + torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + ] env: DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }} steps: From de9bdcce016c8c247392f715932b195864e976b4 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 18 Jun 2025 13:04:59 -0500 Subject: [PATCH 06/17] Fixed runner for cuda and rocm images in docker-builds.yml. --- .github/workflows/docker-builds.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 75d5082c5..4289a07ab 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -22,16 +22,16 @@ concurrency: jobs: docker-build: - runs-on: [self-hosted, linux.2xlarge, linux.rocm.gpu.mi300.8] - timeout-minutes: 240 strategy: fail-fast: false matrix: include: - - docker-image-name: [ - torchtitan-ubuntu-20.04-clang12, - torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - ] + - docker-image-name: torchtitan-ubuntu-20.04-clang12 + runner: [self-hosted, linux.2xlarge] + - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + runner: linux.rocm.gpu.mi300.8 + runs-on: ${{ matrix.runner }} + timeout-minutes: 240 env: DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }} steps: From f634f000a9d8dc52cf88fd46eb33e80bae5d589b Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 19 Jun 2025 01:01:43 -0500 Subject: [PATCH 07/17] Added TEST_WITH_ROCM environment variable for running tests on rocm. Fixed error in integration_tests.py. Fixed lint errors. --- .ci/docker/ubuntu-rocm/Dockerfile | 1 - .github/workflows/integration_test_8gpu_rocm.yaml | 3 +-- tests/integration_tests.py | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index 6f292dfdd..ae944279c 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -13,4 +13,3 @@ COPY requirements.txt requirements.txt RUN pip install -r ./requirements.txt CMD ["bash"] - diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml index bb64ad528..c5069164d 100644 --- a/.github/workflows/integration_test_8gpu_rocm.yaml +++ b/.github/workflows/integration_test_8gpu_rocm.yaml @@ -31,5 +31,4 @@ jobs: USE_CPP=0 python -m pip install --pre torchao mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 - + python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/integration_tests.py b/tests/integration_tests.py index e89a6c374..ce894aea3 100755 --- a/tests/integration_tests.py +++ b/tests/integration_tests.py @@ -283,7 +283,7 @@ def build_test_list(): ]) # check test_with_rocm if test_with_rocm != "1": - integration_tests_flavors["debug_model.toml"].extend( + integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ [ @@ -296,7 +296,7 @@ def build_test_list(): "pp_custom_csv", ngpu=2, ), - ) + ]) integration_tests_flavors["debug_model.toml"].extend([ OverrideDefinitions( [ From 87a5a5982feda275e6e059ab0e7e8499adb5d970 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 24 Jun 2025 13:38:01 -0500 Subject: [PATCH 08/17] Refactored integration_tests.py with skip tests for ROCm. --- tests/integration_tests.py | 85 +++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 42 deletions(-) mode change 100755 => 100644 tests/integration_tests.py diff --git a/tests/integration_tests.py b/tests/integration_tests.py old mode 100755 new mode 100644 index ce894aea3..b10b17953 --- a/tests/integration_tests.py +++ b/tests/integration_tests.py @@ -20,7 +20,15 @@ except ModuleNotFoundError: import tomli as tomllib -test_with_rocm = os.getenv("TEST_WITH_ROCM", "0") +# tests skipped for ROCm +skip_for_rocm_test_list = [ + "pp_looped_zero_bubble", + "pp_zbv", + "pp_custom_csv", + "last_save_model_weights_only_bf16", + "last_save_model_weights_only_fp32", +] +TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1" @dataclass @@ -141,34 +149,28 @@ def build_test_list(): "Checkpoint Integration Test - Save Model Weights Only bf16", "last_save_model_weights_only_bf16", ), - ] - # check test_with_rocm - if test_with_rocm != "1": - integration_tests_flavors["debug_model.toml"].extend([ - OverrideDefinitions( + OverrideDefinitions( + [ [ - [ - "--parallelism.pipeline_parallel_degree 4", - "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", - ], + "--parallelism.pipeline_parallel_degree 4", + "--parallelism.pipeline_parallel_schedule InterleavedZeroBubble", ], - "PP looped zero bubble test", - "pp_looped_zero_bubble", - ngpu=4, - ), - OverrideDefinitions( + ], + "PP looped zero bubble test", + "pp_looped_zero_bubble", + ngpu=4, + ), + OverrideDefinitions( + [ [ - [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", - ], + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule ZBVZeroBubble", ], - "PP zero bubble test (v shaped)", - "pp_zbv", - ngpu=2, - ), - ]) - integration_tests_flavors["debug_model.toml"].extend([ + ], + "PP zero bubble test (v shaped)", + "pp_zbv", + ngpu=2, + ), OverrideDefinitions( [ [ @@ -280,24 +282,18 @@ def build_test_list(): "pp_looped_1f1b", ngpu=4, ), - ]) - # check test_with_rocm - if test_with_rocm != "1": - integration_tests_flavors["debug_model.toml"].extend([ - OverrideDefinitions( + OverrideDefinitions( + [ [ - [ - "--parallelism.pipeline_parallel_degree 2", - "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", - "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", - ], + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule PipelineScheduleMulti", + "--parallelism.pipeline_parallel_schedule_csv ./tests/assets/custom_schedule.csv", ], - "PP with custom pipeline schedule loaded from CSV file", - "pp_custom_csv", - ngpu=2, - ), - ]) - integration_tests_flavors["debug_model.toml"].extend([ + ], + "PP with custom pipeline schedule loaded from CSV file", + "pp_custom_csv", + ngpu=2, + ), OverrideDefinitions( [ [ @@ -523,7 +519,7 @@ def build_test_list(): "gradient_accumulation", ngpu=2, ), - ]) + ] return integration_tests_flavors @@ -582,6 +578,11 @@ def run_tests(args): ) if is_integration_test: for test_flavor in integration_tests_flavors[config_file]: + if ( + TEST_WITH_ROCM + and test_flavor.test_name in skip_for_rocm_test_list + ): + continue if args.test == "all" or test_flavor.test_name == args.test: if args.ngpu < test_flavor.ngpu: logger.info( From d748586ab794318b6223a6cb1787f87ff417b6b8 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Sun, 29 Jun 2025 02:14:23 -0500 Subject: [PATCH 09/17] Changed runner to i-0962598bd0e8298b3 for building ROCm docker image. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 4289a07ab..bfe2fac82 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -29,7 +29,7 @@ jobs: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - runner: linux.rocm.gpu.mi300.8 + runner: i-0962598bd0e8298b3 runs-on: ${{ matrix.runner }} timeout-minutes: 240 env: From 66eba9f37b4cc0c57f1f12084e7c60a81fea442c Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 30 Jun 2025 10:59:25 -0500 Subject: [PATCH 10/17] Changed runner to linux.12xlarge for building ROCm docker image. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index bfe2fac82..be12cceaa 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -29,7 +29,7 @@ jobs: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - runner: i-0962598bd0e8298b3 + runner: [linux.12xlarge] runs-on: ${{ matrix.runner }} timeout-minutes: 240 env: From 2d317c3ae8d2e5fdf2ca544274ce4c92b3655ccc Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Mon, 30 Jun 2025 11:21:03 -0500 Subject: [PATCH 11/17] Changed runner to linux.2xlarge for building ROCm docker image. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index be12cceaa..84afe8bd2 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -29,7 +29,7 @@ jobs: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - runner: [linux.12xlarge] + runner: [linux.2xlarge] runs-on: ${{ matrix.runner }} timeout-minutes: 240 env: From 18025adf8adaf574efdfccba522aa0a1f3b39f8b Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 3 Jul 2025 00:14:42 -0500 Subject: [PATCH 12/17] Added support to use single Dockerfile for both cuda and rocm. Using single workflow file to run integration tests including h100 for both cuda and rocm. Need different name for integration_test_8gpu_h100.yaml as we are also running it for rocm. Fixed file permission for integration_tests.py. --- .ci/docker/build.sh | 24 ++++++------- .ci/docker/ubuntu-rocm/Dockerfile | 15 -------- .ci/docker/ubuntu/Dockerfile | 4 +-- .github/workflows/integration_test_8gpu.yaml | 35 ++++++++++++++----- .../workflows/integration_test_8gpu_h100.yaml | 31 +++++++++++----- .../workflows/integration_test_8gpu_rocm.yaml | 34 ------------------ tests/integration_tests.py | 0 7 files changed, 61 insertions(+), 82 deletions(-) delete mode 100644 .ci/docker/ubuntu-rocm/Dockerfile delete mode 100644 .github/workflows/integration_test_8gpu_rocm.yaml mode change 100644 => 100755 tests/integration_tests.py diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index a1aafe3d3..597b2ee5c 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -12,27 +12,21 @@ shift echo "Building ${IMAGE_NAME} Docker image" -# set operating system OS=ubuntu - -# set Dockerfile -DOCKERFILE="${OS}/Dockerfile" -if [[ "$IMAGE_NAME" == *rocm* ]]; then - DOCKERFILE="${OS}-rocm/Dockerfile" -fi +CLANG_VERSION="" +PYTHON_VERSION=3.11 +MINICONDA_VERSION=24.3.0-0 case "${IMAGE_NAME}" in torchtitan-ubuntu-20.04-clang12) OS_VERSION=20.04 CLANG_VERSION=12 - PYTHON_VERSION=3.11 - MINICONDA_VERSION=24.3.0-0 + BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} ;; - torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3) + torchtitan-rocm-ubuntu-22.04-clang12) OS_VERSION=22.04 - CLANG_VERSION=19 - PYTHON_VERSION=3.10 - MINICONDA_VERSION=25.3.1-0 + CLANG_VERSION=12 + BASE_IMAGE=rocm/dev-ubuntu-${OS_VERSION}:latest ;; *) echo "Invalid image name ${IMAGE_NAME}" @@ -42,11 +36,13 @@ esac docker build \ --no-cache \ --progress=plain \ + --build-arg "BASE_IMAGE=${BASE_IMAGE}" \ --build-arg "OS_VERSION=${OS_VERSION}" \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ - -f $(dirname ${DOCKERFILE})/Dockerfile \ + --shm-size=1g \ + -f "${OS}"/Dockerfile \ "$@" \ . diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile deleted file mode 100644 index ae944279c..000000000 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -# base image -FROM rocm/pytorch-nightly:latest - -# args -ARG OS_VERSION -ARG CLANG_VERSION -ARG GCC_VERSION -ARG MINICONDA_VERSION -ARG PYTHON_VERSION - -# install dependencies -COPY requirements.txt requirements.txt -RUN pip install -r ./requirements.txt - -CMD ["bash"] diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 39e4d8ec5..5d10c01b7 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -1,6 +1,6 @@ -ARG OS_VERSION +ARG BASE_IMAGE -FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu${OS_VERSION} +FROM ${BASE_IMAGE} ARG OS_VERSION diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index a2469a913..5ba4be2d4 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -23,13 +23,30 @@ defaults: jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + strategy: + matrix: + include: + - name: cuda + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/cu126 + is-rocm: 0 + - name: rocm + runner: linux.rocm.gpu.mi300.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + docker-image: torchtitan-rocm-ubuntu-22.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/rocm6.4 + is-rocm: 1 with: - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs script: | @@ -41,9 +58,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} mkdir artifacts-to-be-uploaded - python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 + TEST_WITH_ROCM=${{ matrix.is-rocm }} python ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 813669748..48147f72b 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -24,13 +24,28 @@ defaults: jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + matrix: + include: + - name: cuda + runner: linux.aws.h100.8 + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/cu126 + - name: rocm + runner: linux.rocm.gpu.mi300.8 + gpu-arch-type: rocm + gpu-arch-version: "6.4" + docker-image: torchtitan-rocm-ubuntu-22.04-clang12 + index-url: https://download.pytorch.org/whl/nightly/rocm6.4 with: - runner: linux.aws.h100.8 - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs script: | @@ -42,9 +57,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} - USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} mkdir artifacts-to-be-uploaded python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8 diff --git a/.github/workflows/integration_test_8gpu_rocm.yaml b/.github/workflows/integration_test_8gpu_rocm.yaml deleted file mode 100644 index c5069164d..000000000 --- a/.github/workflows/integration_test_8gpu_rocm.yaml +++ /dev/null @@ -1,34 +0,0 @@ -name: 8 GPU Integration Test - -on: - push: - branches: [ main ] - pull_request: - schedule: - # Runs every 6 hours - - cron: '0 */6 * * *' -concurrency: - group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -l -eo pipefail {0} - -jobs: - build-test: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - with: - runner: linux.rocm.gpu.mi300.8 - gpu-arch-type: rocm - gpu-arch-version: "6.4" - docker-image: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 - repository: pytorch/torchtitan - upload-artifact: outputs - script: | - set -eux - - USE_CPP=0 python -m pip install --pre torchao - - mkdir artifacts-to-be-uploaded - python TEST_WITH_ROCM=1 ./tests/integration_tests.py artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/integration_tests.py b/tests/integration_tests.py old mode 100644 new mode 100755 From 724e202c14fbc120af8f8c933d8c3eba3706798c Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 3 Jul 2025 00:35:08 -0500 Subject: [PATCH 13/17] Changed rocm docker image name in docker-builds.yml. --- .github/workflows/docker-builds.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 84afe8bd2..d5f52824d 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -28,7 +28,7 @@ jobs: include: - docker-image-name: torchtitan-ubuntu-20.04-clang12 runner: [self-hosted, linux.2xlarge] - - docker-image-name: torchtitan-rocm-pytorch-nightly-ubuntu-22.04-clang19-py3 + - docker-image-name: torchtitan-rocm-ubuntu-22.04-clang12 runner: [linux.2xlarge] runs-on: ${{ matrix.runner }} timeout-minutes: 240 From 15a9554102d3b2e43f03c95c26092e4b24028dc8 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 9 Jul 2025 00:54:14 -0500 Subject: [PATCH 14/17] Reverted the changes to integration_test_8gpu_h100.yaml. --- .../workflows/integration_test_8gpu_h100.yaml | 31 +++++-------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index 48147f72b..813669748 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -24,28 +24,13 @@ defaults: jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main - strategy: - matrix: - include: - - name: cuda - runner: linux.aws.h100.8 - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/cu126 - - name: rocm - runner: linux.rocm.gpu.mi300.8 - gpu-arch-type: rocm - gpu-arch-version: "6.4" - docker-image: torchtitan-rocm-ubuntu-22.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/rocm6.4 with: - runner: ${{ matrix.runner }} - gpu-arch-type: ${{ matrix.gpu-arch-type }} - gpu-arch-version: ${{ matrix.gpu-arch-version }} - docker-image: ${{ matrix.docker-image }} + runner: linux.aws.h100.8 + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 repository: pytorch/torchtitan upload-artifact: outputs script: | @@ -57,9 +42,9 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 - USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} + USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 mkdir artifacts-to-be-uploaded python ./tests/integration_tests_h100.py artifacts-to-be-uploaded --ngpu 8 From cb528bc6a23a6ef222141c4f72b6c41ccb681589 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 16 Jul 2025 13:47:13 -0500 Subject: [PATCH 15/17] Empty dummy commit. From 66e5c95d8092e4e68b3442933332413054e12a78 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 16 Jul 2025 20:30:01 -0500 Subject: [PATCH 16/17] Increased the timeout to 45 minutes to override timeout used in linux_job_v2.yml for integration_test_8gpu.yaml. --- .github/workflows/integration_test_8gpu.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml index 5ba4be2d4..8173e1225 100644 --- a/.github/workflows/integration_test_8gpu.yaml +++ b/.github/workflows/integration_test_8gpu.yaml @@ -49,6 +49,7 @@ jobs: docker-image: ${{ matrix.docker-image }} repository: pytorch/torchtitan upload-artifact: outputs + timeout: 45 script: | set -eux From efd11a8b42b7aa064187bd5d101999359ef88dd9 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Thu, 17 Jul 2025 13:29:35 -0500 Subject: [PATCH 17/17] Empty dummy commit.