-
Notifications
You must be signed in to change notification settings - Fork 483
Add AMD GPU node for integration test #1241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d47f51f
ec83247
7cd90e3
2bfcf35
b130aee
e26285c
9e8e997
c9e733a
a9aa64d
41e3021
cf8a9f0
47d9d61
7a4cc68
ae636c5
e4235c9
687fdf4
a781bb3
6fefa81
9404612
98a74d2
d59f978
f352e77
9214618
c3728b6
02e141b
6e7c6dd
0e645b2
ab49ddb
237b955
f4c53d7
496ee04
bc43677
cbbe7ac
e243a54
f1830ea
d279713
79c5253
a532866
d841f29
9e1ca9e
470a7fa
ece8810
7223212
342a0d3
fc280f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
name: 8 AMD GPU Integration Test | ||
|
||
on: | ||
push: | ||
branches: [ main ] | ||
pull_request: | ||
schedule: | ||
# Runs every 6 hours | ||
- cron: '0 */6 * * *' | ||
concurrency: | ||
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} | ||
cancel-in-progress: true | ||
|
||
defaults: | ||
run: | ||
shell: bash -l -eo pipefail {0} | ||
|
||
jobs: | ||
build-test: | ||
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main | ||
with: | ||
runner: linux.rocm.gpu.mi300.8 | ||
gpu-arch-type: rocm | ||
gpu-arch-version: "6.3" | ||
upload-artifact: outputs | ||
use-custom-docker-registry: false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I followed the CUDA yml you linked to set the yaml here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My suggestion here would be to keep the workflow consistent with how we do it for CUDA i.e. build a docker image with the required dependencies installed and then use that docker image to run the tests. This is the approach we are following in #1260. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please also read my comment #1241 (review) to understand how we view the work in this PR as being complimentary to the work in #1260 |
||
script: | | ||
set -eux | ||
# The generic Linux job chooses to use base env, not the one setup by the image | ||
conda install -c conda-forge jq -y | ||
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") | ||
conda activate "${CONDA_ENV}" | ||
pip config --user set global.progress_bar off | ||
pip install -r requirements.txt | ||
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3 | ||
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 | ||
# There's perrmission issue in uploading files in artifacts-to-be-uploaded on AMD nodes, thus skip it by creating and storing | ||
# results in generated-artifacts. | ||
mkdir generated-artifacts | ||
python ./tests/integration_tests_amd.py generated-artifacts --ngpu 8 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the BSD-style license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
import argparse | ||
import logging | ||
import os | ||
import subprocess | ||
from collections import defaultdict | ||
from dataclasses import dataclass | ||
from typing import Sequence | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
try: | ||
import tomllib | ||
except ModuleNotFoundError: | ||
import tomli as tomllib | ||
|
||
|
||
@dataclass | ||
class OverrideDefinitions: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we reuse this class in other tests, instead of reinvent? |
||
""" | ||
This class is used to define the override definitions for the integration tests. | ||
""" | ||
|
||
override_args: Sequence[Sequence[str]] = tuple(tuple(" ")) | ||
test_descr: str = "default" | ||
test_name: str = "default" | ||
ngpu: int = 4 | ||
|
||
def __repr__(self): | ||
return self.test_descr | ||
|
||
|
||
def build_test_list(): | ||
""" | ||
key is the config file name and value is a list of OverrideDefinitions | ||
that is used to generate variations of integration tests based on the | ||
same root config file. | ||
TODO: 8*amd gpu current only support TP, DP, CP test. | ||
HSDP, PP , TP+CP and their composability tests are not supported yet. | ||
""" | ||
integration_tests_flavors = defaultdict(list) | ||
integration_tests_flavors["debug_model.toml"] = [ | ||
OverrideDefinitions( | ||
[ | ||
[ | ||
"--parallelism.context_parallel_degree 2", | ||
"--parallelism.context_parallel_rotate_method='allgather'", | ||
] | ||
], | ||
"DP+CP(allgather)", | ||
"dp_cp_allgather", | ||
), | ||
OverrideDefinitions( | ||
[ | ||
[ | ||
"--training.compile", | ||
"--parallelism.tensor_parallel_degree 2", | ||
"--parallelism.enable_async_tensor_parallel", | ||
], | ||
], | ||
"DP+TP async+ compile", | ||
"dp_tp_async_compile", | ||
), | ||
OverrideDefinitions( | ||
[ | ||
[ | ||
"--model.converters float8", | ||
"--float8.enable_fsdp_float8_all_gather", | ||
"--float8.precompute_float8_dynamic_scale_for_fsdp", | ||
"--float8.force_recompute_fp8_weight_in_bwd", | ||
], | ||
], | ||
"Float8 test", | ||
"float8", | ||
), | ||
OverrideDefinitions( | ||
[ | ||
[ | ||
"--parallelism.tensor_parallel_degree 4", | ||
"--parallelism.context_parallel_degree 2", | ||
] | ||
], | ||
"TP+CP", | ||
"tp_cp", | ||
ngpu=8, | ||
), | ||
] | ||
return integration_tests_flavors | ||
|
||
|
||
def _run_cmd(cmd): | ||
return subprocess.run([cmd], text=True, shell=True) | ||
|
||
|
||
def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str): | ||
# run_test supports sequence of tests. | ||
test_name = test_flavor.test_name | ||
dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}" | ||
all_ranks = ",".join(map(str, range(test_flavor.ngpu))) | ||
|
||
for idx, override_arg in enumerate(test_flavor.override_args): | ||
cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh" | ||
# dump compile trace for debugging purpose | ||
cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd | ||
if test_name == "fsdp2_memory_estimation": | ||
cmd = ( | ||
f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} " | ||
"./scripts/estimate/run_memory_estimation.sh" | ||
) | ||
cmd += " " + dump_folder_arg | ||
if override_arg: | ||
cmd += " " + " ".join(override_arg) | ||
logger.info( | ||
f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}=====" | ||
) | ||
|
||
# save checkpoint (idx == 0) and load it for generation (idx == 1) | ||
if test_name == "test_generate" and idx == 1: | ||
cmd = ( | ||
f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} " | ||
f"CHECKPOINT_DIR={output_dir}/{test_name}/checkpoint/step-10 " | ||
"PROMPT='What is the meaning of life?' " | ||
f"./scripts/generate/run_llama_generate.sh --out > {output_dir}/{test_name}/generated_output.json" | ||
) | ||
|
||
result = _run_cmd(cmd) | ||
logger.info(result.stdout) | ||
if result.returncode != 0: | ||
raise Exception( | ||
f"Integration test failed, flavor : {test_flavor.test_descr}, command : {cmd}" | ||
) | ||
|
||
|
||
def run_tests(args): | ||
integration_tests_flavors = build_test_list() | ||
for config_file in os.listdir(args.config_dir): | ||
if config_file.endswith(".toml"): | ||
full_path = os.path.join(args.config_dir, config_file) | ||
with open(full_path, "rb") as f: | ||
config = tomllib.load(f) | ||
is_integration_test = config["job"].get( | ||
"use_for_integration_test", False | ||
) | ||
if is_integration_test: | ||
for test_flavor in integration_tests_flavors[config_file]: | ||
if args.test == "all" or test_flavor.test_name == args.test: | ||
if args.ngpu < test_flavor.ngpu: | ||
logger.info( | ||
f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus," | ||
f" because --ngpu arg is {args.ngpu}" | ||
) | ||
else: | ||
run_test(test_flavor, full_path, args.output_dir) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("output_dir") | ||
parser.add_argument( | ||
"--config_dir", default="./torchtitan/models/llama3/train_configs" | ||
) | ||
parser.add_argument( | ||
"--test", | ||
default="all", | ||
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)", | ||
) | ||
parser.add_argument("--ngpu", default=8, type=int) | ||
args = parser.parse_args() | ||
|
||
if not os.path.exists(args.output_dir): | ||
os.makedirs(args.output_dir) | ||
if os.listdir(args.output_dir): | ||
raise RuntimeError("Please provide an empty output directory.") | ||
run_tests(args) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Uh oh!
There was an error while loading. Please reload this page.