From 5e960d874d24e416f25b3a6553eeaab9b451ff9f Mon Sep 17 00:00:00 2001 From: dggaytan Date: Wed, 2 Jul 2025 13:07:03 -0700 Subject: [PATCH] Adding torch accelerator and requirements file to FSDP2 example Signed-off-by: dggaytan --- distributed/FSDP2/README.md | 10 ++++---- distributed/FSDP2/{train.py => example.py} | 27 ++++++++++++++++++---- distributed/FSDP2/requirements.txt | 2 ++ distributed/FSDP2/run_example.sh | 11 +++++++++ run_distributed_examples.sh | 4 ++++ 5 files changed, 46 insertions(+), 8 deletions(-) rename distributed/FSDP2/{train.py => example.py} (75%) create mode 100644 distributed/FSDP2/requirements.txt create mode 100644 distributed/FSDP2/run_example.sh diff --git a/distributed/FSDP2/README.md b/distributed/FSDP2/README.md index 71528644e5..50a6dff9b4 100644 --- a/distributed/FSDP2/README.md +++ b/distributed/FSDP2/README.md @@ -1,25 +1,27 @@ ## FSDP2 To run FSDP2 on transformer model: + ``` cd distributed/FSDP2 -torchrun --nproc_per_node 2 train.py +pip install -r requirements.txt +torchrun --nproc_per_node 2 example.py ``` * For 1st time, it creates a "checkpoints" folder and saves state dicts there * For 2nd time, it loads from previous checkpoints To enable explicit prefetching ``` -torchrun --nproc_per_node 2 train.py --explicit-prefetch +torchrun --nproc_per_node 2 example.py --explicit-prefetch ``` To enable mixed precision ``` -torchrun --nproc_per_node 2 train.py --mixed-precision +torchrun --nproc_per_node 2 example.py --mixed-precision ``` To showcase DCP API ``` -torchrun --nproc_per_node 2 train.py --dcp-api +torchrun --nproc_per_node 2 example.py --dcp-api ``` ## Ensure you are running a recent version of PyTorch: diff --git a/distributed/FSDP2/train.py b/distributed/FSDP2/example.py similarity index 75% rename from distributed/FSDP2/train.py rename to distributed/FSDP2/example.py index 94c5dc8a49..abe0120a1d 100644 --- a/distributed/FSDP2/train.py +++ b/distributed/FSDP2/example.py @@ -7,6 +7,11 @@ from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy from utils import inspect_mixed_precision, inspect_model +def verify_min_gpu_count(min_gpus: int = 2) -> bool: + """ verification that we have at least 2 gpus to run dist examples """ + has_gpu = torch.accelerator.is_available() + gpu_count = torch.accelerator.device_count() + return has_gpu and gpu_count >= min_gpus def set_modules_to_forward_prefetch(model, num_to_forward_prefetch): for i, layer in enumerate(model.layers): @@ -29,10 +34,23 @@ def set_modules_to_backward_prefetch(model, num_to_backward_prefetch): def main(args): + _min_gpu_count = 2 + if not verify_min_gpu_count(min_gpus=_min_gpu_count): + print(f"Unable to locate sufficient {_min_gpu_count} gpus to run this example. Exiting.") + exit() rank = int(os.environ["LOCAL_RANK"]) - device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) - torch.distributed.init_process_group(backend="nccl", device_id=device) + if torch.accelerator.is_available(): + device_type = torch.accelerator.current_accelerator() + device = torch.device(f"{device_type}:{rank}") + torch.accelerator.device_index(rank) + print(f"Running on rank {rank} on device {device}") + else: + device = torch.device("cpu") + print(f"Running on device {device}") + + backend = torch.distributed.get_default_backend_for_device(device) + torch.distributed.init_process_group(backend=backend, device_id=device) + torch.manual_seed(0) vocab_size = 1024 batch_size = 32 @@ -64,7 +82,7 @@ def main(args): checkpointer = Checkpointer("checkpoints", dcp_api=args.dcp_api) if checkpointer.last_training_time is None: - model.to_empty(device="cuda") + model.to_empty(device=device) model.reset_parameters() else: checkpointer.load_model(model) @@ -96,4 +114,5 @@ def main(args): parser.add_argument("--mixed-precision", action="store_true", default=False) parser.add_argument("--dcp-api", action="store_true", default=False) args = parser.parse_args() + main(args) diff --git a/distributed/FSDP2/requirements.txt b/distributed/FSDP2/requirements.txt new file mode 100644 index 0000000000..ab9294415c --- /dev/null +++ b/distributed/FSDP2/requirements.txt @@ -0,0 +1,2 @@ +torch>=2.7 +numpy diff --git a/distributed/FSDP2/run_example.sh b/distributed/FSDP2/run_example.sh new file mode 100644 index 0000000000..b12dcfe110 --- /dev/null +++ b/distributed/FSDP2/run_example.sh @@ -0,0 +1,11 @@ +# /bin/bash +# bash run_example.sh {file_to_run.py} {num_gpus} +# where file_to_run = example to run. Default = 'example.py' +# num_gpus = num local gpus to use (must be at least 2). Default = 4 + +# samples to run include: +# example.py + +echo "Launching ${1:-example.py} with ${2:-4} gpus" +torchrun --nnodes=1 --nproc_per_node=${2:-4} ${1:-example.py} + diff --git a/run_distributed_examples.sh b/run_distributed_examples.sh index e1f579c072..d482ca19ba 100755 --- a/run_distributed_examples.sh +++ b/run_distributed_examples.sh @@ -50,6 +50,10 @@ function distributed_tensor_parallelism() { uv run bash run_example.sh fsdp_tp_example.py || error "2D parallel example failed" } +function distributed_FSDP2() { + uv run bash run_example.sh example.py || error "FSDP2 example failed" +} + function distributed_ddp() { uv run main.py || error "ddp example failed" }