From 5e960d874d24e416f25b3a6553eeaab9b451ff9f Mon Sep 17 00:00:00 2001
From: dggaytan <diana.gaytan.munoz@intel.com>
Date: Wed, 2 Jul 2025 13:07:03 -0700
Subject: [PATCH] Adding torch accelerator and requirements file to FSDP2
 example

Signed-off-by: dggaytan <diana.gaytan.munoz@intel.com>
---
 distributed/FSDP2/README.md                | 10 ++++----
 distributed/FSDP2/{train.py => example.py} | 27 ++++++++++++++++++----
 distributed/FSDP2/requirements.txt         |  2 ++
 distributed/FSDP2/run_example.sh           | 11 +++++++++
 run_distributed_examples.sh                |  4 ++++
 5 files changed, 46 insertions(+), 8 deletions(-)
 rename distributed/FSDP2/{train.py => example.py} (75%)
 create mode 100644 distributed/FSDP2/requirements.txt
 create mode 100644 distributed/FSDP2/run_example.sh

diff --git a/distributed/FSDP2/README.md b/distributed/FSDP2/README.md
index 71528644e5..50a6dff9b4 100644
--- a/distributed/FSDP2/README.md
+++ b/distributed/FSDP2/README.md
@@ -1,25 +1,27 @@
 ## FSDP2
 To run FSDP2 on transformer model:
+
 ```
 cd distributed/FSDP2
-torchrun --nproc_per_node 2 train.py
+pip install -r requirements.txt
+torchrun --nproc_per_node 2 example.py
 ```
 * For 1st time, it creates a "checkpoints" folder and saves state dicts there
 * For 2nd time, it loads from previous checkpoints
 
 To enable explicit prefetching
 ```
-torchrun --nproc_per_node 2 train.py --explicit-prefetch
+torchrun --nproc_per_node 2 example.py --explicit-prefetch
 ```
 
 To enable mixed precision
 ```
-torchrun --nproc_per_node 2 train.py --mixed-precision
+torchrun --nproc_per_node 2 example.py --mixed-precision
 ```
 
 To showcase DCP API
 ```
-torchrun --nproc_per_node 2 train.py --dcp-api
+torchrun --nproc_per_node 2 example.py --dcp-api
 ```
 
 ## Ensure you are running a recent version of PyTorch:
diff --git a/distributed/FSDP2/train.py b/distributed/FSDP2/example.py
similarity index 75%
rename from distributed/FSDP2/train.py
rename to distributed/FSDP2/example.py
index 94c5dc8a49..abe0120a1d 100644
--- a/distributed/FSDP2/train.py
+++ b/distributed/FSDP2/example.py
@@ -7,6 +7,11 @@
 from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
 from utils import inspect_mixed_precision, inspect_model
 
+def verify_min_gpu_count(min_gpus: int = 2) -> bool:
+    """ verification that we have at least 2 gpus to run dist examples """
+    has_gpu = torch.accelerator.is_available()
+    gpu_count = torch.accelerator.device_count()
+    return has_gpu and gpu_count >= min_gpus
 
 def set_modules_to_forward_prefetch(model, num_to_forward_prefetch):
     for i, layer in enumerate(model.layers):
@@ -29,10 +34,23 @@ def set_modules_to_backward_prefetch(model, num_to_backward_prefetch):
 
 
 def main(args):
+    _min_gpu_count = 2
+    if not verify_min_gpu_count(min_gpus=_min_gpu_count):
+        print(f"Unable to locate sufficient {_min_gpu_count} gpus to run this example. Exiting.")
+        exit()
     rank = int(os.environ["LOCAL_RANK"])
-    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
-    torch.distributed.init_process_group(backend="nccl", device_id=device)
+    if torch.accelerator.is_available():
+        device_type = torch.accelerator.current_accelerator()
+        device = torch.device(f"{device_type}:{rank}")
+        torch.accelerator.device_index(rank)
+        print(f"Running on rank {rank} on device {device}")
+    else:
+        device = torch.device("cpu")
+        print(f"Running on device {device}")
+
+    backend = torch.distributed.get_default_backend_for_device(device)
+    torch.distributed.init_process_group(backend=backend, device_id=device)
+
     torch.manual_seed(0)
     vocab_size = 1024
     batch_size = 32
@@ -64,7 +82,7 @@ def main(args):
 
     checkpointer = Checkpointer("checkpoints", dcp_api=args.dcp_api)
     if checkpointer.last_training_time is None:
-        model.to_empty(device="cuda")
+        model.to_empty(device=device)
         model.reset_parameters()
     else:
         checkpointer.load_model(model)
@@ -96,4 +114,5 @@ def main(args):
     parser.add_argument("--mixed-precision", action="store_true", default=False)
     parser.add_argument("--dcp-api", action="store_true", default=False)
     args = parser.parse_args()
+    
     main(args)
diff --git a/distributed/FSDP2/requirements.txt b/distributed/FSDP2/requirements.txt
new file mode 100644
index 0000000000..ab9294415c
--- /dev/null
+++ b/distributed/FSDP2/requirements.txt
@@ -0,0 +1,2 @@
+torch>=2.7
+numpy
diff --git a/distributed/FSDP2/run_example.sh b/distributed/FSDP2/run_example.sh
new file mode 100644
index 0000000000..b12dcfe110
--- /dev/null
+++ b/distributed/FSDP2/run_example.sh
@@ -0,0 +1,11 @@
+# /bin/bash
+# bash run_example.sh {file_to_run.py} {num_gpus}
+# where file_to_run = example to run. Default = 'example.py'
+# num_gpus = num local gpus to use (must be at least 2). Default = 4
+
+# samples to run include:
+# example.py
+
+echo "Launching ${1:-example.py} with ${2:-4} gpus"
+torchrun --nnodes=1 --nproc_per_node=${2:-4} ${1:-example.py}
+
diff --git a/run_distributed_examples.sh b/run_distributed_examples.sh
index e1f579c072..d482ca19ba 100755
--- a/run_distributed_examples.sh
+++ b/run_distributed_examples.sh
@@ -50,6 +50,10 @@ function distributed_tensor_parallelism() {
     uv run bash run_example.sh fsdp_tp_example.py || error "2D parallel example failed"
 }
 
+function distributed_FSDP2() {
+    uv run bash run_example.sh example.py || error "FSDP2 example failed"
+}
+
 function distributed_ddp() {
     uv run main.py || error "ddp example failed"
 }