fsdp_tp_example fsdp1-> fsdp2

wwwjn · soumith · commit 2174f26a9c4d · 2025-07-16T11:34:04.000-04:00
diff --git a/distributed/tensor_parallelism/fsdp_tp_example.py b/distributed/tensor_parallelism/fsdp_tp_example.py
@@ -49,7 +49,7 @@
 from llama2_model import Transformer, ModelArgs
 
 from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import fully_shard
 from torch.distributed._tensor import Shard, Replicate
 from torch.distributed.tensor.parallel import (
     parallelize_module,
@@ -146,7 +146,7 @@
     )
 
 # Init FSDP using the dp device mesh
-sharded_model = FSDP(model, device_mesh=dp_mesh, use_orig_params=True)
+sharded_model = fully_shard(model, mesh=dp_mesh)
 
 rank_log(_rank, logger, f"Model after parallelization {sharded_model=}\n")