enable multi-node in external launcher mode (vllm-project#29833)

xieyangxu · facebook-github-bot · commit 3eb3af16fe6a · 2025-12-02T16:29:09.000-08:00
Summary: Pull Request resolved: vllm-project#29833 Differential Revision: D88115795
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -593,10 +593,10 @@ def __post_init__(self) -> None:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
-        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
+        if self.distributed_executor_backend not in ("mp", "uni", "external_launcher") and self.nnodes > 1:
             raise ValueError(
                 "nnodes > 1 can only be set when distributed executor "
-                "backend is mp or uni."
+                "backend is mp, uni or external_launcher."
             )
 
     @property
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -1169,33 +1169,36 @@ def init_distributed_environment(
     from vllm.config import get_current_vllm_config
 
     config = get_current_vllm_config()
-    if config is not None and config.parallel_config.nnodes > 1:
-        parallel_config = config.parallel_config
-        ip = parallel_config.master_addr
-        rank = parallel_config.data_parallel_rank * world_size + rank
-        world_size = parallel_config.world_size_across_dp
-        port = parallel_config.master_port
-        distributed_init_method = get_distributed_init_method(ip, port)
-    elif (
+    if (
         config is not None
-        and config.parallel_config.data_parallel_size > 1
         and config.parallel_config.distributed_executor_backend != "external_launcher"
+        and (
+            config.parallel_config.nnodes > 1
+            or config.parallel_config.data_parallel_size > 1
+        )
     ):
         parallel_config = config.parallel_config
         # adjust to take into account data parallelism
         # offset the rank by the data parallel rank
         rank = parallel_config.data_parallel_rank * world_size + rank
         # adjust the world size to take into account data parallelism
         world_size = parallel_config.world_size_across_dp
-        ip = parallel_config.data_parallel_master_ip
-        port = parallel_config.get_next_dp_init_port()
+        
+        # Use appropriate IP and port based on configuration
+        if parallel_config.nnodes > 1:
+            ip = parallel_config.master_addr
+            port = parallel_config.master_port
+        else:
+            ip = parallel_config.data_parallel_master_ip
+            port = parallel_config.get_next_dp_init_port()
+            logger.debug(
+                "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
+                world_size,
+                rank,
+                distributed_init_method,
+            )
+        
         distributed_init_method = get_distributed_init_method(ip, port)
-        logger.debug(
-            "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
-            world_size,
-            rank,
-            distributed_init_method,
-        )
     if not torch.distributed.is_initialized():
         logger.info(
             "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",

Original file line number	Diff line number	Diff line change
`@@ -593,10 +593,10 @@ def __post_init__(self) -> None:`
`593`	`593`	`"max_parallel_loading_workers is currently "`
`594`	`594`	`"not supported and will be ignored."`
`595`	`595`	`)`
`596`		`- if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:`
	`596`	`+ if self.distributed_executor_backend not in ("mp", "uni", "external_launcher") and self.nnodes > 1:`
`597`	`597`	`raise ValueError(`
`598`	`598`	`"nnodes > 1 can only be set when distributed executor "`
`599`		`- "backend is mp or uni."`
	`599`	`+ "backend is mp, uni or external_launcher."`
`600`	`600`	`)`
`601`	`601`
`602`	`602`	`@property`