Skip to content

Commit 3eb3af1

Browse files
xieyangxufacebook-github-bot
authored andcommitted
enable multi-node in external launcher mode (vllm-project#29833)
Summary: Pull Request resolved: vllm-project#29833 Differential Revision: D88115795
1 parent 5d91d2b commit 3eb3af1

File tree

2 files changed

+22
-19
lines changed

2 files changed

+22
-19
lines changed

vllm/config/parallel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -593,10 +593,10 @@ def __post_init__(self) -> None:
593593
"max_parallel_loading_workers is currently "
594594
"not supported and will be ignored."
595595
)
596-
if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
596+
if self.distributed_executor_backend not in ("mp", "uni", "external_launcher") and self.nnodes > 1:
597597
raise ValueError(
598598
"nnodes > 1 can only be set when distributed executor "
599-
"backend is mp or uni."
599+
"backend is mp, uni or external_launcher."
600600
)
601601

602602
@property

vllm/distributed/parallel_state.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,33 +1169,36 @@ def init_distributed_environment(
11691169
from vllm.config import get_current_vllm_config
11701170

11711171
config = get_current_vllm_config()
1172-
if config is not None and config.parallel_config.nnodes > 1:
1173-
parallel_config = config.parallel_config
1174-
ip = parallel_config.master_addr
1175-
rank = parallel_config.data_parallel_rank * world_size + rank
1176-
world_size = parallel_config.world_size_across_dp
1177-
port = parallel_config.master_port
1178-
distributed_init_method = get_distributed_init_method(ip, port)
1179-
elif (
1172+
if (
11801173
config is not None
1181-
and config.parallel_config.data_parallel_size > 1
11821174
and config.parallel_config.distributed_executor_backend != "external_launcher"
1175+
and (
1176+
config.parallel_config.nnodes > 1
1177+
or config.parallel_config.data_parallel_size > 1
1178+
)
11831179
):
11841180
parallel_config = config.parallel_config
11851181
# adjust to take into account data parallelism
11861182
# offset the rank by the data parallel rank
11871183
rank = parallel_config.data_parallel_rank * world_size + rank
11881184
# adjust the world size to take into account data parallelism
11891185
world_size = parallel_config.world_size_across_dp
1190-
ip = parallel_config.data_parallel_master_ip
1191-
port = parallel_config.get_next_dp_init_port()
1186+
1187+
# Use appropriate IP and port based on configuration
1188+
if parallel_config.nnodes > 1:
1189+
ip = parallel_config.master_addr
1190+
port = parallel_config.master_port
1191+
else:
1192+
ip = parallel_config.data_parallel_master_ip
1193+
port = parallel_config.get_next_dp_init_port()
1194+
logger.debug(
1195+
"Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
1196+
world_size,
1197+
rank,
1198+
distributed_init_method,
1199+
)
1200+
11921201
distributed_init_method = get_distributed_init_method(ip, port)
1193-
logger.debug(
1194-
"Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
1195-
world_size,
1196-
rank,
1197-
distributed_init_method,
1198-
)
11991202
if not torch.distributed.is_initialized():
12001203
logger.info(
12011204
"world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",

0 commit comments

Comments
 (0)