Skip to content

Commit d69a737

Browse files
authored
create multipe outer optimizers for diloco (#1407)
Summary: enable creating a separate outer optimizer for each of the parameter fragments for streaming diloco
1 parent f062d48 commit d69a737

File tree

1 file changed

+8
-4
lines changed
  • torchtitan/components

1 file changed

+8
-4
lines changed

torchtitan/components/ft.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,19 @@ def maybe_semi_sync_training(
116116
# Create the outer optimizer based on the inner optimizer parameters.
117117
params = [group["params"] for group in optimizer.param_groups]
118118
params = [param for sublist in params for param in sublist]
119-
outer_optimizer = torch.optim.SGD(
120-
params, lr=0.7, momentum=0.9, nesterov=True
121-
)
119+
outer_optimizers = []
120+
for model in model_parts:
121+
params = [p for p in model.parameters() if p.requires_grad]
122+
outer_optimizer = torch.optim.SGD(
123+
params, lr=0.7, momentum=0.9, nesterov=True
124+
)
125+
outer_optimizers.append(outer_optimizer)
122126

123127
return local_sgd.DiLoCo(
124128
manager=ft_manager._manager,
125129
model_fragments=model_parts,
126130
inner_optimizer=optimizer,
127-
outer_optimizer=outer_optimizer,
131+
outer_optimizer=outer_optimizers,
128132
sync_every=ft_config.sync_steps,
129133
should_quantize=ft_config.should_quantize,
130134
fragment_sync_delay=ft_config.fragment_sync_delay,

0 commit comments

Comments
 (0)