Skip to content

Commit b84794c

Browse files
tsunghsienleefacebook-github-bot
authored andcommitted
Update README.md according to GraftingConfig merge (#243)
Summary: Pull Request resolved: #243 Due to #242, `README.md` needs updated accordingly. Reviewed By: wz337 Differential Revision: D81278578 fbshipit-source-id: 27db512c63481f30f4e4b0daad38b8a7518a9ae3
1 parent 20705ff commit b84794c

File tree

2 files changed

+25
-29
lines changed

2 files changed

+25
-29
lines changed

distributed_shampoo/README.md

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,7 @@ Ganesh Ajjanagadde (Meta), Rohan Anil (Google), Adnan Aziz (Meta), Pavan Balaji
2626

2727
Key distinctives of this implementation include:
2828
- Homogeneous multi-node multi-GPU support in PyTorch.
29-
- Learning rate grafting [3]. Our version of grafting only grafts the second moment/diagonal preconditioner. Momentum/first moment updates are performed separate from grafting. Supports the methods:
30-
- SGD
31-
- Adagrad
32-
- RMSprop
33-
- Adam
29+
- Learning rate grafting [3]. Our version of grafting only grafts the second moment/diagonal preconditioner. Momentum/first moment updates are performed separate from grafting.
3430
- Supports both normal and AdamW (decoupled) weight decay.
3531
- Incorporates exponential moving averaging (with or without bias correction) to the estimate the first moment (akin to Adam).
3632
- Incorporates momentum and Nesterov acceleration.
@@ -91,7 +87,7 @@ optimizer = SGD(
9187
we would instead use:
9288
```python
9389
import torch
94-
from distributed_shampoo import DistributedShampoo, SGDGraftingConfig
90+
from distributed_shampoo import DistributedShampoo, SGDPreconditionerConfig
9591

9692
model = instantiate_model()
9793

@@ -104,7 +100,7 @@ optimizer = DistributedShampoo(
104100
weight_decay=1e-05,
105101
max_preconditioner_dim=8192,
106102
precondition_frequency=100,
107-
grafting_config=SGDGraftingConfig(),
103+
grafting_config=SGDPreconditionerConfig(),
108104
)
109105
```
110106

@@ -129,7 +125,7 @@ optimizer = Adam(
129125
we would instead use:
130126
```python
131127
import torch
132-
from distributed_shampoo import AdamGraftingConfig, DistributedShampoo
128+
from distributed_shampoo import AdamPreconditionerConfig, DistributedShampoo
133129

134130
model = instantiate_model()
135131

@@ -142,7 +138,7 @@ optimizer = DistributedShampoo(
142138
max_preconditioner_dim=8192,
143139
precondition_frequency=100,
144140
use_decoupled_weight_decay=False,
145-
grafting_config=AdamGraftingConfig(
141+
grafting_config=AdamPreconditionerConfig(
146142
beta2=0.999,
147143
epsilon=1e-08,
148144
),
@@ -168,7 +164,7 @@ optimizer = Adagrad(
168164
we would instead use:
169165
```python
170166
import torch
171-
from distributed_shampoo import AdaGradGraftingConfig, DistributedShampoo
167+
from distributed_shampoo import AdaGradPreconditionerConfig, DistributedShampoo
172168

173169
model = instantiate_model()
174170

@@ -181,7 +177,7 @@ optimizer = DistributedShampoo(
181177
max_preconditioner_dim=8192,
182178
precondition_frequency=100,
183179
use_decoupled_weight_decay=False,
184-
grafting_config=AdaGradGraftingConfig(
180+
grafting_config=AdaGradPreconditionerConfig(
185181
epsilon=1e-10,
186182
),
187183
)
@@ -207,7 +203,7 @@ optimizer = AdamW(
207203
we would instead use:
208204
```python
209205
import torch
210-
from distributed_shampoo import AdamGraftingConfig, DistributedShampoo
206+
from distributed_shampoo import AdamPreconditionerConfig, DistributedShampoo
211207

212208
model = instantiate_model()
213209

@@ -220,7 +216,7 @@ optimizer = DistributedShampoo(
220216
max_preconditioner_dim=8192,
221217
precondition_frequency=100,
222218
use_decoupled_weight_decay=True,
223-
grafting_config=AdamGraftingConfig(
219+
grafting_config=AdamPreconditionerConfig(
224220
beta2=0.999,
225221
epsilon=1e-08,
226222
),
@@ -308,8 +304,8 @@ optimizer = DistributedShampoo(
308304
{
309305
"params": other_params,
310306
"lr": 3e-4,
311-
"start_preconditioning_step", math.inf,
312-
"grafting_config": AdamGraftingConfig(
307+
"start_preconditioning_step": math.inf,
308+
"grafting_config": AdamPreconditionerConfig(
313309
beta2=0.95,
314310
epsilon=1e-10,
315311
),
@@ -343,7 +339,7 @@ import torch
343339
import torch.distributed as dist
344340

345341
from distributed_shampoo import (
346-
AdamGraftingConfig,
342+
AdamPreconditionerConfig,
347343
DDPDistributedConfig,
348344
DistributedShampoo,
349345
)
@@ -376,7 +372,7 @@ optimizer = DistributedShampoo(
376372
max_preconditioner_dim=8192,
377373
precondition_frequency=100,
378374
use_decoupled_weight_decay=True,
379-
grafting_config=AdamGraftingConfig(
375+
grafting_config=AdamPreconditionerConfig(
380376
beta2=0.999,
381377
epsilon=1e-12,
382378
),
@@ -404,7 +400,7 @@ import torch.distributed as dist
404400
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
405401

406402
from distributed_shampoo import (
407-
AdamGraftingConfig,
403+
AdamPreconditionerConfig,
408404
compile_fsdp_parameter_metadata,
409405
DistributedShampoo,
410406
FSDPDistributedConfig,
@@ -434,7 +430,7 @@ optimizer = DistributedShampoo(
434430
max_preconditioner_dim=8192,
435431
precondition_frequency=100,
436432
use_decoupled_weight_decay=True,
437-
grafting_config=AdamGraftingConfig(
433+
grafting_config=AdamPreconditionerConfig(
438434
beta2=0.999,
439435
epsilon=1e-12,
440436
),
@@ -456,7 +452,7 @@ import torch.distributed as dist
456452
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy
457453

458454
from distributed_shampoo import (
459-
AdamGraftingConfig,
455+
AdamPreconditionerConfig,
460456
compile_fsdp_parameter_metadata,
461457
DistributedShampoo,
462458
HSDPDistributedConfig,
@@ -493,7 +489,7 @@ optimizer = DistributedShampoo(
493489
max_preconditioner_dim=8192,
494490
precondition_frequency=100,
495491
use_decoupled_weight_decay=True,
496-
grafting_config=AdamGraftingConfig(
492+
grafting_config=AdamPreconditionerConfig(
497493
beta2=0.999,
498494
epsilon=1e-12,
499495
),
@@ -519,7 +515,7 @@ import torch.distributed as dist
519515
from torch.distributed.fsdp import fully_shard
520516

521517
from distributed_shampoo import (
522-
AdamGraftingConfig,
518+
AdamPreconditionerConfig,
523519
DistributedShampoo,
524520
FullyShardDistributedConfig,
525521
)
@@ -548,7 +544,7 @@ optimizer = DistributedShampoo(
548544
max_preconditioner_dim=8192,
549545
precondition_frequency=100,
550546
use_decoupled_weight_decay=True,
551-
grafting_config=AdamGraftingConfig(
547+
grafting_config=AdamPreconditionerConfig(
552548
beta2=0.999,
553549
epsilon=1e-12,
554550
),
@@ -570,7 +566,7 @@ import torch.distributed as dist
570566
from torch.distributed.fsdp import fully_shard
571567

572568
from distributed_shampoo import (
573-
AdamGraftingConfig,
569+
AdamPreconditionerConfig,
574570
DistributedShampoo,
575571
HybridShardDistributedConfig,
576572
)
@@ -606,7 +602,7 @@ optimizer = DistributedShampoo(
606602
max_preconditioner_dim=8192,
607603
precondition_frequency=100,
608604
use_decoupled_weight_decay=True,
609-
grafting_config=AdamGraftingConfig(
605+
grafting_config=AdamPreconditionerConfig(
610606
beta2=0.999,
611607
epsilon=1e-12,
612608
),
@@ -678,7 +674,7 @@ With the inclusion of learning rate grafting, we can extract a good learning rat
678674
momentum=0.9,
679675
weight_decay=0.01,
680676
max_preconditioner_dim=4096,
681-
grafting_config=SGDGraftingConfig(),
677+
grafting_config=SGDPreconditionerConfig(),
682678
)
683679
```
684680

@@ -699,7 +695,7 @@ With the inclusion of learning rate grafting, we can extract a good learning rat
699695
momentum=0.9,
700696
weight_decay=0.01,
701697
precondition_frequency=100,
702-
grafting_config=SGDGraftingConfig(),
698+
grafting_config=SGDPreconditionerConfig(),
703699
)
704700
```
705701

@@ -718,7 +714,7 @@ With the inclusion of learning rate grafting, we can extract a good learning rat
718714
momentum=0.9,
719715
weight_decay=0.01,
720716
start_preconditioning_step=300,
721-
grafting_config=SGDGraftingConfig(),
717+
grafting_config=SGDPreconditionerConfig(),
722718
)
723719
```
724720

distributed_shampoo/preconditioner/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ optimizer = DistributedShampoo(
281281
precondition_frequency=50,
282282
use_bias_correction=True,
283283
use_decoupled_weight_decay=True,
284-
grafting_config=RMSpropGraftingConfig(beta2=0.95, epsilon=1e-8),
284+
grafting_config=RMSpropPreconditionerConfig(beta2=0.95, epsilon=1e-8),
285285
preconditioner_config=RootInvShampooPreconditionerConfig(
286286
amortized_computation_config=EigenConfig(
287287
max_iterations=1000,

0 commit comments

Comments
 (0)