Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 31 additions & 8 deletions benchmarks/H100x1.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Benchmarks on 1 x RTX PRO 6000
# Benchmarks on 1 x H100

All commands expect the following environment variable for common arguments:
````shell
Expand Down Expand Up @@ -48,14 +48,14 @@ export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/ti

### BF16
```shell
./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16
./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16 --lmhead-chunks=4

# [T] step 0 [ 0.1%] | time: 9256 ms | norm 16.093142 | loss 12.243517 | tps 56643 | sol 54.5%
# [T] step 1 [ 0.2%] | time: 8843 ms | norm 20.712574 | loss 11.035850 | tps 59288 | sol 57.1%
# [T] step 2 [ 0.3%] | time: 8849 ms | norm 12.315584 | loss 9.896231 | tps 59248 | sol 57.0%
# [T] step 3 [ 0.5%] | time: 8848 ms | norm 11.510052 | loss 9.289831 | tps 59254 | sol 57.0%
# [T] step 4 [ 0.6%] | time: 8868 ms | norm 12.428946 | loss 8.962239 | tps 59121 | sol 56.9%
# [T] step 5 [ 0.7%] | time: 8871 ms | norm 11.770356 | loss 8.688566 | tps 59101 | sol 56.9%
# [T] step 0 [ 0.1%] | time: 9400 ms | norm 16.093142 | loss 12.243517 | tps 55775 | sol 53.7%
# [T] step 1 [ 0.2%] | time: 8898 ms | norm 20.712574 | loss 11.035850 | tps 58922 | sol 56.7%
# [T] step 2 [ 0.3%] | time: 8906 ms | norm 12.315584 | loss 9.896231 | tps 58869 | sol 56.7%
# [T] step 3 [ 0.5%] | time: 8905 ms | norm 11.510052 | loss 9.289831 | tps 58875 | sol 56.7%
# [T] step 4 [ 0.6%] | time: 8914 ms | norm 12.428946 | loss 8.962239 | tps 58816 | sol 56.6%
# [T] step 5 [ 0.7%] | time: 8930 ms | norm 11.770356 | loss 8.688566 | tps 58710 | sol 56.5%

```

Expand Down Expand Up @@ -138,3 +138,26 @@ On the RTX Pro 6000, it appears that `--use-zero-copy` is slower than `memcpy`-b
# [T] step 4 [ 0.6%] | time: 82745 ms | norm 177.452179 | loss 17.235472 | tps 6336 | sol 54.7%
# [T] step 5 [ 0.7%] | time: 82939 ms | norm 211.570663 | loss 15.318270 | tps 6321 | sol 54.6%
```


## Profiling
Here are some useful commands for profiling several kernels in the 0.5B model:
```bash
export LAUNCH="--set full --launch-count 1 --import-source yes \"./build/train\" ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16 --steps=2"

ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 49 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 24 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 392 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1640 --export "quantize-rms-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name reduce_abs_max_kernel --launch-skip 360 --export "absmax-att-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 192 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1643 --export "quantize-swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name fused_classifier_kernel5 --launch-skip 8 --export "fused-classifier.ncu-rep" ${LAUNCH}
ncu --kernel-name rmsnorm_backward_kernel10 --launch-skip 392 --export "rmsnorm-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_backward_kernel1 --launch-skip 192 --export "swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1825 --export "quantize-swiglu-bwd.ncu-rep" ${LAUNCH}
```
146 changes: 146 additions & 0 deletions benchmarks/H100x8.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Benchmarks on 8 x H100

All commands expect the following environment variable for common arguments:
````shell
export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \
--ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \
--opt-v-dtype=bf16 --gpus=8 --use-cuda-graphs"
````

## Model size: 0.5B
### FP8
```shell
./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=64 --grad-accumulation=1

# [T] step 0 [ 0.1%] | time: 1145 ms | norm 15.316421 | loss 12.093399 | tps 457k | sol 11.6%
# [T] step 1 [ 0.2%] | time: 401 ms | norm 17.256222 | loss 11.458490 | tps 1307k | sol 33.2%
# [T] step 2 [ 0.3%] | time: 400 ms | norm 12.026175 | loss 10.896434 | tps 1310k | sol 33.2%
# [T] step 3 [ 0.5%] | time: 413 ms | norm 9.225351 | loss 10.504494 | tps 1269k | sol 32.2%
# [T] step 4 [ 0.6%] | time: 401 ms | norm 7.615700 | loss 10.228767 | tps 1307k | sol 33.2%
# [T] step 5 [ 0.7%] | time: 408 ms | norm 6.622881 | loss 10.014042 | tps 1285k | sol 32.6%

```

### BF16
```shell
./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2

# [T] step 0 [ 0.1%] | time: 1212 ms | norm 15.403462 | loss 12.091154 | tps 432k | sol 16.8%
# [T] step 1 [ 0.2%] | time: 438 ms | norm 17.303522 | loss 11.454803 | tps 1197k | sol 46.6%
# [T] step 2 [ 0.3%] | time: 446 ms | norm 11.993380 | loss 10.893306 | tps 1175k | sol 45.7%
# [T] step 3 [ 0.5%] | time: 462 ms | norm 9.201453 | loss 10.500762 | tps 1134k | sol 44.2%
# [T] step 4 [ 0.6%] | time: 442 ms | norm 7.628780 | loss 10.224569 | tps 1186k | sol 46.2%
# [T] step 5 [ 0.7%] | time: 465 ms | norm 6.640370 | loss 10.008899 | tps 1127k | sol 43.9%
```

## Model size: 1.5B

### FP8
```shell
./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=2

# [T] step 0 [ 0.1%] | time: 1627 ms | norm 15.454363 | loss 12.257751 | tps 322k | sol 22.7%
# [T] step 1 [ 0.2%] | time: 929 ms | norm 21.251320 | loss 11.091139 | tps 564k | sol 39.8%
# [T] step 2 [ 0.3%] | time: 930 ms | norm 12.894230 | loss 9.959479 | tps 563k | sol 39.8%
# [T] step 3 [ 0.5%] | time: 927 ms | norm 11.386432 | loss 9.326994 | tps 565k | sol 39.9%
# [T] step 4 [ 0.6%] | time: 929 ms | norm 13.417350 | loss 9.038084 | tps 564k | sol 39.8%
# [T] step 5 [ 0.7%] | time: 928 ms | norm 8.967434 | loss 8.724013 | tps 564k | sol 39.9%


```

### BF16
```shell
./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2 --lmhead-chunks=4

# [T] step 0 [ 0.1%] | time: 1913 ms | norm 15.604229 | loss 12.256866 | tps 274k | sol 32.9%
# [T] step 1 [ 0.2%] | time: 1148 ms | norm 21.527639 | loss 11.080661 | tps 456k | sol 54.9%
# [T] step 2 [ 0.3%] | time: 1143 ms | norm 12.888025 | loss 9.946535 | tps 458k | sol 55.1%
# [T] step 3 [ 0.5%] | time: 1146 ms | norm 11.464658 | loss 9.315798 | tps 457k | sol 55.0%
# [T] step 4 [ 0.6%] | time: 1145 ms | norm 13.239096 | loss 9.021889 | tps 457k | sol 55.0%
# [T] step 5 [ 0.7%] | time: 1148 ms | norm 8.618310 | loss 8.705520 | tps 456k | sol 54.9%
```

## Model size: 3B
### FP8
```shell
./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4

# [T] step 0 [ 0.1%] | time: 2416 ms | norm 16.745432 | loss 12.349982 | tps 217k | sol 29.1%
# [T] step 1 [ 0.2%] | time: 1666 ms | norm 29.795336 | loss 10.758265 | tps 314k | sol 42.3%
# [T] step 2 [ 0.3%] | time: 1678 ms | norm 169.535019 | loss 11.278725 | tps 312k | sol 42.0%
# [T] step 3 [ 0.5%] | time: 1675 ms | norm 63.851013 | loss 9.781277 | tps 313k | sol 42.0%
# [T] step 4 [ 0.6%] | time: 1675 ms | norm 59.026623 | loss 9.587938 | tps 313k | sol 42.0%
# [T] step 5 [ 0.7%] | time: 1674 ms | norm 20.821075 | loss 9.484455 | tps 313k | sol 42.1%

```

### BF16
```shell
./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4

# [T] step 0 [ 0.1%] | time: 2896 ms | norm 16.827234 | loss 12.350925 | tps 181k | sol 43.4%
# [T] step 1 [ 0.2%] | time: 2184 ms | norm 29.811665 | loss 10.748442 | tps 240k | sol 57.5%
# [T] step 2 [ 0.3%] | time: 2185 ms | norm 167.829391 | loss 11.228219 | tps 239k | sol 57.5%
# [T] step 3 [ 0.5%] | time: 2185 ms | norm 63.280231 | loss 9.765936 | tps 239k | sol 57.5%
# [T] step 4 [ 0.6%] | time: 2185 ms | norm 60.825737 | loss 9.600989 | tps 239k | sol 57.5%
# [T] step 5 [ 0.7%] | time: 2182 ms | norm 22.035967 | loss 9.491116 | tps 240k | sol 57.6%
```

## Model size: 7B

### FP8:
```shell
./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \
--lmhead-chunks=4 --attn-bwd-chunks=4 --recompute-swiglu --recompute-norm --offload-residual

# [T] step 0 [ 0.1%] | time: 4018 ms | norm 12.677011 | loss 12.664733 | tps 130k | sol 38.7%
# [T] step 1 [ 0.2%] | time: 3308 ms | norm 24.092297 | loss 10.755530 | tps 158k | sol 46.9%
# [T] step 2 [ 0.3%] | time: 3313 ms | norm 204.764511 | loss 15.885994 | tps 158k | sol 46.9%
# [T] step 3 [ 0.5%] | time: 3307 ms | norm 111.660988 | loss 14.926527 | tps 158k | sol 47.0%
# [T] step 4 [ 0.6%] | time: 3292 ms | norm 170.910950 | loss 14.207653 | tps 159k | sol 47.2%
# [T] step 5 [ 0.7%] | time: 3292 ms | norm 114.922325 | loss 13.319270 | tps 159k | sol 47.2%
```

### BF16:
```shell
./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=8 \
--lmhead-chunks=2 --attn-bwd-chunks=2

# [T] step 0 [ 0.1%] | time: 5379 ms | norm 12.686002 | loss 12.665464 | tps 97469 | sol 53.0%
# [T] step 1 [ 0.2%] | time: 4680 ms | norm 23.989368 | loss 10.748158 | tps 112k | sol 60.9%
# [T] step 2 [ 0.3%] | time: 4673 ms | norm 205.335083 | loss 15.946478 | tps 112k | sol 61.0%
# [T] step 3 [ 0.5%] | time: 4679 ms | norm 113.668663 | loss 14.997133 | tps 112k | sol 60.9%
# [T] step 4 [ 0.6%] | time: 4669 ms | norm 171.156616 | loss 14.274625 | tps 112k | sol 61.0%
# [T] step 5 [ 0.7%] | time: 4649 ms | norm 113.712280 | loss 13.265002 | tps 112k | sol 61.3%

```

## Model size: 14B
### FP8:
```shell
./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \
--lmhead-chunks=2 --attn-bwd-chunks=2 --recompute-ffn --recompute-norm --offload-residual \
--shard-weights --persistent-quants

# [T] step 0 [ 0.1%] | time: 7868 ms | norm 18.289650 | loss 12.961184 | tps 66635 | sol 38.6%
# [T] step 1 [ 0.2%] | time: 7130 ms | norm 176.818008 | loss 11.843331 | tps 73532 | sol 42.6%
# [T] step 2 [ 0.3%] | time: 7141 ms | norm 52.666096 | loss 11.854759 | tps 73419 | sol 42.5%
# [T] step 3 [ 0.5%] | time: 7138 ms | norm 195.560516 | loss 20.536478 | tps 73450 | sol 42.5%
# [T] step 4 [ 0.6%] | time: 7131 ms | norm 147.371887 | loss 19.192728 | tps 73522 | sol 42.5%
# [T] step 5 [ 0.7%] | time: 7120 ms | norm 170.223404 | loss 17.295954 | tps 73635 | sol 42.6%
```

### BF16:
```shell
./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4 \
--lmhead-chunks=2 --attn-bwd-chunks=2 --recompute-ffn --recompute-norm --offload-residual \
--shard-weights

# [T] step 0 [ 0.1%] | time: 11415 ms | norm 18.312731 | loss 12.960577 | tps 45929 | sol 49.6%
# [T] step 1 [ 0.2%] | time: 10704 ms | norm 179.659027 | loss 11.870799 | tps 48980 | sol 52.9%
# [T] step 2 [ 0.3%] | time: 10742 ms | norm 53.101696 | loss 11.826601 | tps 48807 | sol 52.7%
# [T] step 3 [ 0.5%] | time: 10762 ms | norm 197.853302 | loss 20.660141 | tps 48716 | sol 52.6%
# [T] step 4 [ 0.6%] | time: 10724 ms | norm 156.049545 | loss 19.513166 | tps 48889 | sol 52.8%
# [T] step 5 [ 0.7%] | time: 10729 ms | norm 170.124725 | loss 18.176239 | tps 48866 | sol 52.7%
```
2 changes: 1 addition & 1 deletion benchmarks/Pro6000x4.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Benchmarks on 1xRTX4090
# Benchmarks on 4 x RTX PRO 6000

All commands expect the following environment variable for common arguments:
````shell
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/Pro6000x8.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Benchmarks on 1xRTX4090
# Benchmarks on 8 x RTX PRO 6000

All commands expect the following environment variable for common arguments:
````shell
Expand Down
19 changes: 14 additions & 5 deletions src/models/llama_gradients.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,25 +141,34 @@ sLLamaBlockWeights<TensorShard>& LLamaGradientsUnsharded::get_block_shard(int la

void LLamaGradientsUnsharded::notify_embeddings(cudaStream_t stream, NCCLCommunicator& comm) {
if(!mIsLastMicroStep) return;
scatter_reduce(mFullGradient.NonBlocks.Embeddings, stream, mGradEvent, comm);
if (comm.world_size() != 1) {
NvtxRange r{"notify_embeddings"};
scatter_reduce(mFullGradient.NonBlocks.Embeddings, stream, mGradEvent, comm);
}
}

void LLamaGradientsUnsharded::notify_lmhead(cudaStream_t stream, NCCLCommunicator& comm) {
if(!mIsLastMicroStep) return;
if(mFullGradient.NonBlocks.LMHead.Data == mFullGradient.NonBlocks.Embeddings.Data) return; // sync lmhead with embeddings
NvtxRange r{"notify_lmhead"};
scatter_reduce(mFullGradient.NonBlocks.LMHead, stream, mGradEvent, comm);
if (comm.world_size() != 1) {
NvtxRange r{"notify_lmhead"};
scatter_reduce(mFullGradient.NonBlocks.LMHead, stream, mGradEvent, comm);
}
}

void LLamaGradientsUnsharded::notify_lnf_w(cudaStream_t stream, NCCLCommunicator& comm) {
if(!mIsLastMicroStep) return;
scatter_reduce(mFullGradient.NonBlocks.LNF_w, stream, mGradEvent, comm);
if (comm.world_size() != 1) {
scatter_reduce(mFullGradient.NonBlocks.LNF_w, stream, mGradEvent, comm);
}
}

void LLamaGradientsUnsharded::notify_block(int layer_idx, cudaStream_t stream, NCCLCommunicator& comm) {
if(!mIsLastMicroStep) return;
auto& dw = mFullGradient.Blocks[layer_idx];
scatter_reduce(layer_idx, dw, stream, mGradEvent, comm);
if (comm.world_size() != 1) {
scatter_reduce(layer_idx, dw, stream, mGradEvent, comm);
}
}

// ---------------------------------------------------------------------------------------------------------------------
Expand Down