IST-DASLab · ngc92 · Dec 19, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/benchmarks/H100x1.md b/benchmarks/H100x1.md
@@ -1,4 +1,4 @@
-# Benchmarks on 1 x RTX PRO 6000
+# Benchmarks on 1 x H100
 
 All commands expect the following environment variable for common arguments:
 ````shell
@@ -48,14 +48,14 @@ export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/ti
 
 ### BF16
 ```shell
-./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16
+./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16 --lmhead-chunks=4
 
-# [T] step     0 [  0.1%] | time:  9256 ms | norm  16.093142 | loss  12.243517 | tps 56643 | sol 54.5%
-# [T] step     1 [  0.2%] | time:  8843 ms | norm  20.712574 | loss  11.035850 | tps 59288 | sol 57.1%
-# [T] step     2 [  0.3%] | time:  8849 ms | norm  12.315584 | loss   9.896231 | tps 59248 | sol 57.0%
-# [T] step     3 [  0.5%] | time:  8848 ms | norm  11.510052 | loss   9.289831 | tps 59254 | sol 57.0%
-# [T] step     4 [  0.6%] | time:  8868 ms | norm  12.428946 | loss   8.962239 | tps 59121 | sol 56.9%
-# [T] step     5 [  0.7%] | time:  8871 ms | norm  11.770356 | loss   8.688566 | tps 59101 | sol 56.9%
+# [T] step     0 [  0.1%] | time:  9400 ms | norm  16.093142 | loss  12.243517 | tps 55775 | sol 53.7%
+# [T] step     1 [  0.2%] | time:  8898 ms | norm  20.712574 | loss  11.035850 | tps 58922 | sol 56.7%
+# [T] step     2 [  0.3%] | time:  8906 ms | norm  12.315584 | loss   9.896231 | tps 58869 | sol 56.7%
+# [T] step     3 [  0.5%] | time:  8905 ms | norm  11.510052 | loss   9.289831 | tps 58875 | sol 56.7%
+# [T] step     4 [  0.6%] | time:  8914 ms | norm  12.428946 | loss   8.962239 | tps 58816 | sol 56.6%
+# [T] step     5 [  0.7%] | time:  8930 ms | norm  11.770356 | loss   8.688566 | tps 58710 | sol 56.5%
 
 ```
 
@@ -138,3 +138,26 @@ On the RTX Pro 6000, it appears that `--use-zero-copy` is slower than `memcpy`-b
 # [T] step     4 [  0.6%] | time: 82745 ms | norm 177.452179 | loss  17.235472 | tps  6336 | sol 54.7%
 # [T] step     5 [  0.7%] | time: 82939 ms | norm 211.570663 | loss  15.318270 | tps  6321 | sol 54.6%
 ```
+
+
+## Profiling
+Here are some useful commands for profiling several kernels in the 0.5B model:
+```bash
+export LAUNCH="--set full --launch-count 1 --import-source yes \"./build/train\" ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16 --steps=2"
+
+ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 49 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 24 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 392 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1640 --export "quantize-rms-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name reduce_abs_max_kernel --launch-skip 360 --export "absmax-att-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 192 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1643 --export "quantize-swiglu-fwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name fused_classifier_kernel5 --launch-skip 8 --export "fused-classifier.ncu-rep" ${LAUNCH}
+ncu --kernel-name rmsnorm_backward_kernel10 --launch-skip 392 --export "rmsnorm-bwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name swiglu_backward_kernel1 --launch-skip 192 --export "swiglu-bwd.ncu-rep" ${LAUNCH}
+ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1825 --export "quantize-swiglu-bwd.ncu-rep" ${LAUNCH}
+```
diff --git a/benchmarks/H100x8.md b/benchmarks/H100x8.md
@@ -0,0 +1,146 @@
+# Benchmarks on 8 x H100
+
+All commands expect the following environment variable for common arguments:
+````shell
+export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \
+ --ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \
+ --opt-v-dtype=bf16 --gpus=8 --use-cuda-graphs"
+````
+
+## Model size: 0.5B
+### FP8
+```shell
+./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=64 --grad-accumulation=1
+
+# [T] step     0 [  0.1%] | time:  1145 ms | norm  15.316421 | loss  12.093399 | tps  457k | sol 11.6%
+# [T] step     1 [  0.2%] | time:   401 ms | norm  17.256222 | loss  11.458490 | tps 1307k | sol 33.2%
+# [T] step     2 [  0.3%] | time:   400 ms | norm  12.026175 | loss  10.896434 | tps 1310k | sol 33.2%
+# [T] step     3 [  0.5%] | time:   413 ms | norm   9.225351 | loss  10.504494 | tps 1269k | sol 32.2%
+# [T] step     4 [  0.6%] | time:   401 ms | norm   7.615700 | loss  10.228767 | tps 1307k | sol 33.2%
+# [T] step     5 [  0.7%] | time:   408 ms | norm   6.622881 | loss  10.014042 | tps 1285k | sol 32.6%
+
+```
+
+### BF16
+```shell
+./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2
+
+# [T] step     0 [  0.1%] | time:  1212 ms | norm  15.403462 | loss  12.091154 | tps  432k | sol 16.8%
+# [T] step     1 [  0.2%] | time:   438 ms | norm  17.303522 | loss  11.454803 | tps 1197k | sol 46.6%
+# [T] step     2 [  0.3%] | time:   446 ms | norm  11.993380 | loss  10.893306 | tps 1175k | sol 45.7%
+# [T] step     3 [  0.5%] | time:   462 ms | norm   9.201453 | loss  10.500762 | tps 1134k | sol 44.2%
+# [T] step     4 [  0.6%] | time:   442 ms | norm   7.628780 | loss  10.224569 | tps 1186k | sol 46.2%
+# [T] step     5 [  0.7%] | time:   465 ms | norm   6.640370 | loss  10.008899 | tps 1127k | sol 43.9%
+```
+
+## Model size: 1.5B
+
+### FP8
+```shell
+./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=2
+
+# [T] step     0 [  0.1%] | time:  1627 ms | norm  15.454363 | loss  12.257751 | tps  322k | sol 22.7%
+# [T] step     1 [  0.2%] | time:   929 ms | norm  21.251320 | loss  11.091139 | tps  564k | sol 39.8%
+# [T] step     2 [  0.3%] | time:   930 ms | norm  12.894230 | loss   9.959479 | tps  563k | sol 39.8%
+# [T] step     3 [  0.5%] | time:   927 ms | norm  11.386432 | loss   9.326994 | tps  565k | sol 39.9%
+# [T] step     4 [  0.6%] | time:   929 ms | norm  13.417350 | loss   9.038084 | tps  564k | sol 39.8%
+# [T] step     5 [  0.7%] | time:   928 ms | norm   8.967434 | loss   8.724013 | tps  564k | sol 39.9%
+
+
+```
+
+### BF16
+```shell
+./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2 --lmhead-chunks=4
+
+# [T] step     0 [  0.1%] | time:  1913 ms | norm  15.604229 | loss  12.256866 | tps  274k | sol 32.9%
+# [T] step     1 [  0.2%] | time:  1148 ms | norm  21.527639 | loss  11.080661 | tps  456k | sol 54.9%
+# [T] step     2 [  0.3%] | time:  1143 ms | norm  12.888025 | loss   9.946535 | tps  458k | sol 55.1%
+# [T] step     3 [  0.5%] | time:  1146 ms | norm  11.464658 | loss   9.315798 | tps  457k | sol 55.0%
+# [T] step     4 [  0.6%] | time:  1145 ms | norm  13.239096 | loss   9.021889 | tps  457k | sol 55.0%
+# [T] step     5 [  0.7%] | time:  1148 ms | norm   8.618310 | loss   8.705520 | tps  456k | sol 54.9%
+```
+
+## Model size: 3B
+### FP8
+```shell
+./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4
+
+# [T] step     0 [  0.1%] | time:  2416 ms | norm  16.745432 | loss  12.349982 | tps  217k | sol 29.1%
+# [T] step     1 [  0.2%] | time:  1666 ms | norm  29.795336 | loss  10.758265 | tps  314k | sol 42.3%
+# [T] step     2 [  0.3%] | time:  1678 ms | norm 169.535019 | loss  11.278725 | tps  312k | sol 42.0%
+# [T] step     3 [  0.5%] | time:  1675 ms | norm  63.851013 | loss   9.781277 | tps  313k | sol 42.0%
+# [T] step     4 [  0.6%] | time:  1675 ms | norm  59.026623 | loss   9.587938 | tps  313k | sol 42.0%
+# [T] step     5 [  0.7%] | time:  1674 ms | norm  20.821075 | loss   9.484455 | tps  313k | sol 42.1%
+
+```
+
+### BF16
+```shell
+./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4
+
+# [T] step     0 [  0.1%] | time:  2896 ms | norm  16.827234 | loss  12.350925 | tps  181k | sol 43.4%
+# [T] step     1 [  0.2%] | time:  2184 ms | norm  29.811665 | loss  10.748442 | tps  240k | sol 57.5%
+# [T] step     2 [  0.3%] | time:  2185 ms | norm 167.829391 | loss  11.228219 | tps  239k | sol 57.5%
+# [T] step     3 [  0.5%] | time:  2185 ms | norm  63.280231 | loss   9.765936 | tps  239k | sol 57.5%
+# [T] step     4 [  0.6%] | time:  2185 ms | norm  60.825737 | loss   9.600989 | tps  239k | sol 57.5%
+# [T] step     5 [  0.7%] | time:  2182 ms | norm  22.035967 | loss   9.491116 | tps  240k | sol 57.6%
+```
+
+## Model size: 7B
+
+### FP8:
+```shell
+./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \
+  --lmhead-chunks=4 --attn-bwd-chunks=4 --recompute-swiglu --recompute-norm --offload-residual
+
+# [T] step     0 [  0.1%] | time:  4018 ms | norm  12.677011 | loss  12.664733 | tps  130k | sol 38.7%
+# [T] step     1 [  0.2%] | time:  3308 ms | norm  24.092297 | loss  10.755530 | tps  158k | sol 46.9%
+# [T] step     2 [  0.3%] | time:  3313 ms | norm 204.764511 | loss  15.885994 | tps  158k | sol 46.9%
+# [T] step     3 [  0.5%] | time:  3307 ms | norm 111.660988 | loss  14.926527 | tps  158k | sol 47.0%
+# [T] step     4 [  0.6%] | time:  3292 ms | norm 170.910950 | loss  14.207653 | tps  159k | sol 47.2%
+# [T] step     5 [  0.7%] | time:  3292 ms | norm 114.922325 | loss  13.319270 | tps  159k | sol 47.2%
+```
+
+### BF16:
+```shell
+./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=8 \
+  --lmhead-chunks=2 --attn-bwd-chunks=2
+
+# [T] step     0 [  0.1%] | time:  5379 ms | norm  12.686002 | loss  12.665464 | tps 97469 | sol 53.0%
+# [T] step     1 [  0.2%] | time:  4680 ms | norm  23.989368 | loss  10.748158 | tps  112k | sol 60.9%
+# [T] step     2 [  0.3%] | time:  4673 ms | norm 205.335083 | loss  15.946478 | tps  112k | sol 61.0%
+# [T] step     3 [  0.5%] | time:  4679 ms | norm 113.668663 | loss  14.997133 | tps  112k | sol 60.9%
+# [T] step     4 [  0.6%] | time:  4669 ms | norm 171.156616 | loss  14.274625 | tps  112k | sol 61.0%
+# [T] step     5 [  0.7%] | time:  4649 ms | norm 113.712280 | loss  13.265002 | tps  112k | sol 61.3%
+
+```
+
+## Model size: 14B
+### FP8:
+```shell
+./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \
+  --lmhead-chunks=2 --attn-bwd-chunks=2 --recompute-ffn --recompute-norm --offload-residual \
+  --shard-weights --persistent-quants
+
+# [T] step     0 [  0.1%] | time:  7868 ms | norm  18.289650 | loss  12.961184 | tps 66635 | sol 38.6%
+# [T] step     1 [  0.2%] | time:  7130 ms | norm 176.818008 | loss  11.843331 | tps 73532 | sol 42.6%
+# [T] step     2 [  0.3%] | time:  7141 ms | norm  52.666096 | loss  11.854759 | tps 73419 | sol 42.5%
+# [T] step     3 [  0.5%] | time:  7138 ms | norm 195.560516 | loss  20.536478 | tps 73450 | sol 42.5%
+# [T] step     4 [  0.6%] | time:  7131 ms | norm 147.371887 | loss  19.192728 | tps 73522 | sol 42.5%
+# [T] step     5 [  0.7%] | time:  7120 ms | norm 170.223404 | loss  17.295954 | tps 73635 | sol 42.6%
+```
+
+### BF16:
+```shell
+./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4 \
+  --lmhead-chunks=2 --attn-bwd-chunks=2 --recompute-ffn --recompute-norm --offload-residual \
+  --shard-weights
+
+# [T] step     0 [  0.1%] | time: 11415 ms | norm  18.312731 | loss  12.960577 | tps 45929 | sol 49.6%
+# [T] step     1 [  0.2%] | time: 10704 ms | norm 179.659027 | loss  11.870799 | tps 48980 | sol 52.9%
+# [T] step     2 [  0.3%] | time: 10742 ms | norm  53.101696 | loss  11.826601 | tps 48807 | sol 52.7%
+# [T] step     3 [  0.5%] | time: 10762 ms | norm 197.853302 | loss  20.660141 | tps 48716 | sol 52.6%
+# [T] step     4 [  0.6%] | time: 10724 ms | norm 156.049545 | loss  19.513166 | tps 48889 | sol 52.8%
+# [T] step     5 [  0.7%] | time: 10729 ms | norm 170.124725 | loss  18.176239 | tps 48866 | sol 52.7%
+```
diff --git a/benchmarks/Pro6000x4.md b/benchmarks/Pro6000x4.md
@@ -1,4 +1,4 @@
-# Benchmarks on 1xRTX4090
+# Benchmarks on 4 x RTX PRO 6000
 
 All commands expect the following environment variable for common arguments:
 ````shell

diff --git a/benchmarks/Pro6000x8.md b/benchmarks/Pro6000x8.md
@@ -1,4 +1,4 @@
-# Benchmarks on 1xRTX4090
+# Benchmarks on 8 x RTX PRO 6000
 
 All commands expect the following environment variable for common arguments:
 ````shell

diff --git a/src/models/llama_gradients.cpp b/src/models/llama_gradients.cpp
@@ -141,25 +141,34 @@ sLLamaBlockWeights<TensorShard>& LLamaGradientsUnsharded::get_block_shard(int la
 
 void LLamaGradientsUnsharded::notify_embeddings(cudaStream_t stream, NCCLCommunicator& comm) {
     if(!mIsLastMicroStep) return;
-    scatter_reduce(mFullGradient.NonBlocks.Embeddings, stream, mGradEvent, comm);
+    if (comm.world_size() != 1) {
+        NvtxRange r{"notify_embeddings"};
+        scatter_reduce(mFullGradient.NonBlocks.Embeddings, stream, mGradEvent, comm);
+    }
 }
 
 void LLamaGradientsUnsharded::notify_lmhead(cudaStream_t stream, NCCLCommunicator& comm) {
     if(!mIsLastMicroStep) return;
     if(mFullGradient.NonBlocks.LMHead.Data == mFullGradient.NonBlocks.Embeddings.Data) return;    // sync lmhead with embeddings
-    NvtxRange r{"notify_lmhead"};
-    scatter_reduce(mFullGradient.NonBlocks.LMHead, stream, mGradEvent, comm);
+    if (comm.world_size() != 1) {
+        NvtxRange r{"notify_lmhead"};
+        scatter_reduce(mFullGradient.NonBlocks.LMHead, stream, mGradEvent, comm);
+    }
 }
 
 void LLamaGradientsUnsharded::notify_lnf_w(cudaStream_t stream, NCCLCommunicator& comm) {
     if(!mIsLastMicroStep) return;
-    scatter_reduce(mFullGradient.NonBlocks.LNF_w, stream, mGradEvent, comm);
+    if (comm.world_size() != 1) {
+        scatter_reduce(mFullGradient.NonBlocks.LNF_w, stream, mGradEvent, comm);
+    }
 }
 
 void LLamaGradientsUnsharded::notify_block(int layer_idx, cudaStream_t stream, NCCLCommunicator& comm) {
     if(!mIsLastMicroStep) return;
     auto& dw = mFullGradient.Blocks[layer_idx];
-    scatter_reduce(layer_idx, dw, stream, mGradEvent, comm);
+    if (comm.world_size() != 1) {
+        scatter_reduce(layer_idx, dw, stream, mGradEvent, comm);
+    }
 }
 
 // ---------------------------------------------------------------------------------------------------------------------