diff --git a/benchmarks/H100x1.md b/benchmarks/H100x1.md index 9074c22..6ca78b5 100644 --- a/benchmarks/H100x1.md +++ b/benchmarks/H100x1.md @@ -1,4 +1,4 @@ -# Benchmarks on 1 x RTX PRO 6000 +# Benchmarks on 1 x H100 All commands expect the following environment variable for common arguments: ````shell @@ -48,14 +48,14 @@ export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/ti ### BF16 ```shell -./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16 +./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16 --lmhead-chunks=4 -# [T] step 0 [ 0.1%] | time: 9256 ms | norm 16.093142 | loss 12.243517 | tps 56643 | sol 54.5% -# [T] step 1 [ 0.2%] | time: 8843 ms | norm 20.712574 | loss 11.035850 | tps 59288 | sol 57.1% -# [T] step 2 [ 0.3%] | time: 8849 ms | norm 12.315584 | loss 9.896231 | tps 59248 | sol 57.0% -# [T] step 3 [ 0.5%] | time: 8848 ms | norm 11.510052 | loss 9.289831 | tps 59254 | sol 57.0% -# [T] step 4 [ 0.6%] | time: 8868 ms | norm 12.428946 | loss 8.962239 | tps 59121 | sol 56.9% -# [T] step 5 [ 0.7%] | time: 8871 ms | norm 11.770356 | loss 8.688566 | tps 59101 | sol 56.9% +# [T] step 0 [ 0.1%] | time: 9400 ms | norm 16.093142 | loss 12.243517 | tps 55775 | sol 53.7% +# [T] step 1 [ 0.2%] | time: 8898 ms | norm 20.712574 | loss 11.035850 | tps 58922 | sol 56.7% +# [T] step 2 [ 0.3%] | time: 8906 ms | norm 12.315584 | loss 9.896231 | tps 58869 | sol 56.7% +# [T] step 3 [ 0.5%] | time: 8905 ms | norm 11.510052 | loss 9.289831 | tps 58875 | sol 56.7% +# [T] step 4 [ 0.6%] | time: 8914 ms | norm 12.428946 | loss 8.962239 | tps 58816 | sol 56.6% +# [T] step 5 [ 0.7%] | time: 8930 ms | norm 11.770356 | loss 8.688566 | tps 58710 | sol 56.5% ``` @@ -138,3 +138,26 @@ On the RTX Pro 6000, it appears that `--use-zero-copy` is slower than `memcpy`-b # [T] step 4 [ 0.6%] | time: 82745 ms | norm 177.452179 | loss 17.235472 | tps 6336 | sol 54.7% # [T] step 5 [ 0.7%] | time: 82939 ms | norm 211.570663 | loss 15.318270 | tps 6321 | sol 54.6% ``` + + +## Profiling +Here are some useful commands for profiling several kernels in the 0.5B model: +```bash +export LAUNCH="--set full --launch-count 1 --import-source yes \"./build/train\" ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16 --steps=2" + +ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 49 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 24 --export "swiglu-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 392 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1640 --export "quantize-rms-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name reduce_abs_max_kernel --launch-skip 360 --export "absmax-att-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 192 --export "swiglu-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1643 --export "quantize-swiglu-fwd.ncu-rep" ${LAUNCH} +ncu --kernel-name fused_classifier_kernel5 --launch-skip 8 --export "fused-classifier.ncu-rep" ${LAUNCH} +ncu --kernel-name rmsnorm_backward_kernel10 --launch-skip 392 --export "rmsnorm-bwd.ncu-rep" ${LAUNCH} +ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH} +ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH} +ncu --kernel-name swiglu_backward_kernel1 --launch-skip 192 --export "swiglu-bwd.ncu-rep" ${LAUNCH} +ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1825 --export "quantize-swiglu-bwd.ncu-rep" ${LAUNCH} +``` diff --git a/benchmarks/H100x8.md b/benchmarks/H100x8.md new file mode 100644 index 0000000..391bc78 --- /dev/null +++ b/benchmarks/H100x8.md @@ -0,0 +1,146 @@ +# Benchmarks on 8 x H100 + +All commands expect the following environment variable for common arguments: +````shell +export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \ + --ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \ + --opt-v-dtype=bf16 --gpus=8 --use-cuda-graphs" +```` + +## Model size: 0.5B +### FP8 +```shell +./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=64 --grad-accumulation=1 + +# [T] step 0 [ 0.1%] | time: 1145 ms | norm 15.316421 | loss 12.093399 | tps 457k | sol 11.6% +# [T] step 1 [ 0.2%] | time: 401 ms | norm 17.256222 | loss 11.458490 | tps 1307k | sol 33.2% +# [T] step 2 [ 0.3%] | time: 400 ms | norm 12.026175 | loss 10.896434 | tps 1310k | sol 33.2% +# [T] step 3 [ 0.5%] | time: 413 ms | norm 9.225351 | loss 10.504494 | tps 1269k | sol 32.2% +# [T] step 4 [ 0.6%] | time: 401 ms | norm 7.615700 | loss 10.228767 | tps 1307k | sol 33.2% +# [T] step 5 [ 0.7%] | time: 408 ms | norm 6.622881 | loss 10.014042 | tps 1285k | sol 32.6% + +``` + +### BF16 +```shell +./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2 + +# [T] step 0 [ 0.1%] | time: 1212 ms | norm 15.403462 | loss 12.091154 | tps 432k | sol 16.8% +# [T] step 1 [ 0.2%] | time: 438 ms | norm 17.303522 | loss 11.454803 | tps 1197k | sol 46.6% +# [T] step 2 [ 0.3%] | time: 446 ms | norm 11.993380 | loss 10.893306 | tps 1175k | sol 45.7% +# [T] step 3 [ 0.5%] | time: 462 ms | norm 9.201453 | loss 10.500762 | tps 1134k | sol 44.2% +# [T] step 4 [ 0.6%] | time: 442 ms | norm 7.628780 | loss 10.224569 | tps 1186k | sol 46.2% +# [T] step 5 [ 0.7%] | time: 465 ms | norm 6.640370 | loss 10.008899 | tps 1127k | sol 43.9% +``` + +## Model size: 1.5B + +### FP8 +```shell +./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=2 + +# [T] step 0 [ 0.1%] | time: 1627 ms | norm 15.454363 | loss 12.257751 | tps 322k | sol 22.7% +# [T] step 1 [ 0.2%] | time: 929 ms | norm 21.251320 | loss 11.091139 | tps 564k | sol 39.8% +# [T] step 2 [ 0.3%] | time: 930 ms | norm 12.894230 | loss 9.959479 | tps 563k | sol 39.8% +# [T] step 3 [ 0.5%] | time: 927 ms | norm 11.386432 | loss 9.326994 | tps 565k | sol 39.9% +# [T] step 4 [ 0.6%] | time: 929 ms | norm 13.417350 | loss 9.038084 | tps 564k | sol 39.8% +# [T] step 5 [ 0.7%] | time: 928 ms | norm 8.967434 | loss 8.724013 | tps 564k | sol 39.9% + + +``` + +### BF16 +```shell +./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2 --lmhead-chunks=4 + +# [T] step 0 [ 0.1%] | time: 1913 ms | norm 15.604229 | loss 12.256866 | tps 274k | sol 32.9% +# [T] step 1 [ 0.2%] | time: 1148 ms | norm 21.527639 | loss 11.080661 | tps 456k | sol 54.9% +# [T] step 2 [ 0.3%] | time: 1143 ms | norm 12.888025 | loss 9.946535 | tps 458k | sol 55.1% +# [T] step 3 [ 0.5%] | time: 1146 ms | norm 11.464658 | loss 9.315798 | tps 457k | sol 55.0% +# [T] step 4 [ 0.6%] | time: 1145 ms | norm 13.239096 | loss 9.021889 | tps 457k | sol 55.0% +# [T] step 5 [ 0.7%] | time: 1148 ms | norm 8.618310 | loss 8.705520 | tps 456k | sol 54.9% +``` + +## Model size: 3B +### FP8 +```shell +./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 + +# [T] step 0 [ 0.1%] | time: 2416 ms | norm 16.745432 | loss 12.349982 | tps 217k | sol 29.1% +# [T] step 1 [ 0.2%] | time: 1666 ms | norm 29.795336 | loss 10.758265 | tps 314k | sol 42.3% +# [T] step 2 [ 0.3%] | time: 1678 ms | norm 169.535019 | loss 11.278725 | tps 312k | sol 42.0% +# [T] step 3 [ 0.5%] | time: 1675 ms | norm 63.851013 | loss 9.781277 | tps 313k | sol 42.0% +# [T] step 4 [ 0.6%] | time: 1675 ms | norm 59.026623 | loss 9.587938 | tps 313k | sol 42.0% +# [T] step 5 [ 0.7%] | time: 1674 ms | norm 20.821075 | loss 9.484455 | tps 313k | sol 42.1% + +``` + +### BF16 +```shell +./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4 + +# [T] step 0 [ 0.1%] | time: 2896 ms | norm 16.827234 | loss 12.350925 | tps 181k | sol 43.4% +# [T] step 1 [ 0.2%] | time: 2184 ms | norm 29.811665 | loss 10.748442 | tps 240k | sol 57.5% +# [T] step 2 [ 0.3%] | time: 2185 ms | norm 167.829391 | loss 11.228219 | tps 239k | sol 57.5% +# [T] step 3 [ 0.5%] | time: 2185 ms | norm 63.280231 | loss 9.765936 | tps 239k | sol 57.5% +# [T] step 4 [ 0.6%] | time: 2185 ms | norm 60.825737 | loss 9.600989 | tps 239k | sol 57.5% +# [T] step 5 [ 0.7%] | time: 2182 ms | norm 22.035967 | loss 9.491116 | tps 240k | sol 57.6% +``` + +## Model size: 7B + +### FP8: +```shell +./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \ + --lmhead-chunks=4 --attn-bwd-chunks=4 --recompute-swiglu --recompute-norm --offload-residual + +# [T] step 0 [ 0.1%] | time: 4018 ms | norm 12.677011 | loss 12.664733 | tps 130k | sol 38.7% +# [T] step 1 [ 0.2%] | time: 3308 ms | norm 24.092297 | loss 10.755530 | tps 158k | sol 46.9% +# [T] step 2 [ 0.3%] | time: 3313 ms | norm 204.764511 | loss 15.885994 | tps 158k | sol 46.9% +# [T] step 3 [ 0.5%] | time: 3307 ms | norm 111.660988 | loss 14.926527 | tps 158k | sol 47.0% +# [T] step 4 [ 0.6%] | time: 3292 ms | norm 170.910950 | loss 14.207653 | tps 159k | sol 47.2% +# [T] step 5 [ 0.7%] | time: 3292 ms | norm 114.922325 | loss 13.319270 | tps 159k | sol 47.2% +``` + +### BF16: +```shell +./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=8 \ + --lmhead-chunks=2 --attn-bwd-chunks=2 + +# [T] step 0 [ 0.1%] | time: 5379 ms | norm 12.686002 | loss 12.665464 | tps 97469 | sol 53.0% +# [T] step 1 [ 0.2%] | time: 4680 ms | norm 23.989368 | loss 10.748158 | tps 112k | sol 60.9% +# [T] step 2 [ 0.3%] | time: 4673 ms | norm 205.335083 | loss 15.946478 | tps 112k | sol 61.0% +# [T] step 3 [ 0.5%] | time: 4679 ms | norm 113.668663 | loss 14.997133 | tps 112k | sol 60.9% +# [T] step 4 [ 0.6%] | time: 4669 ms | norm 171.156616 | loss 14.274625 | tps 112k | sol 61.0% +# [T] step 5 [ 0.7%] | time: 4649 ms | norm 113.712280 | loss 13.265002 | tps 112k | sol 61.3% + +``` + +## Model size: 14B +### FP8: +```shell +./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \ + --lmhead-chunks=2 --attn-bwd-chunks=2 --recompute-ffn --recompute-norm --offload-residual \ + --shard-weights --persistent-quants + +# [T] step 0 [ 0.1%] | time: 7868 ms | norm 18.289650 | loss 12.961184 | tps 66635 | sol 38.6% +# [T] step 1 [ 0.2%] | time: 7130 ms | norm 176.818008 | loss 11.843331 | tps 73532 | sol 42.6% +# [T] step 2 [ 0.3%] | time: 7141 ms | norm 52.666096 | loss 11.854759 | tps 73419 | sol 42.5% +# [T] step 3 [ 0.5%] | time: 7138 ms | norm 195.560516 | loss 20.536478 | tps 73450 | sol 42.5% +# [T] step 4 [ 0.6%] | time: 7131 ms | norm 147.371887 | loss 19.192728 | tps 73522 | sol 42.5% +# [T] step 5 [ 0.7%] | time: 7120 ms | norm 170.223404 | loss 17.295954 | tps 73635 | sol 42.6% +``` + +### BF16: +```shell +./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4 \ + --lmhead-chunks=2 --attn-bwd-chunks=2 --recompute-ffn --recompute-norm --offload-residual \ + --shard-weights + +# [T] step 0 [ 0.1%] | time: 11415 ms | norm 18.312731 | loss 12.960577 | tps 45929 | sol 49.6% +# [T] step 1 [ 0.2%] | time: 10704 ms | norm 179.659027 | loss 11.870799 | tps 48980 | sol 52.9% +# [T] step 2 [ 0.3%] | time: 10742 ms | norm 53.101696 | loss 11.826601 | tps 48807 | sol 52.7% +# [T] step 3 [ 0.5%] | time: 10762 ms | norm 197.853302 | loss 20.660141 | tps 48716 | sol 52.6% +# [T] step 4 [ 0.6%] | time: 10724 ms | norm 156.049545 | loss 19.513166 | tps 48889 | sol 52.8% +# [T] step 5 [ 0.7%] | time: 10729 ms | norm 170.124725 | loss 18.176239 | tps 48866 | sol 52.7% +``` diff --git a/benchmarks/Pro6000x4.md b/benchmarks/Pro6000x4.md index b21485d..40788da 100644 --- a/benchmarks/Pro6000x4.md +++ b/benchmarks/Pro6000x4.md @@ -1,4 +1,4 @@ -# Benchmarks on 1xRTX4090 +# Benchmarks on 4 x RTX PRO 6000 All commands expect the following environment variable for common arguments: ````shell diff --git a/benchmarks/Pro6000x8.md b/benchmarks/Pro6000x8.md index 3dedaeb..df4084a 100644 --- a/benchmarks/Pro6000x8.md +++ b/benchmarks/Pro6000x8.md @@ -1,4 +1,4 @@ -# Benchmarks on 1xRTX4090 +# Benchmarks on 8 x RTX PRO 6000 All commands expect the following environment variable for common arguments: ````shell diff --git a/src/models/llama_gradients.cpp b/src/models/llama_gradients.cpp index fffb4e0..684d786 100644 --- a/src/models/llama_gradients.cpp +++ b/src/models/llama_gradients.cpp @@ -141,25 +141,34 @@ sLLamaBlockWeights& LLamaGradientsUnsharded::get_block_shard(int la void LLamaGradientsUnsharded::notify_embeddings(cudaStream_t stream, NCCLCommunicator& comm) { if(!mIsLastMicroStep) return; - scatter_reduce(mFullGradient.NonBlocks.Embeddings, stream, mGradEvent, comm); + if (comm.world_size() != 1) { + NvtxRange r{"notify_embeddings"}; + scatter_reduce(mFullGradient.NonBlocks.Embeddings, stream, mGradEvent, comm); + } } void LLamaGradientsUnsharded::notify_lmhead(cudaStream_t stream, NCCLCommunicator& comm) { if(!mIsLastMicroStep) return; if(mFullGradient.NonBlocks.LMHead.Data == mFullGradient.NonBlocks.Embeddings.Data) return; // sync lmhead with embeddings - NvtxRange r{"notify_lmhead"}; - scatter_reduce(mFullGradient.NonBlocks.LMHead, stream, mGradEvent, comm); + if (comm.world_size() != 1) { + NvtxRange r{"notify_lmhead"}; + scatter_reduce(mFullGradient.NonBlocks.LMHead, stream, mGradEvent, comm); + } } void LLamaGradientsUnsharded::notify_lnf_w(cudaStream_t stream, NCCLCommunicator& comm) { if(!mIsLastMicroStep) return; - scatter_reduce(mFullGradient.NonBlocks.LNF_w, stream, mGradEvent, comm); + if (comm.world_size() != 1) { + scatter_reduce(mFullGradient.NonBlocks.LNF_w, stream, mGradEvent, comm); + } } void LLamaGradientsUnsharded::notify_block(int layer_idx, cudaStream_t stream, NCCLCommunicator& comm) { if(!mIsLastMicroStep) return; auto& dw = mFullGradient.Blocks[layer_idx]; - scatter_reduce(layer_idx, dw, stream, mGradEvent, comm); + if (comm.world_size() != 1) { + scatter_reduce(layer_idx, dw, stream, mGradEvent, comm); + } } // ---------------------------------------------------------------------------------------------------------------------