Skip to content

refactorings

refactorings #197

Workflow file for this run

name: Build Wheels
on:
workflow_dispatch:
push:
branches:
- master
- release-*
- dev
pull_request:
branches:
- master
- dev
jobs:
wheel:
name: Build wheel (CUDA ${{ matrix.cuda_version }})
runs-on: ubuntu-24.04
container:
image: ${{ matrix.cuda_image }}
strategy:
matrix:
include:
- cuda_version: "12.8"
cuda_image: "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
wheel_tag: "cu128"
archs: "89;90;100;120"
runtime_deps: |
nvidia-cuda-runtime-cu12>=12.8
nvidia-cudnn-cu12>=9.0
nvidia-cufile-cu12>=1.10
nvidia-cublas-cu12>=12.8
nvidia-nccl-cu12>=2.0
- cuda_version: "13.0"
cuda_image: "nvidia/cuda:13.0.1-cudnn-devel-ubuntu22.04"
wheel_tag: "cu130"
archs: "89;90;100f;120f"
runtime_deps: |
cuda-toolkit[cudart,cublas,cufile]>=13.0
nvidia-cudnn-cu13>=9.0
nvidia-nccl-cu13>=2.0
env:
CMAKE_GENERATOR: Ninja
CMAKE_C_COMPILER: gcc-12
CMAKE_CXX_COMPILER: g++-12
CMAKE_C_COMPILER_LAUNCHER: ccache
CMAKE_CXX_COMPILER_LAUNCHER: ccache
CMAKE_CUDA_COMPILER_LAUNCHER: ccache
CUDAARCHS: ${{ matrix.archs }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install deps
run: apt update && apt install -y git g++-12
- name: ccache
uses: hendrikmuhs/[email protected]
with:
key: wheel-${{ matrix.wheel_tag }}
- name: Install the latest version of uv
uses: astral-sh/setup-uv@v7
# we cannot statically declare the cuda dependencies in pyproject.toml
# so we have to patch them here
- name: Add CUDA runtime dependencies
run: uv run --no-project --with tomlkit python3 .github/scripts/add_cuda_deps.py "${{ matrix.runtime_deps }}" "${{ matrix.wheel_tag }}"
- name: Build wheel
run: uv build --wheel
- name: Repair wheel with auditwheel
run: |
uv run --no-project --with auditwheel --with patchelf auditwheel repair dist/*.whl -w wheelhouse/ --exclude libcuda.so.1 --exclude libcudart.so.12 --exclude libcudart.so.13 --exclude libcudnn.so.9 --exclude libcufile.so.0 --exclude libnccl.so.2 --exclude libcublasLt.so.12 --exclude libcublasLt.so.13 --exclude libnvidia-ml.so.1
rm dist/*-linux_*.whl # Remove non-repaired wheel
- name: Upload wheel
uses: actions/upload-artifact@v4
with:
name: wheel-${{ matrix.wheel_tag }}
path: wheelhouse/pyllmq*.whl
deploy-modal:
name: Deploy to Modal
needs: wheel
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download wheel artifact
uses: actions/download-artifact@v4
with:
name: wheel-cu128
path: wheelhouse/
- name: Install Modal
run: pip install modal
- name: Set Modal token
run: modal token set --token-id ${{ secrets.MODAL_TOKEN_ID }} --token-secret ${{ secrets.MODAL_TOKEN_SECRET }}
- name: Deploy to Modal
run: modal deploy scripts/modal_test_app.py
test-modal-recompute:
# These tests verify that recomputation options do not change the results at all
name: Recompute - ${{ matrix.recompute.name }} - ${{ matrix.dtype.name }}
needs: deploy-modal
runs-on: ubuntu-latest
strategy:
fail-fast: false
max-parallel: 3
matrix:
recompute:
- name: "Determinism test"
args: ""
- name: "Full Block"
args: "--recompute-block --offload-residual --use-cuda-graphs"
- name: "Blockwise"
args: "--recompute-att --recompute-ffn"
- name: "Non-linearities"
args: "--recompute-swiglu --recompute-norm"
- name: "Offload Opt"
args: "--offload-opt-m --offload-opt-v --offload-master"
- name: "Offload Gradient"
args: "--shard-gradients --offload-grads"
# While not strictly a recomputation, chunked attention should be bitwise identical, too
- name: "Chunked attention"
args: "--recompute-att --attn-bwd-chunks=2"
dtype:
- name: "BF16"
args: "--matmul-dtype=bf16"
- name: "FP8"
args: "--matmul-dtype=e4m3"
steps:
- name: Checkout code
uses: actions/checkout@v4
# Note: No need to download wheel again, it's already in the deployed image
- name: Install Modal
run: pip install modal
- name: Set Modal token
run: modal token set --token-id ${{ secrets.MODAL_TOKEN_ID }} --token-secret ${{ secrets.MODAL_TOKEN_SECRET }}
- name: Run test on Modal
run: python3 scripts/modal_test_ci.py recompute ${{ matrix.recompute.args }} ${{ matrix.dtype.args }}
test-modal-fixed:
# These tests run a few steps and compare the resulting losses and norms against a known, fixed reference
# A failure in these tests doesn't necessarily mean that the code is broken, but it indicates that
# the changes need to be *carefully* reviewed, and tested end-to-end, before the reference is updated.
name: Test fixed - ${{ matrix.config.name }}
needs: deploy-modal
runs-on: ubuntu-latest
strategy:
fail-fast: false
max-parallel: 3
matrix:
config:
- name: "LLMQ BF16"
args: "fixed bf16"
- name: "LLMQ FP8"
args: "fixed e4m3"
- name: "LLMQ E5M2"
args: "fixed e5m2"
- name: "Torch BF16 ga=1"
args: "torch-step --grad-accum 1 --model-dtype bf16 --matmul-dtype bf16"
- name: "Torch BF16 ga=4"
args: "torch-step --grad-accum 4 --model-dtype bf16 --matmul-dtype bf16"
- name: "Torch FP32"
args: "torch-step --grad-accum 2 --model-dtype fp32 --matmul-dtype fp32"
- name: "Torch AMP"
args: "torch-step --grad-accum 2 --model-dtype fp32 --matmul-dtype bf16"
- name: "Torch Chunking"
args: "torch-step --grad-accum 4 --model-dtype bf16 --matmul-dtype bf16 --lmhead-chunks 4"
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install Modal
run: pip install modal
- name: Set Modal token
run: modal token set --token-id ${{ secrets.MODAL_TOKEN_ID }} --token-secret ${{ secrets.MODAL_TOKEN_SECRET }}
- name: Run test on Modal
run: python3 scripts/modal_test_ci.py ${{ matrix.config.args }}
release:
if: github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/')
needs:
- test-modal-recompute
- test-modal-fixed
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Download all wheels
uses: actions/download-artifact@v4
with:
pattern: wheel-*
path: wheelhouse/
merge-multiple: true
- name: Create Release
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ github.ref_name }}
files: wheelhouse/*.whl
generate_release_notes: true
draft: true