instructlab · ktdreyer · Jul 3, 2025
diff --git a/.github/actions/run-e2e/action.yml b/.github/actions/run-e2e/action.yml
@@ -0,0 +1,230 @@
+name: 'Run e2e tests'
+description: 'Runs e2e tests'
+inputs:
+  python-version:
+    required: true
+    description: >-
+      Python version to use. Must be in the form of "3.xx".
+  gh-token:
+    required: true
+    description: >-
+      GitHub token to use for authentication.
+  hf-token:
+    required: true
+    description: >-
+      Hugging Face token to use for authentication.
+  openai-api-key:
+    required: true
+    description: >-
+      OpenAI API key to use for authentication.
+  son-of-jeeves-discord-webhook:
+    required: true
+    description: >-
+      Son of Jeeves webhook (Discord).
+runs:
+  using: "composite"
+  steps:
+    - name: Install Packages
+      shell: bash
+      run: |
+        cat /etc/os-release
+        mkdir -p "${TMPDIR}"
+        sudo dnf install -y gcc gcc-c++ make git python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
+
+    - name: Checkout instructlab/instructlab
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        repository: "instructlab/instructlab"
+        path: "instructlab"
+          # https://github.com/actions/checkout/issues/249
+        fetch-depth: 0
+
+    - name: Determine if pr_or_branch is a PR number
+      id: check_pr
+      shell: bash
+      run: |
+        PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
+        if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
+        echo "is_pr=true" >> "$GITHUB_OUTPUT"
+        else
+        echo "is_pr=false" >> "$GITHUB_OUTPUT"
+        fi
+        echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
+
+    - name: Check if gh cli is installed
+      id: gh_cli
+      shell: bash
+      run: |
+        if command -v gh &> /dev/null ; then
+        echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+        else
+        echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+        fi
+
+    - name: Install gh CLI
+      if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+      shell: bash
+      run: |
+        sudo dnf install 'dnf-command(config-manager)' -y
+        sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+        sudo dnf install gh --repo gh-cli -y
+
+    - name: test gh CLI
+      shell: bash
+      run: |
+        gh --version
+
+    - name: set default repo
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Add comment to PR
+      if: steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Fetch and checkout PR
+      if: steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Checkout branch
+      if: steps.check_pr.outputs.is_pr == 'false'
+      working-directory: ./training
+      shell: bash
+      run: |
+        git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+
+    - name: Install ilab
+      working-directory: ./instructlab
+      shell: bash
+      run: |
+        PYTHON=python${{ inputs.python-version }} ./scripts/install-ilab-with-cuda.sh
+
+    - name: Update instructlab-training library
+      working-directory: ./training
+      shell: bash
+      run: |
+        . ../instructlab/venv/bin/activate
+
+        # Patch out our own pin from the ilab repo constraints file
+        ilab_constraints=../instructlab/constraints-dev.txt
+        sed -i '/instructlab-training==/d' $ilab_constraints
+
+        # Since we reuse the virtual environment prepared using ilab
+        # constraints, we should stick to the same constraints when
+        # installing latest training.
+        #
+        # FIX: this is not ideal; a proper fix would require decoupling the
+        # two repos in CI: either by removing the job completely and relying
+        # on "sdk" (no ilab) test runs; or by preparing a separate
+        # constraints file that would consider both the requirements files
+        # for the training library AND for the ilab - so that they are
+        # consistent.
+        pip_install="pip install -c $ilab_constraints"
+        $pip_install .
+        $pip_install .[cuda]
+
+    - name: Check disk before tests
+      if: always()
+      shell: bash
+      run: |
+        df -h
+
+    - name: Run e2e test
+      working-directory: ./instructlab
+      env:
+        HF_TOKEN: ${{ inputs.hf-token }}
+        OPENAI_API_KEY: ${{ inputs.openai-api-key }}
+      shell: bash
+      run: |
+        . venv/bin/activate
+
+        # set preserve to true so we can retain the logs
+        ./scripts/e2e-ci.sh -lp
+
+        # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+        # and we know that it will be written into a directory created by `mktemp -d`. 
+        # Given this information, we can use the following command to find the file:
+        log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
+        phase_num=1;
+        for log_file in $log_files; do
+        mv "${log_file}" phase-${phase_num}-training-log.jsonl
+        ((phase_num++))
+        done
+
+    - name: Check disk after tests
+      if: always()
+      shell: bash
+      run: |
+        df -h
+
+    - name: Upload training logs Phase 1
+      uses: actions/upload-artifact@v4
+      with:
+        name: phase-1-training-log.jsonl
+        path: ./instructlab/phase-1-training-log.jsonl
+        retention-days: 1
+        overwrite: true
+
+    - name: Upload training logs Phase 2
+      uses: actions/upload-artifact@v4
+      with:
+        name: phase-2-training-log.jsonl
+        path: ./instructlab/phase-2-training-log.jsonl
+        retention-days: 1
+        overwrite: true
+
+    - name: Add comment to PR if the workflow failed
+      if: failure() && steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Add comment to PR if the workflow succeeded
+      if: success() && steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Send Discord notification for failure
+      if: failure() && steps.check_pr.outputs.is_pr == 'false'
+      uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
+      with:
+        webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
+        status: ${{ job.status }}
+        title: "e2e-nvidia-l40s-x4"
+        description: |
+          Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
+          Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
+        color: 0xCB2431 # Red color for failure
+
+    - name: Send Discord notification for success
+      if: success() && steps.check_pr.outputs.is_pr == 'false'
+      uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
+      with:
+        webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
+        status: ${{ job.status }}
+        title: "e2e-nvidia-l40s-x4"
+        description: |
+          Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
+          Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
+        color: 0x28A745 # Green color for success
diff --git a/.github/actions/run-smoke/action.yml b/.github/actions/run-smoke/action.yml
@@ -0,0 +1,77 @@
+name: 'Run smoke tests'
+description: 'Runs smoke tests'
+inputs:
+  python-version:
+    required: true
+    description: >-
+      Python version to use. Must be in the form of "3.xx".
+runs:
+  using: "composite"
+  steps:
+    - name: "Install packages"
+      shell: bash
+      run: |
+        cat /etc/os-release
+        sudo dnf install -y gcc gcc-c++ make git-core python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
+
+    - name: "Verify cuda environment is setup"
+      shell: bash
+      run: |
+        export CUDA_HOME="/usr/local/cuda"
+        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
+        export PATH="${PATH}:${CUDA_HOME}/bin"
+        nvidia-smi
+
+    # installs in $GITHUB_WORKSPACE/venv.
+    # only has to install Tox because Tox will do the other virtual environment management.
+    - name: "Setup Python virtual environment"
+      shell: bash
+      run: |
+        python${{ inputs.python-version }} -m venv --upgrade-deps venv
+        . venv/bin/activate
+        pip install tox -c constraints-dev.txt
+
+    # flash-attn has a bug in the setup.py that causes pip to attempt
+    # installing it before torch is installed. This is a bug because their
+    # setup.py depends on importing the module, so it should have been listed
+    # in build_requires. Alas. See:
+    # https://github.com/Dao-AILab/flash-attention/pull/958
+    - name: "Install torch and other unlisted build dependencies for flash-attn"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        # The list is taken from the pull request linked above
+        pip install torch packaging setuptools wheel psutil ninja -c constraints-dev.txt
+
+    - name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        pip install tox-current-env
+
+    - name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        tox -e py3-smoke --print-deps-to-file=./deps.txt
+        pip_install="pip install -c constraints-dev.txt"
+        $pip_install -r ./deps.txt --no-build-isolation
+        $pip_install .
+
+    - name: "Show disk utilization BEFORE tests"
+      shell: bash
+      if: always()
+      run: |
+        df -h
+
+    - name: "Run smoke tests with Tox and Pytest"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        tox --current-env -e py3-smoke
+
+    - name: "Show disk utilization AFTER tests"
+      shell: bash
+      if: always()
+      run: |
+        df -h
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -13,9 +13,3 @@ updates:
     directory: "/.github/workflows"
     schedule:
       interval: "daily"
-
-  # Maintain dependencies for Python scripts
-  - package-ecosystem: "pip"
-    directory: "/"
-    schedule:
-      interval: "daily"
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -11,7 +11,6 @@ pull_request_rules:
     - label!=hold
     - label!=do-not-merge
     - label!=needs-rebase
-    - check-success=DCO
 
     # The files conditions regex should match the globs in workflow files
     # If workflow configuration files in .github/ are changed, the actionlint check must pass

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
@@ -29,11 +29,6 @@ jobs:
   actionlint:
     runs-on: ubuntu-latest
     steps:
-      - name: "Harden Runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-        with:
-          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
-
       - name: "Checkout"
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:

diff --git a/.github/workflows/constraints-update.yml b/.github/workflows/constraints-update.yml
@@ -0,0 +1,35 @@
+# Aligned with: https://github.com/instructlab/dev-docs/pull/198
+name: Update constraints-dev.txt
+
+on:
+  schedule:
+    - cron: '0 3 * * 1'  # Every Monday at 03:00 UTC
+  workflow_dispatch:
+
+jobs:
+  update-constraints:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Checkout "update-constraints" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: instructlab/ci-actions
+          path: ci-actions
+          # no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet
+          ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main
+          sparse-checkout: |
+            actions/update-constraints
+
+      - name: Update constraints
+        id: update-constraints
+        uses: ./ci-actions/actions/update-constraints
+        with:
+          gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -32,15 +32,11 @@ jobs:
   markdown-lint:
     runs-on: ubuntu-latest
     steps:
-      - name: "Harden Runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-        with:
-          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: "Check Markdown documents"
-        uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0
+        uses: DavidAnson/markdownlint-cli2-action@992badcdf24e3b8eb7e87ff9287fe931bcb00c6e # v20.0.0
         with:
           globs: '**/*.md'