Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 230 additions & 0 deletions .github/actions/run-e2e/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
name: 'Run e2e tests'
description: 'Runs e2e tests'
inputs:
python-version:
required: true
description: >-
Python version to use. Must be in the form of "3.xx".
gh-token:
required: true
description: >-
GitHub token to use for authentication.
hf-token:
required: true
description: >-
Hugging Face token to use for authentication.
openai-api-key:
required: true
description: >-
OpenAI API key to use for authentication.
son-of-jeeves-discord-webhook:
required: true
description: >-
Son of Jeeves webhook (Discord).
runs:
using: "composite"
steps:
- name: Install Packages
shell: bash
run: |
cat /etc/os-release
mkdir -p "${TMPDIR}"
sudo dnf install -y gcc gcc-c++ make git python${{ inputs.python-version }} python${{ inputs.python-version }}-devel

- name: Checkout instructlab/instructlab
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/instructlab"
path: "instructlab"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Determine if pr_or_branch is a PR number
id: check_pr
shell: bash
run: |
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
echo "is_pr=true" >> "$GITHUB_OUTPUT"
else
echo "is_pr=false" >> "$GITHUB_OUTPUT"
fi
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"

- name: Check if gh cli is installed
id: gh_cli
shell: bash
run: |
if command -v gh &> /dev/null ; then
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
else
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
fi

- name: Install gh CLI
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
shell: bash
run: |
sudo dnf install 'dnf-command(config-manager)' -y
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
sudo dnf install gh --repo gh-cli -y

- name: test gh CLI
shell: bash
run: |
gh --version

- name: set default repo
working-directory: ./training
shell: bash
run: |
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
env:
GH_TOKEN: ${{ inputs.gh-token }}

- name: Add comment to PR
if: steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
shell: bash
run: |
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
env:
GH_TOKEN: ${{ inputs.gh-token }}

- name: Fetch and checkout PR
if: steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
shell: bash
run: |
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
env:
GH_TOKEN: ${{ inputs.gh-token }}

- name: Checkout branch
if: steps.check_pr.outputs.is_pr == 'false'
working-directory: ./training
shell: bash
run: |
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}

- name: Install ilab
working-directory: ./instructlab
shell: bash
run: |
PYTHON=python${{ inputs.python-version }} ./scripts/install-ilab-with-cuda.sh

- name: Update instructlab-training library
working-directory: ./training
shell: bash
run: |
. ../instructlab/venv/bin/activate

# Patch out our own pin from the ilab repo constraints file
ilab_constraints=../instructlab/constraints-dev.txt
sed -i '/instructlab-training==/d' $ilab_constraints

# Since we reuse the virtual environment prepared using ilab
# constraints, we should stick to the same constraints when
# installing latest training.
#
# FIX: this is not ideal; a proper fix would require decoupling the
# two repos in CI: either by removing the job completely and relying
# on "sdk" (no ilab) test runs; or by preparing a separate
# constraints file that would consider both the requirements files
# for the training library AND for the ilab - so that they are
# consistent.
pip_install="pip install -c $ilab_constraints"
$pip_install .
$pip_install .[cuda]

- name: Check disk before tests
if: always()
shell: bash
run: |
df -h

- name: Run e2e test
working-directory: ./instructlab
env:
HF_TOKEN: ${{ inputs.hf-token }}
OPENAI_API_KEY: ${{ inputs.openai-api-key }}
shell: bash
run: |
. venv/bin/activate

# set preserve to true so we can retain the logs
./scripts/e2e-ci.sh -lp

# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
# and we know that it will be written into a directory created by `mktemp -d`.
# Given this information, we can use the following command to find the file:
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
phase_num=1;
for log_file in $log_files; do
mv "${log_file}" phase-${phase_num}-training-log.jsonl
((phase_num++))
done

- name: Check disk after tests
if: always()
shell: bash
run: |
df -h

- name: Upload training logs Phase 1
uses: actions/upload-artifact@v4
with:
name: phase-1-training-log.jsonl
path: ./instructlab/phase-1-training-log.jsonl
retention-days: 1
overwrite: true

- name: Upload training logs Phase 2
uses: actions/upload-artifact@v4
with:
name: phase-2-training-log.jsonl
path: ./instructlab/phase-2-training-log.jsonl
retention-days: 1
overwrite: true

- name: Add comment to PR if the workflow failed
if: failure() && steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
shell: bash
run: |
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
env:
GH_TOKEN: ${{ inputs.gh-token }}

- name: Add comment to PR if the workflow succeeded
if: success() && steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
shell: bash
run: |
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
env:
GH_TOKEN: ${{ inputs.gh-token }}

- name: Send Discord notification for failure
if: failure() && steps.check_pr.outputs.is_pr == 'false'
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
with:
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
status: ${{ job.status }}
title: "e2e-nvidia-l40s-x4"
description: |
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
color: 0xCB2431 # Red color for failure

- name: Send Discord notification for success
if: success() && steps.check_pr.outputs.is_pr == 'false'
uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
with:
webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
status: ${{ job.status }}
title: "e2e-nvidia-l40s-x4"
description: |
Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
color: 0x28A745 # Green color for success
77 changes: 77 additions & 0 deletions .github/actions/run-smoke/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: 'Run smoke tests'
description: 'Runs smoke tests'
inputs:
python-version:
required: true
description: >-
Python version to use. Must be in the form of "3.xx".
runs:
using: "composite"
steps:
- name: "Install packages"
shell: bash
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git-core python${{ inputs.python-version }} python${{ inputs.python-version }}-devel

- name: "Verify cuda environment is setup"
shell: bash
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
export PATH="${PATH}:${CUDA_HOME}/bin"
nvidia-smi

# installs in $GITHUB_WORKSPACE/venv.
# only has to install Tox because Tox will do the other virtual environment management.
- name: "Setup Python virtual environment"
shell: bash
run: |
python${{ inputs.python-version }} -m venv --upgrade-deps venv
. venv/bin/activate
pip install tox -c constraints-dev.txt

# flash-attn has a bug in the setup.py that causes pip to attempt
# installing it before torch is installed. This is a bug because their
# setup.py depends on importing the module, so it should have been listed
# in build_requires. Alas. See:
# https://github.com/Dao-AILab/flash-attention/pull/958
- name: "Install torch and other unlisted build dependencies for flash-attn"
shell: bash
run: |
source venv/bin/activate
# The list is taken from the pull request linked above
pip install torch packaging setuptools wheel psutil ninja -c constraints-dev.txt

- name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
shell: bash
run: |
source venv/bin/activate
pip install tox-current-env

- name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
shell: bash
run: |
source venv/bin/activate
tox -e py3-smoke --print-deps-to-file=./deps.txt
pip_install="pip install -c constraints-dev.txt"
$pip_install -r ./deps.txt --no-build-isolation
$pip_install .

- name: "Show disk utilization BEFORE tests"
shell: bash
if: always()
run: |
df -h

- name: "Run smoke tests with Tox and Pytest"
shell: bash
run: |
source venv/bin/activate
tox --current-env -e py3-smoke

- name: "Show disk utilization AFTER tests"
shell: bash
if: always()
run: |
df -h
6 changes: 0 additions & 6 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,3 @@ updates:
directory: "/.github/workflows"
schedule:
interval: "daily"

# Maintain dependencies for Python scripts
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
1 change: 0 additions & 1 deletion .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pull_request_rules:
- label!=hold
- label!=do-not-merge
- label!=needs-rebase
- check-success=DCO

# The files conditions regex should match the globs in workflow files
# If workflow configuration files in .github/ are changed, the actionlint check must pass
Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/actionlint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,6 @@ jobs:
actionlint:
runs-on: ubuntu-latest
steps:
- name: "Harden Runner"
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
with:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs

- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
Expand Down
35 changes: 35 additions & 0 deletions .github/workflows/constraints-update.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Aligned with: https://github.com/instructlab/dev-docs/pull/198
name: Update constraints-dev.txt

on:
schedule:
- cron: '0 3 * * 1' # Every Monday at 03:00 UTC
workflow_dispatch:

jobs:
update-constraints:
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write

steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Checkout "update-constraints" in-house CI action
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: instructlab/ci-actions
path: ci-actions
# no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet
ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main
sparse-checkout: |
actions/update-constraints

- name: Update constraints
id: update-constraints
uses: ./ci-actions/actions/update-constraints
with:
gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}

6 changes: 1 addition & 5 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,11 @@ jobs:
markdown-lint:
runs-on: ubuntu-latest
steps:
- name: "Harden Runner"
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
with:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
- name: "Checkout"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Check Markdown documents"
uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0
uses: DavidAnson/markdownlint-cli2-action@992badcdf24e3b8eb7e87ff9287fe931bcb00c6e # v20.0.0
with:
globs: '**/*.md'
Loading
Loading