diff --git a/.github/actions/run-e2e/action.yml b/.github/actions/run-e2e/action.yml new file mode 100644 index 00000000..37aaaa5b --- /dev/null +++ b/.github/actions/run-e2e/action.yml @@ -0,0 +1,230 @@ +name: 'Run e2e tests' +description: 'Runs e2e tests' +inputs: + python-version: + required: true + description: >- + Python version to use. Must be in the form of "3.xx". + gh-token: + required: true + description: >- + GitHub token to use for authentication. + hf-token: + required: true + description: >- + Hugging Face token to use for authentication. + openai-api-key: + required: true + description: >- + OpenAI API key to use for authentication. + son-of-jeeves-discord-webhook: + required: true + description: >- + Son of Jeeves webhook (Discord). +runs: + using: "composite" + steps: + - name: Install Packages + shell: bash + run: | + cat /etc/os-release + mkdir -p "${TMPDIR}" + sudo dnf install -y gcc gcc-c++ make git python${{ inputs.python-version }} python${{ inputs.python-version }}-devel + + - name: Checkout instructlab/instructlab + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/instructlab" + path: "instructlab" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Determine if pr_or_branch is a PR number + id: check_pr + shell: bash + run: | + PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set + if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then + echo "is_pr=true" >> "$GITHUB_OUTPUT" + else + echo "is_pr=false" >> "$GITHUB_OUTPUT" + fi + echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT" + + - name: Check if gh cli is installed + id: gh_cli + shell: bash + run: | + if command -v gh &> /dev/null ; then + echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" + else + echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Install gh CLI + if: steps.gh_cli.outputs.gh_cli_installed == 'false' + shell: bash + run: | + sudo dnf install 'dnf-command(config-manager)' -y + sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo + sudo dnf install gh --repo gh-cli -y + + - name: test gh CLI + shell: bash + run: | + gh --version + + - name: set default repo + working-directory: ./training + shell: bash + run: | + gh repo set-default ${{ github.server_url }}/${{ github.repository }} + env: + GH_TOKEN: ${{ inputs.gh-token }} + + - name: Add comment to PR + if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + shell: bash + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" + env: + GH_TOKEN: ${{ inputs.gh-token }} + + - name: Fetch and checkout PR + if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + shell: bash + run: | + gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }} + env: + GH_TOKEN: ${{ inputs.gh-token }} + + - name: Checkout branch + if: steps.check_pr.outputs.is_pr == 'false' + working-directory: ./training + shell: bash + run: | + git checkout ${{ steps.check_pr.outputs.pr_or_branch }} + + - name: Install ilab + working-directory: ./instructlab + shell: bash + run: | + PYTHON=python${{ inputs.python-version }} ./scripts/install-ilab-with-cuda.sh + + - name: Update instructlab-training library + working-directory: ./training + shell: bash + run: | + . ../instructlab/venv/bin/activate + + # Patch out our own pin from the ilab repo constraints file + ilab_constraints=../instructlab/constraints-dev.txt + sed -i '/instructlab-training==/d' $ilab_constraints + + # Since we reuse the virtual environment prepared using ilab + # constraints, we should stick to the same constraints when + # installing latest training. + # + # FIX: this is not ideal; a proper fix would require decoupling the + # two repos in CI: either by removing the job completely and relying + # on "sdk" (no ilab) test runs; or by preparing a separate + # constraints file that would consider both the requirements files + # for the training library AND for the ilab - so that they are + # consistent. + pip_install="pip install -c $ilab_constraints" + $pip_install . + $pip_install .[cuda] + + - name: Check disk before tests + if: always() + shell: bash + run: | + df -h + + - name: Run e2e test + working-directory: ./instructlab + env: + HF_TOKEN: ${{ inputs.hf-token }} + OPENAI_API_KEY: ${{ inputs.openai-api-key }} + shell: bash + run: | + . venv/bin/activate + + # set preserve to true so we can retain the logs + ./scripts/e2e-ci.sh -lp + + # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python + # and we know that it will be written into a directory created by `mktemp -d`. + # Given this information, we can use the following command to find the file: + log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl") + phase_num=1; + for log_file in $log_files; do + mv "${log_file}" phase-${phase_num}-training-log.jsonl + ((phase_num++)) + done + + - name: Check disk after tests + if: always() + shell: bash + run: | + df -h + + - name: Upload training logs Phase 1 + uses: actions/upload-artifact@v4 + with: + name: phase-1-training-log.jsonl + path: ./instructlab/phase-1-training-log.jsonl + retention-days: 1 + overwrite: true + + - name: Upload training logs Phase 2 + uses: actions/upload-artifact@v4 + with: + name: phase-2-training-log.jsonl + path: ./instructlab/phase-2-training-log.jsonl + retention-days: 1 + overwrite: true + + - name: Add comment to PR if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + shell: bash + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." + env: + GH_TOKEN: ${{ inputs.gh-token }} + + - name: Add comment to PR if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training + shell: bash + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" + env: + GH_TOKEN: ${{ inputs.gh-token }} + + - name: Send Discord notification for failure + if: failure() && steps.check_pr.outputs.is_pr == 'false' + uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3 + with: + webhook: ${{ inputs.son-of-jeeves-discord-webhook }} + status: ${{ job.status }} + title: "e2e-nvidia-l40s-x4" + description: | + Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌ + Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. + color: 0xCB2431 # Red color for failure + + - name: Send Discord notification for success + if: success() && steps.check_pr.outputs.is_pr == 'false' + uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3 + with: + webhook: ${{ inputs.son-of-jeeves-discord-webhook }} + status: ${{ job.status }} + title: "e2e-nvidia-l40s-x4" + description: | + Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅ + Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. + color: 0x28A745 # Green color for success diff --git a/.github/actions/run-smoke/action.yml b/.github/actions/run-smoke/action.yml new file mode 100644 index 00000000..8c73603d --- /dev/null +++ b/.github/actions/run-smoke/action.yml @@ -0,0 +1,77 @@ +name: 'Run smoke tests' +description: 'Runs smoke tests' +inputs: + python-version: + required: true + description: >- + Python version to use. Must be in the form of "3.xx". +runs: + using: "composite" + steps: + - name: "Install packages" + shell: bash + run: | + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git-core python${{ inputs.python-version }} python${{ inputs.python-version }}-devel + + - name: "Verify cuda environment is setup" + shell: bash + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64" + export PATH="${PATH}:${CUDA_HOME}/bin" + nvidia-smi + + # installs in $GITHUB_WORKSPACE/venv. + # only has to install Tox because Tox will do the other virtual environment management. + - name: "Setup Python virtual environment" + shell: bash + run: | + python${{ inputs.python-version }} -m venv --upgrade-deps venv + . venv/bin/activate + pip install tox -c constraints-dev.txt + + # flash-attn has a bug in the setup.py that causes pip to attempt + # installing it before torch is installed. This is a bug because their + # setup.py depends on importing the module, so it should have been listed + # in build_requires. Alas. See: + # https://github.com/Dao-AILab/flash-attention/pull/958 + - name: "Install torch and other unlisted build dependencies for flash-attn" + shell: bash + run: | + source venv/bin/activate + # The list is taken from the pull request linked above + pip install torch packaging setuptools wheel psutil ninja -c constraints-dev.txt + + - name: "Install tox-current-env to reuse the venv with pre-installed build dependencies" + shell: bash + run: | + source venv/bin/activate + pip install tox-current-env + + - name: "Install dependencies from tox.ini in the current venv, using current venv installed deps" + shell: bash + run: | + source venv/bin/activate + tox -e py3-smoke --print-deps-to-file=./deps.txt + pip_install="pip install -c constraints-dev.txt" + $pip_install -r ./deps.txt --no-build-isolation + $pip_install . + + - name: "Show disk utilization BEFORE tests" + shell: bash + if: always() + run: | + df -h + + - name: "Run smoke tests with Tox and Pytest" + shell: bash + run: | + source venv/bin/activate + tox --current-env -e py3-smoke + + - name: "Show disk utilization AFTER tests" + shell: bash + if: always() + run: | + df -h diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 05b26f59..85c1d1b7 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,9 +13,3 @@ updates: directory: "/.github/workflows" schedule: interval: "daily" - - # Maintain dependencies for Python scripts - - package-ecosystem: "pip" - directory: "/" - schedule: - interval: "daily" diff --git a/.github/mergify.yml b/.github/mergify.yml index 0c4a9f45..e82bca96 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -11,7 +11,6 @@ pull_request_rules: - label!=hold - label!=do-not-merge - label!=needs-rebase - - check-success=DCO # The files conditions regex should match the globs in workflow files # If workflow configuration files in .github/ are changed, the actionlint check must pass diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 4606dde9..e7b8ff24 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -29,11 +29,6 @@ jobs: actionlint: runs-on: ubuntu-latest steps: - - name: "Harden Runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1 - with: - egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - - name: "Checkout" uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: diff --git a/.github/workflows/constraints-update.yml b/.github/workflows/constraints-update.yml new file mode 100644 index 00000000..ef2f60cd --- /dev/null +++ b/.github/workflows/constraints-update.yml @@ -0,0 +1,35 @@ +# Aligned with: https://github.com/instructlab/dev-docs/pull/198 +name: Update constraints-dev.txt + +on: + schedule: + - cron: '0 3 * * 1' # Every Monday at 03:00 UTC + workflow_dispatch: + +jobs: + update-constraints: + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Checkout "update-constraints" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: instructlab/ci-actions + path: ci-actions + # no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet + ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main + sparse-checkout: | + actions/update-constraints + + - name: Update constraints + id: update-constraints + uses: ./ci-actions/actions/update-constraints + with: + gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 6ed92091..a62b38de 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,15 +32,11 @@ jobs: markdown-lint: runs-on: ubuntu-latest steps: - - name: "Harden Runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1 - with: - egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - name: "Checkout" uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - name: "Check Markdown documents" - uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0 + uses: DavidAnson/markdownlint-cli2-action@992badcdf24e3b8eb7e87ff9287fe931bcb00c6e # v20.0.0 with: globs: '**/*.md' diff --git a/.github/workflows/e2e-nvidia-l40s-x4-py312.yml b/.github/workflows/e2e-nvidia-l40s-x4-py312.yml new file mode 100644 index 00000000..e78dce4d --- /dev/null +++ b/.github/workflows/e2e-nvidia-l40s-x4-py312.yml @@ -0,0 +1,221 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: E2E (NVIDIA L40S x4) (python 3.12) + +on: + schedule: + - cron: '0 16 * * *' # Runs at 4PM UTC every day + workflow_dispatch: + inputs: + pr_or_branch: + description: 'pull request number or branch name' + required: true + default: 'main' + +env: + TMPDIR: /home/tmp + +jobs: + start-large-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} + ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} + ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} + steps: + - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: instructlab/ci-actions + # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents + path: ci-actions + ref: release-v0.1 + sparse-checkout: | + actions/launch-ec2-runner-with-fallback + + - name: Launch EC2 Runner with Fallback + id: launch-ec2-instance-with-fallback + uses: ./ci-actions/actions/launch-ec2-runner-with-fallback + env: + TMPDIR: "/tmp" + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + regions_config: > + [ + { + "region": "us-east-2", + "subnets": { + "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", + "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", + "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" + }, + { + "region": "us-east-1", + "subnets": { + "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", + "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", + "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", + "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", + "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", + "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" + } + ] + try_spot_instance_first: false + ec2_instance_type: g6e.12xlarge + aws_resource_tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} + ] + + e2e-large-test: + needs: + - start-large-ec2-runner + runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} + + permissions: + pull-requests: write + + steps: + - name: Checkout instructlab/training + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/training" + path: "training" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Run e2e tests + uses: ./training/.github/actions/run-e2e + with: + python-version: 3.12 + gh-token: ${{ secrets.GITHUB_TOKEN }} + hf-token: ${{ secrets.HF_TOKEN }} + openai-api-key: ${{ secrets.OPENAI_API_KEY }} + son-of-jeeves-discord-webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} + + stop-large-ec2-runner: + needs: + - start-large-ec2-runner + - e2e-large-test + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }} + + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@8b37f736c69ba6af391e437447d3c07548478d78 # v2.4.0 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-large-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} + + loss-graphs: + needs: + - stop-large-ec2-runner + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Download loss data Phase 1 + id: phase-1-download-logs + uses: actions/download-artifact@v4 + with: + name: phase-1-training-log.jsonl + path: downloaded-data + + - name: Download loss data Phase 2 + id: phase-2-download-logs + uses: actions/download-artifact@v4 + with: + name: phase-2-training-log.jsonl + path: downloaded-data + + - name: Checkout instructlab/training + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/training" + path: "training" + fetch-depth: 0 + + - name: Install dependencies + working-directory: ./training + run: | + python -m pip install --upgrade pip + pip install -r requirements-dev.txt -c constraints-dev.txt + + - name: Try to upload Phase 1 to s3 + id: phase-1-upload-s3 + continue-on-error: true + run: | + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \ + --output-file "./phase-1-test.md" \ + --phase "1" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ + --origin-repository "${{ github.repository }}" + + - name: Try to upload Phase 2 to s3 + id: phase-2-upload-s3 + continue-on-error: true + run: | + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \ + --output-file "./phase-2-test.md" \ + --phase "2" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ + --origin-repository "${{ github.repository }}" + + - name: Check Phase 1 S3 upload status for success + if: steps.phase-1-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 1 loss graph to S3." + cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for success + if: steps.phase-2-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 2 loss graph to S3." + cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 1 S3 upload status for failure + if: steps.phase-1-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for failure + if: steps.phase-2-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" diff --git a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml new file mode 100644 index 00000000..e1c8313b --- /dev/null +++ b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml @@ -0,0 +1,295 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: E2E (NVIDIA L40S x4) SDK Test + +on: + # only run on PRs that touch certain regex paths + workflow_dispatch: + inputs: + pr_or_branch: + description: 'pull request number or branch name' + required: true + default: 'main' + +env: + TMPDIR: /home/tmp + +jobs: + start-large-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} + ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} + ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} + steps: + - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: instructlab/ci-actions + # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents + path: ci-actions + ref: release-v0.1 + sparse-checkout: | + actions/launch-ec2-runner-with-fallback + + - name: Launch EC2 Runner with Fallback + id: launch-ec2-instance-with-fallback + uses: ./ci-actions/actions/launch-ec2-runner-with-fallback + env: + TMPDIR: "/tmp" + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + regions_config: > + [ + { + "region": "us-east-2", + "subnets": { + "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", + "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", + "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" + }, + { + "region": "us-east-1", + "subnets": { + "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", + "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", + "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", + "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", + "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", + "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" + } + ] + try_spot_instance_first: false + ec2_instance_type: g6e.12xlarge + aws_resource_tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} + ] + + e2e-medium-test: + needs: + - start-large-ec2-runner + runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} + + permissions: + pull-requests: write + + steps: + - name: Install Packages + run: | + cat /etc/os-release + mkdir -p "${TMPDIR}" + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel + + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Install dependent PRs if needed + uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Fetch and checkout PR + if: ${{ github.event_name == 'pull_request_target' }} + run: | + git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }} + git checkout pr-${{ github.event.number }} + + - name: Update instructlab-training library + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" + nvidia-smi + python3.11 -m venv --upgrade-deps venv + . venv/bin/activate + pip install instructlab + pip install instructlab[cuda] + pip install vllm + python3.11 -m pip install packaging wheel setuptools-scm + pip install . + pip install .[cuda] + python3.11 -m pip uninstall -y flash-attn + python3.11 -m pip cache purge + python3.11 -m pip install ninja + MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation + + - name: Check disk before tests + run: | + df -h + + # TODO: switch to downloading a ds rather than generating one + # - name: Download SDG Dataset + # working-directory: ./training + # uses: actions/download-artifact@v4 + # with: + # name: sdg-dataset.jsonl + # path: dataset + + - name: Run e2e test + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + . venv/bin/activate + ls scripts + ls ./ + ./scripts/test-sdk.sh + + # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python + # and we know that it will be written into a directory created by `mktemp -d`. + # Given this information, we can use the following command to find the file: + log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl") + phase_num=1; + for log_file in $log_files; do + mv "${log_file}" phase-${phase_num}-training-log.jsonl + ((phase_num++)) + done + + - name: Check disk after tests + run: | + df -h + + - name: Upload training logs Phase 1 + uses: actions/upload-artifact@v4 + with: + name: phase-1-training-log.jsonl + path: ./phase-1-training-log.jsonl + retention-days: 1 + overwrite: true + + - name: Upload training logs Phase 2 + uses: actions/upload-artifact@v4 + with: + name: phase-2-training-log.jsonl + path: ./phase-2-training-log.jsonl + retention-days: 1 + overwrite: true + + stop-large-ec2-runner: + needs: + - start-large-ec2-runner + - e2e-medium-test + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-large-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} + + loss-graphs: + needs: + - stop-large-ec2-runner + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-dev.txt + + - name: Download loss data Phase 1 + id: phase-1-download-logs + uses: actions/download-artifact@v4 + with: + name: phase-1-training-log.jsonl + path: downloaded-data + + - name: Download loss data Phase 2 + id: phase-2-download-logs + uses: actions/download-artifact@v4 + with: + name: phase-2-training-log.jsonl + path: downloaded-data + + - name: Try to upload Phase 1 to s3 + id: phase-1-upload-s3 + continue-on-error: true + run: | + python ./scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \ + --output-file "./phase-1-test.md" \ + --phase "1" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ + --origin-repository "${{ github.repository }}" + + - name: Try to upload Phase 2 to s3 + id: phase-2-upload-s3 + continue-on-error: true + run: | + python ./scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \ + --output-file "./phase-2-test.md" \ + --phase "2" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ + --origin-repository "${{ github.repository }}" + + - name: Check Phase 1 S3 upload status for success + if: steps.phase-1-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 1 loss graph to S3." + cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for success + if: steps.phase-2-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 2 loss graph to S3." + cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 1 S3 upload status for failure + if: steps.phase-1-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for failure + if: steps.phase-2-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 57d213b9..f464783c 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -name: E2E (NVIDIA L40S x4) +name: E2E (NVIDIA L40S x4) (python 3.11) on: schedule: @@ -19,34 +19,58 @@ jobs: start-large-ec2-runner: runs-on: ubuntu-latest outputs: - label: ${{ steps.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} + ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} + ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} steps: - - name: "Harden Runner" - # v2.10.1 - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf + - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - egress-policy: audit + repository: instructlab/ci-actions + # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents + path: ci-actions + ref: release-v0.1 + sparse-checkout: | + actions/launch-ec2-runner-with-fallback - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Start EC2 runner - id: start-ec2-runner - uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9 + - name: Launch EC2 Runner with Fallback + id: launch-ec2-instance-with-fallback + uses: ./ci-actions/actions/launch-ec2-runner-with-fallback + env: + TMPDIR: "/tmp" with: - mode: start - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ${{ vars.AWS_EC2_AMI }} - ec2-instance-type: g6e.12xlarge - subnet-id: subnet-024298cefa3bedd61 - security-group-id: sg-06300447c4a5fbef3 - iam-role-name: instructlab-ci-runner - aws-resource-tags: > + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + regions_config: > + [ + { + "region": "us-east-2", + "subnets": { + "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", + "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", + "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" + }, + { + "region": "us-east-1", + "subnets": { + "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", + "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", + "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", + "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", + "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", + "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" + } + ] + try_spot_instance_first: false + ec2_instance_type: g6e.12xlarge + aws_resource_tags: > [ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, @@ -63,25 +87,6 @@ jobs: pull-requests: write steps: - - name: "Harden Runner" - # v2.10.1 - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf - with: - egress-policy: audit - - name: Install Packages - run: | - cat /etc/os-release - mkdir -p "${TMPDIR}" - sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel - - - name: Checkout instructlab/instructlab - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: "instructlab/instructlab" - path: "instructlab" - # https://github.com/actions/checkout/issues/249 - fetch-depth: 0 - - name: Checkout instructlab/training uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -90,203 +95,14 @@ jobs: # https://github.com/actions/checkout/issues/249 fetch-depth: 0 - - name: Determine if pr_or_branch is a PR number - id: check_pr - run: | - PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set - if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then - echo "is_pr=true" >> "$GITHUB_OUTPUT" - else - echo "is_pr=false" >> "$GITHUB_OUTPUT" - fi - echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT" - - - name: Check if gh cli is installed - id: gh_cli - run: | - if command -v gh &> /dev/null ; then - echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" - else - echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" - fi - - - name: Install gh CLI - if: steps.gh_cli.outputs.gh_cli_installed == 'false' - run: | - sudo dnf install 'dnf-command(config-manager)' -y - sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - sudo dnf install gh --repo gh-cli -y - - - name: test gh CLI - run: | - gh --version - - - name: set default repo - working-directory: ./training - run: | - gh repo set-default ${{ github.server_url }}/${{ github.repository }} - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Add comment to PR - if: steps.check_pr.outputs.is_pr == 'true' - working-directory: ./training - run: | - gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Fetch and checkout PR - if: steps.check_pr.outputs.is_pr == 'true' - working-directory: ./training - run: | - gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }} - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Checkout branch - if: steps.check_pr.outputs.is_pr == 'false' - working-directory: ./training - run: | - git checkout ${{ steps.check_pr.outputs.pr_or_branch }} - - - name: Install ilab - working-directory: ./instructlab - run: | - export CUDA_HOME="/usr/local/cuda" - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" - export PATH="$PATH:$CUDA_HOME/bin" - python3.11 -m venv --upgrade-deps venv - . venv/bin/activate - nvidia-smi - python3.11 -m pip cache remove llama_cpp_python - - CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install . - - # https://github.com/instructlab/instructlab/issues/1821 - # install with Torch and build dependencies installed - python3.11 -m pip install packaging wheel setuptools-scm - python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt - - - name: Update instructlab-training library - working-directory: ./training - run: | - . ../instructlab/venv/bin/activate - pip install . - pip install .[cuda] - - - name: Check disk before tests - run: | - df -h - - - name: Run e2e test - working-directory: ./instructlab - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: | - . venv/bin/activate - - # set preserve to true so we can retain the logs - ./scripts/e2e-ci.sh -lp - - # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python - # and we know that it will be written into a directory created by `mktemp -d`. - # Given this information, we can use the following command to find the file: - log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl") - phase_num=1; - for log_file in $log_files; do - mv "${log_file}" phase-${phase_num}-training-log.jsonl - ((phase_num++)) - done - - - name: Check disk after tests - run: | - df -h - - - name: Upload training logs Phase 1 - uses: actions/upload-artifact@v4 + - name: Run e2e tests + uses: ./training/.github/actions/run-e2e with: - name: phase-1-training-log.jsonl - path: ./instructlab/phase-1-training-log.jsonl - retention-days: 1 - overwrite: true - - - name: Upload training logs Phase 2 - uses: actions/upload-artifact@v4 - with: - name: phase-2-training-log.jsonl - path: ./instructlab/phase-2-training-log.jsonl - retention-days: 1 - overwrite: true - - - name: Add comment to PR if the workflow failed - if: failure() && steps.check_pr.outputs.is_pr == 'true' - working-directory: ./training - run: | - gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Add comment to PR if the workflow succeeded - if: success() && steps.check_pr.outputs.is_pr == 'true' - working-directory: ./training - run: | - gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Post job results to Slack if the workflow failed - if: failure() && steps.check_pr.outputs.is_pr == 'false' - id: slack-report-failure - uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 - with: - token: ${{ secrets.SON_OF_JEEVES_TOKEN }} - method: chat.postMessage - payload: | - # Slack channel id, channel name, or user id to post message. - # See also: https://api.slack.com/methods/chat.postMessage#channels - # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. - channel: 'e2e-ci-results' - text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - - - name: Post job results to Slack if the workflow succeeded - if: success() && steps.check_pr.outputs.is_pr == 'false' - id: slack-report-success - uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0 - with: - token: ${{ secrets.SON_OF_JEEVES_TOKEN }} - method: chat.postMessage - payload: | - # Slack channel id, channel name, or user id to post message. - # See also: https://api.slack.com/methods/chat.postMessage#channels - # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. - channel: 'e2e-ci-results' - text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - - - name: Send Discord notification for failure - if: failure() && steps.check_pr.outputs.is_pr == 'false' - uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3 - with: - webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} - status: ${{ job.status }} - title: "e2e-nvidia-l40s-x4" - description: | - Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌ - Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. - color: 0xCB2431 # Red color for failure - - - name: Send Discord notification for success - if: success() && steps.check_pr.outputs.is_pr == 'false' - uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3 - with: - webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} - status: ${{ job.status }} - title: "e2e-nvidia-l40s-x4" - description: | - Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅ - Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. - color: 0x28A745 # Green color for success + python-version: 3.11 + gh-token: ${{ secrets.GITHUB_TOKEN }} + hf-token: ${{ secrets.HF_TOKEN }} + openai-api-key: ${{ secrets.OPENAI_API_KEY }} + son-of-jeeves-discord-webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} stop-large-ec2-runner: needs: @@ -295,21 +111,15 @@ jobs: runs-on: ubuntu-latest if: ${{ always() }} steps: - - name: "Harden Runner" - # v2.10.1 - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf - with: - egress-policy: audit - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ vars.AWS_REGION }} + aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }} - name: Stop EC2 runner - uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9 + uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 with: mode: stop github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} @@ -322,14 +132,8 @@ jobs: runs-on: ubuntu-latest if: ${{ always() }} steps: - - name: "Harden Runner" - # v2.10.1 - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf - with: - egress-policy: audit - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -360,7 +164,7 @@ jobs: working-directory: ./training run: | python -m pip install --upgrade pip - pip install -r requirements-dev.txt + pip install -r requirements-dev.txt -c constraints-dev.txt - name: Try to upload Phase 1 to s3 id: phase-1-upload-s3 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 6327295e..2c8df16d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -11,6 +11,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements*.txt' + - 'constraints-dev.txt' - 'tox.ini' - '.pylintrc' - 'scripts/*.sh' # Used by this workflow @@ -23,6 +24,7 @@ on: - '**.py' - 'pyproject.toml' - 'requirements*.txt' + - 'constraints-dev.txt' - 'tox.ini' - '.pylintrc' - 'scripts/*.sh' # Used by this workflow @@ -57,11 +59,6 @@ jobs: commands: | tox -e mypy steps: - - name: "Harden Runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1 - with: - egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - - name: "Checkout" uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -69,7 +66,7 @@ jobs: fetch-depth: 0 - name: Setup Python 3.11 - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: 3.11 cache: pip @@ -80,7 +77,7 @@ jobs: - name: Install tox run: | python -m pip install --upgrade pip - python -m pip install tox tox-gh + python -m pip install tox tox-gh -c constraints-dev.txt - name: "${{ matrix.lint.name }}" run: | diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 7343ab1c..69a41135 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -36,11 +36,6 @@ jobs: name: Build and check packages runs-on: ubuntu-latest steps: - - name: "Harden Runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1 - with: - egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - - name: "Checkout" uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -65,11 +60,6 @@ jobs: needs: build-package steps: - - name: "Harden Runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1 - with: - egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - - name: "Download build artifacts" uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: @@ -97,11 +87,6 @@ jobs: needs: build-package steps: - - name: "Harden Runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1 - with: - egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - - name: "Download build artifacts" uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: diff --git a/.github/workflows/smoke-py312.yaml b/.github/workflows/smoke-py312.yaml new file mode 100644 index 00000000..400146d1 --- /dev/null +++ b/.github/workflows/smoke-py312.yaml @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: "Run smoke tests via Tox::pytest (python 3.12)" +# These tests will be long running and require accelerated hardware. + +on: + workflow_dispatch: + inputs: + branch: + type: string + default: main + # using this rather than pull_request because this workflow + # needs to run in the context of the base branch (main) and + # access the repo's secrets to start the AWS instances. + pull_request_target: + branches: + - main + - release-* + paths: + # note this should match the merging criteria in 'mergify.yml' + - "**.py" + - "tox.ini" + - "pyproject.toml" + - "requirements-dev.txt" + - "requirements-cuda.txt" + - "constraints-dev.txt" + +permissions: + contents: read + +defaults: + run: + shell: bash + +env: + ec2_runner_variant: "g6e.12xlarge" # 4x L40s + +jobs: + start-large-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} + ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} + ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} + steps: + - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: instructlab/ci-actions + # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents + path: ci-actions + ref: release-v0.1 + sparse-checkout: | + actions/launch-ec2-runner-with-fallback + + - name: Launch EC2 Runner with Fallback + id: launch-ec2-instance-with-fallback + uses: ./ci-actions/actions/launch-ec2-runner-with-fallback + env: + TMPDIR: "/tmp" + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + regions_config: > + [ + { + "region": "us-east-2", + "subnets": { + "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", + "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", + "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" + }, + { + "region": "us-east-1", + "subnets": { + "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", + "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", + "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", + "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", + "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", + "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" + } + ] + try_spot_instance_first: false + ec2_instance_type: g6e.12xlarge + aws_resource_tags: > + [ + {"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} + ] + + run-smoke-tests: + needs: + - start-large-ec2-runner + runs-on: ${{needs.start-large-ec2-runner.outputs.label}} + # It is important that this job has no write permissions and has + # no access to any secrets. This part is where we are running + # untrusted code from PRs. + permissions: {} + steps: + - name: "Checkout code" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + ref: ${{inputs.branch}} + + - name: Run smoke tests + uses: ./.github/actions/run-smoke + with: + python-version: 3.12 + + stop-large-ec2-runner: + needs: + - start-large-ec2-runner + - run-smoke-tests + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: "Configure AWS credentials" + uses: "aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df" # v4.2.1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }} + + - name: "Stop EC2 runner" + uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-large-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml index 7818adfd..bedeeb3b 100644 --- a/.github/workflows/smoke.yaml +++ b/.github/workflows/smoke.yaml @@ -1,14 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 -name: "Run smoke tests via Tox::pytest" +name: "Run smoke tests via Tox::pytest (python 3.11)" # These tests will be long running and require accelerated hardware. on: - workflow_dispatch: - inputs: - branch: - type: string - default: main + workflow_dispatch: {} # using this rather than pull_request because this workflow # needs to run in the context of the base branch (main) and # access the repo's secrets to start the AWS instances. @@ -23,7 +19,7 @@ on: - "pyproject.toml" - "requirements-dev.txt" - "requirements-cuda.txt" - - ".github/workflows/smoke.yaml" # This workflow + - "constraints-dev.txt" permissions: contents: read @@ -36,129 +32,112 @@ env: ec2_runner_variant: "g6e.12xlarge" # 4x L40s jobs: - start-ec2-runner: + start-large-ec2-runner: runs-on: ubuntu-latest outputs: - label: ${{ steps.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}} - + label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} + ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} + ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} steps: - - name: "Harden runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1 - with: - egress-policy: audit - - - name: "Configure AWS credentials" - uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0 + - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ vars.AWS_REGION }} - - - name: "Start EC2 runner" - id: start-ec2-runner - uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9 + repository: instructlab/ci-actions + # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents + path: ci-actions + ref: release-v0.1 + sparse-checkout: | + actions/launch-ec2-runner-with-fallback + + - name: Launch EC2 Runner with Fallback + id: launch-ec2-instance-with-fallback + uses: ./ci-actions/actions/launch-ec2-runner-with-fallback + env: + TMPDIR: "/tmp" with: - mode: start - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ${{ vars.AWS_EC2_AMI }} - ec2-instance-type: ${{ env.ec2_runner_variant }} - subnet-id: subnet-024298cefa3bedd61 - security-group-id: sg-06300447c4a5fbef3 - iam-role-name: instructlab-ci-runner - aws-resource-tags: > + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + regions_config: > + [ + { + "region": "us-east-2", + "subnets": { + "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", + "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", + "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" + }, + { + "region": "us-east-1", + "subnets": { + "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", + "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", + "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", + "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", + "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", + "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" + }, + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" + } + ] + try_spot_instance_first: false + ec2_instance_type: g6e.12xlarge + aws_resource_tags: > [ - {"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"}, - {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, - {"Key": "GitHubRef", "Value": "${{ github.ref }}"} + {"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} ] run-smoke-tests: needs: - - start-ec2-runner - runs-on: ${{needs.start-ec2-runner.outputs.label}} + - start-large-ec2-runner + runs-on: ${{needs.start-large-ec2-runner.outputs.label}} # It is important that this job has no write permissions and has # no access to any secrets. This part is where we are running # untrusted code from PRs. permissions: {} steps: - - name: "Harden runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1 - with: - egress-policy: audit - - - name: "Install packages" - run: | - cat /etc/os-release - sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel - - - name: "Verify cuda environment is setup" - run: | - export CUDA_HOME="/usr/local/cuda" - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64" - export PATH="${PATH}:${CUDA_HOME}/bin" - nvidia-smi - - name: "Checkout code" uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 - ref: ${{inputs.branch}} - - # installs in $GITHUB_WORKSPACE/venv. - # only has to install Tox because Tox will do the other virtual environment management. - - name: "Setup Python virtual environment" - run: | - python3.11 -m venv --upgrade-deps venv - . venv/bin/activate - pip install tox - - # flash-attn has a bug in the setup.py that causes pip to attempt - # installing it before torch is installed. This is a bug because their - # setup.py depends on importing the module, so it should have been listed - # in build_requires. Alas. - # See: https://github.com/Dao-AILab/flash-attention/pull/958 - - name: "Install torch before other dependencies" - run: | - source venv/bin/activate - pip install torch - - name: "Show disk utilization BEFORE tests" + - name: "Fetch and checkout PR" + # Needed because this workflow runs on pull_request_target which runs on the base branch (e.g. main) + if: ${{ github.event_name == 'pull_request_target'}} run: | - df -h + git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }} + git checkout pr-${{ github.event.number }} - - name: "Run smoke tests with Tox and Pytest" - run: | - source venv/bin/activate - tox -e py3-smoke - - - name: "Show disk utilization AFTER tests" - run: | - df -h + - name: Run smoke tests + uses: ./.github/actions/run-smoke + with: + python-version: 3.11 - stop-ec2-runner: + stop-large-ec2-runner: needs: - - start-ec2-runner + - start-large-ec2-runner - run-smoke-tests runs-on: ubuntu-latest if: ${{ always() }} steps: - - name: "Harden runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1 - with: - egress-policy: audit - - name: "Configure AWS credentials" - uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0 + uses: "aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df" # v4.2.1 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ vars.AWS_REGION }} + aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }} - name: "Stop EC2 runner" - uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9 + uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 with: mode: stop github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }} + label: ${{ needs.start-large-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml index 11499515..89b466dc 100644 --- a/.github/workflows/stale_bot.yml +++ b/.github/workflows/stale_bot.yml @@ -23,11 +23,6 @@ jobs: pull-requests: write runs-on: ubuntu-latest steps: - - name: "Harden Runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1 - with: - egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - - name: "Stale Action" uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 with: diff --git a/.github/workflows/unit.yaml b/.github/workflows/unit.yaml index 64f02ca5..c317cafd 100644 --- a/.github/workflows/unit.yaml +++ b/.github/workflows/unit.yaml @@ -20,6 +20,7 @@ on: - "pyproject.toml" - "requirements.txt" - "requirements-dev.txt" + - "constraints-dev.txt" - ".github/workflows/unit.yaml" # This workflow concurrency: @@ -51,13 +52,8 @@ jobs: # untrusted code from PRs. permissions: {} steps: - - name: "Harden runner" - uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1 - with: - egress-policy: audit - - name: Setup Python ${{ matrix.python }} - uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: "${{ matrix.python }}" @@ -72,9 +68,10 @@ jobs: run: | python -m venv --upgrade-deps venv . venv/bin/activate - pip install tox + pip install tox -c constraints-dev.txt - name: "Show disk utilization BEFORE tests" + if: always() run: | df -h @@ -84,5 +81,6 @@ jobs: tox -e py3-unit - name: "Show disk utilization AFTER tests" + if: always() run: | df -h