diff --git a/.github/actions/run-e2e/action.yml b/.github/actions/run-e2e/action.yml
new file mode 100644
index 00000000..37aaaa5b
--- /dev/null
+++ b/.github/actions/run-e2e/action.yml
@@ -0,0 +1,230 @@
+name: 'Run e2e tests'
+description: 'Runs e2e tests'
+inputs:
+  python-version:
+    required: true
+    description: >-
+      Python version to use. Must be in the form of "3.xx".
+  gh-token:
+    required: true
+    description: >-
+      GitHub token to use for authentication.
+  hf-token:
+    required: true
+    description: >-
+      Hugging Face token to use for authentication.
+  openai-api-key:
+    required: true
+    description: >-
+      OpenAI API key to use for authentication.
+  son-of-jeeves-discord-webhook:
+    required: true
+    description: >-
+      Son of Jeeves webhook (Discord).
+runs:
+  using: "composite"
+  steps:
+    - name: Install Packages
+      shell: bash
+      run: |
+        cat /etc/os-release
+        mkdir -p "${TMPDIR}"
+        sudo dnf install -y gcc gcc-c++ make git python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
+
+    - name: Checkout instructlab/instructlab
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        repository: "instructlab/instructlab"
+        path: "instructlab"
+          # https://github.com/actions/checkout/issues/249
+        fetch-depth: 0
+
+    - name: Determine if pr_or_branch is a PR number
+      id: check_pr
+      shell: bash
+      run: |
+        PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
+        if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
+        echo "is_pr=true" >> "$GITHUB_OUTPUT"
+        else
+        echo "is_pr=false" >> "$GITHUB_OUTPUT"
+        fi
+        echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
+
+    - name: Check if gh cli is installed
+      id: gh_cli
+      shell: bash
+      run: |
+        if command -v gh &> /dev/null ; then
+        echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+        else
+        echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+        fi
+
+    - name: Install gh CLI
+      if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+      shell: bash
+      run: |
+        sudo dnf install 'dnf-command(config-manager)' -y
+        sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+        sudo dnf install gh --repo gh-cli -y
+
+    - name: test gh CLI
+      shell: bash
+      run: |
+        gh --version
+
+    - name: set default repo
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Add comment to PR
+      if: steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Fetch and checkout PR
+      if: steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Checkout branch
+      if: steps.check_pr.outputs.is_pr == 'false'
+      working-directory: ./training
+      shell: bash
+      run: |
+        git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+
+    - name: Install ilab
+      working-directory: ./instructlab
+      shell: bash
+      run: |
+        PYTHON=python${{ inputs.python-version }} ./scripts/install-ilab-with-cuda.sh
+
+    - name: Update instructlab-training library
+      working-directory: ./training
+      shell: bash
+      run: |
+        . ../instructlab/venv/bin/activate
+
+        # Patch out our own pin from the ilab repo constraints file
+        ilab_constraints=../instructlab/constraints-dev.txt
+        sed -i '/instructlab-training==/d' $ilab_constraints
+
+        # Since we reuse the virtual environment prepared using ilab
+        # constraints, we should stick to the same constraints when
+        # installing latest training.
+        #
+        # FIX: this is not ideal; a proper fix would require decoupling the
+        # two repos in CI: either by removing the job completely and relying
+        # on "sdk" (no ilab) test runs; or by preparing a separate
+        # constraints file that would consider both the requirements files
+        # for the training library AND for the ilab - so that they are
+        # consistent.
+        pip_install="pip install -c $ilab_constraints"
+        $pip_install .
+        $pip_install .[cuda]
+
+    - name: Check disk before tests
+      if: always()
+      shell: bash
+      run: |
+        df -h
+
+    - name: Run e2e test
+      working-directory: ./instructlab
+      env:
+        HF_TOKEN: ${{ inputs.hf-token }}
+        OPENAI_API_KEY: ${{ inputs.openai-api-key }}
+      shell: bash
+      run: |
+        . venv/bin/activate
+
+        # set preserve to true so we can retain the logs
+        ./scripts/e2e-ci.sh -lp
+
+        # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+        # and we know that it will be written into a directory created by `mktemp -d`. 
+        # Given this information, we can use the following command to find the file:
+        log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
+        phase_num=1;
+        for log_file in $log_files; do
+        mv "${log_file}" phase-${phase_num}-training-log.jsonl
+        ((phase_num++))
+        done
+
+    - name: Check disk after tests
+      if: always()
+      shell: bash
+      run: |
+        df -h
+
+    - name: Upload training logs Phase 1
+      uses: actions/upload-artifact@v4
+      with:
+        name: phase-1-training-log.jsonl
+        path: ./instructlab/phase-1-training-log.jsonl
+        retention-days: 1
+        overwrite: true
+
+    - name: Upload training logs Phase 2
+      uses: actions/upload-artifact@v4
+      with:
+        name: phase-2-training-log.jsonl
+        path: ./instructlab/phase-2-training-log.jsonl
+        retention-days: 1
+        overwrite: true
+
+    - name: Add comment to PR if the workflow failed
+      if: failure() && steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Add comment to PR if the workflow succeeded
+      if: success() && steps.check_pr.outputs.is_pr == 'true'
+      working-directory: ./training
+      shell: bash
+      run: |
+        gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+      env:
+        GH_TOKEN: ${{ inputs.gh-token }}
+
+    - name: Send Discord notification for failure
+      if: failure() && steps.check_pr.outputs.is_pr == 'false'
+      uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
+      with:
+        webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
+        status: ${{ job.status }}
+        title: "e2e-nvidia-l40s-x4"
+        description: |
+          Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
+          Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
+        color: 0xCB2431 # Red color for failure
+
+    - name: Send Discord notification for success
+      if: success() && steps.check_pr.outputs.is_pr == 'false'
+      uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
+      with:
+        webhook: ${{ inputs.son-of-jeeves-discord-webhook }}
+        status: ${{ job.status }}
+        title: "e2e-nvidia-l40s-x4"
+        description: |
+          Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
+          Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
+        color: 0x28A745 # Green color for success
diff --git a/.github/actions/run-smoke/action.yml b/.github/actions/run-smoke/action.yml
new file mode 100644
index 00000000..8c73603d
--- /dev/null
+++ b/.github/actions/run-smoke/action.yml
@@ -0,0 +1,77 @@
+name: 'Run smoke tests'
+description: 'Runs smoke tests'
+inputs:
+  python-version:
+    required: true
+    description: >-
+      Python version to use. Must be in the form of "3.xx".
+runs:
+  using: "composite"
+  steps:
+    - name: "Install packages"
+      shell: bash
+      run: |
+        cat /etc/os-release
+        sudo dnf install -y gcc gcc-c++ make git-core python${{ inputs.python-version }} python${{ inputs.python-version }}-devel
+
+    - name: "Verify cuda environment is setup"
+      shell: bash
+      run: |
+        export CUDA_HOME="/usr/local/cuda"
+        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
+        export PATH="${PATH}:${CUDA_HOME}/bin"
+        nvidia-smi
+
+    # installs in $GITHUB_WORKSPACE/venv.
+    # only has to install Tox because Tox will do the other virtual environment management.
+    - name: "Setup Python virtual environment"
+      shell: bash
+      run: |
+        python${{ inputs.python-version }} -m venv --upgrade-deps venv
+        . venv/bin/activate
+        pip install tox -c constraints-dev.txt
+
+    # flash-attn has a bug in the setup.py that causes pip to attempt
+    # installing it before torch is installed. This is a bug because their
+    # setup.py depends on importing the module, so it should have been listed
+    # in build_requires. Alas. See:
+    # https://github.com/Dao-AILab/flash-attention/pull/958
+    - name: "Install torch and other unlisted build dependencies for flash-attn"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        # The list is taken from the pull request linked above
+        pip install torch packaging setuptools wheel psutil ninja -c constraints-dev.txt
+
+    - name: "Install tox-current-env to reuse the venv with pre-installed build dependencies"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        pip install tox-current-env
+
+    - name: "Install dependencies from tox.ini in the current venv, using current venv installed deps"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        tox -e py3-smoke --print-deps-to-file=./deps.txt
+        pip_install="pip install -c constraints-dev.txt"
+        $pip_install -r ./deps.txt --no-build-isolation
+        $pip_install .
+
+    - name: "Show disk utilization BEFORE tests"
+      shell: bash
+      if: always()
+      run: |
+        df -h
+
+    - name: "Run smoke tests with Tox and Pytest"
+      shell: bash
+      run: |
+        source venv/bin/activate
+        tox --current-env -e py3-smoke
+
+    - name: "Show disk utilization AFTER tests"
+      shell: bash
+      if: always()
+      run: |
+        df -h
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 05b26f59..85c1d1b7 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -13,9 +13,3 @@ updates:
     directory: "/.github/workflows"
     schedule:
       interval: "daily"
-
-  # Maintain dependencies for Python scripts
-  - package-ecosystem: "pip"
-    directory: "/"
-    schedule:
-      interval: "daily"
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 0c4a9f45..e82bca96 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -11,7 +11,6 @@ pull_request_rules:
     - label!=hold
     - label!=do-not-merge
     - label!=needs-rebase
-    - check-success=DCO
 
     # The files conditions regex should match the globs in workflow files
     # If workflow configuration files in .github/ are changed, the actionlint check must pass
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 4606dde9..e7b8ff24 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -29,11 +29,6 @@ jobs:
   actionlint:
     runs-on: ubuntu-latest
     steps:
-      - name: "Harden Runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-        with:
-          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
-
       - name: "Checkout"
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
diff --git a/.github/workflows/constraints-update.yml b/.github/workflows/constraints-update.yml
new file mode 100644
index 00000000..ef2f60cd
--- /dev/null
+++ b/.github/workflows/constraints-update.yml
@@ -0,0 +1,35 @@
+# Aligned with: https://github.com/instructlab/dev-docs/pull/198
+name: Update constraints-dev.txt
+
+on:
+  schedule:
+    - cron: '0 3 * * 1'  # Every Monday at 03:00 UTC
+  workflow_dispatch:
+
+jobs:
+  update-constraints:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Checkout "update-constraints" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: instructlab/ci-actions
+          path: ci-actions
+          # no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet
+          ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main
+          sparse-checkout: |
+            actions/update-constraints
+
+      - name: Update constraints
+        id: update-constraints
+        uses: ./ci-actions/actions/update-constraints
+        with:
+          gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 6ed92091..a62b38de 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -32,15 +32,11 @@ jobs:
   markdown-lint:
     runs-on: ubuntu-latest
     steps:
-      - name: "Harden Runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-        with:
-          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: "Check Markdown documents"
-        uses: DavidAnson/markdownlint-cli2-action@05f32210e84442804257b2a6f20b273450ec8265 # v19.1.0
+        uses: DavidAnson/markdownlint-cli2-action@992badcdf24e3b8eb7e87ff9287fe931bcb00c6e # v20.0.0
         with:
           globs: '**/*.md'
diff --git a/.github/workflows/e2e-nvidia-l40s-x4-py312.yml b/.github/workflows/e2e-nvidia-l40s-x4-py312.yml
new file mode 100644
index 00000000..e78dce4d
--- /dev/null
+++ b/.github/workflows/e2e-nvidia-l40s-x4-py312.yml
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA L40S x4) (python 3.12)
+
+on:
+  schedule:
+    - cron: '0 16 * * *' # Runs at 4PM UTC every day
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+env:
+  TMPDIR: /home/tmp
+
+jobs:
+  start-large-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
+      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
+      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
+    steps:
+      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: instructlab/ci-actions
+          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
+          path: ci-actions
+          ref: release-v0.1
+          sparse-checkout: |
+            actions/launch-ec2-runner-with-fallback
+
+      - name: Launch EC2 Runner with Fallback
+        id: launch-ec2-instance-with-fallback
+        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
+        env:
+          TMPDIR: "/tmp"
+        with:
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          regions_config: >
+            [
+              {
+                "region": "us-east-2",
+                "subnets": {
+                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
+                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
+                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
+              },
+              {
+                "region": "us-east-1",
+                "subnets": {
+                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
+                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
+                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
+                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
+                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
+                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
+              }
+            ]
+          try_spot_instance_first: false
+          ec2_instance_type: g6e.12xlarge
+          aws_resource_tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  e2e-large-test:
+    needs:
+      - start-large-ec2-runner
+    runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Checkout instructlab/training
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/training"
+          path: "training"
+            # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Run e2e tests
+        uses: ./training/.github/actions/run-e2e
+        with:
+          python-version: 3.12
+          gh-token: ${{ secrets.GITHUB_TOKEN }}
+          hf-token: ${{ secrets.HF_TOKEN }}
+          openai-api-key: ${{ secrets.OPENAI_API_KEY }}
+          son-of-jeeves-discord-webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
+
+  stop-large-ec2-runner:
+    needs:
+      - start-large-ec2-runner
+      - e2e-large-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@8b37f736c69ba6af391e437447d3c07548478d78 # v2.4.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
+
+  loss-graphs:
+    needs:
+      - stop-large-ec2-runner
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Download loss data Phase 1
+        id: phase-1-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: phase-1-training-log.jsonl
+          path: downloaded-data
+
+      - name: Download loss data Phase 2
+        id: phase-2-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: phase-2-training-log.jsonl
+          path: downloaded-data
+
+      - name: Checkout instructlab/training
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          fetch-depth: 0
+
+      - name: Install dependencies
+        working-directory: ./training
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-dev.txt -c constraints-dev.txt
+
+      - name: Try to upload Phase 1 to s3
+        id: phase-1-upload-s3
+        continue-on-error: true
+        run: |
+          python training/scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
+            --output-file "./phase-1-test.md" \
+            --phase "1" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Try to upload Phase 2 to s3
+        id: phase-2-upload-s3
+        continue-on-error: true
+        run: |
+          python training/scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
+            --output-file "./phase-2-test.md" \
+            --phase "2" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Check Phase 1 S3 upload status for success
+        if: steps.phase-1-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 1 loss graph to S3."
+          cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for success
+        if: steps.phase-2-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 2 loss graph to S3."
+          cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 1 S3 upload status for failure
+        if: steps.phase-1-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for failure
+        if: steps.phase-2-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
diff --git a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml
new file mode 100644
index 00000000..e1c8313b
--- /dev/null
+++ b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml
@@ -0,0 +1,295 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA L40S x4) SDK Test
+
+on:
+  # only run on PRs that touch certain regex paths
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+env:
+  TMPDIR: /home/tmp
+
+jobs:
+  start-large-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
+      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
+      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
+    steps:
+      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: instructlab/ci-actions
+          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
+          path: ci-actions
+          ref: release-v0.1
+          sparse-checkout: |
+            actions/launch-ec2-runner-with-fallback
+
+      - name: Launch EC2 Runner with Fallback
+        id: launch-ec2-instance-with-fallback
+        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
+        env:
+          TMPDIR: "/tmp"
+        with:
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          regions_config: >
+            [
+              {
+                "region": "us-east-2",
+                "subnets": {
+                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
+                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
+                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
+              },
+              {
+                "region": "us-east-1",
+                "subnets": {
+                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
+                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
+                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
+                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
+                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
+                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
+              }
+            ]
+          try_spot_instance_first: false
+          ec2_instance_type: g6e.12xlarge
+          aws_resource_tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  e2e-medium-test:
+    needs:
+      - start-large-ec2-runner
+    runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          mkdir -p "${TMPDIR}"
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+  
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Install dependent PRs if needed
+        uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: ${{ github.event_name == 'pull_request_target' }}
+        run: |
+          git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
+          git checkout pr-${{ github.event.number }}
+
+      - name: Update instructlab-training library
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          nvidia-smi
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          pip install instructlab
+          pip install instructlab[cuda]
+          pip install vllm
+          python3.11 -m pip install packaging wheel setuptools-scm
+          pip install .
+          pip install .[cuda]
+          python3.11 -m pip uninstall -y flash-attn
+          python3.11 -m pip cache purge
+          python3.11 -m pip install ninja
+          MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
+
+      - name: Check disk before tests
+        run: |
+          df -h
+
+      # TODO: switch to downloading a ds rather than generating one
+      # - name: Download SDG Dataset
+      #   working-directory: ./training
+      #   uses: actions/download-artifact@v4
+      #   with:
+      #     name: sdg-dataset.jsonl
+      #     path: dataset
+
+      - name: Run e2e test
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          . venv/bin/activate
+          ls scripts
+          ls ./
+          ./scripts/test-sdk.sh
+
+          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+          # and we know that it will be written into a directory created by `mktemp -d`. 
+          # Given this information, we can use the following command to find the file:
+          log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
+          phase_num=1;
+          for log_file in $log_files; do
+              mv "${log_file}" phase-${phase_num}-training-log.jsonl
+              ((phase_num++))
+          done
+
+      - name: Check disk after tests
+        run: |
+          df -h
+
+      - name: Upload training logs Phase 1
+        uses: actions/upload-artifact@v4
+        with:
+          name: phase-1-training-log.jsonl
+          path: ./phase-1-training-log.jsonl
+          retention-days: 1
+          overwrite: true
+
+      - name: Upload training logs Phase 2
+        uses: actions/upload-artifact@v4
+        with:
+          name: phase-2-training-log.jsonl
+          path: ./phase-2-training-log.jsonl
+          retention-days: 1
+          overwrite: true
+
+  stop-large-ec2-runner:
+    needs:
+      - start-large-ec2-runner
+      - e2e-medium-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
+
+  loss-graphs:
+    needs:
+      - stop-large-ec2-runner
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-dev.txt
+
+      - name: Download loss data Phase 1
+        id: phase-1-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: phase-1-training-log.jsonl
+          path: downloaded-data
+
+      - name: Download loss data Phase 2
+        id: phase-2-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: phase-2-training-log.jsonl
+          path: downloaded-data
+
+      - name: Try to upload Phase 1 to s3
+        id: phase-1-upload-s3
+        continue-on-error: true
+        run: |
+          python ./scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
+            --output-file "./phase-1-test.md" \
+            --phase "1" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Try to upload Phase 2 to s3
+        id: phase-2-upload-s3
+        continue-on-error: true
+        run: |
+          python ./scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
+            --output-file "./phase-2-test.md" \
+            --phase "2" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Check Phase 1 S3 upload status for success
+        if: steps.phase-1-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 1 loss graph to S3."
+          cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for success
+        if: steps.phase-2-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 2 loss graph to S3."
+          cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 1 S3 upload status for failure
+        if: steps.phase-1-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for failure
+        if: steps.phase-2-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index 57d213b9..f464783c 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: E2E (NVIDIA L40S x4)
+name: E2E (NVIDIA L40S x4) (python 3.11)
 
 on:
   schedule:
@@ -19,34 +19,58 @@ jobs:
   start-large-ec2-runner:
     runs-on: ubuntu-latest
     outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
+      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
+      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
     steps:
-      - name: "Harden Runner"
-        # v2.10.1
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
+      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          egress-policy: audit
+          repository: instructlab/ci-actions
+          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
+          path: ci-actions
+          ref: release-v0.1
+          sparse-checkout: |
+            actions/launch-ec2-runner-with-fallback
 
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
+      - name: Launch EC2 Runner with Fallback
+        id: launch-ec2-instance-with-fallback
+        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
+        env:
+          TMPDIR: "/tmp"
         with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
-          ec2-instance-type: g6e.12xlarge
-          subnet-id: subnet-024298cefa3bedd61
-          security-group-id: sg-06300447c4a5fbef3
-          iam-role-name: instructlab-ci-runner
-          aws-resource-tags: >
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          regions_config: >
+            [
+              {
+                "region": "us-east-2",
+                "subnets": {
+                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
+                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
+                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
+              },
+              {
+                "region": "us-east-1",
+                "subnets": {
+                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
+                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
+                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
+                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
+                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
+                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
+              }
+            ]
+          try_spot_instance_first: false
+          ec2_instance_type: g6e.12xlarge
+          aws_resource_tags: >
             [
               {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
               {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
@@ -63,25 +87,6 @@ jobs:
       pull-requests: write
 
     steps:
-      - name: "Harden Runner"
-        # v2.10.1
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
-        with:
-          egress-policy: audit
-      - name: Install Packages
-        run: |
-          cat /etc/os-release
-          mkdir -p "${TMPDIR}"
-          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
-
-      - name: Checkout instructlab/instructlab
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: "instructlab/instructlab"
-          path: "instructlab"
-          # https://github.com/actions/checkout/issues/249
-          fetch-depth: 0
-  
       - name: Checkout instructlab/training
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -90,203 +95,14 @@ jobs:
           # https://github.com/actions/checkout/issues/249
           fetch-depth: 0
 
-      - name: Determine if pr_or_branch is a PR number
-        id: check_pr
-        run: |
-          PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
-          if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
-            echo "is_pr=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "is_pr=false" >> "$GITHUB_OUTPUT"
-          fi
-          echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
-
-      - name: Check if gh cli is installed
-        id: gh_cli
-        run: |
-          if command -v gh &> /dev/null ; then
-            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Install gh CLI
-        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
-        run: |
-          sudo dnf install 'dnf-command(config-manager)' -y
-          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
-          sudo dnf install gh --repo gh-cli -y
-
-      - name: test gh CLI
-        run: |
-          gh --version
-
-      - name: set default repo
-        working-directory: ./training
-        run: |
-          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Add comment to PR
-        if: steps.check_pr.outputs.is_pr == 'true'
-        working-directory: ./training
-        run: |
-          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Fetch and checkout PR
-        if: steps.check_pr.outputs.is_pr == 'true'
-        working-directory: ./training
-        run: |
-          gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Checkout branch
-        if: steps.check_pr.outputs.is_pr == 'false'
-        working-directory: ./training
-        run: |
-          git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
-
-      - name: Install ilab
-        working-directory: ./instructlab
-        run: |
-          export CUDA_HOME="/usr/local/cuda"
-          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-          export PATH="$PATH:$CUDA_HOME/bin"
-          python3.11 -m venv --upgrade-deps venv
-          . venv/bin/activate
-          nvidia-smi
-          python3.11 -m pip cache remove llama_cpp_python
-
-          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
-
-          # https://github.com/instructlab/instructlab/issues/1821
-          # install with Torch and build dependencies installed
-          python3.11 -m pip install packaging wheel setuptools-scm
-          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
-
-      - name: Update instructlab-training library
-        working-directory: ./training
-        run: |
-          . ../instructlab/venv/bin/activate
-          pip install .
-          pip install .[cuda]
-
-      - name: Check disk before tests
-        run: |
-          df -h
-
-      - name: Run e2e test
-        working-directory: ./instructlab
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        run: |
-          . venv/bin/activate
-
-          # set preserve to true so we can retain the logs
-          ./scripts/e2e-ci.sh -lp
-
-          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
-          # and we know that it will be written into a directory created by `mktemp -d`. 
-          # Given this information, we can use the following command to find the file:
-          log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
-          phase_num=1;
-          for log_file in $log_files; do
-              mv "${log_file}" phase-${phase_num}-training-log.jsonl
-              ((phase_num++))
-          done
-
-      - name: Check disk after tests
-        run: |
-          df -h
-
-      - name: Upload training logs Phase 1
-        uses: actions/upload-artifact@v4
+      - name: Run e2e tests
+        uses: ./training/.github/actions/run-e2e
         with:
-          name: phase-1-training-log.jsonl
-          path: ./instructlab/phase-1-training-log.jsonl
-          retention-days: 1
-          overwrite: true
-
-      - name: Upload training logs Phase 2
-        uses: actions/upload-artifact@v4
-        with:
-          name: phase-2-training-log.jsonl
-          path: ./instructlab/phase-2-training-log.jsonl
-          retention-days: 1
-          overwrite: true
-
-      - name: Add comment to PR if the workflow failed
-        if: failure() && steps.check_pr.outputs.is_pr == 'true'
-        working-directory: ./training
-        run: |
-          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Add comment to PR if the workflow succeeded
-        if: success() && steps.check_pr.outputs.is_pr == 'true'
-        working-directory: ./training
-        run: |
-          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Post job results to Slack if the workflow failed
-        if: failure() && steps.check_pr.outputs.is_pr == 'false'
-        id: slack-report-failure
-        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
-        with:
-          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
-          method: chat.postMessage
-          payload: |
-            # Slack channel id, channel name, or user id to post message.
-            # See also: https://api.slack.com/methods/chat.postMessage#channels
-            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
-            channel: 'e2e-ci-results'
-            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-
-      - name: Post job results to Slack if the workflow succeeded
-        if: success() && steps.check_pr.outputs.is_pr == 'false'
-        id: slack-report-success
-        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
-        with:
-          token: ${{ secrets.SON_OF_JEEVES_TOKEN }}
-          method: chat.postMessage
-          payload: |
-            # Slack channel id, channel name, or user id to post message.
-            # See also: https://api.slack.com/methods/chat.postMessage#channels
-            # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
-            channel: 'e2e-ci-results'
-            text: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-
-      - name: Send Discord notification for failure
-        if: failure() && steps.check_pr.outputs.is_pr == 'false'
-        uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
-        with:
-          webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
-          status: ${{ job.status }}
-          title: "e2e-nvidia-l40s-x4"
-          description: |
-            Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌
-            Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
-          color: 0xCB2431 # Red color for failure
-
-      - name: Send Discord notification for success
-        if: success() && steps.check_pr.outputs.is_pr == 'false'
-        uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3
-        with:
-          webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
-          status: ${{ job.status }}
-          title: "e2e-nvidia-l40s-x4"
-          description: |
-            Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅
-            Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
-          color: 0x28A745 # Green color for success
+          python-version: 3.11
+          gh-token: ${{ secrets.GITHUB_TOKEN }}
+          hf-token: ${{ secrets.HF_TOKEN }}
+          openai-api-key: ${{ secrets.OPENAI_API_KEY }}
+          son-of-jeeves-discord-webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
 
   stop-large-ec2-runner:
     needs:
@@ -295,21 +111,15 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
-      - name: "Harden Runner"
-        # v2.10.1
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
-        with:
-          egress-policy: audit
-
       - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ vars.AWS_REGION }}
+          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
 
       - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
+        uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
         with:
           mode: stop
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
@@ -322,14 +132,8 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
-      - name: "Harden Runner"
-        # v2.10.1
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
-        with:
-          egress-policy: audit
-
       - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@@ -360,7 +164,7 @@ jobs:
         working-directory: ./training
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements-dev.txt
+          pip install -r requirements-dev.txt -c constraints-dev.txt
 
       - name: Try to upload Phase 1 to s3
         id: phase-1-upload-s3
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 6327295e..2c8df16d 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -11,6 +11,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements*.txt'
+      - 'constraints-dev.txt'
       - 'tox.ini'
       - '.pylintrc'
       - 'scripts/*.sh' # Used by this workflow
@@ -23,6 +24,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements*.txt'
+      - 'constraints-dev.txt'
       - 'tox.ini'
       - '.pylintrc'
       - 'scripts/*.sh' # Used by this workflow
@@ -57,11 +59,6 @@ jobs:
             commands: |
               tox -e mypy
     steps:
-      - name: "Harden Runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-        with:
-          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
-
       - name: "Checkout"
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -69,7 +66,7 @@ jobs:
           fetch-depth: 0
 
       - name: Setup Python 3.11
-        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           python-version: 3.11
           cache: pip
@@ -80,7 +77,7 @@ jobs:
       - name: Install tox
         run: |
           python -m pip install --upgrade pip
-          python -m pip install tox tox-gh
+          python -m pip install tox tox-gh -c constraints-dev.txt
 
       - name: "${{ matrix.lint.name }}"
         run: |
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 7343ab1c..69a41135 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -36,11 +36,6 @@ jobs:
         name: Build and check packages
         runs-on: ubuntu-latest
         steps:
-            - name: "Harden Runner"
-              uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-              with:
-                  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
-
             - name: "Checkout"
               uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
               with:
@@ -65,11 +60,6 @@ jobs:
         needs: build-package
 
         steps:
-            - name: "Harden Runner"
-              uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-              with:
-                  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
-
             - name: "Download build artifacts"
               uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
               with:
@@ -97,11 +87,6 @@ jobs:
         needs: build-package
 
         steps:
-            - name: "Harden Runner"
-              uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-              with:
-                  egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
-
             - name: "Download build artifacts"
               uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
               with:
diff --git a/.github/workflows/smoke-py312.yaml b/.github/workflows/smoke-py312.yaml
new file mode 100644
index 00000000..400146d1
--- /dev/null
+++ b/.github/workflows/smoke-py312.yaml
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: "Run smoke tests via Tox::pytest (python 3.12)"
+# These tests will be long running and require accelerated hardware.
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        type: string
+        default: main
+  # using this rather than pull_request because this workflow
+  # needs to run in the context of the base branch (main) and
+  # access the repo's secrets to start the AWS instances.
+  pull_request_target:
+    branches:
+      - main
+      - release-*
+    paths:
+      # note this should match the merging criteria in 'mergify.yml'
+      - "**.py"
+      - "tox.ini"
+      - "pyproject.toml"
+      - "requirements-dev.txt"
+      - "requirements-cuda.txt"
+      - "constraints-dev.txt"
+
+permissions:
+  contents: read
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  ec2_runner_variant: "g6e.12xlarge" # 4x L40s
+
+jobs:
+  start-large-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
+      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
+      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
+    steps:
+      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: instructlab/ci-actions
+          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
+          path: ci-actions
+          ref: release-v0.1
+          sparse-checkout: |
+            actions/launch-ec2-runner-with-fallback
+
+      - name: Launch EC2 Runner with Fallback
+        id: launch-ec2-instance-with-fallback
+        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
+        env:
+          TMPDIR: "/tmp"
+        with:
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          regions_config: >
+            [
+              {
+                "region": "us-east-2",
+                "subnets": {
+                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
+                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
+                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
+              },
+              {
+                "region": "us-east-1",
+                "subnets": {
+                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
+                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
+                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
+                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
+                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
+                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
+              }
+            ]
+          try_spot_instance_first: false
+          ec2_instance_type: g6e.12xlarge
+          aws_resource_tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  run-smoke-tests:
+    needs:
+      - start-large-ec2-runner
+    runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
+    # It is important that this job has no write permissions and has
+    # no access to any secrets. This part is where we are running
+    # untrusted code from PRs.
+    permissions: {}
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          ref: ${{inputs.branch}}
+
+      - name: Run smoke tests
+        uses: ./.github/actions/run-smoke
+        with:
+          python-version: 3.12
+
+  stop-large-ec2-runner:
+    needs:
+      - start-large-ec2-runner
+      - run-smoke-tests
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Configure AWS credentials"
+        uses: "aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df" # v4.2.1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
+
+      - name: "Stop EC2 runner"
+        uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
index 7818adfd..bedeeb3b 100644
--- a/.github/workflows/smoke.yaml
+++ b/.github/workflows/smoke.yaml
@@ -1,14 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: "Run smoke tests via Tox::pytest"
+name: "Run smoke tests via Tox::pytest (python 3.11)"
 # These tests will be long running and require accelerated hardware.
 
 on:
-  workflow_dispatch:
-    inputs:
-      branch:
-        type: string
-        default: main
+  workflow_dispatch: {}
   # using this rather than pull_request because this workflow
   # needs to run in the context of the base branch (main) and
   # access the repo's secrets to start the AWS instances.
@@ -23,7 +19,7 @@ on:
       - "pyproject.toml"
       - "requirements-dev.txt"
       - "requirements-cuda.txt"
-      - ".github/workflows/smoke.yaml" # This workflow
+      - "constraints-dev.txt"
 
 permissions:
   contents: read
@@ -36,129 +32,112 @@ env:
   ec2_runner_variant: "g6e.12xlarge" # 4x L40s
 
 jobs:
-  start-ec2-runner:
+  start-large-ec2-runner:
     runs-on: ubuntu-latest
     outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}}
-
+      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
+      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
+      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
     steps:
-      - name: "Harden runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1
-        with:
-          egress-policy: audit
-
-      - name: "Configure AWS credentials"
-        uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0
+      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: "Start EC2 runner"
-        id: start-ec2-runner
-        uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
+          repository: instructlab/ci-actions
+          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
+          path: ci-actions
+          ref: release-v0.1
+          sparse-checkout: |
+            actions/launch-ec2-runner-with-fallback
+
+      - name: Launch EC2 Runner with Fallback
+        id: launch-ec2-instance-with-fallback
+        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
+        env:
+          TMPDIR: "/tmp"
         with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
-          ec2-instance-type: ${{ env.ec2_runner_variant }}
-          subnet-id: subnet-024298cefa3bedd61
-          security-group-id: sg-06300447c4a5fbef3
-          iam-role-name: instructlab-ci-runner
-          aws-resource-tags: >
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          regions_config: >
+            [
+              {
+                "region": "us-east-2",
+                "subnets": {
+                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
+                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
+                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
+              },
+              {
+                "region": "us-east-1",
+                "subnets": {
+                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
+                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
+                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
+                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
+                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
+                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
+                },
+                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
+                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
+              }
+            ]
+          try_spot_instance_first: false
+          ec2_instance_type: g6e.12xlarge
+          aws_resource_tags: >
             [
-            {"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"},
-            {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
-            {"Key": "GitHubRef", "Value": "${{ github.ref }}"}
+              {"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
             ]
 
   run-smoke-tests:
     needs:
-      - start-ec2-runner
-    runs-on: ${{needs.start-ec2-runner.outputs.label}}
+      - start-large-ec2-runner
+    runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
     # It is important that this job has no write permissions and has
     # no access to any secrets. This part is where we are running
     # untrusted code from PRs.
     permissions: {}
     steps:
-      - name: "Harden runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1
-        with:
-          egress-policy: audit
-
-      - name: "Install packages"
-        run: |
-          cat /etc/os-release
-          sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
-
-      - name: "Verify cuda environment is setup"
-        run: |
-          export CUDA_HOME="/usr/local/cuda"
-          export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
-          export PATH="${PATH}:${CUDA_HOME}/bin"
-          nvidia-smi
-
       - name: "Checkout code"
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
-          ref: ${{inputs.branch}}
-
-      # installs in $GITHUB_WORKSPACE/venv.
-      # only has to install Tox because Tox will do the other virtual environment management.
-      - name: "Setup Python virtual environment"
-        run: |
-          python3.11 -m venv --upgrade-deps venv
-          . venv/bin/activate
-          pip install tox
-
-      # flash-attn has a bug in the setup.py that causes pip to attempt
-      # installing it before torch is installed. This is a bug because their
-      # setup.py depends on importing the module, so it should have been listed
-      # in build_requires. Alas.
-      # See: https://github.com/Dao-AILab/flash-attention/pull/958
-      - name: "Install torch before other dependencies"
-        run: |
-          source venv/bin/activate
-          pip install torch
 
-      - name: "Show disk utilization BEFORE tests"
+      - name: "Fetch and checkout PR"
+        # Needed because this workflow runs on pull_request_target which runs on the base branch (e.g. main)
+        if: ${{ github.event_name == 'pull_request_target'}}
         run: |
-          df -h
+          git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
+          git checkout pr-${{ github.event.number }}
 
-      - name: "Run smoke tests with Tox and Pytest"
-        run: |
-          source venv/bin/activate
-          tox -e py3-smoke
-
-      - name: "Show disk utilization AFTER tests"
-        run: |
-          df -h
+      - name: Run smoke tests
+        uses: ./.github/actions/run-smoke
+        with:
+          python-version: 3.11
 
-  stop-ec2-runner:
+  stop-large-ec2-runner:
     needs:
-      - start-ec2-runner
+      - start-large-ec2-runner
       - run-smoke-tests
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
-      - name: "Harden runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1
-        with:
-          egress-policy: audit
-
       - name: "Configure AWS credentials"
-        uses: "aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722" # v4.1.0
+        uses: "aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df" # v4.2.1
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ vars.AWS_REGION }}
+          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
 
       - name: "Stop EC2 runner"
-        uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
+        uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
         with:
           mode: stop
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-ec2-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml
index 11499515..89b466dc 100644
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@@ -23,11 +23,6 @@ jobs:
       pull-requests: write
     runs-on: ubuntu-latest
     steps:
-      - name: "Harden Runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.11.1
-        with:
-          egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
-
       - name: "Stale Action"
         uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
         with:
diff --git a/.github/workflows/unit.yaml b/.github/workflows/unit.yaml
index 64f02ca5..c317cafd 100644
--- a/.github/workflows/unit.yaml
+++ b/.github/workflows/unit.yaml
@@ -20,6 +20,7 @@ on:
       - "pyproject.toml"
       - "requirements.txt"
       - "requirements-dev.txt"
+      - "constraints-dev.txt"
       - ".github/workflows/unit.yaml" # This workflow
 
 concurrency:
@@ -51,13 +52,8 @@ jobs:
     # untrusted code from PRs.
     permissions: {}
     steps:
-      - name: "Harden runner"
-        uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf # v2.10.1
-        with:
-          egress-policy: audit
-
       - name: Setup Python ${{ matrix.python }}
-        uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           python-version: "${{ matrix.python }}"
 
@@ -72,9 +68,10 @@ jobs:
         run: |
           python -m venv --upgrade-deps venv
           . venv/bin/activate
-          pip install tox
+          pip install tox -c constraints-dev.txt
 
       - name: "Show disk utilization BEFORE tests"
+        if: always()
         run: |
           df -h
 
@@ -84,5 +81,6 @@ jobs:
           tox -e py3-unit
 
       - name: "Show disk utilization AFTER tests"
+        if: always()
         run: |
           df -h