diff --git a/.github/workflows/auto-label.yml b/.github/workflows/auto-label.yml deleted file mode 100644 index 5feff289..00000000 --- a/.github/workflows/auto-label.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: Auto labeling -on: - issues: - types: [opened] - pull_request: - types: [opened] - -permissions: - contents: read - issues: write - pull-requests: write - -jobs: - # Label ISSUES using Renato66/auto-label - label-issues: - if: github.event_name == 'issues' - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - sparse-checkout: | - .github/workflows/auto-label.json5 - sparse-checkout-cone-mode: false - - uses: Renato66/auto-label@v3 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - # Add ISSUES to ROCm Project #91 so they land in Todo - add-issues-to-project: - if: github.event_name == 'issues' - runs-on: ubuntu-latest - steps: - - uses: actions/add-to-project@v1.0.2 - with: - project-url: https://github.com/orgs/ROCm/projects/91 - github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} - - # PRs: label so the project rule moves them to In Progress - label-prs: - if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork - runs-on: ubuntu-latest - steps: - - name: Add iris + in-progress labels to PR - uses: actions/github-script@v7 - with: - script: | - await github.rest.issues.addLabels({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.payload.pull_request.number, - labels: ["iris", "in-progress"] - }) - diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml new file mode 100644 index 00000000..87e84d75 --- /dev/null +++ b/.github/workflows/docker-build-push.yml @@ -0,0 +1,98 @@ +name: Build and Push Docker Image + +on: + push: + branches: + - muhaawad/docker-images + workflow_dispatch: + inputs: + triton_commit: + description: 'Triton commit SHA to use' + required: false + default: 'aafec417bded34db6308f5b3d6023daefae43905' + type: string + +# This workflow always runs, even with [skip ci] or [ci skip] in commit message + +env: + DOCKERHUB_USERNAME: muhaawad + IMAGE_NAME: iris-dev + +jobs: + build-test: + runs-on: [self-hosted, mi3008x] + permissions: + contents: read + packages: write + timeout-minutes: 60 + + strategy: + fail-fast: false + matrix: + include: + - dockerfile: ./docker/Dockerfile.rocm6.4 + base-name: "rocm6.4.4_ubuntu24.04_py3.12_pytorch_release_2.7.1" + - dockerfile: ./docker/Dockerfile.rocm7.0 + base-name: "rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.8.0" + - dockerfile: ./docker/Dockerfile.rocm7.1 + base-name: "rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.8.0" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set Triton SHA + id: triton + run: | + TRITON_COMMIT="${{ inputs.triton_commit || 'aafec417bded34db6308f5b3d6023daefae43905' }}" + TRITON_SHORT="${TRITON_COMMIT:0:7}" + echo "short_sha=${TRITON_SHORT}" >> $GITHUB_OUTPUT + echo "full_sha=${TRITON_COMMIT}" >> $GITHUB_OUTPUT + + - name: Build Docker image + run: | + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" + echo "Building ${IMAGE_TAG}..." + docker build \ + -f ${{ matrix.dockerfile }} \ + -t ${IMAGE_TAG} \ + --build-arg TRITON_COMMIT=${{ steps.triton.outputs.full_sha }} \ + ./docker + echo "✅ Build complete!" + +# - name: Run validation tests +# run: | +# set -e +# +# IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" +# +# echo "::group::Running validation tests" +# bash .github/scripts/container_exec.sh --image "${IMAGE_TAG}" " +# set -e +# pip install -e . +# +# echo '=== Running external validation test ===' +# wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py +# python test_iris_distributed.py +# +# echo '=== Running external gluon validation test ===' +# wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py +# python test_iris_gluon_distributed.py +# " +# echo "::endgroup::" +# +# echo "✅ All validation tests passed for ${{ matrix.base-name }}!" + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ env.DOCKERHUB_USERNAME }} + password: ${{ secrets.IRIS_DOCKERHUB_TOKEN }} + + - name: Push Docker image + run: | + IMAGE_TAG="${{ env.DOCKERHUB_USERNAME }}/${{ env.IMAGE_NAME }}:${{ matrix.base-name }}_triton_${{ steps.triton.outputs.short_sha }}" + echo "Pushing ${IMAGE_TAG} to Docker Hub..." + docker push ${IMAGE_TAG} + echo "✅ Successfully pushed ${{ matrix.base-name }}!" + diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 4e7dd3da..00000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,80 +0,0 @@ -name: Build and Deploy Documentation - -on: - push: - branches: [ main ] - paths: - - 'docs/**' - - 'iris/**' - - 'examples/**' - - '.github/workflows/docs.yml' - pull_request: - branches: [ main ] - paths: - - 'docs/**' - - 'iris/**' - - 'examples/**' - - '.github/workflows/docs.yml' - -permissions: - contents: read - pages: write - id-token: write - -concurrency: - group: "pages-${{ github.ref }}" - cancel-in-progress: false - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - ref: ${{ github.ref }} - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y build-essential - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install -r docs/sphinx/requirements.txt - - - name: Build documentation - run: | - cd docs - chmod +x build_docs.sh - ./build_docs.sh - - - name: Upload documentation artifact - uses: actions/upload-artifact@v4 - with: - name: documentation - path: docs/_build/html - retention-days: 30 - - - name: Upload artifact for GitHub Pages - uses: actions/upload-pages-artifact@v3 - with: - path: docs/_build/html - - deploy: - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - needs: build - if: github.ref == 'refs/heads/main' && github.event_name == 'push' - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml deleted file mode 100644 index 2cac214c..00000000 --- a/.github/workflows/iris-external-validation-test.yml +++ /dev/null @@ -1,95 +0,0 @@ -name: Iris External Validation Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 90 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - - external-validation-test: - name: External Validation Test - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run External Validation Test - run: | - set -e - - echo "::group::Running external validation test" - bash .github/scripts/container_exec.sh " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py - python test_iris_distributed.py - " - echo "::endgroup::" - - echo "✅ External validation test passed!" - - external-gluon-validation-test: - name: External Gluon Validation Test - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run External Gluon Validation Test - run: | - set -e - - echo "::group::Running external gluon validation test" - bash .github/scripts/container_exec.sh --gpus "0,1" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py - python test_iris_gluon_distributed.py - " - echo "::endgroup::" - - echo "✅ External gluon validation test passed!" \ No newline at end of file diff --git a/.github/workflows/iris-performance-regression-test.yml b/.github/workflows/iris-performance-regression-test.yml deleted file mode 100644 index fa017886..00000000 --- a/.github/workflows/iris-performance-regression-test.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: Iris Performance Regression Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 20 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - - performance-test: - name: ${{ matrix.example_name }} - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - strategy: - fail-fast: false - matrix: - # Performance baselines measured on AMD Instinct MI325X (8 GPUs) - include: - # Disabled https://github.com/ROCm/iris/issues/238 - #- example_name: "GEMM All-Scatter WG Specialization" - # example_path: "10_gemm_all_scatter_wg_specialization" - # tflops_threshold: 1600 # Actual: ~2182 TFLOPs - # benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256" - - - example_name: "GEMM All-Scatter" - example_path: "07_gemm_all_scatter" - tflops_threshold: 1000 # Actual: ~1407 TFLOPs - benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 256 --BLK_N 64 --BLK_K 64 --gsize_m 6 --gemm_sms 256" - - - example_name: "GEMM All-Scatter Producer-Consumer" - example_path: "11_gemm_all_scatter_producer_consumer" - tflops_threshold: 1600 # Actual: ~2190 TFLOPs - benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256 --comm_sms 48" - - - example_name: "GEMM All-Scatter Bulk Synchronous" - example_path: "12_gemm_all_scatter_bulk_synchronous" - tflops_threshold: 900 # Actual: ~1262 TFLOPs - benchmark_args: "-m 16384 -n 16384 -k 16384 --BLK_M 128 --BLK_N 128 --BLK_K 64 --gsize_m 6 --gemm_sms 256" - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run ${{ matrix.example_name }} Benchmark (8 ranks) - run: | - set -e - - echo "::group::Running performance benchmark" - bash .github/scripts/run_perf_benchmark.sh \ - "${{ matrix.example_path }}" \ - "${{ matrix.tflops_threshold }}" \ - ${{ matrix.benchmark_args }} - echo "::endgroup::" - diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml deleted file mode 100644 index 739af005..00000000 --- a/.github/workflows/iris-pip-install-test.yml +++ /dev/null @@ -1,167 +0,0 @@ -name: Iris Pip Install Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 90 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - - test-1-2-4-ranks: - name: Pip Install Test 1/2/4 Ranks (Parallel) - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run pip install tests for 1, 2, 4 ranks in parallel - run: | - set -e - - # Run tests in parallel with different GPU assignments - # Note: Each test gets 2+ GPUs even if it only uses some of them. - # This allows tests like test_empty_device_handling to verify that - # allocating on a different device correctly raises an error. - - echo "::group::Starting parallel tests" - echo "Starting 1-rank test on GPUs 0,1..." - bash .github/scripts/container_exec.sh --gpus "0,1" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 1 ranks\" - python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 1 ranks\" - python tests/run_tests_distributed.py --num_ranks 1 \"\$test_file\" -v --tb=short --durations=10 - done - " & - PID1=$! - - echo "Starting 2-rank test on GPUs 2,3..." - bash .github/scripts/container_exec.sh --gpus "2,3" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 2 ranks\" - python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 2 ranks\" - python tests/run_tests_distributed.py --num_ranks 2 \"\$test_file\" -v --tb=short --durations=10 - done - " & - PID2=$! - - echo "Starting 4-rank test on GPUs 4,5,6,7..." - bash .github/scripts/container_exec.sh --gpus "4,5,6,7" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 4 ranks\" - python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 4 ranks\" - python tests/run_tests_distributed.py --num_ranks 4 \"\$test_file\" -v --tb=short --durations=10 - done - " & - PID4=$! - echo "::endgroup::" - - # Wait for all parallel tests and track failures - echo "::group::Waiting for parallel tests to complete" - FAIL=0 - FAILED_TESTS="" - - wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; } - wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; } - wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; } - echo "::endgroup::" - - if [ $FAIL -eq 1 ]; then - echo "::error::Parallel tests failed:$FAILED_TESTS" - exit 1 - fi - - echo "✅ All parallel tests (1, 2, 4 ranks) passed!" - - test-8-ranks: - name: Pip Install Test 8 Ranks - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 30 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 8-rank pip install test - run: | - set -e - - echo "::group::Running 8-rank test on all GPUs" - bash .github/scripts/container_exec.sh --gpus "0,1,2,3,4,5,6,7" " - set -e - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - pip install -e . - for test_file in tests/examples/test_*.py; do - echo \"Testing: \$test_file with 8 ranks\" - python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10 - done - for test_file in tests/unittests/test_*.py; do - echo \"Testing: \$test_file with 8 ranks\" - python tests/run_tests_distributed.py --num_ranks 8 \"\$test_file\" -v --tb=short --durations=10 - done - " - echo "::endgroup::" - - echo "✅ 8-rank test passed!" diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml deleted file mode 100644 index 63950f95..00000000 --- a/.github/workflows/iris-tests-apptainer.yml +++ /dev/null @@ -1,114 +0,0 @@ -name: Iris Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - build-container-image: - runs-on: [self-hosted, mi3008x] - timeout-minutes: 90 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Setup Apptainer (if not available) - run: | - if ! command -v apptainer &> /dev/null && ! command -v docker &> /dev/null; then - echo "Neither Apptainer nor Docker found, installing Apptainer..." - apt-get update && apt-get install -y software-properties-common - add-apt-repository -y ppa:apptainer/ppa - apt-get update && apt-get install -y apptainer - else - echo "Container runtime already available" - fi - - - name: Build Iris container - run: | - # Use the universal container build script - bash .github/scripts/container_build.sh - test-1-2-4-ranks: - name: Test 1/2/4 Ranks (Parallel) - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 20 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 1, 2, 4 rank tests in parallel - run: | - set -e - - # Run tests in parallel with different GPU assignments - # Note: Each test gets 2+ GPUs even if it only uses some of them. - # This allows tests like test_empty_device_handling to verify that - # allocating on a different device correctly raises an error. - - echo "::group::Starting parallel tests" - echo "Starting 1-rank test on GPUs 0,1..." - bash .github/scripts/run_tests.sh 1 "0,1" & - PID1=$! - - echo "Starting 2-rank test on GPUs 2,3..." - bash .github/scripts/run_tests.sh 2 "2,3" & - PID2=$! - - echo "Starting 4-rank test on GPUs 4,5,6,7..." - bash .github/scripts/run_tests.sh 4 "4,5,6,7" & - PID4=$! - echo "::endgroup::" - - # Wait for all parallel tests and track failures - echo "::group::Waiting for parallel tests to complete" - FAIL=0 - FAILED_TESTS="" - - wait $PID1 || { echo "::error::1-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 1-rank"; FAIL=1; } - wait $PID2 || { echo "::error::2-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 2-rank"; FAIL=1; } - wait $PID4 || { echo "::error::4-rank test FAILED"; FAILED_TESTS="$FAILED_TESTS 4-rank"; FAIL=1; } - echo "::endgroup::" - - if [ $FAIL -eq 1 ]; then - echo "::error::Parallel tests failed:$FAILED_TESTS" - exit 1 - fi - - echo "✅ All parallel tests (1, 2, 4 ranks) passed!" - - test-8-ranks: - name: Test 8 Ranks - needs: build-container-image - runs-on: [self-hosted, mi3008x] - timeout-minutes: 15 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Cleanup lingering ports before tests - run: | - bash .github/scripts/cleanup_ports.sh - - - name: Run 8-rank test - run: | - set -e - - echo "::group::Running 8-rank test on all GPUs" - bash .github/scripts/run_tests.sh 8 "0,1,2,3,4,5,6,7" - echo "::endgroup::" - - echo "✅ 8-rank test passed!" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml deleted file mode 100644 index 2b6d1dbb..00000000 --- a/.github/workflows/lint.yml +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. - -name: Lint & Auto-Fix - -on: [push, pull_request] - -jobs: - lint: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Install Ruff - run: pip install ruff - - - name: Run Ruff and auto-fix issues - run: ruff check . --fix - - - name: Run Ruff formatter - run: ruff format . - - - name: Commit and push fixes - run: | - git config --global user.name "github-actions[bot]" - git config --global user.email "github-actions[bot]@users.noreply.github.com" - git add . - git commit -m "Apply Ruff auto-fixes" || echo "No changes to commit" - git push - continue-on-error: true \ No newline at end of file diff --git a/docker/Dockerfile.rocm6.4 b/docker/Dockerfile.rocm6.4 new file mode 100644 index 00000000..bf201dcc --- /dev/null +++ b/docker/Dockerfile.rocm6.4 @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +FROM rocm/pytorch:rocm6.4.4_ubuntu24.04_py3.12_pytorch_release_2.7.1 + +# Use bash shell for RUN commands +SHELL ["/bin/bash", "-c"] + +# Set environment variables +ENV TRITON_PATH=/opt/triton \ + ROCM_PATH=/opt/rocm \ + OMPI_MCA_mtl="^ofi" \ + OMPI_MCA_pml="ob1" + +ENV LD_LIBRARY_PATH=$ROCM_PATH/lib:$LD_LIBRARY_PATH \ + PATH="$ROCM_PATH/bin:$PATH" + +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \ + OMPI_ALLOW_RUN_AS_ROOT=1 + +# Install system packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git wget ninja-build cmake python3-pip python3-dev build-essential && \ + rm -rf /var/lib/apt/lists/* + +# Install Python packages with pip +RUN pip3 install --upgrade pip && \ + pip3 install wheel jupyter + +# Clone and install Triton +ARG TRITON_COMMIT=aafec417bded34db6308f5b3d6023daefae43905 +WORKDIR $TRITON_PATH +RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH +RUN git checkout ${TRITON_COMMIT} +RUN pip3 install -e . +ENV PYTHONPATH=$TRITON_PATH + +WORKDIR /workspace + diff --git a/docker/Dockerfile.rocm7.0 b/docker/Dockerfile.rocm7.0 new file mode 100644 index 00000000..b679d14c --- /dev/null +++ b/docker/Dockerfile.rocm7.0 @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.8.0 + +# Use bash shell for RUN commands +SHELL ["/bin/bash", "-c"] + +# Set environment variables +ENV TRITON_PATH=/opt/triton \ + ROCM_PATH=/opt/rocm \ + OMPI_MCA_mtl="^ofi" \ + OMPI_MCA_pml="ob1" + +ENV LD_LIBRARY_PATH=$ROCM_PATH/lib:$LD_LIBRARY_PATH \ + PATH="$ROCM_PATH/bin:$PATH" + +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \ + OMPI_ALLOW_RUN_AS_ROOT=1 + +# Install system packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git wget ninja-build cmake python3-pip python3-dev build-essential && \ + rm -rf /var/lib/apt/lists/* + +# Install Python packages with pip +RUN pip3 install --upgrade pip && \ + pip3 install wheel jupyter + +# Clone and install Triton +ARG TRITON_COMMIT=aafec417bded34db6308f5b3d6023daefae43905 +WORKDIR $TRITON_PATH +RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH +RUN git checkout ${TRITON_COMMIT} +RUN pip3 install -e . +ENV PYTHONPATH=$TRITON_PATH + +WORKDIR /workspace + +# Make Python site-packages writable by all users for development +RUN chmod -R a+w /opt/venv/lib/python3.12/site-packages/ + diff --git a/docker/Dockerfile.rocm7.1 b/docker/Dockerfile.rocm7.1 new file mode 100644 index 00000000..9c6f7666 --- /dev/null +++ b/docker/Dockerfile.rocm7.1 @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +FROM rocm/pytorch:rocm7.1_ubuntu24.04_py3.12_pytorch_release_2.8.0 + +# Use bash shell for RUN commands +SHELL ["/bin/bash", "-c"] + +# Set environment variables +ENV TRITON_PATH=/opt/triton \ + ROCM_PATH=/opt/rocm \ + OMPI_MCA_mtl="^ofi" \ + OMPI_MCA_pml="ob1" + +ENV LD_LIBRARY_PATH=$ROCM_PATH/lib:$LD_LIBRARY_PATH \ + PATH="$ROCM_PATH/bin:$PATH" + +ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 \ + OMPI_ALLOW_RUN_AS_ROOT=1 + +# Install system packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + git wget ninja-build cmake python3-pip python3-dev build-essential && \ + rm -rf /var/lib/apt/lists/* + +# Install Python packages with pip +RUN pip3 install --upgrade pip && \ + pip3 install wheel jupyter + +# Clone and install Triton +ARG TRITON_COMMIT=aafec417bded34db6308f5b3d6023daefae43905 +WORKDIR $TRITON_PATH +RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH +RUN git checkout ${TRITON_COMMIT} +RUN pip3 install -e . +ENV PYTHONPATH=$TRITON_PATH + +WORKDIR /workspace + +# Make Python site-packages writable by all users for development +RUN chmod -R a+w /opt/venv/lib/python3.12/site-packages/ + diff --git a/pyproject.toml b/pyproject.toml index cc8757f5..33e7f6b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ requires-python = ">=3.8" dependencies = [ "numpy", "requests", - "ruff", ] [project.urls] @@ -33,6 +32,7 @@ dev = [ "pytest", "black", "mypy", + "ruff", ] [tool.setuptools]