From ca6fe293318bc3be9e67bedc507a50533e3f6538 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 17 Sep 2025 15:48:02 -0500 Subject: [PATCH 01/14] Add pip install test workflow with Apptainer and rank matrix --- .github/workflows/iris-pip-install-test.yml | 87 +++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 .github/workflows/iris-pip-install-test.yml diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml new file mode 100644 index 0000000..778a02a --- /dev/null +++ b/.github/workflows/iris-pip-install-test.yml @@ -0,0 +1,87 @@ +name: Iris Pip Install Test + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + build-apptainer-image: + runs-on: [self-hosted, mi3008x] + timeout-minutes: 90 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Apptainer + run: | + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + + - name: Build Iris Apptainer container + run: | + # Create persistent Apptainer directory + mkdir -p ~/apptainer + + # Build Apptainer image from definition file (only if it doesn't exist) + if [ ! -f ~/apptainer/iris-dev.sif ]; then + echo "Building new Apptainer image..." + apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + else + echo "Using existing Apptainer image" + fi + + pip-install-test: + name: ${{ matrix.ranks }}-rank Pip Install Test + needs: build-apptainer-image + runs-on: [self-hosted, mi3008x] + timeout-minutes: 30 + strategy: + matrix: + ranks: [1, 2, 4, 8] + max-parallel: 1 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Run Pip Install Test with Apptainer + run: | + apptainer exec ~/apptainer/iris-dev.sif bash -c " + set -e # Exit on any error + + # Setup Python + python3 -m pip install --upgrade pip + pip install pytest + + # Uninstall any existing Iris installations + echo 'Uninstalling any existing Iris packages...' + pip uninstall -y Iris iris || echo 'No existing Iris packages found or uninstall failed' + + # Install iris from the current repository + pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} + + # Run examples tests one at a time using distributed wrapper + echo 'Running examples tests one at a time...' + for test_file in tests/examples/test_*.py; do + echo \"Testing: \$test_file with ${{ matrix.ranks }} ranks\" + python tests/run_tests_distributed.py --num_ranks ${{ matrix.ranks }} \"\$test_file\" -v --tb=short + done + + # Run unit tests one at a time using distributed wrapper + echo 'Running unit tests one at a time...' + for test_file in tests/unittests/test_*.py; do + echo \"Testing: \$test_file with ${{ matrix.ranks }} ranks\" + python tests/run_tests_distributed.py --num_ranks ${{ matrix.ranks }} \"\$test_file\" -v --tb=short + done + " From 6a64cb9f8b3b92aec41b0974cde60d5a09636525 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Wed, 17 Sep 2025 15:48:38 -0500 Subject: [PATCH 02/14] Make sure we exit on failures --- tests/run_tests_distributed.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/run_tests_distributed.py b/tests/run_tests_distributed.py index 8c4af8e..d202878 100755 --- a/tests/run_tests_distributed.py +++ b/tests/run_tests_distributed.py @@ -44,6 +44,9 @@ def _distributed_worker(rank, world_size, test_file, pytest_args): try: # Run pytest directly in this process exit_code = pytest.main([test_file] + pytest_args) + # If tests failed, exit with the failure code + if exit_code != 0: + sys.exit(exit_code) return exit_code finally: # Restore original argv @@ -82,8 +85,20 @@ def main(): print(f"args={args}, test_file={test_file}, pytest_args={pytest_args}") # Run all tests within a single distributed process group - mp.spawn(_distributed_worker, args=(num_ranks, test_file, pytest_args), nprocs=num_ranks, join=True) + try: + mp.spawn( + _distributed_worker, + args=(num_ranks, test_file, pytest_args), + nprocs=num_ranks, + join=True, + ) + except SystemExit as e: + # Catch sys.exit() from worker and return same exit code + sys.exit(e.code if isinstance(e.code, int) else 1) + except Exception: + # Any other unhandled exception = failure + sys.exit(1) if __name__ == "__main__": - main() + main() \ No newline at end of file From 75621460dc2d0683da916d2b48a28d5e50e429e2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Sep 2025 20:49:09 +0000 Subject: [PATCH 03/14] Apply Ruff auto-fixes --- tests/run_tests_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/run_tests_distributed.py b/tests/run_tests_distributed.py index d202878..abf3296 100755 --- a/tests/run_tests_distributed.py +++ b/tests/run_tests_distributed.py @@ -101,4 +101,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() From 618f9fe2233eb724d6af9ce577176041c745489a Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 08:06:34 -0700 Subject: [PATCH 04/14] Add simple external test --- .../iris-external-validation-test.yml | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 .github/workflows/iris-external-validation-test.yml diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml new file mode 100644 index 0000000..d79b7ae --- /dev/null +++ b/.github/workflows/iris-external-validation-test.yml @@ -0,0 +1,152 @@ +name: Iris External Validation Test + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + build-apptainer-image: + runs-on: [self-hosted, mi3008x] + timeout-minutes: 90 + + steps: + - name: Setup Apptainer + run: | + apt-get update && apt-get install -y software-properties-common + add-apt-repository -y ppa:apptainer/ppa + apt-get update && apt-get install -y apptainer + + - name: Build Iris Apptainer container + run: | + # Create persistent Apptainer directory + mkdir -p ~/apptainer + + # Build Apptainer image from definition file (only if it doesn't exist) + if [ ! -f ~/apptainer/iris-dev.sif ]; then + echo "Building new Apptainer image..." + apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + else + echo "Using existing Apptainer image" + fi + + external-validation-test: + name: External Validation Test + needs: build-apptainer-image + runs-on: [self-hosted, mi3008x] + timeout-minutes: 30 + + steps: + - name: Run External Validation Test with Apptainer + run: | + apptainer exec ~/apptainer/iris-dev.sif bash -c " + set -e # Exit on any error + + # Setup Python + python3 -m pip install --upgrade pip + + # Uninstall any existing Iris installations + echo 'Uninstalling any existing Iris packages...' + pip uninstall -y Iris iris || echo 'No existing Iris packages found or uninstall failed' + + # Install iris from the current repository + echo 'Installing iris from current repository...' + pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} + + # Create test script + cat > test_iris_distributed.py << 'EOF' +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import triton +import triton.language as tl +import iris + +# Device-side APIs +@triton.jit +def kernel(buffer, buffer_size: tl.constexpr, block_size: tl.constexpr, heap_bases_ptr): + # Compute start index of this block + pid = tl.program_id(0) + block_start = pid * block_size + offsets = block_start + tl.arange(0, block_size) + + # Guard for out-of-bounds accesses + mask = offsets < buffer_size + + # Store 1 in the target buffer at each offset + source_rank = 0 + target_rank = 1 + iris.store(buffer + offsets, 1, + source_rank, target_rank, + heap_bases_ptr, mask=mask) + +def _worker(rank, world_size): + # Torch distributed initialization + device_id = rank % torch.cuda.device_count() + dist.init_process_group( + backend=\"nccl\", + rank=rank, + world_size=world_size, + init_method=\"tcp://127.0.0.1:29500\", + device_id=torch.device(f\"cuda:{device_id}\") + ) + + # Iris initialization + heap_size = 2**30 # 1GiB symmetric heap for inter-GPU communication + iris_ctx = iris.iris(heap_size) + cur_rank = iris_ctx.get_rank() + + # Iris tensor allocation + buffer_size = 4096 # 4K elements buffer + buffer = iris_ctx.zeros(buffer_size, device=\"cuda\", dtype=torch.float32) + + # Launch the kernel on rank 0 + block_size = 1024 + grid = lambda meta: (triton.cdiv(buffer_size, meta[\"block_size\"]),) + source_rank = 0 + if cur_rank == source_rank: + kernel[grid]( + buffer, + buffer_size, + block_size, + iris_ctx.get_heap_bases(), + ) + + # Synchronize all ranks + iris_ctx.barrier() + + # Validation: Check that the data was correctly stored + if cur_rank == target_rank: + # Convert iris tensor to torch tensor for validation + buffer_torch = buffer.to_torch() + expected_values = torch.ones(buffer_size, device=f\"cuda:{device_id}\", dtype=torch.float32) + + # Check if all values are 1 (as expected from the store operation) + if torch.allclose(buffer_torch, expected_values, atol=1e-6): + print(f\"✓ Rank {cur_rank}: Validation passed - all values are 1\") + else: + print(f\"✗ Rank {cur_rank}: Validation failed - values don't match expected\") + print(f\" Expected: all 1s, Got: {buffer_torch[:10]}... (showing first 10 values)\") + raise AssertionError(f\"Validation failed on rank {cur_rank}\") + else: + print(f\"✓ Rank {cur_rank}: Skipping validation (not target rank)\") + + dist.destroy_process_group() + +if __name__ == \"__main__\": + world_size = 2 # Using two ranks + mp.spawn(_worker, args=(world_size,), nprocs=world_size, join=True) +EOF + + # Run the external validation test + echo 'Running iris external validation test...' + python test_iris_distributed.py + + echo 'External validation test completed successfully!' + " From 19c837d766685cf8ebb0462933206f3f25ef4d8e Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 08:15:07 -0700 Subject: [PATCH 05/14] Indent the yml --- .../iris-external-validation-test.yml | 164 +++++++++--------- 1 file changed, 81 insertions(+), 83 deletions(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index d79b7ae..2609e19 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -61,92 +61,90 @@ jobs: # Create test script cat > test_iris_distributed.py << 'EOF' -import torch -import torch.distributed as dist -import torch.multiprocessing as mp -import triton -import triton.language as tl -import iris - -# Device-side APIs -@triton.jit -def kernel(buffer, buffer_size: tl.constexpr, block_size: tl.constexpr, heap_bases_ptr): - # Compute start index of this block - pid = tl.program_id(0) - block_start = pid * block_size - offsets = block_start + tl.arange(0, block_size) - - # Guard for out-of-bounds accesses - mask = offsets < buffer_size - - # Store 1 in the target buffer at each offset - source_rank = 0 - target_rank = 1 - iris.store(buffer + offsets, 1, - source_rank, target_rank, - heap_bases_ptr, mask=mask) - -def _worker(rank, world_size): - # Torch distributed initialization - device_id = rank % torch.cuda.device_count() - dist.init_process_group( - backend=\"nccl\", - rank=rank, - world_size=world_size, - init_method=\"tcp://127.0.0.1:29500\", - device_id=torch.device(f\"cuda:{device_id}\") - ) - - # Iris initialization - heap_size = 2**30 # 1GiB symmetric heap for inter-GPU communication - iris_ctx = iris.iris(heap_size) - cur_rank = iris_ctx.get_rank() - - # Iris tensor allocation - buffer_size = 4096 # 4K elements buffer - buffer = iris_ctx.zeros(buffer_size, device=\"cuda\", dtype=torch.float32) - - # Launch the kernel on rank 0 - block_size = 1024 - grid = lambda meta: (triton.cdiv(buffer_size, meta[\"block_size\"]),) - source_rank = 0 - if cur_rank == source_rank: - kernel[grid]( - buffer, - buffer_size, - block_size, - iris_ctx.get_heap_bases(), - ) - - # Synchronize all ranks - iris_ctx.barrier() - - # Validation: Check that the data was correctly stored - if cur_rank == target_rank: - # Convert iris tensor to torch tensor for validation - buffer_torch = buffer.to_torch() - expected_values = torch.ones(buffer_size, device=f\"cuda:{device_id}\", dtype=torch.float32) - - # Check if all values are 1 (as expected from the store operation) - if torch.allclose(buffer_torch, expected_values, atol=1e-6): - print(f\"✓ Rank {cur_rank}: Validation passed - all values are 1\") - else: - print(f\"✗ Rank {cur_rank}: Validation failed - values don't match expected\") - print(f\" Expected: all 1s, Got: {buffer_torch[:10]}... (showing first 10 values)\") - raise AssertionError(f\"Validation failed on rank {cur_rank}\") - else: - print(f\"✓ Rank {cur_rank}: Skipping validation (not target rank)\") - - dist.destroy_process_group() - -if __name__ == \"__main__\": - world_size = 2 # Using two ranks - mp.spawn(_worker, args=(world_size,), nprocs=world_size, join=True) -EOF + import torch + import torch.distributed as dist + import torch.multiprocessing as mp + import triton + import triton.language as tl + import iris + + # Device-side APIs + @triton.jit + def kernel(buffer, buffer_size: tl.constexpr, block_size: tl.constexpr, heap_bases_ptr): + # Compute start index of this block + pid = tl.program_id(0) + block_start = pid * block_size + offsets = block_start + tl.arange(0, block_size) + + # Guard for out-of-bounds accesses + mask = offsets < buffer_size + + # Store 1 in the target buffer at each offset + source_rank = 0 + target_rank = 1 + iris.store(buffer + offsets, 1, source_rank, target_rank, heap_bases_ptr, mask=mask) + + def _worker(rank, world_size): + # Torch distributed initialization + device_id = rank % torch.cuda.device_count() + dist.init_process_group( + backend=\"nccl\", + rank=rank, + world_size=world_size, + init_method=\"tcp://127.0.0.1:29500\", + device_id=torch.device(f\"cuda:{device_id}\"), + ) + + # Iris initialization + heap_size = 2**30 # 1GiB symmetric heap for inter-GPU communication + iris_ctx = iris.iris(heap_size) + cur_rank = iris_ctx.get_rank() + + # Iris tensor allocation + buffer_size = 4096 # 4K elements buffer + buffer = iris_ctx.zeros(buffer_size, device=\"cuda\", dtype=torch.float32) + + # Launch the kernel on rank 0 + block_size = 1024 + grid = lambda meta: (triton.cdiv(buffer_size, meta[\"block_size\"]),) + source_rank = 0 + if cur_rank == source_rank: + kernel[grid]( + buffer, + buffer_size, + block_size, + iris_ctx.get_heap_bases(), + ) + + # Synchronize all ranks + iris_ctx.barrier() + + # Validation: Check that the data was correctly stored + if cur_rank == target_rank: + # Convert iris tensor to torch tensor for validation + buffer_torch = buffer.to_torch() + expected_values = torch.ones(buffer_size, device=f\"cuda:{device_id}\", dtype=torch.float32) + + # Check if all values are 1 (as expected from the store operation) + if torch.allclose(buffer_torch, expected_values, atol=1e-6): + print(f\"✓ Rank {cur_rank}: Validation passed - all values are 1\") + else: + print(f\"✗ Rank {cur_rank}: Validation failed - values don't match expected\") + print(f\" Expected: all 1s, Got: {buffer_torch[:10]}... (showing first 10 values)\") + raise AssertionError(f\"Validation failed on rank {cur_rank}\") + else: + print(f\"✓ Rank {cur_rank}: Skipping validation (not target rank)\") + + dist.destroy_process_group() + + if __name__ == \"__main__\": + world_size = 2 # Using two ranks + mp.spawn(_worker, args=(world_size,), nprocs=world_size, join=True) + EOF # Run the external validation test echo 'Running iris external validation test...' python test_iris_distributed.py echo 'External validation test completed successfully!' - " + " \ No newline at end of file From a8061a108ad02bcd5bb8141e1a64ffac1fa7b1ad Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 08:21:51 -0700 Subject: [PATCH 06/14] Get test from gist --- .../iris-external-validation-test.yml | 85 +------------------ 1 file changed, 3 insertions(+), 82 deletions(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index 2609e19..e44208a 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -59,88 +59,9 @@ jobs: echo 'Installing iris from current repository...' pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} - # Create test script - cat > test_iris_distributed.py << 'EOF' - import torch - import torch.distributed as dist - import torch.multiprocessing as mp - import triton - import triton.language as tl - import iris - - # Device-side APIs - @triton.jit - def kernel(buffer, buffer_size: tl.constexpr, block_size: tl.constexpr, heap_bases_ptr): - # Compute start index of this block - pid = tl.program_id(0) - block_start = pid * block_size - offsets = block_start + tl.arange(0, block_size) - - # Guard for out-of-bounds accesses - mask = offsets < buffer_size - - # Store 1 in the target buffer at each offset - source_rank = 0 - target_rank = 1 - iris.store(buffer + offsets, 1, source_rank, target_rank, heap_bases_ptr, mask=mask) - - def _worker(rank, world_size): - # Torch distributed initialization - device_id = rank % torch.cuda.device_count() - dist.init_process_group( - backend=\"nccl\", - rank=rank, - world_size=world_size, - init_method=\"tcp://127.0.0.1:29500\", - device_id=torch.device(f\"cuda:{device_id}\"), - ) - - # Iris initialization - heap_size = 2**30 # 1GiB symmetric heap for inter-GPU communication - iris_ctx = iris.iris(heap_size) - cur_rank = iris_ctx.get_rank() - - # Iris tensor allocation - buffer_size = 4096 # 4K elements buffer - buffer = iris_ctx.zeros(buffer_size, device=\"cuda\", dtype=torch.float32) - - # Launch the kernel on rank 0 - block_size = 1024 - grid = lambda meta: (triton.cdiv(buffer_size, meta[\"block_size\"]),) - source_rank = 0 - if cur_rank == source_rank: - kernel[grid]( - buffer, - buffer_size, - block_size, - iris_ctx.get_heap_bases(), - ) - - # Synchronize all ranks - iris_ctx.barrier() - - # Validation: Check that the data was correctly stored - if cur_rank == target_rank: - # Convert iris tensor to torch tensor for validation - buffer_torch = buffer.to_torch() - expected_values = torch.ones(buffer_size, device=f\"cuda:{device_id}\", dtype=torch.float32) - - # Check if all values are 1 (as expected from the store operation) - if torch.allclose(buffer_torch, expected_values, atol=1e-6): - print(f\"✓ Rank {cur_rank}: Validation passed - all values are 1\") - else: - print(f\"✗ Rank {cur_rank}: Validation failed - values don't match expected\") - print(f\" Expected: all 1s, Got: {buffer_torch[:10]}... (showing first 10 values)\") - raise AssertionError(f\"Validation failed on rank {cur_rank}\") - else: - print(f\"✓ Rank {cur_rank}: Skipping validation (not target rank)\") - - dist.destroy_process_group() - - if __name__ == \"__main__\": - world_size = 2 # Using two ranks - mp.spawn(_worker, args=(world_size,), nprocs=world_size, join=True) - EOF + # Download test script from gist + echo 'Downloading test script from gist...' + wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/67d2364368dc09d020ee73e8c4ef900f7201a791/test_iris_distributed.py # Run the external validation test echo 'Running iris external validation test...' From 2aba55800fab4c64bb277138fc694959c80d25ec Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 08:25:03 -0700 Subject: [PATCH 07/14] Fix bad char --- .github/workflows/iris-external-validation-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index e44208a..ad55b31 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -61,7 +61,7 @@ jobs: # Download test script from gist echo 'Downloading test script from gist...' - wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/67d2364368dc09d020ee73e8c4ef900f7201a791/test_iris_distributed.py + wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/c7fbe961a05cf884111761169a14e2532710c34c/test_iris_distributed.py # Run the external validation test echo 'Running iris external validation test...' From 2e8dd17fb52863e48ad43cad64673df2f2ce7ec2 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 16:58:37 -0700 Subject: [PATCH 08/14] Add dependancies --- .github/workflows/iris-pip-install-test.yml | 7 ++++++- .github/workflows/iris-tests-apptainer.yml | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index 778a02a..2aedaf7 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -12,7 +12,12 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: + external-validation-test: + uses: ./.github/workflows/iris-external-validation-test.yml + secrets: inherit + build-apptainer-image: + needs: external-validation-test runs-on: [self-hosted, mi3008x] timeout-minutes: 90 @@ -41,7 +46,7 @@ jobs: pip-install-test: name: ${{ matrix.ranks }}-rank Pip Install Test - needs: build-apptainer-image + needs: [external-validation-test, build-apptainer-image] runs-on: [self-hosted, mi3008x] timeout-minutes: 30 strategy: diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 5e2d9a8..be33c7c 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -12,7 +12,12 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: + external-validation-test: + uses: ./.github/workflows/iris-external-validation-test.yml + secrets: inherit + build-apptainer-image: + needs: external-validation-test runs-on: [self-hosted, mi3008x] timeout-minutes: 90 @@ -40,7 +45,7 @@ jobs: fi run-tests: name: ${{ matrix.ranks }}-rank Iris Test - needs: build-apptainer-image + needs: [external-validation-test, build-apptainer-image] runs-on: [self-hosted, mi3008x] timeout-minutes: 20 strategy: From 9964112b96f7cd38804cdba197cbe5efdc4689ae Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 17:01:28 -0700 Subject: [PATCH 09/14] Update link --- .github/workflows/iris-external-validation-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index ad55b31..e98ce2a 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -61,7 +61,7 @@ jobs: # Download test script from gist echo 'Downloading test script from gist...' - wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/c7fbe961a05cf884111761169a14e2532710c34c/test_iris_distributed.py + wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py # Run the external validation test echo 'Running iris external validation test...' From b9e49c6f9c463cecd411b720e1d94c3da6f0007a Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 17:06:00 -0700 Subject: [PATCH 10/14] Remove bad dependancy --- .github/workflows/iris-pip-install-test.yml | 7 +------ .github/workflows/iris-tests-apptainer.yml | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index 2aedaf7..d5954fb 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -12,12 +12,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: - external-validation-test: - uses: ./.github/workflows/iris-external-validation-test.yml - secrets: inherit - build-apptainer-image: - needs: external-validation-test runs-on: [self-hosted, mi3008x] timeout-minutes: 90 @@ -46,7 +41,7 @@ jobs: pip-install-test: name: ${{ matrix.ranks }}-rank Pip Install Test - needs: [external-validation-test, build-apptainer-image] + needs: [build-apptainer-image] runs-on: [self-hosted, mi3008x] timeout-minutes: 30 strategy: diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index be33c7c..9e4ed3c 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -12,12 +12,7 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: - external-validation-test: - uses: ./.github/workflows/iris-external-validation-test.yml - secrets: inherit - build-apptainer-image: - needs: external-validation-test runs-on: [self-hosted, mi3008x] timeout-minutes: 90 @@ -45,7 +40,7 @@ jobs: fi run-tests: name: ${{ matrix.ranks }}-rank Iris Test - needs: [external-validation-test, build-apptainer-image] + needs: [build-apptainer-image] runs-on: [self-hosted, mi3008x] timeout-minutes: 20 strategy: From c964e36938a248733e08baac2a242f5d3b1f8c7b Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 17:25:34 -0700 Subject: [PATCH 11/14] Uninstall --- .github/workflows/iris-tests-apptainer.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 9e4ed3c..419cad8 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -57,6 +57,9 @@ jobs: apptainer exec ~/apptainer/iris-dev.sif bash -c " set -e # Exit on any error + # Uninstall any existing Iris installations + pip uninstall -y Iris iris + # Install iris first pip install -e . From 12ce2821babc76c38f51f36a91a663a33a521a8d Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 17:39:59 -0700 Subject: [PATCH 12/14] Always force reinstall --- .github/workflows/iris-external-validation-test.yml | 2 +- .github/workflows/iris-pip-install-test.yml | 2 +- .github/workflows/iris-tests-apptainer.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index e98ce2a..959d0bf 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -57,7 +57,7 @@ jobs: # Install iris from the current repository echo 'Installing iris from current repository...' - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} + pip install --force-reinstall git+https://github.com/${{ github.repository }}.git@${{ github.sha }} # Download test script from gist echo 'Downloading test script from gist...' diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index d5954fb..0690a89 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -69,7 +69,7 @@ jobs: pip uninstall -y Iris iris || echo 'No existing Iris packages found or uninstall failed' # Install iris from the current repository - pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} + pip install --force-reinstall git+https://github.com/${{ github.repository }}.git@${{ github.sha }} # Run examples tests one at a time using distributed wrapper echo 'Running examples tests one at a time...' diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 419cad8..01e7cce 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -61,7 +61,7 @@ jobs: pip uninstall -y Iris iris # Install iris first - pip install -e . + pip install -e . --force-reinstall # Run examples tests one at a time using distributed wrapper echo 'Running examples tests one at a time...' From 76e86f162c3fc1f76a04ffbc4cf9f4c764246a9e Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 17:46:10 -0700 Subject: [PATCH 13/14] Add `--no-deps --- .github/workflows/iris-external-validation-test.yml | 2 +- .github/workflows/iris-pip-install-test.yml | 2 +- .github/workflows/iris-tests-apptainer.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index 959d0bf..a638c1f 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -57,7 +57,7 @@ jobs: # Install iris from the current repository echo 'Installing iris from current repository...' - pip install --force-reinstall git+https://github.com/${{ github.repository }}.git@${{ github.sha }} + pip install --force-reinstall --no-deps git+https://github.com/${{ github.repository }}.git@${{ github.sha }} # Download test script from gist echo 'Downloading test script from gist...' diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index 0690a89..946f72c 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -69,7 +69,7 @@ jobs: pip uninstall -y Iris iris || echo 'No existing Iris packages found or uninstall failed' # Install iris from the current repository - pip install --force-reinstall git+https://github.com/${{ github.repository }}.git@${{ github.sha }} + pip install --force-reinstall --no-deps git+https://github.com/${{ github.repository }}.git@${{ github.sha }} # Run examples tests one at a time using distributed wrapper echo 'Running examples tests one at a time...' diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index 01e7cce..fd59017 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -61,7 +61,7 @@ jobs: pip uninstall -y Iris iris # Install iris first - pip install -e . --force-reinstall + pip install -e . --force-reinstall --no-deps # Run examples tests one at a time using distributed wrapper echo 'Running examples tests one at a time...' From e4e5214a4b0128c29b77ae81148ecb17157cb7c1 Mon Sep 17 00:00:00 2001 From: Muhammad Awad Date: Thu, 18 Sep 2025 17:52:06 -0700 Subject: [PATCH 14/14] More cleanup --- .github/workflows/iris-external-validation-test.yml | 3 ++- .github/workflows/iris-pip-install-test.yml | 3 ++- .github/workflows/iris-tests-apptainer.yml | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-external-validation-test.yml index a638c1f..296ce99 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-external-validation-test.yml @@ -54,7 +54,8 @@ jobs: # Uninstall any existing Iris installations echo 'Uninstalling any existing Iris packages...' pip uninstall -y Iris iris || echo 'No existing Iris packages found or uninstall failed' - + rm -rf build dist *.egg-info + # Install iris from the current repository echo 'Installing iris from current repository...' pip install --force-reinstall --no-deps git+https://github.com/${{ github.repository }}.git@${{ github.sha }} diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-pip-install-test.yml index 946f72c..f724478 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-pip-install-test.yml @@ -67,7 +67,8 @@ jobs: # Uninstall any existing Iris installations echo 'Uninstalling any existing Iris packages...' pip uninstall -y Iris iris || echo 'No existing Iris packages found or uninstall failed' - + rm -rf build dist *.egg-info + # Install iris from the current repository pip install --force-reinstall --no-deps git+https://github.com/${{ github.repository }}.git@${{ github.sha }} diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-apptainer.yml index fd59017..8fc1997 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-apptainer.yml @@ -59,7 +59,8 @@ jobs: # Uninstall any existing Iris installations pip uninstall -y Iris iris - + rm -rf build dist *.egg-info + # Install iris first pip install -e . --force-reinstall --no-deps