ROCm
diff --git a/‎.github/workflows/auto-label.json5‎
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/auto-label.json5‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/workflows/auto-label.yml‎
Lines changed: 52 additions & 0 deletions b/‎.github/workflows/auto-label.yml‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎apptainer/iris.def‎
Lines changed: 1 addition & 1 deletion b/‎apptainer/iris.def‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apptainer/jupyter.sh‎
Lines changed: 0 additions & 22 deletions b/‎apptainer/jupyter.sh‎
Lines changed: 0 additions & 22 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/07_gemm_all_scatter/gemm_all_scatter.py‎
Lines changed: 38 additions & 133 deletions b/‎examples/07_gemm_all_scatter/gemm_all_scatter.py‎
Lines changed: 38 additions & 133 deletions
@@ -0,0 +1,10 @@
+{
+  labelsSynonyms: {
+    feature: ["[Feature]"],
+    bug: ["[Issue]"],
+    documentation: ["[Documentation]"]
+  },
+  defaultLabels: ["iris"],   // always add iris
+  includeTitle: true,
+  ignoreComments: true
+}
@@ -0,0 +1,52 @@
+name: Auto labeling
+on:
+  issues:
+    types: [opened]
+  pull_request:
+    types: [opened]
+
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
+
+jobs:
+  # Label ISSUES using Renato66/auto-label
+  label-issues:
+    if: github.event_name == 'issues'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            .github/workflows/auto-label.json5
+          sparse-checkout-cone-mode: false
+      - uses: Renato66/auto-label@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+  # Add ISSUES to ROCm Project #91 so they land in Todo
+  add-issues-to-project:
+    if: github.event_name == 'issues'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/[email protected]
+        with:
+          project-url: https://github.com/orgs/ROCm/projects/91
+          github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
+
+  # PRs: label so the project rule moves them to In Progress
+  label-prs:
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Add iris + in-progress labels to PR
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.issues.addLabels({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.payload.pull_request.number,
+              labels: ["iris", "in-progress"]
+            });
@@ -13,7 +13,7 @@ From: rocm/pytorch:rocm6.3.1_ubuntu22.04_py3.10_pytorch
     conda install -y -n py_3.10 -c conda-forge mpi4py openmpi jupyter ninja cmake wheel
     git clone https://github.com/triton-lang/triton.git \$TRITON_PATH
     cd \$TRITON_PATH
-    git checkout eb73b0373a7fb4cd2e563f68e3488a96525562eb
+    git checkout dd5823453bcc7973eabadb65f9d827c43281c434
     pip install -e .
     wget https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-6.3.1/rocprofiler-systems-install.py
     python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 6.3
 
@@ -37,7 +37,7 @@ RUN sudo pip3 install mpi4py
 # Clone and install Triton
 WORKDIR $TRITON_PATH
 RUN git clone https://github.com/triton-lang/triton.git $TRITON_PATH
-RUN git checkout eb73b0373a7fb4cd2e563f68e3488a96525562eb
+RUN git checkout dd5823453bcc7973eabadb65f9d827c43281c434
 RUN pip3 install -e .
 ENV PYTHONPATH=$TRITON_PATH
 
 
@@ -11,89 +11,6 @@
 import iris
 
 
-@triton.jit
-def tile_id_to_index_range(
-    tile_id,
-    M,
-    N,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-
-    group_id = tile_id // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-
-    tile_in_group = tile_id % num_pid_in_group
-    pid_m = first_pid_m + (tile_in_group % group_size_m)
-    pid_n = tile_in_group // group_size_m
-
-    rm_start = pid_m * BLOCK_SIZE_M
-    rn_start = pid_n * BLOCK_SIZE_N
-
-    # clamp to the maximum valid index (M-1, N-1)
-    max_m = M - 1
-    max_n = N - 1
-
-    # generate indices
-    rm = rm_start + tl.arange(0, BLOCK_SIZE_M)
-    rn = rn_start + tl.arange(0, BLOCK_SIZE_N)
-
-    rm = tl.minimum(rm, max_m)
-    rn = tl.minimum(rn, max_n)
-
-    # rm_mod = rm % M
-    # rm = tl.max_contiguous(tl.multiple_of(rm_mod, BLOCK_SIZE_M), BLOCK_SIZE_M)
-
-    return rm, rn, rm_start, rn_start
-
-
-@triton.jit
-def offset_for_tile(local_tile_id, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M, M_local, N_local):
-    rm, rn, rm_start, rn_start = tile_id_to_index_range(
-        local_tile_id, M_local, N_local, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M
-    )
-    c_mask = (rm[:, None] < M_local) & (rn[None, :] < N_local)
-    return rm, rn, c_mask, rm_start, rn_start
-
-
-@triton.jit
-def extract_submask_and_offset(
-    rm,
-    rn,
-    mask,
-    rm_start,
-    rn_start,
-    start_row,
-    start_col,
-    SUB_BLOCK_SIZE_M: tl.constexpr,
-    SUB_BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    stride_cm_local: tl.constexpr,
-    stride_cn_local: tl.constexpr,
-):
-    # Create indices for the sub-block
-    sub_rm = tl.arange(0, SUB_BLOCK_SIZE_M) + start_row
-    sub_rn = tl.arange(0, SUB_BLOCK_SIZE_N) + start_col
-
-    # Create a 2D grid of indices for the sub-block
-    sub_rm_2d = sub_rm[:, None]  # Shape: (SUB_BLOCK_SIZE_M, 1)
-    sub_rn_2d = sub_rn[None, :]  # Shape: (1, SUB_BLOCK_SIZE_N)
-
-    # Compute the sub-mask
-    sub_mask = (sub_rm_2d < BLOCK_SIZE_M) & (sub_rn_2d < BLOCK_SIZE_N)
-
-    # Compute the sub-offset relative to the start of the tile
-    sub_offset = ((rm_start + sub_rm_2d) * stride_cm_local) + ((rn_start + sub_rn_2d) * stride_cn_local)
-
-    return sub_mask, sub_offset
-
-
 @triton.jit()
 def persistent_gemm_all_scatter(
     A,
@@ -166,8 +83,8 @@ def persistent_gemm_all_scatter(
         A_BASE = A + rm[:, None] * stride_am + rk[None, :] * stride_ak
         B_BASE = B + rk[:, None] * stride_bk + rn[None, :] * stride_bn
 
-        tl.assume(pid_m > 0)
-        tl.assume(pid_n > 0)
+        tl.assume(pid_m >= 0)
+        tl.assume(pid_n >= 0)
 
         loop_k = tl.cdiv(K, BLOCK_SIZE_K)
         if not EVEN_K:
@@ -195,51 +112,39 @@ def persistent_gemm_all_scatter(
         # Accumulator registers with C results
         c = acc.to(C.type.element_ty)
 
-        rm, rn, mask, rm_start, rn_start = offset_for_tile(tile_id, BLOCK_SIZE_M, BLOCK_SIZE_N, GROUP_SIZE_M, M, N)
-
-        # Calculate the number of sub-tiles in each dimension
-        num_sub_tiles_m = tl.cdiv(BLOCK_SIZE_M, BLOCK_SIZE_M)
-        num_sub_tiles_n = tl.cdiv(BLOCK_SIZE_N, BLOCK_SIZE_N)
-        total_sub_tiles = num_sub_tiles_m * num_sub_tiles_n
-
-        for sub_tile_idx in range(0, total_sub_tiles):
-            # Calculate start_row and start_col for the current sub-tile
-            start_row = (sub_tile_idx // num_sub_tiles_n) * BLOCK_SIZE_M
-            start_col = (sub_tile_idx % num_sub_tiles_n) * BLOCK_SIZE_N
-
-            # Translate to global
-            sub_mask, global_offset = extract_submask_and_offset(
-                rm,
-                rn + cur_rank * N,
-                mask,
-                rm_start,
-                rn_start + cur_rank * N,
-                start_row,
-                start_col,
-                BLOCK_SIZE_M,
-                BLOCK_SIZE_N,
-                BLOCK_SIZE_M,
-                BLOCK_SIZE_N,
-                stride_cm_global,
-                stride_cn_global,
-            )
-
-            # Timestamp for GEMM before store
-            if COLLECT_TIMESTAMPS:
-                timestamp = read_realtime()
-                tl.atomic_max(mm_end_timestamp_ptr + tile_id, timestamp)
-
-            # Store data to the global result using puts
-            for remote_rank in range(world_size):
-                if remote_rank == cur_rank:
-                    # For the current rank, we can use store
-                    tl.store(c_global + global_offset, c, mask=sub_mask)
-                else:
-                    iris.store(
-                        c_global + global_offset,
-                        c,
-                        cur_rank,
-                        remote_rank,
-                        heap_bases,
-                        mask=sub_mask,
-                    )
+        rm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+        rn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+
+        # Add compiler hints
+        rm = tl.max_contiguous(tl.multiple_of(rm, BLOCK_SIZE_M), BLOCK_SIZE_M)
+        rn = tl.max_contiguous(tl.multiple_of(rn, BLOCK_SIZE_N), BLOCK_SIZE_N)
+
+        # Define the C-mask (BLOCK_SIZE_M, 1) x (1, BLOCK_SIZE_N)
+        sub_mask = (rm[:, None] < M) & (rn[None, :] < N)
+
+        # Calculate the "global" offset of C based on the rank.
+        # Note how the N-dimension is being multiplied by current rank.
+        # This is because each rank is computing a portion of the N-dimension
+        # locally and then scattering it to all other ranks to complete
+        # the global N-dimension.
+        global_offset = rm[:, None] * stride_cm_global + (rn[None, :] + cur_rank * N) * stride_cn_global
+
+        # Timestamp for GEMM before store
+        if COLLECT_TIMESTAMPS:
+            timestamp = read_realtime()
+            tl.atomic_max(mm_end_timestamp_ptr + tile_id, timestamp)
+
+        # Store data to the global result using puts
+        for remote_rank in range(world_size):
+            if remote_rank == cur_rank:
+                # For the current rank, we can use store
+                tl.store(c_global + global_offset, c, mask=sub_mask)
+            else:
+                iris.store(
+                    c_global + global_offset,
+                    c,
+                    cur_rank,
+                    remote_rank,
+                    heap_bases,
+                    mask=sub_mask,
+                )