From 18cc32b838dcc5da79d700e40b231ccc3a711243 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Fri, 4 Jul 2025 09:16:59 +0200
Subject: [PATCH 01/38] (fix) added cls pointer init

---
 sw/snRuntime/src/start.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 9f6e4be7e..afeef20be 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -98,6 +98,8 @@ static inline void snrt_init_cls() {
         snrt_dma_memset((void*)ptr, 0, size);
         snrt_dma_wait_all();
     }
+    // Init the cls pointer
+    _cls_ptr = (cls_t*)snrt_cls_base_addr();
     snrt_cluster_hw_barrier();
 }
 #endif

From 14e37915044fbcf612f8a7d2c72366c8027b7d79 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Mon, 16 Jun 2025 11:27:41 +0200
Subject: [PATCH 02/38] (feat) HW support for narrow reduction

* Extend snitch by adding CSR register for the user field

* All collective operation will be forwarded to the (default-) SoC Port for further processing.
  Even if the crossbar is defined as mcast it is currently not being used as such.
  The problem are reduction operation as the crossbar does not support such operation and therefor
  all collectiv ops are forwarded to the SoC port (to be processed in the router there)

* Bump bender dependencies

(Cherry-Picked from 6402d712b95791abf21ac38fa8045a54ddcb1029 of Lorenzos Fork)
---
 Bender.lock                                   |   4 +-
 Bender.yml                                    |   2 +-
 .../include/reqrsp_interface/typedef.svh      |  14 +-
 hw/snitch/src/riscv_instr.sv                  |   3 +-
 hw/snitch/src/snitch.sv                       |  27 ++-
 hw/snitch/src/snitch_lsu.sv                   |   4 +-
 hw/snitch/src/snitch_pkg.sv                   |   4 +-
 hw/snitch_cluster/src/snitch_cc.sv            |  22 +-
 hw/snitch_cluster/src/snitch_cluster.sv       | 224 ++++++++++++++----
 .../src/snitch_cluster_pkg.sv.tpl             |  34 ++-
 .../src/snitch_cluster_wrapper.sv.tpl         |   7 +-
 hw/snitch_cluster/src/snitch_fp_ss.sv         |   2 +-
 sw/snRuntime/src/sync.h                       |  34 ++-
 .../schema/snitch_cluster.schema.json         |  22 +-
 14 files changed, 321 insertions(+), 82 deletions(-)

diff --git a/Bender.lock b/Bender.lock
index 1dd9a5288..90359f15b 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -7,10 +7,10 @@ packages:
     dependencies:
     - common_cells
   axi:
-    revision: bec548fa2a9b18cbd7531105bb1fdf481ea8ad49
+    revision: d99625fe8fb6f253926e370b7989f81850afd21f
     version: null
     source:
-      Git: https://github.com/colluca/axi.git
+      Git: https://github.com/Lura518/axi.git
     dependencies:
     - common_cells
     - common_verification
diff --git a/Bender.yml b/Bender.yml
index 75dc4bd1b..8479f27ed 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -19,7 +19,7 @@ package:
     - Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
 
 dependencies:
-  axi:                { git: https://github.com/colluca/axi,                      rev:  multicast       }
+  axi:                { git: https://github.com/Lura518/axi,                      rev:  reduction       }
   axi_riscv_atomics:  { git: https://github.com/pulp-platform/axi_riscv_atomics,  version:  0.6.0       }
   common_cells:       { git: https://github.com/pulp-platform/common_cells,       rev:      snitch      }
   apb:                { git: https://github.com/pulp-platform/apb.git,            version:  0.2.2       }
diff --git a/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh b/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh
index 07a4179cc..db32d828e 100644
--- a/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh
+++ b/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh
@@ -10,13 +10,13 @@
 
 `define REQRSP_TYPEDEF_REQ_CHAN_T(__req_chan_t, __addr_t, __data_t, __strb_t) \
   typedef struct packed { \
-    __addr_t             addr;  \
-    __addr_t             mask;  \
-    logic                write; \
-    reqrsp_pkg::amo_op_e amo;   \
-    __data_t             data;  \
-    __strb_t             strb;  \
-    reqrsp_pkg::size_t   size;  \
+    __addr_t             addr;        \
+    logic [63:0]         user;        \
+    logic                write;       \
+    reqrsp_pkg::amo_op_e amo;         \
+    __data_t             data;        \
+    __strb_t             strb;        \
+    reqrsp_pkg::size_t   size;        \
   } __req_chan_t;
 
 `define REQRSP_TYPEDEF_RSP_CHAN_T(__rsp_chan_t, __data_t) \
diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv
index c76795b71..53ab0727d 100644
--- a/hw/snitch/src/riscv_instr.sv
+++ b/hw/snitch/src/riscv_instr.sv
@@ -1140,7 +1140,8 @@ package riscv_instr;
   localparam logic [11:0] CSR_FPMODE = 12'h7c1;
   localparam logic [11:0] CSR_BARRIER = 12'h7c2;
   localparam logic [11:0] CSR_SC = 12'h7c3;
-  localparam logic [11:0] CSR_MCAST = 12'h7c4;
+  localparam logic [11:0] CSR_USER_HIGH = 12'h7c4;
+  localparam logic [11:0] CSR_USER_LOW = 12'h7c5;
   localparam logic [11:0] CSR_HTIMEDELTAH = 12'h615;
   localparam logic [11:0] CSR_CYCLEH = 12'hc80;
   localparam logic [11:0] CSR_TIMEH = 12'hc81;
diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv
index 2c879614f..48f20bab4 100644
--- a/hw/snitch/src/snitch.sv
+++ b/hw/snitch/src/snitch.sv
@@ -247,8 +247,10 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   logic csr_en;
   logic csr_dump;
   logic csr_stall_d, csr_stall_q;
-  // Multicast mask
-  logic [31:0] csr_mcast_d, csr_mcast_q;
+
+  // User Field
+  logic [31:0] csr_user_high_d, csr_user_high_q;
+  logic [31:0] csr_user_low_d, csr_user_low_q;
 
   localparam logic M = 0;
   localparam logic S = 1;
@@ -320,7 +322,8 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   end
 
   `FFAR(csr_stall_q, csr_stall_d, '0, clk_i, rst_i)
-  `FFAR(csr_mcast_q, csr_mcast_d, '0, clk_i, rst_i)
+  `FFAR(csr_user_high_q, csr_user_high_d, '0, clk_i, rst_i)
+  `FFAR(csr_user_low_q, csr_user_low_d, '0, clk_i, rst_i)
 
   typedef struct packed {
     fpnew_pkg::fmt_mode_t  fmode;
@@ -2358,7 +2361,8 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
     dscratch_d = dscratch_q;
 
     csr_stall_d = csr_stall_q;
-    csr_mcast_d = csr_mcast_q;
+    csr_user_high_d = csr_user_high_q;
+    csr_user_low_d = csr_user_low_q;
 
     if (barrier_i) csr_stall_d = 1'b0;
     barrier_o = 1'b0;
@@ -2585,10 +2589,15 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
             barrier_o = 1'b1;
             csr_stall_d = 1'b1;
           end
-          // Multicast mask
-          CSR_MCAST: begin
-            csr_rvalue = csr_mcast_q;
-            csr_mcast_d = alu_result[31:0];
+          // User field high
+          CSR_USER_HIGH: begin
+            csr_rvalue = csr_user_high_q;
+            csr_user_high_d = alu_result[31:0];
+          end
+          // User field low
+          CSR_USER_LOW: begin
+            csr_rvalue = csr_user_low_q;
+            csr_user_low_d = alu_result[31:0];
           end
           default: begin
             csr_rvalue = '0;
@@ -2910,7 +2919,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
     .lsu_qsize_i (ls_size),
     .lsu_qamo_i (ls_amo),
     .lsu_qrepd_i (1'b0),
-    .lsu_qmcast_i (addr_t'(csr_mcast_q)),
+    .lsu_quser_i ({csr_user_high_q, csr_user_low_q}),
     .lsu_qvalid_i (lsu_qvalid),
     .lsu_qready_o (lsu_qready),
     .lsu_pdata_o (ld_result),
diff --git a/hw/snitch/src/snitch_lsu.sv b/hw/snitch/src/snitch_lsu.sv
index 4ae44ad04..f66a6dda3 100644
--- a/hw/snitch/src/snitch_lsu.sv
+++ b/hw/snitch/src/snitch_lsu.sv
@@ -50,7 +50,7 @@ module snitch_lsu #(
   input  logic [1:0]           lsu_qsize_i,
   input  reqrsp_pkg::amo_op_e  lsu_qamo_i,
   input  logic                 lsu_qrepd_i,  // Whether this is a sequencer repetition
-  input  addr_t                lsu_qmcast_i,  // Multicast mask
+  input  logic [63:0]          lsu_quser_i,  // User field for the axi transmission
   input  logic                 lsu_qvalid_i,
   output logic                 lsu_qready_o,
   // response channel
@@ -254,7 +254,7 @@ module snitch_lsu #(
   assign data_req_o.q_valid = lsu_postcaq_qvalid & (lsu_qwrite_i | ~laq_full) & ~mem_full;
   assign data_req_o.q.write = lsu_qwrite_i;
   assign data_req_o.q.addr = lsu_qaddr_i;
-  assign data_req_o.q.mask = lsu_qmcast_i;
+  assign data_req_o.q.user = lsu_quser_i;
   assign data_req_o.q.amo  = lsu_qamo_i;
   assign data_req_o.q.size = lsu_qsize_i;
 
diff --git a/hw/snitch/src/snitch_pkg.sv b/hw/snitch/src/snitch_pkg.sv
index 5bc04f790..0090dd755 100644
--- a/hw/snitch/src/snitch_pkg.sv
+++ b/hw/snitch/src/snitch_pkg.sv
@@ -128,8 +128,8 @@ package snitch_pkg;
   // Slaves on Cluster AXI Bus
   typedef enum integer {
     TCDM               = 0,
-    ClusterPeripherals = 1,
-    SoC                = 2,
+    SoC                = 1,
+    ClusterPeripherals = 2,
     ExtSlave           = 3
   } cluster_slave_e;
 
diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 31584cd1c..6006d7098 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -66,6 +66,10 @@ module snitch_cc #(
   parameter bit          Xfrep              = 1,
   /// Has `SSR` support.
   parameter bit          Xssr               = 1,
+  /// Reroute collective Operation (Multicast + Reduction) to the AXI Crossbar anyway!
+  parameter bit          ReRouteCollectiveOp = 0,
+  /// Size of the collectiv width
+  parameter int unsigned CollectiveWidth    = 1,
   /// Has `COPIFT` support.
   parameter bit          Xcopift            = 1,
   /// Has `IPU` support.
@@ -608,6 +612,7 @@ module snitch_cc #(
   localparam int unsigned SelectWidth = cf_math_pkg::idx_width(2);
   typedef logic [SelectWidth-1:0] select_t;
   select_t slave_select;
+  select_t slave_select_coll_op;
   reqrsp_demux #(
     .NrPorts (2),
     .req_t (dreq_t),
@@ -617,13 +622,28 @@ module snitch_cc #(
   ) i_reqrsp_demux (
     .clk_i,
     .rst_ni,
-    .slv_select_i (slave_select),
+    .slv_select_i (slave_select_coll_op),
     .slv_req_i (merged_dreq),
     .slv_rsp_o (merged_drsp),
     .mst_req_o ({data_tcdm_req, data_req_o}),
     .mst_rsp_i ({data_tcdm_rsp, data_rsp_i})
   );
 
+  // If we want to support collective operation (MCasst + Reduction) then all coll op request
+  // needs to be passed to the SoC independent of the address map. The problem is that a multicast 
+  // which targets its own address space needs to be forwarded to the AXI crossbar so that the
+  // rest of the SoC can be notified about the multicast too (Same goes for Reduction)!
+  // If the .collect subfield is set to 0 we have a unicast - everything else is a collective
+  // operation!
+  if (ReRouteCollectiveOp) begin
+    // Reconstruct the multicast mask from the user field
+    addr_t mcast_mask;
+    assign mcast_mask = addr_t'((merged_dreq.q.user >> CollectiveWidth) & ((1 << AddrWidth) - 1));
+    assign slave_select_coll_op = (mcast_mask != 0) ? '0 : slave_select;
+  end else begin
+    assign slave_select_coll_op = slave_select;
+  end
+
   typedef struct packed {
     int unsigned idx;
     logic [AddrWidth-1:0] base;
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 3a452427d..038cac302 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -40,6 +40,8 @@ module snitch_cluster
   parameter int unsigned WideUserWidth      = 1,
   /// Width of the atomic ID to be used in a system.
   parameter int unsigned AtomicIdWidth      = 1,
+  /// Width of the collective operation
+  parameter int unsigned CollectiveWidth    = 1,
   /// Boot Address from which to fetch the first instructions.
   /// Used if `AliasRegionEnable` or `IntBootromEnable` is not set.
   parameter logic [31:0] BootAddr           = 32'h0,
@@ -79,7 +81,12 @@ module snitch_cluster
   /// Enable virtual memory support.
   parameter bit          VMSupport          = 1,
   /// Enable multicast on DMA XBAR.
-  parameter bit          EnableDMAMulticast = 0,
+  parameter bit          EnableDmaMulticast = 0,
+  /// Enable multicast on the Narrow XBAR
+  parameter bit          EnableMulticast    = 0,
+  /// Cluster will forward any collective operation request to the SoC
+  /// independent of the address range. The SoC has to handle MCast loopbacks
+  parameter bit          ReRouteCollectiveOp = 0,
   /// Per-core enabling of the standard `E` ISA reduced-register extension.
   parameter bit [NrCores-1:0] RVE           = '0,
   /// Per-core enabling of the standard `F` ISA extensions.
@@ -198,6 +205,9 @@ module snitch_cluster
   parameter type         wide_out_resp_t   = logic,
   parameter type         wide_in_req_t     = logic,
   parameter type         wide_in_resp_t    = logic,
+  /// User field is given as type as the subfields are config dependent!
+  parameter type         user_narrow_t     = logic,
+  parameter type         user_dma_t        = logic,
   // TCDM Ports
   parameter type         tcdm_dma_req_t    = logic,
   parameter type         tcdm_dma_rsp_t    = logic,
@@ -325,6 +335,24 @@ module snitch_cluster
   localparam int unsigned NrWideRules = (1 + AliasRegionEnable) * NrWideRuleIdcs;
 
   // AXI Configuration
+  localparam axi_pkg::xbar_cfg_t ClusterMcastXbarCfg = '{
+    NoSlvPorts: NrNarrowMasters,
+    NoMstPorts: NrSlaves,
+    MaxMstTrans: NarrowMaxMstTrans,
+    MaxSlvTrans: NarrowMaxSlvTrans,
+    FallThrough: 1'b0,
+    LatencyMode: NarrowXbarLatency,
+    PipelineStages: 0,
+    AxiIdWidthSlvPorts: NarrowIdWidthIn,
+    AxiIdUsedSlvPorts: NarrowIdWidthIn,
+    UniqueIds: 1'b0,
+    AxiAddrWidth: PhysicalAddrWidth,
+    AxiDataWidth: NarrowDataWidth,
+    NoAddrRules: NrRules,
+    NoMulticastRules: 1,
+    NoMulticastPorts: 2,
+    default: '0
+  };
   localparam axi_pkg::xbar_cfg_t ClusterXbarCfg = '{
     NoSlvPorts: NrNarrowMasters,
     NoMstPorts: NrSlaves,
@@ -407,14 +435,33 @@ module snitch_cluster
   typedef logic [NarrowIdWidthOut-1:0]  id_slv_t;
   typedef logic [WideIdWidthIn-1:0]     id_dma_mst_t;
   typedef logic [WideIdWidthOut-1:0]    id_dma_slv_t;
+
+/*
   typedef logic [NarrowUserWidth-1:0]   user_t;
+
+  // TODO (raroth) Solve this clusterfuck of different AXI definition!!! We define the user mask in picobello at least 3 time - WTF?
+  // -> mcast with selectiv atomics
+  typedef struct packed {
+    addr_t                                        mcast;
+    logic [CollectiveWidth-1:0]                   collectiv;
+    logic [AtomicIdWidth-1:0]                     atomic;
+  } user_narrow_reduction_t;
+  // -> mcast without mcast
+  typedef struct packed {
+    logic [AtomicIdWidth-1:0]                   atomic;
+  } // Needs to be as wide as the NarrowUserWidth
+
+  // Ask: Lorenzo: can you disable atomics --> problem: how to handle user_ ... assignment back down
   typedef struct packed {
     logic [WideUserWidth-1:0] mcast;
   } user_dma_t;
+*/
 
   typedef logic [TCDMMemAddrWidth-1:0]  tcdm_mem_addr_t;
   typedef logic [TCDMAddrWidth-1:0]     tcdm_addr_t;
 
+  typedef logic [CollectiveWidth-1:0]   coll_type_t;
+
   // Struct replaced by logic array to workaround Questa optimization bug.
   // typedef struct packed {
   //   logic [CoreIDWidth-1:0] core_id;
@@ -423,8 +470,8 @@ module snitch_cluster
   typedef logic [CoreIDWidth:0] tcdm_user_t;
 
   // Regbus peripherals.
-  `AXI_TYPEDEF_ALL(axi_mst, addr_t, id_mst_t, data_t, strb_t, user_t)
-  `AXI_TYPEDEF_ALL(axi_slv, addr_t, id_slv_t, data_t, strb_t, user_t)
+  `AXI_TYPEDEF_ALL(axi_mst, addr_t, id_mst_t, data_t, strb_t, user_narrow_t)
+  `AXI_TYPEDEF_ALL(axi_slv, addr_t, id_slv_t, data_t, strb_t, user_narrow_t)
   `AXI_TYPEDEF_ALL(axi_mst_dma, addr_t, id_dma_mst_t, data_dma_t, strb_dma_t, user_dma_t)
   `AXI_TYPEDEF_ALL(axi_slv_dma, addr_t, id_dma_slv_t, data_dma_t, strb_dma_t, user_dma_t)
 
@@ -1075,7 +1122,9 @@ module snitch_cluster
         .CaqTagWidth (CaqTagWidth),
         .DebugSupport (DebugSupport),
         .TCDMAliasEnable (AliasRegionEnable),
-        .TCDMAliasStart (TCDMAliasStart)
+        .TCDMAliasStart (TCDMAliasStart),
+        .ReRouteCollectiveOp (ReRouteCollectiveOp),
+        .CollectiveWidth (CollectiveWidth)
       ) i_snitch_cc (
         .clk_i,
         .clk_d2_i (clk_d2),
@@ -1222,11 +1271,29 @@ module snitch_cluster
 
   reqrsp_req_t core_to_axi_req;
   reqrsp_rsp_t core_to_axi_rsp;
-  user_t cluster_user;
+
+  // User field for the AXI transmission
+  // We encode Atomics operation and (if enabled) collective operations
+  user_narrow_t cluster_user;
+  addr_t mcast_mask;
+  coll_type_t collective_type;
   // Atomic ID, needs to be unique ID of cluster
   // cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters)
-  assign cluster_user = (core_to_axi_req.q.mask << AtomicIdWidth) |
-                        ((hart_base_id_i / NrCores) +  (hart_base_id_i % NrCores) + 1'b1);
+  if (EnableMulticast) begin : AssignUserWithMCast
+    assign mcast_mask = addr_t'((core_to_axi_req.q.user >> CollectiveWidth) & ((1 << PhysicalAddrWidth) - 1));
+    assign collective_type = coll_type_t'(core_to_axi_req.q.user & ((1 << CollectiveWidth) - 1));
+    assign cluster_user = '{
+      mcast: mcast_mask,
+      collective: collective_type,
+      atomic:  (hart_base_id_i / NrCores) +  (hart_base_id_i % NrCores) + 1'b1,
+      default: '0
+    };
+  end else begin : AssignUsesrWithoutMCast
+    assign cluster_user = '{
+      atomic:  (hart_base_id_i / NrCores) +  (hart_base_id_i % NrCores) + 1'b1,
+      default: '0
+    };
+  end
 
   reqrsp_mux #(
     .NrPorts (NrCores),
@@ -1262,15 +1329,23 @@ module snitch_cluster
     .axi_rsp_i (narrow_axi_mst_rsp[CoreReq])
   );
 
+  // Define default port (SoC) for the mcast cluster cbar
+  xbar_rule_t cluster_mcast_xbar_default_port;
+  assign cluster_mcast_xbar_default_port = '{
+    idx: SoC,
+    start_addr: tcdm_start_address,
+    end_addr: tcdm_end_address
+  };
+
   logic [ClusterXbarCfg.NoSlvPorts-1:0][$clog2(ClusterXbarCfg.NoMstPorts)-1:0]
     cluster_xbar_default_port;
   xbar_rule_t [NrRules-1:0] cluster_xbar_rules;
 
   assign cluster_xbar_rules [NrRuleIdcs-1:0] = '{
     '{
-      idx:        TCDM,
-      start_addr: tcdm_start_address,
-      end_addr:   tcdm_end_address
+      idx:        ExtSlave,
+      start_addr: ext_mem_start_address,
+      end_addr:   ext_mem_end_address
     },
     '{
       idx:        ClusterPeripherals,
@@ -1278,17 +1353,17 @@ module snitch_cluster
       end_addr:   cluster_periph_end_address
     },
     '{
-      idx:        ExtSlave,
-      start_addr: ext_mem_start_address,
-      end_addr:   ext_mem_end_address
+      idx:        TCDM,
+      start_addr: tcdm_start_address,
+      end_addr:   tcdm_end_address
     }
   };
   if (AliasRegionEnable) begin : gen_cluster_xbar_alias
     assign cluster_xbar_rules [NrRules-1:NrRuleIdcs] = '{
       '{
-        idx:        TCDM,
-        start_addr: TCDMAliasStart,
-        end_addr:   TCDMAliasEnd
+        idx:        ExtSlave,
+        start_addr: ExtAliasStart,
+        end_addr:   ExtAliasEnd
       },
       '{
         idx:        ClusterPeripherals,
@@ -1296,43 +1371,91 @@ module snitch_cluster
         end_addr:   PeriphAliasEnd
       },
       '{
-        idx:        ExtSlave,
-        start_addr: ExtAliasStart,
-        end_addr:   ExtAliasEnd
+        idx:        TCDM,
+        start_addr: TCDMAliasStart,
+        end_addr:   TCDMAliasEnd
       }
     };
   end
 
-  localparam bit [ClusterXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = '1;
-  axi_xbar #(
-    .Cfg (ClusterXbarCfg),
-    .slv_aw_chan_t (axi_mst_aw_chan_t),
-    .mst_aw_chan_t (axi_slv_aw_chan_t),
-    .w_chan_t (axi_mst_w_chan_t),
-    .slv_b_chan_t (axi_mst_b_chan_t),
-    .mst_b_chan_t (axi_slv_b_chan_t),
-    .slv_ar_chan_t (axi_mst_ar_chan_t),
-    .mst_ar_chan_t (axi_slv_ar_chan_t),
-    .slv_r_chan_t (axi_mst_r_chan_t),
-    .mst_r_chan_t (axi_slv_r_chan_t),
-    .slv_req_t (axi_mst_req_t),
-    .slv_resp_t (axi_mst_resp_t),
-    .mst_req_t (axi_slv_req_t),
-    .mst_resp_t (axi_slv_resp_t),
-    .rule_t (xbar_rule_t)
-  ) i_cluster_xbar (
-    .clk_i,
-    .rst_ni,
-    .test_i (1'b0),
-    .slv_ports_req_i (narrow_axi_mst_req),
-    .slv_ports_resp_o (narrow_axi_mst_rsp),
-    .mst_ports_req_o (narrow_axi_slv_req),
-    .mst_ports_resp_i (narrow_axi_slv_rsp),
-    .addr_map_i (cluster_xbar_rules),
-    .en_default_mst_port_i (ClusterEnableDefaultMstPort),
-    .default_mst_port_i (cluster_xbar_default_port)
-  );
-  assign cluster_xbar_default_port = '{default: SoC};
+  // Instance the narrow axi xbar
+  if (EnableMulticast) begin : gen_narrow_mcast_axi_crossbar
+
+    // Define the collective connectivity matrix!
+    typedef bit [ClusterMcastXbarCfg.NoMstPorts-1:0] cluster_line_t;
+    typedef bit [ClusterMcastXbarCfg.NoSlvPorts-1:0][ClusterMcastXbarCfg.NoMstPorts-1:0] cluster_matrix_t;
+    // If we want to reroute collective operation the only available collective operation port is
+    // the SoC port
+    localparam cluster_line_t ClusterlocalArray = (ReRouteCollectiveOp) ?
+        cluster_line_t'{SoC: 1'b1, default: 1'b0} : cluster_line_t'{default: 1'b1};
+    localparam cluster_matrix_t ClusterCollectivConnectivity =
+        cluster_matrix_t'{default: ClusterlocalArray};
+
+    // Set default master port for all multicast's crossbar input's
+    localparam bit [ClusterMcastXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = '1;
+
+    axi_mcast_xbar #(
+      .Cfg                      (ClusterMcastXbarCfg),
+      .CollectivOpsConnectivity (ClusterCollectivConnectivity),
+      .slv_aw_chan_t            (axi_mst_aw_chan_t),
+      .mst_aw_chan_t            (axi_slv_aw_chan_t),
+      .w_chan_t                 (axi_mst_w_chan_t),
+      .slv_b_chan_t             (axi_mst_b_chan_t),
+      .mst_b_chan_t             (axi_slv_b_chan_t),
+      .slv_ar_chan_t            (axi_mst_ar_chan_t),
+      .mst_ar_chan_t            (axi_slv_ar_chan_t),
+      .slv_r_chan_t             (axi_mst_r_chan_t),
+      .mst_r_chan_t             (axi_slv_r_chan_t),
+      .slv_req_t                (axi_mst_req_t),
+      .slv_resp_t               (axi_mst_resp_t),
+      .mst_req_t                (axi_slv_req_t),
+      .mst_resp_t               (axi_slv_resp_t),
+      .rule_t                   (xbar_rule_t)
+    ) i_cluster_mcast_xbar (
+      .clk_i,
+      .rst_ni,
+      .test_i                   (1'b0),
+      .slv_ports_req_i          (narrow_axi_mst_req),
+      .slv_ports_resp_o         (narrow_axi_mst_rsp),
+      .mst_ports_req_o          (narrow_axi_slv_req),
+      .mst_ports_resp_i         (narrow_axi_slv_rsp),
+      .addr_map_i               (cluster_xbar_rules),
+      .en_default_mst_port_i    (ClusterEnableDefaultMstPort),
+      .default_mst_port_i       ({ClusterMcastXbarCfg.NoSlvPorts{cluster_mcast_xbar_default_port}})
+    );
+  end else begin : gen_narrow_axi_crossbar
+    // Set default master port for all crossbar input's
+    localparam bit [ClusterXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = '1;
+    axi_xbar #(
+      .Cfg (ClusterXbarCfg),
+      .slv_aw_chan_t (axi_mst_aw_chan_t),
+      .mst_aw_chan_t (axi_slv_aw_chan_t),
+      .w_chan_t (axi_mst_w_chan_t),
+      .slv_b_chan_t (axi_mst_b_chan_t),
+      .mst_b_chan_t (axi_slv_b_chan_t),
+      .slv_ar_chan_t (axi_mst_ar_chan_t),
+      .mst_ar_chan_t (axi_slv_ar_chan_t),
+      .slv_r_chan_t (axi_mst_r_chan_t),
+      .mst_r_chan_t (axi_slv_r_chan_t),
+      .slv_req_t (axi_mst_req_t),
+      .slv_resp_t (axi_mst_resp_t),
+      .mst_req_t (axi_slv_req_t),
+      .mst_resp_t (axi_slv_resp_t),
+      .rule_t (xbar_rule_t)
+    ) i_cluster_xbar (
+      .clk_i,
+      .rst_ni,
+      .test_i (1'b0),
+      .slv_ports_req_i (narrow_axi_mst_req),
+      .slv_ports_resp_o (narrow_axi_mst_rsp),
+      .mst_ports_req_o (narrow_axi_slv_req),
+      .mst_ports_resp_i (narrow_axi_slv_rsp),
+      .addr_map_i (cluster_xbar_rules),
+      .en_default_mst_port_i (ClusterEnableDefaultMstPort),
+      .default_mst_port_i (cluster_xbar_default_port)
+    );
+    assign cluster_xbar_default_port = '{default: SoC};
+  end
 
   // Optionally decouple the external narrow AXI slave port.
   axi_cut #(
@@ -1576,5 +1699,8 @@ module snitch_cluster
     ~AliasRegionEnable || ((TCDMSizeNapotRounded - 1) & AliasRegionBase) == 0)
   // Make sure we only have one DMA in the system.
   `ASSERT_INIT(NumberDMA, $onehot0(Xdma))
+  // Verify that the size of the user field matches
+  `ASSERT_INIT(CheckNarrowUserFieldWidth, NarrowUserWidth == $bits(user_narrow_t));
+  `ASSERT_INIT(CheckWideUserFieldWidth, WideUserWidth == $bits(user_dma_t));
 
 endmodule
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index 04fbfe890..3a368a6b0 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -59,9 +59,14 @@ package ${cfg['cluster']['name']}_pkg;
   localparam int unsigned WideIdWidthIn = ${cfg['cluster']['dma_id_width_in']};
   localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn;
 
+  localparam int unsigned EnableDmaMulticast = ${int(cfg['cluster']['enable_dma_multicast'])};
+  localparam int unsigned EnableMulticast = ${int(cfg['cluster']['enable_multicast'])};
+  localparam int unsigned ReRouteCollectiveOp = ${int(cfg['cluster']['enable_reroute_collective'])};
+
   localparam int unsigned NarrowUserWidth = ${cfg['cluster']['user_width']};
   localparam int unsigned WideUserWidth = ${cfg['cluster']['dma_user_width']};
   localparam int unsigned AtomicIdWidth = ${cfg['cluster']['atomic_id_width']};
+  localparam int unsigned CollectiveWidth = ${cfg['cluster']['collective_width']};
 
   localparam int unsigned ICacheLineWidth [NrHives] = '{${icache_cfg('cacheline')}};
   localparam int unsigned ICacheLineCount [NrHives] = '{${icache_cfg('depth')}};
@@ -92,11 +97,32 @@ package ${cfg['cluster']['name']}_pkg;
   typedef logic [NarrowIdWidthOut-1:0]  narrow_out_id_t;
   typedef logic [WideIdWidthIn-1:0]     wide_in_id_t;
   typedef logic [WideIdWidthOut-1:0]    wide_out_id_t;
-  typedef logic [NarrowUserWidth-1:0]   user_t;
-  typedef logic [WideUserWidth-1:0]     user_dma_t;
 
-  `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_t)
-  `AXI_TYPEDEF_ALL(narrow_out, addr_t, narrow_out_id_t, data_t, strb_t, user_t)
+// Generate the typedef's for the userfield's with the required subfields depending
+// on the configuration
+% if cfg['cluster']['enable_multicast']:
+  typedef struct packed {
+    addr_t                          mcast;
+    logic [CollectiveWidth-1:0]     collective;
+    logic [AtomicIdWidth-1:0]       atomic;
+  } user_narrow_t;
+%else:
+  typedef struct packed {
+    logic [AtomicIdWidth-1:0]       atomic;
+  } user_narrow_t;
+%endif
+
+// Will be extended when implementing collective operation on the wide dma link
+% if cfg['cluster']['enable_dma_multicast']:
+  typedef struct packed {
+    addr_t                          mcast;
+  } user_dma_t;
+%else:
+  typedef logic [WideUserWidth-1:0] user_dma_t;
+%endif
+
+  `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_narrow_t)
+  `AXI_TYPEDEF_ALL(narrow_out, addr_t, narrow_out_id_t, data_t, strb_t, user_narrow_t)
   `AXI_TYPEDEF_ALL(wide_in, addr_t, wide_in_id_t, data_dma_t, strb_dma_t, user_dma_t)
   `AXI_TYPEDEF_ALL(wide_out, addr_t, wide_out_id_t, data_dma_t, strb_dma_t, user_dma_t)
 
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index 3b9537401..79aeab8fe 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -75,6 +75,7 @@ module ${cfg['cluster']['name']}_wrapper (
     .NarrowUserWidth (${cfg['cluster']['name']}_pkg::NarrowUserWidth),
     .WideUserWidth (${cfg['cluster']['name']}_pkg::WideUserWidth),
     .AtomicIdWidth (${cfg['cluster']['name']}_pkg::AtomicIdWidth),
+    .CollectiveWidth (${cfg['cluster']['name']}_pkg::CollectiveWidth),
     .BootAddr (${to_sv_hex(cfg['cluster']['boot_addr'], 32)}),
     .IntBootromEnable (${int(cfg['cluster']['int_bootrom_enable'])}),
     .narrow_in_req_t (${cfg['cluster']['name']}_pkg::narrow_in_req_t),
@@ -85,6 +86,8 @@ module ${cfg['cluster']['name']}_wrapper (
     .wide_out_resp_t (${cfg['cluster']['name']}_pkg::wide_out_resp_t),
     .wide_in_req_t (${cfg['cluster']['name']}_pkg::wide_in_req_t),
     .wide_in_resp_t (${cfg['cluster']['name']}_pkg::wide_in_resp_t),
+    .user_narrow_t (${cfg['cluster']['name']}_pkg::user_narrow_t),
+    .user_dma_t (${cfg['cluster']['name']}_pkg::user_dma_t),
     .tcdm_dma_req_t (${cfg['cluster']['name']}_pkg::tcdm_dma_req_t),
     .tcdm_dma_rsp_t (${cfg['cluster']['name']}_pkg::tcdm_dma_rsp_t),
     .NrHives (${cfg['cluster']['nr_hives']}),
@@ -103,7 +106,9 @@ module ${cfg['cluster']['name']}_wrapper (
     .ICacheLineCount (${cfg['cluster']['name']}_pkg::ICacheLineCount),
     .ICacheWays (${cfg['cluster']['name']}_pkg::ICacheWays),
     .VMSupport (${int(cfg['cluster']['vm_support'])}),
-    .EnableDMAMulticast (${int(cfg['cluster']['enable_multicast'])}),
+    .EnableDmaMulticast (${cfg['cluster']['name']}_pkg::EnableDmaMulticast),
+    .EnableMulticast (${cfg['cluster']['name']}_pkg::EnableMulticast),
+    .ReRouteCollectiveOp (${cfg['cluster']['name']}_pkg::ReRouteCollectiveOp),
     .RVE (${core_isa('e')}),
     .RVF (${core_isa('f')}),
     .RVD (${core_isa('d')}),
diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv
index d0e49cc1d..345e1eb3a 100644
--- a/hw/snitch_cluster/src/snitch_fp_ss.sv
+++ b/hw/snitch_cluster/src/snitch_fp_ss.sv
@@ -2711,7 +2711,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
     .lsu_qsize_i (ls_size),
     .lsu_qamo_i (reqrsp_pkg::AMONone),
     .lsu_qrepd_i (acc_req_repd_q),
-    .lsu_qmcast_i ('0),
+    .lsu_quser_i ('0),
     .lsu_qvalid_i (lsu_qvalid),
     .lsu_qready_o (lsu_qready),
     .lsu_pdata_o (ld_result),
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index 2f9c4eef3..4386f3530 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -123,11 +123,41 @@ inline void snrt_cluster_hw_barrier() {
  * @note One core per cluster must invoke this function, or the calling cores
  *       will stall indefinitely.
  */
-static inline void snrt_inter_cluster_barrier() {
+inline void snrt_inter_cluster_barrier() {
+#ifdef SUPPORTS_MULTICAST
     // Everyone increments a shared counter
     uint32_t cnt =
         __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
 
+    // All but the last cluster enter WFI, while the last cluster resets the
+    // counter for the next barrier and multicasts an interrupt to wake up the
+    // other clusters.
+    if (cnt == snrt_cluster_num()) {
+        _snrt_barrier.cnt = 0;
+
+        // Multicast cluster interrupt to every other cluster's core
+        // Note: we need to address another cluster's address space
+        //       because the cluster XBAR has not been extended to support
+        //       multicast yet. We address the second cluster, if we are the
+        //       first cluster, and the second otherwise.
+        uintptr_t addr = (uintptr_t)snrt_cluster_clint_set_ptr() - SNRT_CLUSTER_OFFSET * snrt_cluster_idx();
+        if (snrt_cluster_idx() == 0) addr += SNRT_CLUSTER_OFFSET;
+        snrt_enable_multicast(BCAST_MASK_ALL);
+        *((uint32_t *)addr) = 1 << snrt_cluster_core_idx();
+        snrt_disable_multicast();
+        // Clear interrupt for next barrier
+        snrt_int_clr_mcip();
+    } else {
+        snrt_wfi();
+        // Clear interrupt for next barrier
+        snrt_int_clr_mcip();
+    }
+#else
+    // Remember previous iteration
+    uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
+    uint32_t cnt =
+        __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
+
     // All but the last cluster enter WFI, while the last cluster resets the
     // counter for the next barrier and multicasts an interrupt to wake up the
     // other clusters.
@@ -140,6 +170,7 @@ static inline void snrt_inter_cluster_barrier() {
         // Clear interrupt for next barrier
         snrt_int_clr_mcip();
     }
+#endif
 }
 
 /**
@@ -157,6 +188,7 @@ inline void snrt_global_barrier() {
     // Synchronize all DM cores in software
     if (snrt_is_dm_core()) {
         snrt_inter_cluster_barrier();
+
     }
     // Synchronize cores in a cluster with the HW barrier
     snrt_cluster_hw_barrier();
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index 8b9c70437..5f6a766af 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -176,9 +176,29 @@
                     "description": "Width of the cluster's atomics ID.",
                     "default": 1
                 },  
+                "collectiv_width": {
+                    "type": "number",
+                    "description": "Width of the collective operation field",
+                    "default": 6
+                },  
                 "enable_multicast": {
                     "type": "boolean",
-                    "description": "Whether to enable multicast in the cluster.",
+                    "description": "Whether to enable multicast in the sw & hw for the cluster.",
+                    "default": false
+                },
+                "enable_reduction": {
+                    "type": "boolean",
+                    "description": "Whether to enable multicast in the sw for the cluster.",
+                    "default": false
+                },
+                "enable_dma_multicast": {
+                    "type": "boolean",
+                    "description": "Whether to enable the multicast capable axi-crossbar in the snitch cluster",
+                    "default": false
+                },
+                "enable_reroute_collectiv": {
+                    "type": "boolean",
+                    "description": "Whether to reroute any collectiv operation request to the SoC port independent of the address",
                     "default": false
                 },
                 "hart_base_id": {

From 35e16bd914f1376d7772c5cf9ebe4f55a1b31b7c Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Wed, 18 Jun 2025 15:59:07 +0200
Subject: [PATCH 03/38] (feat) SW support for narrow reduction

* New configuration for the narrow reduction

* Rename existing configuration options

* Integrate multicast/reduction into the global barrier functions

(Cherry-Picked from c9ed65cd4da3f3fd43a5bc9c80803a3bd68ab7b6 of Lorenzos Fork)
---
 sw/snRuntime/api/sync_decls.h                 |   8 +-
 sw/snRuntime/src/dump.h                       |   2 +-
 sw/snRuntime/src/start.c                      |  16 +-
 sw/snRuntime/src/sync.c                       |   9 +-
 sw/snRuntime/src/sync.h                       | 170 +++++++++++++-----
 target/snitch_cluster/cfg/default.json        |  10 +-
 target/snitch_cluster/cfg/dma_mchan.json      |   2 +-
 target/snitch_cluster/cfg/reduction.hjson     | 150 ++++++++++++++++
 .../runtime/common/snitch_cluster_cfg.h.tpl   |  27 +++
 9 files changed, 339 insertions(+), 55 deletions(-)
 create mode 100644 target/snitch_cluster/cfg/reduction.hjson

diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index b574303da..40224cb26 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -31,6 +31,12 @@ inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);
 
 inline void snrt_wait_writeback(uint32_t val);
 
-inline void snrt_enable_multicast(uint32_t mask);
+inline void snrt_enable_multicast(uint64_t mask);
 
 inline void snrt_disable_multicast();
+
+inline void snrt_enable_reduction(uint64_t mask, uint32_t reduction);
+
+inline void snrt_disable_reduction();
+
+inline void snrt_set_user_field(uint64_t field);
diff --git a/sw/snRuntime/src/dump.h b/sw/snRuntime/src/dump.h
index 0de36219e..4786bf036 100644
--- a/sw/snRuntime/src/dump.h
+++ b/sw/snRuntime/src/dump.h
@@ -24,4 +24,4 @@
         asm volatile("csrw " #reg ", %0" ::"rK"(val));                        \
     }
 
-#define DUMP(val) ({ asm volatile("csrw 0x7C4, %0" ::"rK"(val)); })
+#define DUMP(val) ({ asm volatile("csrw 0x7C6, %0" ::"rK"(val)); })
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index afeef20be..8614faafd 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -70,11 +70,21 @@ static inline void snrt_init_bss() {
 
 #ifdef SNRT_WAKE_UP
 static inline void snrt_wake_up() {
+
+    // cluster 0 / core 0 should wake up all other cores!
     if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) {
         snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
-    } else {
-        snrt_int_clr_mcip();
-    }
+    } 
+    
+    // TODO (raroth): Hotfix!!! Race condition applies here!
+    // The problem is the snrt_wake_all call is multicast which targets all cores / clusters.
+    // If this delay is not inserted then the multicast will hit core 1 cluster 0 at the exact time
+    // where the clear flag is reset but not read in the function "snrt_int_clr_mcip".
+    // The real solution would be a fence here!!!
+    snrt_cluster_hw_barrier();
+
+    // Clear the reset flag
+    snrt_int_clr_mcip();
 }
 #endif
 
diff --git a/sw/snRuntime/src/sync.c b/sw/snRuntime/src/sync.c
index ea915b109..1a333bf53 100644
--- a/sw/snRuntime/src/sync.c
+++ b/sw/snRuntime/src/sync.c
@@ -35,6 +35,13 @@ extern uint32_t snrt_global_all_to_all_reduction(uint32_t value);
 
 extern void snrt_wait_writeback(uint32_t val);
 
-extern void snrt_enable_multicast(uint32_t mask);
+extern void snrt_enable_multicast(uint64_t mask);
 
 extern void snrt_disable_multicast();
+
+extern void snrt_enable_reduction(uint64_t mask, uint32_t reduction);
+
+extern void snrt_disable_reduction();
+
+extern void snrt_set_user_field(uint64_t field);
+
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index 4386f3530..f2fe82286 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -77,6 +77,15 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) {
 // Barrier functions
 //================================================================================
 
+/**
+ * @brief Wakes up all core by writing in their respective clint var.
+ *        Can only be called by a single core inside the hole system!
+ * @note When the Multicast is enbled then the core mask is sent to itself too
+ *        therefore setting the wake up flag althrough the core is awake.
+ *        As consequence the function "snrt_int_clr_mcip()" needs to be called
+ *        even if the core was awake. For a simplified flow we copy this behaviour
+ *        in the non-multicast case even if it was not 100% necessary!
+ */
 inline void snrt_wake_all(uint32_t core_mask) {
 #ifdef SNRT_SUPPORTS_MULTICAST
     // Multicast cluster interrupt to every other cluster's core
@@ -99,6 +108,7 @@ inline void snrt_wake_all(uint32_t core_mask) {
         snrt_disable_multicast();
     }
 #else
+    // loop to send cluster interrupt to every other cluster's core
     for (int i = 0; i < snrt_cluster_num(); i++) {
         if (snrt_cluster_idx() != i) {
             snrt_cluster(i)->peripheral_reg.cl_clint_set.f.cl_clint_set =
@@ -120,57 +130,71 @@ inline void snrt_cluster_hw_barrier() {
 /**
  * @brief Synchronize one core from every cluster with the others.
  * @details Implemented as a software barrier.
- * @note One core per cluster must invoke this function, or the calling cores
+ * @note All cores per cluster must invoke this function, or the calling cores
  *       will stall indefinitely.
  */
+
 inline void snrt_inter_cluster_barrier() {
-#ifdef SUPPORTS_MULTICAST
-    // Everyone increments a shared counter
-    uint32_t cnt =
-        __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
-
-    // All but the last cluster enter WFI, while the last cluster resets the
-    // counter for the next barrier and multicasts an interrupt to wake up the
-    // other clusters.
-    if (cnt == snrt_cluster_num()) {
-        _snrt_barrier.cnt = 0;
-
-        // Multicast cluster interrupt to every other cluster's core
-        // Note: we need to address another cluster's address space
-        //       because the cluster XBAR has not been extended to support
-        //       multicast yet. We address the second cluster, if we are the
-        //       first cluster, and the second otherwise.
-        uintptr_t addr = (uintptr_t)snrt_cluster_clint_set_ptr() - SNRT_CLUSTER_OFFSET * snrt_cluster_idx();
-        if (snrt_cluster_idx() == 0) addr += SNRT_CLUSTER_OFFSET;
-        snrt_enable_multicast(BCAST_MASK_ALL);
-        *((uint32_t *)addr) = 1 << snrt_cluster_core_idx();
-        snrt_disable_multicast();
-        // Clear interrupt for next barrier
-        snrt_int_clr_mcip();
+// First we need to reduce from all clusters together.
+// TODO raroth: Potentially if we could track the B-Response from the reduction we could remove the multicast completly.
+//              The downside is that we could not send the core into sleep and would have the cores spin on a memory fence!
+#ifdef SNRT_SUPPORTS_REDUCTION
+    // Only continue with dma core's - send the rest into sleep mode
+    if(snrt_is_dm_core()){
+        // fetch the address for the reduction
+        cls_t * ctrl_red = cls();
+        void * addr = (void *) snrt_remote_l1_ptr(&(ctrl_red->reduction), snrt_cluster_idx(), 0);
+        
+        // clear the memory location of any previouse reduction
+        if(snrt_cluster_idx() == 0){
+            *((uint32_t *) addr) = 0;
+        }
+
+        // init the reduction
+        snrt_enable_reduction(SNRT_BROADCAST_MASK, SNRT_COLL_NARROW_BARRIER);
+        *((uint32_t *) addr) = 1;
+        snrt_disable_reduction();
+
+        // The dma core of cluster 0 should pull the reduction destination to find if we have finished th reduction
+        if(snrt_cluster_idx() == 0){
+            while(*((volatile uint32_t *) addr) != 1);
+            // Wake all clusters
+            snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
+        } else {
+            snrt_wfi();
+        }
     } else {
         snrt_wfi();
-        // Clear interrupt for next barrier
-        snrt_int_clr_mcip();
     }
 #else
-    // Remember previous iteration
-    uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
-    uint32_t cnt =
-        __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
-
-    // All but the last cluster enter WFI, while the last cluster resets the
-    // counter for the next barrier and multicasts an interrupt to wake up the
-    // other clusters.
-    if (cnt == snrt_cluster_num()) {
-        _snrt_barrier.cnt = 0;
-        // Wake all clusters
-        snrt_wake_all(1 << snrt_cluster_core_idx());
+    // Only continue with dma core's - send the rest into sleep mode
+    if(snrt_is_dm_core()){
+        uint32_t cnt = __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
+
+        // All but the last cluster enter WFI, while the last cluster resets the
+        // counter for the next barrier and multicasts an interrupt to wake up the
+        // other clusters.
+        if (cnt == snrt_cluster_num()) {
+            _snrt_barrier.cnt = 0;
+            // Wake all clusters
+            snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
+        } else {
+            snrt_wfi();
+        }
     } else {
         snrt_wfi();
-        // Clear interrupt for next barrier
-        snrt_int_clr_mcip();
     }
 #endif
+
+    // TODO (raroth): Hotfix!!! Race condition applies here!
+    // The problem is the snrt_wake_all call is multicast which targets all cores / clusters.
+    // If this delay is not inserted then the multicast will hit core 0 cluster 0 at the exact time
+    // where the clear flag is reset but not read in the function "snrt_int_clr_mcip".
+    // The real solution would be a fence here!!!
+    snrt_cluster_hw_barrier();
+
+    // Clear the reset flag
+    snrt_int_clr_mcip();
 }
 
 /**
@@ -183,13 +207,12 @@ inline void snrt_inter_cluster_barrier() {
  *       will stall indefinitely.
  */
 inline void snrt_global_barrier() {
+    // Synchronize cores in a cluster with the HW barrier
     snrt_cluster_hw_barrier();
 
-    // Synchronize all DM cores in software
-    if (snrt_is_dm_core()) {
-        snrt_inter_cluster_barrier();
+    // Synchronize all clusters
+    snrt_inter_cluster_barrier();
 
-    }
     // Synchronize cores in a cluster with the HW barrier
     snrt_cluster_hw_barrier();
 }
@@ -246,6 +269,9 @@ inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
                            __ATOMIC_RELAXED);
         snrt_inter_cluster_barrier();
         *cluster_result = _reduction_result;
+    } else {
+        // All core need to invoke the barrier
+        snrt_inter_cluster_barrier();
     }
     snrt_cluster_hw_barrier();
     return *cluster_result;
@@ -342,9 +368,61 @@ inline void snrt_wait_writeback(uint32_t val) {
  *
  * @param mask Multicast mask value
  */
-inline void snrt_enable_multicast(uint32_t mask) { write_csr(0x7c4, mask); }
+inline void snrt_enable_multicast(uint64_t mask){
+    uint32_t user_high = (uint32_t) (mask >> (32 - SNRT_COLLECTIVE_WIDTH));
+    uint32_t user_low = (uint32_t) ((mask << SNRT_COLLECTIVE_WIDTH)  | SNRT_COLL_MULTICAST);
+    write_csr(0x7c4, user_high);
+    write_csr(0x7c5, user_low);
+}
 
 /**
  * @brief Disable LSU multicast
  */
-inline void snrt_disable_multicast() { write_csr(0x7c4, 0); }
+inline void snrt_disable_multicast() { 
+    write_csr(0x7c4, 0);
+    write_csr(0x7c5, 0);
+}
+
+//================================================================================
+// Reduction functions
+//================================================================================
+
+/**
+ * @brief Enable LSU reduction
+ * @details All stores performed after this call will be reductions
+ *
+ * @param mask Mask defines all involved members
+ * @param reduction Type of reduction operation
+ */
+inline void snrt_enable_reduction(uint64_t mask, uint32_t reduction) { 
+    uint32_t user_high = (uint32_t) (mask >> (32 - SNRT_COLLECTIVE_WIDTH));
+    uint32_t user_low = (uint32_t) ((mask << SNRT_COLLECTIVE_WIDTH)  | reduction);
+    write_csr(0x7c4, user_high);
+    write_csr(0x7c5, user_low);
+}
+
+/**
+ * @brief Disable LSU reduction
+ */
+inline void snrt_disable_reduction() {
+    write_csr(0x7c4, 0);
+    write_csr(0x7c5, 0);
+}
+
+//================================================================================
+// User functions
+//================================================================================
+
+/**
+ * @brief Enable LSU user field
+ * @details All stores performed after this call equiped with given user field
+ *
+ * @param field Defines the user field for the AXI transmission
+ */
+
+inline void snrt_set_user_field(uint64_t field){
+    uint32_t user_high = (uint32_t) (field >> 32);
+    uint32_t user_low = (uint32_t) (field);
+    write_csr(0x7c4, user_high);
+    write_csr(0x7c5, user_low);
+}
\ No newline at end of file
diff --git a/target/snitch_cluster/cfg/default.json b/target/snitch_cluster/cfg/default.json
index 7ccc78a24..28f8fa7ad 100644
--- a/target/snitch_cluster/cfg/default.json
+++ b/target/snitch_cluster/cfg/default.json
@@ -10,7 +10,7 @@
         addr_width: 48,
         data_width: 64,
         atomic_id_width: 5, // clog2(total number of clusters)
-        user_width: 53, // addr_width + atomic_id_width
+        user_width: 59, // atomic_id_width
         tcdm: {
             size: 128,
             banks: 32,
@@ -23,8 +23,14 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 48,
+        // Collectiv operation on narrow interface
         enable_multicast: true,
+        enable_reduction: true,
+        // To support the multicast for the DMA
+        enable_DMA_multicast: false,
+        // dma_user_width: 48,
+        // Only enable if collectiv operation are handled in SoC
+        enable_reroute_collectiv: false,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/dma_mchan.json b/target/snitch_cluster/cfg/dma_mchan.json
index e5aaae211..236ee21ba 100644
--- a/target/snitch_cluster/cfg/dma_mchan.json
+++ b/target/snitch_cluster/cfg/dma_mchan.json
@@ -25,7 +25,7 @@
         narrow_trans: 4,
         wide_trans: 32,
         dma_user_width: 48,
-        enable_multicast: true,
+        enable_DMA_multicast: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/reduction.hjson b/target/snitch_cluster/cfg/reduction.hjson
new file mode 100644
index 000000000..8bdfcd073
--- /dev/null
+++ b/target/snitch_cluster/cfg/reduction.hjson
@@ -0,0 +1,150 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Cluster configuration for a simple testbench system.
+{
+    cluster: {
+        cluster_base_addr: 0x10000000, 
+        cluster_base_offset: 0,
+        cluster_base_hartid: 0,
+        addr_width: 48,
+        data_width: 64,
+        atomic_id_width: 5, // clog2(total number of clusters)
+        collectiv_width: 6, // # bits used for the collective operation
+        user_width: 59, // addr_width + atomic_id_width + collectiv_width
+        tcdm: {
+            size: 128,
+            banks: 32,
+        },
+        cluster_periph_size: 64, // kB
+        zero_mem_size: 64, // kB
+        alias_region_enable: false,
+        dma_data_width: 512,
+        dma_axi_req_fifo_depth: 24,
+        dma_req_fifo_depth: 8,
+        narrow_trans: 4,
+        wide_trans: 32,
+        dma_user_width: 48,
+        // Enable the multicast and the reduction feature in the sw (Only Narrow IF - Not DMA calls)
+        enable_multicast: true,
+        enable_reduction: true,
+        // To support the multicast for the DMA
+        enable_DMA_multicast: true,
+        // Currently we need to reroute collectiv operation to the SoC IF independent of the dst address
+        enable_reroute_collectiv: true,
+        // We don't need Snitch debugging in Occamy
+        enable_debug: false,
+        // We don't need Snitch (core-internal) virtual memory support
+        vm_support: false,
+        // Memory configuration inputs
+        sram_cfg_expose: true,
+        sram_cfg_fields: {
+            ema: 3,
+            emaw: 2,
+            emas: 1
+        },
+        // Timing parameters
+        timing: {
+            lat_comp_fp32: 2,
+            lat_comp_fp64: 3,
+            lat_comp_fp16: 1,
+            lat_comp_fp16_alt: 1,
+            lat_comp_fp8: 1,
+            lat_comp_fp8_alt: 1,
+            lat_noncomp: 1,
+            lat_conv: 2,
+            lat_sdotp: 3,
+            fpu_pipe_config: "BEFORE",
+            narrow_xbar_latency: "CUT_ALL_PORTS",
+            wide_xbar_latency: "CUT_ALL_PORTS",
+            // Isolate the core.
+            register_core_req: true,
+            register_core_rsp: true,
+            register_offload_req: true,
+            register_offload_rsp: true,
+            register_fpu_req: true,
+            register_ext_narrow: false,
+            register_ext_wide: false
+        },
+        hives: [
+            // Hive 0
+            {
+                icache: {
+                    size: 8, // total instruction cache size in kByte
+                    ways: 2, // number of ways
+                    cacheline: 256 // word size in bits
+                },
+                cores: [
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/dma_core_template" },
+                ]
+            }
+        ]
+    },
+    dram: {
+        address: 0x80000000,
+        length: 0x80000000
+    },
+    peripherals: {
+        clint: {
+            address: 0xFFFF0000,
+            length: 0x1000
+        },
+    },
+    // Templates.
+    compute_core_template: {
+        isa: "rv32imafd",
+        xssr: true,
+        xfrep: true,
+        xdma: false,
+        xf16: true,
+        xf16alt: true,
+        xf8: true,
+        xf8alt: true,
+        xfdotp: true,
+        xfvec: true,
+        ssr_nr_credits: 4,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+        // SSSR configuration below
+        ssr_intersection: true,
+        ssr_intersection_triple: [0, 1, 2],
+        ssrs: [
+            {indirection: true},    // Master 0
+            {indirection: true},    // Master 1
+            {},                     // Slave
+        ],
+    },
+    dma_core_template: {
+        isa: "rv32imafd",
+        xdma: true,
+        xssr: false,
+        xfrep: false,
+        xf16: false,
+        xf16alt: false,
+        xf8: false,
+        xf8alt: false,
+        xfdotp: false,
+        xfvec: false,
+        num_int_outstanding_loads: 1,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 16,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+    }
+}
diff --git a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
index 37bc2dcad..566e9832b 100644
--- a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
+++ b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
@@ -25,5 +25,32 @@
 #define SNRT_SUPPORTS_MULTICAST
 % endif
 
+% if cfg['cluster']['enable_reduction']:
+#define SNRT_SUPPORTS_REDUCTION
+% endif
+#define SNRT_COLLECTIVE_WIDTH ${cfg['cluster']['collective_width']}
+
+// OP Codes copied from floo_pkg definition
+#define SNRT_COLL_MULTICAST             16
+#define SNRT_COLL_PARALLEL_REDUCTION    32
+#define SNRT_COLL_OFFLOAD_REDUCTION     48
+
+// On the wide link - fp offload
+#define SNRT_COLL_WIDE_FPADD            (4 + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_WIDE_FPMUL            (5 + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_WIDE_FPMIN            (6 + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_WIDE_FPMAX            (7 + SNRT_COLL_OFFLOAD_REDUCTION)
+
+// On the narrow link - integer offload
+#define SNRT_COLL_NARROW_ADD            (8  + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_NARROW_MUL            (9  + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_NARROW_MIN_INT        (10 + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_NARROW_MIN_UINT       (14 + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_NARROW_MAX_INT        (11 + SNRT_COLL_OFFLOAD_REDUCTION)
+#define SNRT_COLL_NARROW_MAX_UINT       (15 + SNRT_COLL_OFFLOAD_REDUCTION)
+
+// On the narrow link - parallel reduction
+#define SNRT_COLL_NARROW_BARRIER        (2  + SNRT_COLL_PARALLEL_REDUCTION)
+
 // Software configuration
 #define SNRT_LOG2_STACK_SIZE 10

From 7f687fef37d70f0571c71e1bba9049e0068e4591 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Mon, 16 Jun 2025 09:46:50 +0200
Subject: [PATCH 04/38] (feat) HW support for wide reduction

* Extend the axi xbar to offload collectiv operation to the SoC

* Add new iDMA Opscode

(Cherry-Picked from 0226903e9d892f6269994c872ea9dcb34eb7789d of Lorenzos Fork)
---
 hw/snitch/src/riscv_instr.sv                  |  2 +-
 hw/snitch/src/snitch.sv                       | 11 +----
 hw/snitch_cluster/src/snitch_cluster.sv       | 48 ++++++++-----------
 .../src/snitch_cluster_pkg.sv.tpl             |  1 +
 4 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv
index 53ab0727d..291736e91 100644
--- a/hw/snitch/src/riscv_instr.sv
+++ b/hw/snitch/src/riscv_instr.sv
@@ -327,7 +327,7 @@ package riscv_instr;
   localparam logic [31:0] DMSTAT             = 32'b0000101?????00000000?????0101011;
   localparam logic [31:0] DMSTR              = 32'b0000110??????????000000000101011;
   localparam logic [31:0] DMREP              = 32'b000011100000?????000000000101011;
-  localparam logic [31:0] DMMCAST            = 32'b000100000000?????000000000101011;
+  localparam logic [31:0] DMUSER             = 32'b0001000??????????000000000101011;
   localparam logic [31:0] FREP_O             = 32'b????????????????????????10001011;
   localparam logic [31:0] IREP               = 32'b?????????????????????????0111111;
   localparam logic [31:0] SCFGRI             = 32'b????????????00000001?????0101011;
diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv
index 48f20bab4..c31c07c76 100644
--- a/hw/snitch/src/snitch.sv
+++ b/hw/snitch/src/snitch.sv
@@ -2155,6 +2155,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
       // DMA instructions
       DMSRC,
       DMDST,
+      DMUSER,
       DMSTR: begin
         if (Xdma) begin
           acc_qreq_o.addr  = DMA_SS;
@@ -2224,16 +2225,6 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
           illegal_inst = 1'b1;
         end
       end
-      DMMCAST: begin
-        if (Xdma) begin
-          acc_qreq_o.addr = DMA_SS;
-          opa_select      = Reg;
-          acc_qvalid_o    = valid_instr;
-          write_rd        = 1'b0;
-        end else begin
-          illegal_inst = 1'b1;
-        end
-      end
       SCFGRI: begin
         if (Xssr) begin
           write_rd = 1'b0;
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 038cac302..dd9291457 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -436,27 +436,6 @@ module snitch_cluster
   typedef logic [WideIdWidthIn-1:0]     id_dma_mst_t;
   typedef logic [WideIdWidthOut-1:0]    id_dma_slv_t;
 
-/*
-  typedef logic [NarrowUserWidth-1:0]   user_t;
-
-  // TODO (raroth) Solve this clusterfuck of different AXI definition!!! We define the user mask in picobello at least 3 time - WTF?
-  // -> mcast with selectiv atomics
-  typedef struct packed {
-    addr_t                                        mcast;
-    logic [CollectiveWidth-1:0]                   collectiv;
-    logic [AtomicIdWidth-1:0]                     atomic;
-  } user_narrow_reduction_t;
-  // -> mcast without mcast
-  typedef struct packed {
-    logic [AtomicIdWidth-1:0]                   atomic;
-  } // Needs to be as wide as the NarrowUserWidth
-
-  // Ask: Lorenzo: can you disable atomics --> problem: how to handle user_ ... assignment back down
-  typedef struct packed {
-    logic [WideUserWidth-1:0] mcast;
-  } user_dma_t;
-*/
-
   typedef logic [TCDMMemAddrWidth-1:0]  tcdm_mem_addr_t;
   typedef logic [TCDMAddrWidth-1:0]     tcdm_addr_t;
 
@@ -703,7 +682,6 @@ module snitch_cluster
     .mst_resp_i (wide_axi_mst_rsp[SoCDMAIn])
   );
 
-
   logic [WideSlaveIdxBits-1:0] dma_xbar_default_port = SoCDMAOut;
   xbar_rule_t dma_xbar_default_port_rule;
   assign dma_xbar_default_port_rule = '{
@@ -736,10 +714,23 @@ module snitch_cluster
     end
   end
 
-  localparam bit [DmaXbarCfg.NoSlvPorts-1:0] DMAEnableDefaultMstPort = '1;
-  if (EnableDMAMulticast) begin : gen_mcast_dma_xbar
+  if (EnableDmaMulticast) begin : gen_mcast_dma_xbar
+
+    // Define the collective connectivity matrix!
+    typedef bit [DmaMcastXbarCfg.NoMstPorts-1:0] dma_line_t;
+    typedef bit [DmaMcastXbarCfg.NoSlvPorts-1:0][DmaMcastXbarCfg.NoMstPorts-1:0] dma_matrix_t;
+    // If we want to reroute collective operation the only available collective operation
+    // port is the SoC port
+    localparam dma_line_t DMAlocalArray = (ReRouteCollectiveOp) ?
+        dma_line_t'{SoCDMAOut: 1'b1, default: 1'b0} : dma_line_t'{default: 1'b1};
+    localparam dma_matrix_t DMACollectivConnectivity = dma_matrix_t'{default: DMAlocalArray};
+
+    // Set default master port for all multicast's crossbar input's
+    localparam bit [DmaMcastXbarCfg.NoSlvPorts-1:0] DmaEnableDefaultMstPort = '1;
+
     axi_mcast_xbar #(
       .Cfg (DmaMcastXbarCfg),
+      .CollectivOpsConnectivity (DMACollectivConnectivity),
       .ATOPs (0),
       .slv_aw_chan_t (axi_mst_dma_aw_chan_t),
       .mst_aw_chan_t (axi_slv_dma_aw_chan_t),
@@ -764,10 +755,13 @@ module snitch_cluster
       .mst_ports_req_o (wide_axi_slv_req),
       .mst_ports_resp_i (wide_axi_slv_rsp),
       .addr_map_i (enabled_dma_xbar_rule),
-      .en_default_mst_port_i (DMAEnableDefaultMstPort),
-      .default_mst_port_i ({DmaXbarCfg.NoSlvPorts{dma_xbar_default_port_rule}})
+      .en_default_mst_port_i (DmaEnableDefaultMstPort),
+      .default_mst_port_i ({DmaMcastXbarCfg.NoSlvPorts{dma_xbar_default_port_rule}})
     );
   end else begin : gen_dma_xbar
+    // Set default master port for all multicast's crossbar input's
+    localparam bit [DmaXbarCfg.NoSlvPorts-1:0] DmaEnableDefaultMstPort = '1;
+
     axi_xbar #(
       .Cfg (DmaXbarCfg),
       .ATOPs (0),
@@ -794,7 +788,7 @@ module snitch_cluster
       .mst_ports_req_o (wide_axi_slv_req),
       .mst_ports_resp_i (wide_axi_slv_rsp),
       .addr_map_i (enabled_dma_xbar_rule),
-      .en_default_mst_port_i (DMAEnableDefaultMstPort),
+      .en_default_mst_port_i (DmaEnableDefaultMstPort),
       .default_mst_port_i ({DmaXbarCfg.NoSlvPorts{dma_xbar_default_port}})
     );
   end
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index 3a368a6b0..228457feb 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -116,6 +116,7 @@ package ${cfg['cluster']['name']}_pkg;
 % if cfg['cluster']['enable_dma_multicast']:
   typedef struct packed {
     addr_t                          mcast;
+    logic [CollectiveWidth-1:0]     collective;
   } user_dma_t;
 %else:
   typedef logic [WideUserWidth-1:0] user_dma_t;

From 69de9ce98a0d86123ca01aae3e5775cf44242b34 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Fri, 4 Jul 2025 14:03:54 +0200
Subject: [PATCH 05/38] (feat) SW support for wide reduction

* Extend the dma driver to support collectiv operation by setting the user field in the axi transmission

(Cherry-Picked from 3af3efa4aa9685caa11b0abd85ed7d0d701bd633 of Lorenzos Fork)
---
 sw/snRuntime/src/dma.c |   6 ++-
 sw/snRuntime/src/dma.h | 103 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/sw/snRuntime/src/dma.c b/sw/snRuntime/src/dma.c
index a76cf91ef..b3c6c8c81 100644
--- a/sw/snRuntime/src/dma.c
+++ b/sw/snRuntime/src/dma.c
@@ -10,10 +10,14 @@ extern void snrt_dma_stop_tracking();
 
 extern void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len);
 
-extern void snrt_dma_enable_mcast(uint32_t mask);
+extern void snrt_dma_enable_mcast(uint64_t mask);
 
 extern void snrt_dma_disable_mcast();
 
+extern void snrt_dma_enable_collective(uint64_t coll_mask, uint32_t coll_op);
+
+extern void snrt_dma_disable_collective();
+
 extern snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src,
                                              size_t tile_idx, size_t tile_size,
                                              uint32_t prec);
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 94ac67b5c..5c1d4deae 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -62,51 +62,97 @@ static inline uint32_t snrt_dma_start_1d(volatile void *dst, volatile void *src,
     return snrt_dma_start_1d((uint64_t)dst, (uint64_t)src, size, channel);
 }
 
+
 /**
- * @brief Enable multicast for successive transfers.
- * @param mask Multicast mask applied to successive transfers.
+ * @brief Enable collective operations for successive transfers.
+ * @param coll_mask The mask for the collective operation
+ * @param coll_op operation type
+ * @details The next dma transfer will be a collective operation
  */
-inline void snrt_dma_enable_mcast(uint32_t mask) {
-    asm volatile("dmuser %[mask], zero \n" : : [ mask ] "r"(mask));
+inline void snrt_dma_enable_collective(uint64_t coll_mask, uint32_t coll_op) {
+    // Prepare the collective components
+    uint32_t mask_coll_op = (1 << SNRT_COLLECTIVE_WIDTH) - 1;
+    uint32_t usr_lo = (((uint32_t) (coll_mask << SNRT_COLLECTIVE_WIDTH)) | (coll_op & mask_coll_op));
+    uint32_t usr_hi = (((uint32_t) (coll_mask >> (32-SNRT_COLLECTIVE_WIDTH))) & mask_coll_op);
+    // Set the user field
+    asm volatile("dmuser %[usr_lo], %[usr_hi] \n" : : 
+        [ usr_lo ] "r"(usr_lo), [ usr_hi ] "r"(usr_hi));
 }
 
 /**
  * @brief Disable multicast for successive transfers.
- * @details Resets the multicast mask to zero.
+ * @details Resets the collective component to zero.
  */
-inline void snrt_dma_disable_mcast() { asm volatile("dmuser zero, zero \n"); }
+inline void snrt_dma_disable_collective() {
+    asm volatile("dmuser zero, zero \n");
+}
+
+/**
+ * @brief Start an asynchronous collective 1D DMA transfer with 64-bit wide
+ * pointers.
+ * @param coll_mask The mask for the collective operation
+ * @param coll_op operation type
+ * @see snrt_dma_start_1d(uint64_t, uint64_t, size_t, uint32_t) for a
+ *      description of the other parameters.
+ */
+static inline uint32_t snrt_dma_start_1d_collective(uint64_t dst, uint64_t src,
+                                               size_t size, uint64_t coll_mask,
+                                               uint32_t coll_op,
+                                               const uint32_t channel = 0) {
+    snrt_dma_enable_collective(coll_mask, coll_op);
+    uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
+    snrt_dma_disable_collective();
+    return txid;
+}
 
 /**
  * @brief Start an asynchronous multicast 1D DMA transfer with 64-bit wide
  * pointers.
- * @param mask Multicast mask applied on the destination address.
+ * @param coll_mask The mask for the multicast operation
+ * @param coll_op operation type
  * @see snrt_dma_start_1d(uint64_t, uint64_t, size_t, uint32_t) for a
  *      description of the other parameters.
  */
 static inline uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src,
-                                               size_t size, uint32_t mask,
+                                               size_t size, uint64_t mask,
                                                const uint32_t channel = 0) {
-    snrt_dma_enable_mcast(mask);
+    snrt_dma_enable_collective(mask, SNRT_COLL_MULTICAST);
     uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
-    snrt_dma_disable_mcast();
+    snrt_dma_disable_collective();
     return txid;
 }
 
 /**
- * @brief Start an asynchronous multicast 1D DMA transfer using native-size
+ * @brief Start an asynchronous collective 1D DMA transfer using native-size
+ * pointers.
+ *
+ * This is a convenience overload of snrt_dma_start_1d_collective() using `void*`
+ * pointers.
+ *
+ * @see snrt_dma_start_1d_collective(uint64_t, uint64_t, size_t, uint64_t, uint32_t, uint32_t)
+ */
+static inline uint32_t snrt_dma_start_1d_collective(volatile void *dst,
+                                               volatile void *src, size_t size,
+                                               volatile void * coll_mask,
+                                               uint32_t coll_op,
+                                               const uint32_t channel = 0) {
+    return snrt_dma_start_1d_collective((uint64_t)dst, (uint64_t)src, size, (uint64_t)coll_mask, coll_op, channel);
+}
+
+/**
+ * @brief Start an asynchronous collective 1D DMA transfer using native-size
  * pointers.
  *
  * This is a convenience overload of snrt_dma_start_1d_mcast() using `void*`
  * pointers.
  *
- * @see snrt_dma_start_1d_mcast(uint64_t, uint64_t, size_t, uint32_t, uint32_t)
+ * @see snrt_dma_start_1d_mcast(uint64_t, uint64_t, size_t, uint64_t, uint32_t)
  */
 static inline uint32_t snrt_dma_start_1d_mcast(volatile void *dst,
                                                volatile void *src, size_t size,
-                                               uint32_t mask,
+                                               volatile void * mask,
                                                const uint32_t channel = 0) {
-    return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, mask,
-                                   channel);
+    return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, (uint64_t)mask, channel);
 }
 
 /**
@@ -291,13 +337,34 @@ inline snrt_dma_txid_t snrt_dma_mcast_load_1d_tile(void *dst, void *src,
                                                    size_t tile_idx,
                                                    size_t tile_size,
                                                    uint32_t prec,
-                                                   uint32_t mcast) {
+                                                   void *mcast) {
     size_t tile_nbytes = tile_size * prec;
-    return snrt_dma_start_1d_mcast((uintptr_t)dst,
-                                   (uintptr_t)src + tile_idx * tile_nbytes,
+    return snrt_dma_start_1d_mcast(dst, (void*) ((char*) src + (tile_idx * tile_nbytes)),
                                    tile_nbytes, mcast);
 }
 
+/**
+ * @brief Load a tile of a 1D array.
+ * @param dst Pointer to the tile destination.
+ * @param src Pointer to the source array.
+ * @param tile_idx Index of the tile in the 1D array.
+ * @param tile_size Number of elements within a tile of the 1D array.
+ * @param prec Number of bytes of each element in the 1D array.
+ * @param coll_mask Multicast mask for collective operation applied on the destination address.
+ * @param coll_op Type of operation (Should only work for multicast)
+ */
+inline snrt_dma_txid_t snrt_dma_collective_load_1d_tile(void *dst, void *src,
+                                                   size_t tile_idx,
+                                                   size_t tile_size,
+                                                   uint32_t prec,
+                                                   void *coll_mask,
+                                                   uint32_t coll_op) {
+    size_t tile_nbytes = tile_size * prec;
+    return snrt_dma_start_1d_collective((uintptr_t)dst,
+                                   (uintptr_t)src + tile_idx * tile_nbytes,
+                                   tile_nbytes, (uintptr_t)coll_mask, coll_op);
+}
+
 /**
  * @brief Transfer and reshape a 1D array into a 2D array.
  * @param dst Pointer to the destination array.

From a7bc2ec8a40bb858f32d057c6dfc038ccc66e3ae Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Fri, 4 Jul 2025 09:36:14 +0200
Subject: [PATCH 06/38] (feat) fix all configurations to account for the
 features

---
 target/snitch_cluster/cfg/default.json        | 16 +++++-----
 target/snitch_cluster/cfg/dma_mchan.json      |  2 +-
 target/snitch_cluster/cfg/github-ci.json      | 11 +++++--
 target/snitch_cluster/cfg/reduction.hjson     | 29 ++++++++++---------
 .../schema/snitch_cluster.schema.json         |  6 ++--
 5 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/target/snitch_cluster/cfg/default.json b/target/snitch_cluster/cfg/default.json
index 28f8fa7ad..d4dd79301 100644
--- a/target/snitch_cluster/cfg/default.json
+++ b/target/snitch_cluster/cfg/default.json
@@ -10,7 +10,7 @@
         addr_width: 48,
         data_width: 64,
         atomic_id_width: 5, // clog2(total number of clusters)
-        user_width: 59, // atomic_id_width
+        user_width: 5, // atomic_id_width + 6 + addr_width if we enable_multicast (not enable_dma_multicast)
         tcdm: {
             size: 128,
             banks: 32,
@@ -23,14 +23,14 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        // Collectiv operation on narrow interface
-        enable_multicast: true,
-        enable_reduction: true,
+        // Collective operation on narrow interface
+        enable_multicast: false,
+        enable_reduction: false,
         // To support the multicast for the DMA
-        enable_DMA_multicast: false,
-        // dma_user_width: 48,
-        // Only enable if collectiv operation are handled in SoC
-        enable_reroute_collectiv: false,
+        enable_dma_multicast: false,
+        dma_user_width: 1, // 6 + addr_width if we enable_dma_multicast (not enable_multicast)
+        // Only enable if collective operation are handled in SoC
+        enable_reroute_collective: false,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/dma_mchan.json b/target/snitch_cluster/cfg/dma_mchan.json
index 236ee21ba..01ee47cd0 100644
--- a/target/snitch_cluster/cfg/dma_mchan.json
+++ b/target/snitch_cluster/cfg/dma_mchan.json
@@ -25,7 +25,7 @@
         narrow_trans: 4,
         wide_trans: 32,
         dma_user_width: 48,
-        enable_DMA_multicast: true,
+        enable_dma_multicast: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/github-ci.json b/target/snitch_cluster/cfg/github-ci.json
index b95466f1c..496d79a53 100644
--- a/target/snitch_cluster/cfg/github-ci.json
+++ b/target/snitch_cluster/cfg/github-ci.json
@@ -10,7 +10,8 @@
         addr_width: 48,
         data_width: 64,
         atomic_id_width: 5, // clog2(total number of clusters)
-        user_width: 53, // addr_width + atomic_id_width
+        collective_width: 6, // # bits used for the collective operation
+        user_width: 59, // addr_width + atomic_id_width + collective_width
         tcdm: {
             size: 128,
             banks: 32,
@@ -23,8 +24,14 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 48,
+        // Collective operation on narrow interface
         enable_multicast: true,
+        enable_reduction: true,
+        // To support the multicast for the DMA
+        enable_dma_multicast: true,
+        dma_user_width: 54, // 6 + addr_width if we enable_dma_multicast (not enable_multicast)
+        // Only enable if collective operation are handled in SoC
+        enable_reroute_collective: false,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/reduction.hjson b/target/snitch_cluster/cfg/reduction.hjson
index 8bdfcd073..729ba89d2 100644
--- a/target/snitch_cluster/cfg/reduction.hjson
+++ b/target/snitch_cluster/cfg/reduction.hjson
@@ -11,8 +11,8 @@
         addr_width: 48,
         data_width: 64,
         atomic_id_width: 5, // clog2(total number of clusters)
-        collectiv_width: 6, // # bits used for the collective operation
-        user_width: 59, // addr_width + atomic_id_width + collectiv_width
+        collective_width: 6, // # bits used for the collective operation
+        user_width: 59, // addr_width + atomic_id_width + collective_width
         tcdm: {
             size: 128,
             banks: 32,
@@ -25,14 +25,14 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 48,
+        dma_user_width: 54,
         // Enable the multicast and the reduction feature in the sw (Only Narrow IF - Not DMA calls)
         enable_multicast: true,
         enable_reduction: true,
         // To support the multicast for the DMA
-        enable_DMA_multicast: true,
-        // Currently we need to reroute collectiv operation to the SoC IF independent of the dst address
-        enable_reroute_collectiv: true,
+        enable_dma_multicast: true,
+        // Currently we need to reroute collective operation to the SoC IF independent of the dst address
+        enable_reroute_collective: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
@@ -89,16 +89,19 @@
             }
         ]
     },
-    dram: {
-        address: 0x80000000,
-        length: 0x80000000
-    },
-    peripherals: {
-        clint: {
+    external_addr_regions: [
+        {
+            name: "dram",
+            address: 0x80000000,
+            length: 0x80000000,
+            cacheable: true
+        },
+        {
+            name: "clint",
             address: 0xFFFF0000,
             length: 0x1000
         },
-    },
+    ],
     // Templates.
     compute_core_template: {
         isa: "rv32imafd",
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index 5f6a766af..372047cba 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -176,7 +176,7 @@
                     "description": "Width of the cluster's atomics ID.",
                     "default": 1
                 },  
-                "collectiv_width": {
+                "collective_width": {
                     "type": "number",
                     "description": "Width of the collective operation field",
                     "default": 6
@@ -196,9 +196,9 @@
                     "description": "Whether to enable the multicast capable axi-crossbar in the snitch cluster",
                     "default": false
                 },
-                "enable_reroute_collectiv": {
+                "enable_reroute_collective": {
                     "type": "boolean",
-                    "description": "Whether to reroute any collectiv operation request to the SoC port independent of the address",
+                    "description": "Whether to reroute any collective operation request to the SoC port independent of the address",
                     "default": false
                 },
                 "hart_base_id": {

From 1429a977fd9117d78d6d0ed16e474c07eaf31430 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Wed, 9 Jul 2025 14:37:25 +0200
Subject: [PATCH 07/38] (feat) introduce memory fences in the syncronization
 methodes

---
 sw/snRuntime/src/start.c |  8 ++------
 sw/snRuntime/src/sync.h  | 26 ++++----------------------
 2 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 8614faafd..2ab50f97d 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -76,12 +76,8 @@ static inline void snrt_wake_up() {
         snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
     } 
     
-    // TODO (raroth): Hotfix!!! Race condition applies here!
-    // The problem is the snrt_wake_all call is multicast which targets all cores / clusters.
-    // If this delay is not inserted then the multicast will hit core 1 cluster 0 at the exact time
-    // where the clear flag is reset but not read in the function "snrt_int_clr_mcip".
-    // The real solution would be a fence here!!!
-    snrt_cluster_hw_barrier();
+    // fence which wait until all memory operation are done (all cores are woken up)
+    fence();
 
     // Clear the reset flag
     snrt_int_clr_mcip();
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index f2fe82286..c50e7e4a7 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -135,11 +135,8 @@ inline void snrt_cluster_hw_barrier() {
  */
 
 inline void snrt_inter_cluster_barrier() {
-// First we need to reduce from all clusters together.
-// TODO raroth: Potentially if we could track the B-Response from the reduction we could remove the multicast completly.
-//              The downside is that we could not send the core into sleep and would have the cores spin on a memory fence!
 #ifdef SNRT_SUPPORTS_REDUCTION
-    // Only continue with dma core's - send the rest into sleep mode
+    // Only continue with dma core's - send the rest into the next hw barrier
     if(snrt_is_dm_core()){
         // fetch the address for the reduction
         cls_t * ctrl_red = cls();
@@ -155,16 +152,8 @@ inline void snrt_inter_cluster_barrier() {
         *((uint32_t *) addr) = 1;
         snrt_disable_reduction();
 
-        // The dma core of cluster 0 should pull the reduction destination to find if we have finished th reduction
-        if(snrt_cluster_idx() == 0){
-            while(*((volatile uint32_t *) addr) != 1);
-            // Wake all clusters
-            snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
-        } else {
-            snrt_wfi();
-        }
-    } else {
-        snrt_wfi();
+        // fence to wait until the reduction is finished
+        fence();
     }
 #else
     // Only continue with dma core's - send the rest into sleep mode
@@ -184,17 +173,10 @@ inline void snrt_inter_cluster_barrier() {
     } else {
         snrt_wfi();
     }
-#endif
-
-    // TODO (raroth): Hotfix!!! Race condition applies here!
-    // The problem is the snrt_wake_all call is multicast which targets all cores / clusters.
-    // If this delay is not inserted then the multicast will hit core 0 cluster 0 at the exact time
-    // where the clear flag is reset but not read in the function "snrt_int_clr_mcip".
-    // The real solution would be a fence here!!!
-    snrt_cluster_hw_barrier();
 
     // Clear the reset flag
     snrt_int_clr_mcip();
+#endif
 }
 
 /**

From 5d112e972bbfddee0d04e03467ef729e2cf53791 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Wed, 9 Jul 2025 19:08:19 +0200
Subject: [PATCH 08/38] (misc)

* Bump axi bnder version due to renaming on axi side

* Rename rest of collectiv to collective

* multicast rule / port fix
---
 Bender.lock                             |  2 +-
 hw/snitch_cluster/src/snitch_cluster.sv | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Bender.lock b/Bender.lock
index 90359f15b..2b7a97918 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -7,7 +7,7 @@ packages:
     dependencies:
     - common_cells
   axi:
-    revision: d99625fe8fb6f253926e370b7989f81850afd21f
+    revision: 9debffc1b1b2b4e4045ec10dfe6eb87a412eab95
     version: null
     source:
       Git: https://github.com/Lura518/axi.git
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index dd9291457..93d424650 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -349,8 +349,8 @@ module snitch_cluster
     AxiAddrWidth: PhysicalAddrWidth,
     AxiDataWidth: NarrowDataWidth,
     NoAddrRules: NrRules,
-    NoMulticastRules: 1,
-    NoMulticastPorts: 2,
+    NoMulticastRules: 3,
+    NoMulticastPorts: 3,
     default: '0
   };
   localparam axi_pkg::xbar_cfg_t ClusterXbarCfg = '{
@@ -723,14 +723,14 @@ module snitch_cluster
     // port is the SoC port
     localparam dma_line_t DMAlocalArray = (ReRouteCollectiveOp) ?
         dma_line_t'{SoCDMAOut: 1'b1, default: 1'b0} : dma_line_t'{default: 1'b1};
-    localparam dma_matrix_t DMACollectivConnectivity = dma_matrix_t'{default: DMAlocalArray};
+    localparam dma_matrix_t DMACollectiveConnectivity = dma_matrix_t'{default: DMAlocalArray};
 
     // Set default master port for all multicast's crossbar input's
     localparam bit [DmaMcastXbarCfg.NoSlvPorts-1:0] DmaEnableDefaultMstPort = '1;
 
     axi_mcast_xbar #(
       .Cfg (DmaMcastXbarCfg),
-      .CollectivOpsConnectivity (DMACollectivConnectivity),
+      .CollectiveOpsConnectivity (DMACollectiveConnectivity),
       .ATOPs (0),
       .slv_aw_chan_t (axi_mst_dma_aw_chan_t),
       .mst_aw_chan_t (axi_slv_dma_aw_chan_t),
@@ -1328,7 +1328,7 @@ module snitch_cluster
   assign cluster_mcast_xbar_default_port = '{
     idx: SoC,
     start_addr: tcdm_start_address,
-    end_addr: tcdm_end_address
+    end_addr: ext_mem_end_address
   };
 
   logic [ClusterXbarCfg.NoSlvPorts-1:0][$clog2(ClusterXbarCfg.NoMstPorts)-1:0]
@@ -1382,7 +1382,7 @@ module snitch_cluster
     // the SoC port
     localparam cluster_line_t ClusterlocalArray = (ReRouteCollectiveOp) ?
         cluster_line_t'{SoC: 1'b1, default: 1'b0} : cluster_line_t'{default: 1'b1};
-    localparam cluster_matrix_t ClusterCollectivConnectivity =
+    localparam cluster_matrix_t ClusterCollectiveConnectivity =
         cluster_matrix_t'{default: ClusterlocalArray};
 
     // Set default master port for all multicast's crossbar input's
@@ -1390,7 +1390,7 @@ module snitch_cluster
 
     axi_mcast_xbar #(
       .Cfg                      (ClusterMcastXbarCfg),
-      .CollectivOpsConnectivity (ClusterCollectivConnectivity),
+      .CollectiveOpsConnectivity(ClusterCollectiveConnectivity),
       .slv_aw_chan_t            (axi_mst_aw_chan_t),
       .mst_aw_chan_t            (axi_slv_aw_chan_t),
       .w_chan_t                 (axi_mst_w_chan_t),

From 45e9399376e9270944770bd18db433bacfe7a9b0 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 08:25:55 +0200
Subject: [PATCH 09/38] Add USER CSRs and DMUSER instruction to riscv-opcodes

Also maps CSR_USER_LOW and CSR_USER_HIGH respectively to addresses
0x7C4 and 0x7C5 (instead of the opposite), to reflect LOW and HIGH
names also in the CSR address mapping.
---
 hw/snitch/src/riscv_instr.sv | 4 ++--
 sw/deps/riscv-opcodes        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv
index 291736e91..8e3332db2 100644
--- a/hw/snitch/src/riscv_instr.sv
+++ b/hw/snitch/src/riscv_instr.sv
@@ -1140,8 +1140,8 @@ package riscv_instr;
   localparam logic [11:0] CSR_FPMODE = 12'h7c1;
   localparam logic [11:0] CSR_BARRIER = 12'h7c2;
   localparam logic [11:0] CSR_SC = 12'h7c3;
-  localparam logic [11:0] CSR_USER_HIGH = 12'h7c4;
-  localparam logic [11:0] CSR_USER_LOW = 12'h7c5;
+  localparam logic [11:0] CSR_USER_LOW = 12'h7c4;
+  localparam logic [11:0] CSR_USER_HIGH = 12'h7c5;
   localparam logic [11:0] CSR_HTIMEDELTAH = 12'h615;
   localparam logic [11:0] CSR_CYCLEH = 12'hc80;
   localparam logic [11:0] CSR_TIMEH = 12'hc81;
diff --git a/sw/deps/riscv-opcodes b/sw/deps/riscv-opcodes
index 8874980e3..a1ba32d68 160000
--- a/sw/deps/riscv-opcodes
+++ b/sw/deps/riscv-opcodes
@@ -1 +1 @@
-Subproject commit 8874980e337ad757e983ab90622bb29b2fbb87b4
+Subproject commit a1ba32d68eec20a93ec212a7f3b186ed255bd778

From 0a3cde5a100d521365c1220ddb18b15eb1f6774c Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 11:48:31 +0200
Subject: [PATCH 10/38] hw: Add user field to rspreq interface

---
 .../include/reqrsp_interface/typedef.svh      |  8 +--
 hw/reqrsp_interface/src/axi_to_reqrsp.sv      |  2 +-
 hw/reqrsp_interface/src/reqrsp_cut.sv         | 11 +++-
 hw/reqrsp_interface/src/reqrsp_demux.sv       |  5 +-
 hw/reqrsp_interface/src/reqrsp_iso.sv         | 11 +++-
 hw/reqrsp_interface/src/reqrsp_mux.sv         | 11 +++-
 hw/reqrsp_interface/src/reqrsp_to_axi.sv      | 18 ++----
 hw/snitch_cluster/src/snitch_cluster.sv       | 64 ++++++++++++-------
 hw/tcdm_interface/src/axi_to_tcdm.sv          |  5 +-
 hw/tcdm_interface/src/reqrsp_to_tcdm.sv       | 12 ++--
 10 files changed, 96 insertions(+), 51 deletions(-)

diff --git a/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh b/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh
index db32d828e..19f2e794d 100644
--- a/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh
+++ b/hw/reqrsp_interface/include/reqrsp_interface/typedef.svh
@@ -8,14 +8,14 @@
 `ifndef REQRSP_INTERFACE_TYPEDEF_SVH_
 `define REQRSP_INTERFACE_TYPEDEF_SVH_
 
-`define REQRSP_TYPEDEF_REQ_CHAN_T(__req_chan_t, __addr_t, __data_t, __strb_t) \
+`define REQRSP_TYPEDEF_REQ_CHAN_T(__req_chan_t, __addr_t, __data_t, __strb_t, __user_t) \
   typedef struct packed { \
     __addr_t             addr;        \
-    logic [63:0]         user;        \
     logic                write;       \
     reqrsp_pkg::amo_op_e amo;         \
     __data_t             data;        \
     __strb_t             strb;        \
+    __user_t             user;        \
     reqrsp_pkg::size_t   size;        \
   } __req_chan_t;
 
@@ -39,8 +39,8 @@
     logic      q_ready; \
   } __rsp_t;
 
-`define REQRSP_TYPEDEF_ALL(__name, __addr_t, __data_t, __strb_t) \
-  `REQRSP_TYPEDEF_REQ_CHAN_T(__name``_req_chan_t, __addr_t, __data_t, __strb_t) \
+`define REQRSP_TYPEDEF_ALL(__name, __addr_t, __data_t, __strb_t, __user_t) \
+  `REQRSP_TYPEDEF_REQ_CHAN_T(__name``_req_chan_t, __addr_t, __data_t, __strb_t, __user_t) \
   `REQRSP_TYPEDEF_RSP_CHAN_T(__name``_rsp_chan_t, __data_t) \
   `REQRSP_TYPEDEF_REQ_T(__name``_req_t, __name``_req_chan_t) \
   `REQRSP_TYPEDEF_RSP_T(__name``_rsp_t, __name``_rsp_chan_t)
diff --git a/hw/reqrsp_interface/src/axi_to_reqrsp.sv b/hw/reqrsp_interface/src/axi_to_reqrsp.sv
index d96cca556..cdafa9d8f 100644
--- a/hw/reqrsp_interface/src/axi_to_reqrsp.sv
+++ b/hw/reqrsp_interface/src/axi_to_reqrsp.sv
@@ -448,7 +448,7 @@ module axi_to_reqrsp_intf #(
   typedef logic [IdWidth-1:0] id_t;
   typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   `AXI_TYPEDEF_AW_CHAN_T(aw_chan_t, addr_t, id_t, user_t)
   `AXI_TYPEDEF_W_CHAN_T(w_chan_t, data_t, strb_t, user_t)
diff --git a/hw/reqrsp_interface/src/reqrsp_cut.sv b/hw/reqrsp_interface/src/reqrsp_cut.sv
index 2e804dec8..e221f8906 100644
--- a/hw/reqrsp_interface/src/reqrsp_cut.sv
+++ b/hw/reqrsp_interface/src/reqrsp_cut.sv
@@ -12,6 +12,8 @@ module reqrsp_cut #(
     parameter int unsigned AddrWidth = 0,
     /// Data width of the interface.
     parameter int unsigned DataWidth = 0,
+    /// User width of the interface.
+    parameter int unsigned UserWidth = 0,
     /// Request type.
     parameter type req_t             = logic,
     /// Response type.
@@ -32,8 +34,9 @@ module reqrsp_cut #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   spill_register #(
     .T (reqrsp_req_chan_t),
@@ -74,6 +77,8 @@ module reqrsp_cut_intf #(
     parameter int unsigned AddrWidth = 0,
     /// Data width of the interface.
     parameter int unsigned DataWidth = 0,
+    /// User width of the interface.
+    parameter int unsigned UserWidth = 0,
     /// Bypass request channel.
     parameter bit  BypassReq         = 0,
     /// Bypass Response channel.
@@ -88,8 +93,9 @@ module reqrsp_cut_intf #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   reqrsp_req_t reqrsp_slv_req, reqrsp_mst_req;
   reqrsp_rsp_t reqrsp_slv_rsp, reqrsp_mst_rsp;
@@ -97,6 +103,7 @@ module reqrsp_cut_intf #(
   reqrsp_cut #(
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (UserWidth),
     .req_t (reqrsp_req_t),
     .rsp_t (reqrsp_rsp_t),
     .BypassReq (BypassReq),
diff --git a/hw/reqrsp_interface/src/reqrsp_demux.sv b/hw/reqrsp_interface/src/reqrsp_demux.sv
index ce14f9862..2d4fb0cf6 100644
--- a/hw/reqrsp_interface/src/reqrsp_demux.sv
+++ b/hw/reqrsp_interface/src/reqrsp_demux.sv
@@ -104,6 +104,8 @@ module reqrsp_demux_intf #(
     parameter int unsigned AddrWidth    = 0,
     /// Data width of the interface.
     parameter int unsigned DataWidth    = 0,
+    /// User width of the interface.
+    parameter int unsigned UserWidth    = 0,
     /// Amount of outstanding responses. Determines the FIFO size.
     parameter int unsigned RespDepth    = 8,
     // Dependent parameters, DO NOT OVERRIDE!
@@ -120,8 +122,9 @@ module reqrsp_demux_intf #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   reqrsp_req_t reqrsp_slv_req;
   reqrsp_rsp_t reqrsp_slv_rsp;
diff --git a/hw/reqrsp_interface/src/reqrsp_iso.sv b/hw/reqrsp_interface/src/reqrsp_iso.sv
index 997591615..82a456b88 100644
--- a/hw/reqrsp_interface/src/reqrsp_iso.sv
+++ b/hw/reqrsp_interface/src/reqrsp_iso.sv
@@ -14,6 +14,8 @@ module reqrsp_iso #(
     parameter int unsigned AddrWidth = 0,
     /// Data width of the interface.
     parameter int unsigned DataWidth = 0,
+    /// User width of the interface.
+    parameter int unsigned UserWidth = 0,
     /// Request type.
     parameter type req_t             = logic,
     /// Response type.
@@ -43,8 +45,9 @@ module reqrsp_iso #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   isochronous_spill_register #(
     .T (reqrsp_req_chan_t),
@@ -89,6 +92,8 @@ module reqrsp_iso_intf #(
     parameter int unsigned AddrWidth = 0,
     /// Data width of the interface.
     parameter int unsigned DataWidth = 0,
+    /// User width of the interface.
+    parameter int unsigned UserWidth = 0,
     /// Bypass.
     parameter bit  BypassReq         = 0,
     parameter bit  BypassRsp         = 0
@@ -110,8 +115,9 @@ module reqrsp_iso_intf #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   reqrsp_req_t reqrsp_src_req, reqrsp_dst_req;
   reqrsp_rsp_t reqrsp_src_rsp, reqrsp_dst_rsp;
@@ -119,6 +125,7 @@ module reqrsp_iso_intf #(
   reqrsp_iso #(
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (UserWidth),
     .req_t     (reqrsp_req_t),
     .rsp_t     (reqrsp_rsp_t),
     .BypassReq (BypassReq),
diff --git a/hw/reqrsp_interface/src/reqrsp_mux.sv b/hw/reqrsp_interface/src/reqrsp_mux.sv
index afab89bbb..9a828e7dd 100644
--- a/hw/reqrsp_interface/src/reqrsp_mux.sv
+++ b/hw/reqrsp_interface/src/reqrsp_mux.sv
@@ -14,6 +14,8 @@ module reqrsp_mux #(
     parameter int unsigned               AddrWidth    = 0,
     /// Data width of the interface.
     parameter int unsigned               DataWidth    = 0,
+    /// User width of the interface.
+    parameter int unsigned               UserWidth    = 0,
     /// Request type.
     parameter type                       req_t        = logic,
     /// Response type.
@@ -39,8 +41,9 @@ module reqrsp_mux #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_REQ_CHAN_T(req_chan_t, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_REQ_CHAN_T(req_chan_t, addr_t, data_t, strb_t, user_t)
 
   localparam int unsigned LogNrPorts = cf_math_pkg::idx_width(NrPorts);
 
@@ -159,6 +162,8 @@ module reqrsp_mux_intf #(
     parameter int unsigned      AddrWidth    = 0,
     /// Data width of the interface.
     parameter int unsigned      DataWidth    = 0,
+    /// User width of the interface.
+    parameter int unsigned      UserWidth    = 0,
     /// Amount of outstanding responses. Determines the FIFO size.
     parameter int unsigned      RespDepth    = 8,
     /// Cut timing paths on the request path. Incurs a cycle additional latency.
@@ -175,8 +180,9 @@ module reqrsp_mux_intf #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   reqrsp_req_t [NrPorts-1:0] reqrsp_slv_req;
   reqrsp_rsp_t [NrPorts-1:0] reqrsp_slv_rsp;
@@ -188,6 +194,7 @@ module reqrsp_mux_intf #(
     .NrPorts (NrPorts),
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (UserWidth),
     .req_t (reqrsp_req_t),
     .rsp_t (reqrsp_rsp_t),
     .RespDepth (RespDepth),
diff --git a/hw/reqrsp_interface/src/reqrsp_to_axi.sv b/hw/reqrsp_interface/src/reqrsp_to_axi.sv
index 55ad93c54..d3fa51150 100644
--- a/hw/reqrsp_interface/src/reqrsp_to_axi.sv
+++ b/hw/reqrsp_interface/src/reqrsp_to_axi.sv
@@ -48,7 +48,6 @@ module reqrsp_to_axi import reqrsp_pkg::*; #(
   parameter int unsigned ID = 0,
   /// Data width of bus, must be 32 or 64.
   parameter int unsigned DataWidth = 32'b0,
-  parameter int unsigned UserWidth = 32'b0,
   parameter type reqrsp_req_t = logic,
   parameter type reqrsp_rsp_t = logic,
   parameter type axi_req_t = logic,
@@ -56,7 +55,6 @@ module reqrsp_to_axi import reqrsp_pkg::*; #(
 ) (
   input  logic clk_i,
   input  logic rst_ni,
-  input  logic [UserWidth-1:0] user_i,
   input  reqrsp_req_t reqrsp_req_i,
   output reqrsp_rsp_t reqrsp_rsp_o,
   output axi_req_t axi_req_o,
@@ -175,7 +173,7 @@ module reqrsp_to_axi import reqrsp_pkg::*; #(
   assign axi_req_o.ar.lock   = (reqrsp_req_i.q.amo == AMOLR);
   assign axi_req_o.ar.cache  = axi_pkg::CACHE_MODIFIABLE;
   assign axi_req_o.ar.id     = $unsigned(ID);
-  assign axi_req_o.ar.user   = user_i;
+  assign axi_req_o.ar.user   = reqrsp_req_i.q.user;
   assign axi_req_o.ar_valid  = q_valid_read;
   assign q_ready_read        = axi_rsp_i.ar_ready;
 
@@ -190,11 +188,11 @@ module reqrsp_to_axi import reqrsp_pkg::*; #(
   assign axi_req_o.aw.lock   = (reqrsp_req_i.q.amo == AMOSC);
   assign axi_req_o.aw.cache  = axi_pkg::CACHE_MODIFIABLE;
   assign axi_req_o.aw.id     = $unsigned(ID);
-  assign axi_req_o.aw.user   = user_i;
+  assign axi_req_o.aw.user   = reqrsp_req_i.q.user;
   assign axi_req_o.w.data    = write_data;
   assign axi_req_o.w.strb    = reqrsp_req_i.q.strb;
   assign axi_req_o.w.last    = 1'b1;
-  assign axi_req_o.w.user    = user_i;
+  assign axi_req_o.w.user    = reqrsp_req_i.q.user;
 
   // Both channels need to handshake (independently).
   stream_fork #(
@@ -305,12 +303,11 @@ module reqrsp_to_axi_intf #(
   parameter int unsigned AddrWidth = 32'd0,
   /// AXI and REQRSP data width.
   parameter int unsigned DataWidth = 32'd0,
-  /// AXI user width.
-  parameter int unsigned AxiUserWidth = 32'd0
+  /// AXI and REQRSP user width.
+  parameter int unsigned UserWidth = 32'd0
 ) (
   input logic clk_i,
   input logic rst_ni,
-  input logic [AxiUserWidth-1:0] user_i,
   REQRSP_BUS  reqrsp,
   AXI_BUS     axi
 );
@@ -319,9 +316,9 @@ module reqrsp_to_axi_intf #(
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
   typedef logic [AxiIdWidth-1:0] id_t;
-  typedef logic [AxiUserWidth-1:0] user_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   `AXI_TYPEDEF_AW_CHAN_T(aw_chan_t, addr_t, id_t, user_t)
   `AXI_TYPEDEF_W_CHAN_T(w_chan_t, data_t, strb_t, user_t)
@@ -347,7 +344,6 @@ module reqrsp_to_axi_intf #(
   ) i_reqrsp_to_axi (
     .clk_i,
     .rst_ni,
-    .user_i,
     .reqrsp_req_i (reqrsp_req),
     .reqrsp_rsp_o (reqrsp_rsp),
     .axi_req_o (axi_req),
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 93d424650..6ba1ca4c9 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -429,6 +429,7 @@ module snitch_cluster
   typedef logic [PhysicalAddrWidth-1:0] addr_t;
   typedef logic [NarrowDataWidth-1:0]   data_t;
   typedef logic [NarrowDataWidth/8-1:0] strb_t;
+  typedef logic [63:0]                  user_t;
   typedef logic [WideDataWidth-1:0]     data_dma_t;
   typedef logic [WideDataWidth/8-1:0]   strb_dma_t;
   typedef logic [NarrowIdWidthIn-1:0]   id_mst_t;
@@ -458,7 +459,11 @@ module snitch_cluster
 
   `APB_TYPEDEF_ALL(apb, addr_t, data_t, strb_t)
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  // Reqrsp interface of the core has a 64b user field
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
+  // Reqrsp interface in the cluster additionally contains the cluster ID
+  // (used for atomic operations) in the user field
+  `REQRSP_TYPEDEF_ALL(reqrsp_amo, addr_t, data_t, strb_t, user_narrow_t)
 
   `MEM_TYPEDEF_ALL(mem, tcdm_mem_addr_t, data_t, strb_t, tcdm_user_t)
   `MEM_TYPEDEF_ALL(mem_dma, tcdm_mem_addr_t, data_dma_t, strb_dma_t, logic)
@@ -1235,7 +1240,6 @@ module snitch_cluster
 
   reqrsp_to_axi #(
     .DataWidth (NarrowDataWidth),
-    .UserWidth (NarrowUserWidth),
     .reqrsp_req_t (reqrsp_req_t),
     .reqrsp_rsp_t (reqrsp_rsp_t),
     .axi_req_t (axi_mst_req_t),
@@ -1243,7 +1247,6 @@ module snitch_cluster
   ) i_reqrsp_to_axi_ptw (
     .clk_i,
     .rst_ni,
-    .user_i ('0),
     .reqrsp_req_i (ptw_to_axi_req),
     .reqrsp_rsp_o (ptw_to_axi_rsp),
     .axi_req_o (narrow_axi_mst_req[PTW]),
@@ -1266,11 +1269,29 @@ module snitch_cluster
   reqrsp_req_t core_to_axi_req;
   reqrsp_rsp_t core_to_axi_rsp;
 
+  reqrsp_mux #(
+    .NrPorts (NrCores),
+    .AddrWidth (PhysicalAddrWidth),
+    .DataWidth (NarrowDataWidth),
+    .req_t (reqrsp_req_t),
+    .rsp_t (reqrsp_rsp_t),
+    .RespDepth (2)
+  ) i_reqrsp_mux_core (
+    .clk_i,
+    .rst_ni,
+    .slv_req_i (core_req),
+    .slv_rsp_o (core_rsp),
+    .mst_req_o (core_to_axi_req),
+    .mst_rsp_i (core_to_axi_rsp),
+    .idx_o (/*unused*/)
+  );
+
   // User field for the AXI transmission
   // We encode Atomics operation and (if enabled) collective operations
   user_narrow_t cluster_user;
   addr_t mcast_mask;
   coll_type_t collective_type;
+
   // Atomic ID, needs to be unique ID of cluster
   // cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters)
   if (EnableMulticast) begin : AssignUserWithMCast
@@ -1289,26 +1310,24 @@ module snitch_cluster
     };
   end
 
-  reqrsp_mux #(
-    .NrPorts (NrCores),
-    .AddrWidth (PhysicalAddrWidth),
-    .DataWidth (NarrowDataWidth),
-    .req_t (reqrsp_req_t),
-    .rsp_t (reqrsp_rsp_t),
-    .RespDepth (2)
-  ) i_reqrsp_mux_core (
-    .clk_i,
-    .rst_ni,
-    .slv_req_i (core_req),
-    .slv_rsp_o (core_rsp),
-    .mst_req_o (core_to_axi_req),
-    .mst_rsp_i (core_to_axi_rsp),
-    .idx_o (/*unused*/)
-  );
+  reqrsp_amo_req_t core_to_axi_amo_req;
+  reqrsp_amo_rsp_t core_to_axi_amo_rsp;
+
+  always_comb begin
+    core_to_axi_amo_req.q.addr  = core_to_axi_req.q.addr;
+    core_to_axi_amo_req.q.write = core_to_axi_req.q.write;
+    core_to_axi_amo_req.q.amo   = core_to_axi_req.q.amo;
+    core_to_axi_amo_req.q.data  = core_to_axi_req.q.data;
+    core_to_axi_amo_req.q.strb  = core_to_axi_req.q.strb;
+    core_to_axi_amo_req.q.user  = cluster_user;
+    core_to_axi_amo_req.q.size  = core_to_axi_req.q.size;
+    core_to_axi_amo_req.q_valid = core_to_axi_req.q_valid;
+    core_to_axi_amo_req.p_ready = core_to_axi_req.p_ready;
+    core_to_axi_rsp             = core_to_axi_amo_rsp;
+  end
 
   reqrsp_to_axi #(
     .DataWidth (NarrowDataWidth),
-    .UserWidth (NarrowUserWidth),
     .reqrsp_req_t (reqrsp_req_t),
     .reqrsp_rsp_t (reqrsp_rsp_t),
     .axi_req_t (axi_mst_req_t),
@@ -1316,9 +1335,8 @@ module snitch_cluster
   ) i_reqrsp_to_axi_core (
     .clk_i,
     .rst_ni,
-    .user_i (cluster_user),
-    .reqrsp_req_i (core_to_axi_req),
-    .reqrsp_rsp_o (core_to_axi_rsp),
+    .reqrsp_req_i (core_to_axi_amo_req),
+    .reqrsp_rsp_o (core_to_axi_amo_rsp),
     .axi_req_o (narrow_axi_mst_req[CoreReq]),
     .axi_rsp_i (narrow_axi_mst_rsp[CoreReq])
   );
diff --git a/hw/tcdm_interface/src/axi_to_tcdm.sv b/hw/tcdm_interface/src/axi_to_tcdm.sv
index c7502019a..4eeb18dbf 100644
--- a/hw/tcdm_interface/src/axi_to_tcdm.sv
+++ b/hw/tcdm_interface/src/axi_to_tcdm.sv
@@ -13,6 +13,7 @@ module axi_to_tcdm #(
     parameter type tcdm_rsp_t = logic,
     parameter int unsigned AddrWidth  = 0,
     parameter int unsigned DataWidth  = 0,
+    parameter int unsigned UserWidth  = 0,
     parameter int unsigned IdWidth    = 0,
     parameter int unsigned BufDepth   = 1
 ) (
@@ -27,8 +28,9 @@ module axi_to_tcdm #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
 
   reqrsp_req_t reqrsp_req;
   reqrsp_rsp_t reqrsp_rsp;
@@ -55,6 +57,7 @@ module axi_to_tcdm #(
   reqrsp_to_tcdm #(
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (UserWidth),
     .BufDepth (BufDepth),
     .reqrsp_req_t (reqrsp_req_t),
     .reqrsp_rsp_t (reqrsp_rsp_t),
diff --git a/hw/tcdm_interface/src/reqrsp_to_tcdm.sv b/hw/tcdm_interface/src/reqrsp_to_tcdm.sv
index 0e2c476fa..ca73c40dd 100644
--- a/hw/tcdm_interface/src/reqrsp_to_tcdm.sv
+++ b/hw/tcdm_interface/src/reqrsp_to_tcdm.sv
@@ -10,6 +10,7 @@
 module reqrsp_to_tcdm #(
   parameter int unsigned AddrWidth  = 0,
   parameter int unsigned DataWidth  = 0,
+  parameter int unsigned UserWidth  = 0,
   parameter int unsigned BufDepth = 2,
   parameter type reqrsp_req_t = logic,
   parameter type reqrsp_rsp_t = logic,
@@ -27,8 +28,9 @@ module reqrsp_to_tcdm #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(rr, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(rr, addr_t, data_t, strb_t, user_t)
   rr_req_chan_t req;
   rr_rsp_chan_t rsp;
 
@@ -58,7 +60,7 @@ module reqrsp_to_tcdm #(
     amo: req.amo,
     data: req.data,
     strb: req.strb,
-    user: '0
+    user: req.user
   };
 
   assign rsp = '{
@@ -77,7 +79,7 @@ endmodule
 module reqrsp_to_tcdm_intf #(
   parameter int unsigned AddrWidth  = 0,
   parameter int unsigned DataWidth  = 0,
-  parameter type user_t             = logic,
+  parameter int unsigned UserWidth  = 0,
   parameter int unsigned BufDepth = 2
 ) (
   input  logic        clk_i,
@@ -89,8 +91,9 @@ module reqrsp_to_tcdm_intf #(
   typedef logic [AddrWidth-1:0] addr_t;
   typedef logic [DataWidth-1:0] data_t;
   typedef logic [DataWidth/8-1:0] strb_t;
+  typedef logic [UserWidth-1:0] user_t;
 
-  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t)
+  `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, user_t)
   `TCDM_TYPEDEF_ALL(tcdm, addr_t, data_t, strb_t, user_t)
 
   reqrsp_req_t reqrsp_req;
@@ -102,6 +105,7 @@ module reqrsp_to_tcdm_intf #(
   reqrsp_to_tcdm #(
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (UserWidth),
     .BufDepth (BufDepth),
     .reqrsp_req_t (reqrsp_req_t),
     .reqrsp_rsp_t (reqrsp_rsp_t),

From 50e0eeb2e2393149bceb688490978856d1d3dfc1 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 11:57:18 +0200
Subject: [PATCH 11/38] hw: Parameterize Snitch LSU user width

---
 hw/snitch/src/snitch.sv     | 1 +
 hw/snitch/src/snitch_lsu.sv | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv
index c31c07c76..7e66f3006 100644
--- a/hw/snitch/src/snitch.sv
+++ b/hw/snitch/src/snitch.sv
@@ -2890,6 +2890,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   snitch_lsu #(
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (64),
     .dreq_t (dreq_t),
     .drsp_t (drsp_t),
     .tag_t (logic[RegWidth-1:0]),
diff --git a/hw/snitch/src/snitch_lsu.sv b/hw/snitch/src/snitch_lsu.sv
index f66a6dda3..8a0efa988 100644
--- a/hw/snitch/src/snitch_lsu.sv
+++ b/hw/snitch/src/snitch_lsu.sv
@@ -11,6 +11,7 @@
 module snitch_lsu #(
   parameter int unsigned AddrWidth           = 32,
   parameter int unsigned DataWidth           = 32,
+  parameter int unsigned UserWidth           = 0,
   /// Tag passed from input to output. All transactions are in-order.
   parameter type tag_t                       = logic [4:0],
   /// Number of outstanding memory transactions.
@@ -37,7 +38,8 @@ module snitch_lsu #(
   parameter type         drsp_t              = logic,
   /// Derived parameter *Do not override*
   parameter type addr_t = logic [AddrWidth-1:0],
-  parameter type data_t = logic [DataWidth-1:0]
+  parameter type data_t = logic [DataWidth-1:0],
+  parameter type user_t = logic [UserWidth-1:0]
 ) (
   input  logic                 clk_i,
   input  logic                 rst_i,
@@ -50,7 +52,7 @@ module snitch_lsu #(
   input  logic [1:0]           lsu_qsize_i,
   input  reqrsp_pkg::amo_op_e  lsu_qamo_i,
   input  logic                 lsu_qrepd_i,  // Whether this is a sequencer repetition
-  input  logic [63:0]          lsu_quser_i,  // User field for the axi transmission
+  input  user_t                lsu_quser_i,  // User field for the axi transmission
   input  logic                 lsu_qvalid_i,
   output logic                 lsu_qready_o,
   // response channel

From 7278e2b9e8a6a3bf9a00ce49dc5984a08de82dd5 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 11:58:25 +0200
Subject: [PATCH 12/38] hw: Undo change in narrow XBAR's port index order

---
 hw/snitch/src/snitch_pkg.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/snitch/src/snitch_pkg.sv b/hw/snitch/src/snitch_pkg.sv
index 0090dd755..5bc04f790 100644
--- a/hw/snitch/src/snitch_pkg.sv
+++ b/hw/snitch/src/snitch_pkg.sv
@@ -128,8 +128,8 @@ package snitch_pkg;
   // Slaves on Cluster AXI Bus
   typedef enum integer {
     TCDM               = 0,
-    SoC                = 1,
-    ClusterPeripherals = 2,
+    ClusterPeripherals = 1,
+    SoC                = 2,
     ExtSlave           = 3
   } cluster_slave_e;
 

From f9811d7882e267e24ef7790135fb95b1616e2ba1 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 13:15:35 +0200
Subject: [PATCH 13/38] sw: Fix fence in start.c routine

---
 sw/snRuntime/src/start.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 2ab50f97d..a495d4350 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -71,13 +71,14 @@ static inline void snrt_init_bss() {
 #ifdef SNRT_WAKE_UP
 static inline void snrt_wake_up() {
 
-    // cluster 0 / core 0 should wake up all other cores!
+    // Core 0 of cluster 0 wakes up all other cores
     if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) {
         snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
+        snrt_fence();
     } 
-    
-    // fence which wait until all memory operation are done (all cores are woken up)
-    fence();
+
+    // Synchronize all cores
+    snrt_cluster_hw_barrier();
 
     // Clear the reset flag
     snrt_int_clr_mcip();

From 515c3f2ffedb45c5604fe57cd78e95ac88ce5e6b Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 14:36:10 +0200
Subject: [PATCH 14/38] hw: Reuse default port configuration for both unicast
 and multicast XBARs

---
 hw/snitch_cluster/src/snitch_cluster.sv | 35 +++++++++++--------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 6ba1ca4c9..f28778000 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -687,17 +687,19 @@ module snitch_cluster
     .mst_resp_i (wide_axi_mst_rsp[SoCDMAIn])
   );
 
-  logic [WideSlaveIdxBits-1:0] dma_xbar_default_port = SoCDMAOut;
+  localparam bit DmaEnableDefaultMstPort = 1'b1;
+  logic [WideSlaveIdxBits-1:0] dma_xbar_default_port;
   xbar_rule_t dma_xbar_default_port_rule;
+  xbar_rule_t [5:0] dma_xbar_rules;
+  xbar_rule_t [DmaXbarCfg.NoAddrRules-1:0] enabled_dma_xbar_rule;
+
+  assign dma_xbar_default_port = SoCDMAOut;
   assign dma_xbar_default_port_rule = '{
     idx: dma_xbar_default_port,
     start_addr: tcdm_start_address,
     end_addr: zero_mem_end_address
   };
 
-  xbar_rule_t [5:0] dma_xbar_rules;
-  xbar_rule_t [DmaXbarCfg.NoAddrRules-1:0] enabled_dma_xbar_rule;
-
   assign dma_xbar_rules = '{
     '{idx: BootRom,    start_addr: BootRomAliasStart,      end_addr: BootRomAliasEnd},
     '{idx: ZeroMemory, start_addr: ZeroMemAliasStart,      end_addr: ZeroMemAliasEnd},
@@ -724,15 +726,13 @@ module snitch_cluster
     // Define the collective connectivity matrix!
     typedef bit [DmaMcastXbarCfg.NoMstPorts-1:0] dma_line_t;
     typedef bit [DmaMcastXbarCfg.NoSlvPorts-1:0][DmaMcastXbarCfg.NoMstPorts-1:0] dma_matrix_t;
+
     // If we want to reroute collective operation the only available collective operation
     // port is the SoC port
     localparam dma_line_t DMAlocalArray = (ReRouteCollectiveOp) ?
         dma_line_t'{SoCDMAOut: 1'b1, default: 1'b0} : dma_line_t'{default: 1'b1};
     localparam dma_matrix_t DMACollectiveConnectivity = dma_matrix_t'{default: DMAlocalArray};
 
-    // Set default master port for all multicast's crossbar input's
-    localparam bit [DmaMcastXbarCfg.NoSlvPorts-1:0] DmaEnableDefaultMstPort = '1;
-
     axi_mcast_xbar #(
       .Cfg (DmaMcastXbarCfg),
       .CollectiveOpsConnectivity (DMACollectiveConnectivity),
@@ -760,13 +760,10 @@ module snitch_cluster
       .mst_ports_req_o (wide_axi_slv_req),
       .mst_ports_resp_i (wide_axi_slv_rsp),
       .addr_map_i (enabled_dma_xbar_rule),
-      .en_default_mst_port_i (DmaEnableDefaultMstPort),
+      .en_default_mst_port_i ({DmaMcastXbarCfg.NoSlvPorts{DmaEnableDefaultMstPort}}),
       .default_mst_port_i ({DmaMcastXbarCfg.NoSlvPorts{dma_xbar_default_port_rule}})
     );
   end else begin : gen_dma_xbar
-    // Set default master port for all multicast's crossbar input's
-    localparam bit [DmaXbarCfg.NoSlvPorts-1:0] DmaEnableDefaultMstPort = '1;
-
     axi_xbar #(
       .Cfg (DmaXbarCfg),
       .ATOPs (0),
@@ -793,7 +790,7 @@ module snitch_cluster
       .mst_ports_req_o (wide_axi_slv_req),
       .mst_ports_resp_i (wide_axi_slv_rsp),
       .addr_map_i (enabled_dma_xbar_rule),
-      .en_default_mst_port_i (DmaEnableDefaultMstPort),
+      .en_default_mst_port_i ({DmaXbarCfg.NoSlvPorts{DmaEnableDefaultMstPort}}),
       .default_mst_port_i ({DmaXbarCfg.NoSlvPorts{dma_xbar_default_port}})
     );
   end
@@ -1390,6 +1387,10 @@ module snitch_cluster
     };
   end
 
+  // Set default master port for all multicast's crossbar input's
+  localparam bit [ClusterMcastXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = 1'b1;
+  assign cluster_xbar_default_port = '{default: SoC};
+
   // Instance the narrow axi xbar
   if (EnableMulticast) begin : gen_narrow_mcast_axi_crossbar
 
@@ -1403,9 +1404,6 @@ module snitch_cluster
     localparam cluster_matrix_t ClusterCollectiveConnectivity =
         cluster_matrix_t'{default: ClusterlocalArray};
 
-    // Set default master port for all multicast's crossbar input's
-    localparam bit [ClusterMcastXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = '1;
-
     axi_mcast_xbar #(
       .Cfg                      (ClusterMcastXbarCfg),
       .CollectiveOpsConnectivity(ClusterCollectiveConnectivity),
@@ -1432,12 +1430,10 @@ module snitch_cluster
       .mst_ports_req_o          (narrow_axi_slv_req),
       .mst_ports_resp_i         (narrow_axi_slv_rsp),
       .addr_map_i               (cluster_xbar_rules),
-      .en_default_mst_port_i    (ClusterEnableDefaultMstPort),
+      .en_default_mst_port_i    ({ClusterMcastXbarCfg.NoSlvPorts{ClusterEnableDefaultMstPort}}),
       .default_mst_port_i       ({ClusterMcastXbarCfg.NoSlvPorts{cluster_mcast_xbar_default_port}})
     );
   end else begin : gen_narrow_axi_crossbar
-    // Set default master port for all crossbar input's
-    localparam bit [ClusterXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = '1;
     axi_xbar #(
       .Cfg (ClusterXbarCfg),
       .slv_aw_chan_t (axi_mst_aw_chan_t),
@@ -1463,10 +1459,9 @@ module snitch_cluster
       .mst_ports_req_o (narrow_axi_slv_req),
       .mst_ports_resp_i (narrow_axi_slv_rsp),
       .addr_map_i (cluster_xbar_rules),
-      .en_default_mst_port_i (ClusterEnableDefaultMstPort),
+      .en_default_mst_port_i ({ClusterXbarCfg.NoSlvPorts{ClusterEnableDefaultMstPort}}),
       .default_mst_port_i (cluster_xbar_default_port)
     );
-    assign cluster_xbar_default_port = '{default: SoC};
   end
 
   // Optionally decouple the external narrow AXI slave port.

From 29836da48fcaf852023bc562d4a6c94f3782f9a3 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 14:58:34 +0200
Subject: [PATCH 15/38] hw: Make `select_t` an enum for readability and clean
 comment

---
 hw/snitch_cluster/src/snitch_cc.sv | 40 ++++++++++++++++--------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 6006d7098..21ab2b00e 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -607,12 +607,15 @@ module snitch_cc #(
   end
 
   // Decide whether to go to SoC or TCDM
+
+  localparam int unsigned SelectWidth = cf_math_pkg::idx_width(2);
+  typedef enum logic [SelectWidth-1:0] {SelectTcdm = 1, SelectSoc = 0} select_t;
+
   dreq_t data_tcdm_req;
   drsp_t data_tcdm_rsp;
-  localparam int unsigned SelectWidth = cf_math_pkg::idx_width(2);
-  typedef logic [SelectWidth-1:0] select_t;
-  select_t slave_select;
-  select_t slave_select_coll_op;
+
+  select_t slave_select, slave_select_coll_op;
+
   reqrsp_demux #(
     .NrPorts (2),
     .req_t (dreq_t),
@@ -629,21 +632,6 @@ module snitch_cc #(
     .mst_rsp_i ({data_tcdm_rsp, data_rsp_i})
   );
 
-  // If we want to support collective operation (MCasst + Reduction) then all coll op request
-  // needs to be passed to the SoC independent of the address map. The problem is that a multicast 
-  // which targets its own address space needs to be forwarded to the AXI crossbar so that the
-  // rest of the SoC can be notified about the multicast too (Same goes for Reduction)!
-  // If the .collect subfield is set to 0 we have a unicast - everything else is a collective
-  // operation!
-  if (ReRouteCollectiveOp) begin
-    // Reconstruct the multicast mask from the user field
-    addr_t mcast_mask;
-    assign mcast_mask = addr_t'((merged_dreq.q.user >> CollectiveWidth) & ((1 << AddrWidth) - 1));
-    assign slave_select_coll_op = (mcast_mask != 0) ? '0 : slave_select;
-  end else begin
-    assign slave_select_coll_op = slave_select;
-  end
-
   typedef struct packed {
     int unsigned idx;
     logic [AddrWidth-1:0] base;
@@ -679,6 +667,20 @@ module snitch_cc #(
     .default_idx_i ('0)
   );
 
+  // Collective communication operations are performed within the interconnect at the SoC
+  // level. However, requests destined to the TCDM never arrive at the SoC interconnect,
+  // as they are routed internally within the cluster. In order for collectives destined to
+  // the TCDM to work, we need to handle them differently, and always forward them to the
+  // SoC interconnect, which will reroute them back to the TCDM from outside the cluster.
+  if (ReRouteCollectiveOp) begin
+    // We use the collective mask, in the user field, to detect collective operations.
+    addr_t collective_mask;
+    assign collective_mask = addr_t'((merged_dreq.q.user >> CollectiveWidth) & ((1 << AddrWidth) - 1));
+    assign slave_select_coll_op = (collective_mask != 0) ? SelectSoc : slave_select;
+  end else begin
+    assign slave_select_coll_op = slave_select;
+  end
+
   tcdm_req_t core_tcdm_req;
   tcdm_rsp_t core_tcdm_rsp;
 

From a922c02a18cf35b58d8b6c8c954e5cffcdba3cc4 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 15:00:12 +0200
Subject: [PATCH 16/38] hw: Follow generate block label naming convention

---
 hw/snitch_cluster/src/snitch_cluster.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index f28778000..8ec0ad258 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -1291,7 +1291,7 @@ module snitch_cluster
 
   // Atomic ID, needs to be unique ID of cluster
   // cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters)
-  if (EnableMulticast) begin : AssignUserWithMCast
+  if (EnableMulticast) begin : gen_user_w_collective
     assign mcast_mask = addr_t'((core_to_axi_req.q.user >> CollectiveWidth) & ((1 << PhysicalAddrWidth) - 1));
     assign collective_type = coll_type_t'(core_to_axi_req.q.user & ((1 << CollectiveWidth) - 1));
     assign cluster_user = '{
@@ -1300,7 +1300,7 @@ module snitch_cluster
       atomic:  (hart_base_id_i / NrCores) +  (hart_base_id_i % NrCores) + 1'b1,
       default: '0
     };
-  end else begin : AssignUsesrWithoutMCast
+  end else begin : gen_user_wo_collective
     assign cluster_user = '{
       atomic:  (hart_base_id_i / NrCores) +  (hart_base_id_i % NrCores) + 1'b1,
       default: '0

From a2fc663dfab1cdb0d334e5a5c3aabdbf683197b1 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 15:27:22 +0200
Subject: [PATCH 17/38] hw: Rename user fields and make code less verbose

---
 hw/snitch_cluster/src/snitch_cluster.sv       | 26 +++++++++----------
 .../src/snitch_cluster_pkg.sv.tpl             | 12 ++++-----
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 8ec0ad258..15bca9f83 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -440,7 +440,8 @@ module snitch_cluster
   typedef logic [TCDMMemAddrWidth-1:0]  tcdm_mem_addr_t;
   typedef logic [TCDMAddrWidth-1:0]     tcdm_addr_t;
 
-  typedef logic [CollectiveWidth-1:0]   coll_type_t;
+  typedef logic [CollectiveWidth-1:0]   collective_op_t;
+  typedef logic [AtomicIdWidth-1:0]     atomic_id_t;
 
   // Struct replaced by logic array to workaround Questa optimization bug.
   // typedef struct packed {
@@ -1284,25 +1285,24 @@ module snitch_cluster
   );
 
   // User field for the AXI transmission
-  // We encode Atomics operation and (if enabled) collective operations
-  user_narrow_t cluster_user;
-  addr_t mcast_mask;
-  coll_type_t collective_type;
+  // Encodes the atomic ID and (if enabled) collective operation information
+  atomic_id_t     atomic_id;
+  user_narrow_t   cluster_user;
 
   // Atomic ID, needs to be unique ID of cluster
   // cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters)
-  if (EnableMulticast) begin : gen_user_w_collective
-    assign mcast_mask = addr_t'((core_to_axi_req.q.user >> CollectiveWidth) & ((1 << PhysicalAddrWidth) - 1));
-    assign collective_type = coll_type_t'(core_to_axi_req.q.user & ((1 << CollectiveWidth) - 1));
+  assign atomic_id = (hart_base_id_i / NrCores) + (hart_base_id_i % NrCores) + 1'b1;
+
+  if (EnableMulticast) begin : gen_user
     assign cluster_user = '{
-      mcast: mcast_mask,
-      collective: collective_type,
-      atomic:  (hart_base_id_i / NrCores) +  (hart_base_id_i % NrCores) + 1'b1,
+      collective_mask: addr_t'(core_to_axi_req.q.user[CollectiveWidth+:PhysicalAddrWidth]),
+      collective_op:   collective_op_t'(core_to_axi_req.q.user[0+:CollectiveWidth]),
+      atomic_id:       atomic_id,
       default: '0
     };
-  end else begin : gen_user_wo_collective
+  end else begin : gen_user_no_collectives
     assign cluster_user = '{
-      atomic:  (hart_base_id_i / NrCores) +  (hart_base_id_i % NrCores) + 1'b1,
+      atomic_id:  atomic_id,
       default: '0
     };
   end
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index 228457feb..edfcfb7c4 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -102,21 +102,21 @@ package ${cfg['cluster']['name']}_pkg;
 // on the configuration
 % if cfg['cluster']['enable_multicast']:
   typedef struct packed {
-    addr_t                          mcast;
-    logic [CollectiveWidth-1:0]     collective;
-    logic [AtomicIdWidth-1:0]       atomic;
+    addr_t                          collective_mask;
+    logic [CollectiveWidth-1:0]     collective_op;
+    logic [AtomicIdWidth-1:0]       atomic_id;
   } user_narrow_t;
 %else:
   typedef struct packed {
-    logic [AtomicIdWidth-1:0]       atomic;
+    logic [AtomicIdWidth-1:0]       atomic_id;
   } user_narrow_t;
 %endif
 
 // Will be extended when implementing collective operation on the wide dma link
 % if cfg['cluster']['enable_dma_multicast']:
   typedef struct packed {
-    addr_t                          mcast;
-    logic [CollectiveWidth-1:0]     collective;
+    addr_t                          collective_mask;
+    logic [CollectiveWidth-1:0]     collective_op;
   } user_dma_t;
 %else:
   typedef logic [WideUserWidth-1:0] user_dma_t;

From 60a8d6e98caa34ac92f1875da855ab881c0ff2c9 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 15:53:29 +0200
Subject: [PATCH 18/38] sw: Improve narrow collectives API

---
 sw/snRuntime/api/sync_decls.h                 | 31 +++++++++
 sw/snRuntime/src/sync.h                       | 63 ++++++++++---------
 .../runtime/common/snitch_cluster_cfg.h.tpl   | 22 -------
 3 files changed, 66 insertions(+), 50 deletions(-)

diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index 40224cb26..d25b75420 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -11,6 +11,37 @@ typedef struct {
     uint32_t volatile iteration;
 } snrt_barrier_t;
 
+typedef enum {
+    SNRT_REDUCTION_NONE = 0,
+    SNRT_REDUCTION_BARRIER = 2,
+    SNRT_REDUCTION_FADD = 4,
+    SNRT_REDUCTION_FMUL = 5,
+    SNRT_REDUCTION_FMIN = 6,
+    SNRT_REDUCTION_FMAX = 7,
+    SNRT_REDUCTION_ADD = 8,
+    SNRT_REDUCTION_MUL = 9,
+    SNRT_REDUCTION_MIN = 10,
+    SNRT_REDUCTION_MAX = 11,
+    SNRT_REDUCTION_MINU = 14,
+    SNRT_REDUCTION_MAXU = 15
+} snrt_reduction_opcode_t;
+
+typedef enum {
+    SNRT_COLLECTIVE_UNICAST = 0,
+    SNRT_COLLECTIVE_MULTICAST = 1,
+    SNRT_COLLECTIVE_PARALLEL_REDUCTION = 2,
+    SNRT_COLLECTIVE_OFFLOAD_REDUCTION = 3
+} snrt_collective_opcode_t;
+
+typedef union {
+    struct __attribute__((__packed__)) {
+        snrt_reduction_opcode_t reduction_opcode : SNRT_COLLECTIVE_WIDTH;
+        snrt_collective_opcode_t collective_opcode : 2;
+        uint64_t mask : (64 - SNRT_COLLECTIVE_WIDTH - 2);
+    } f;
+    uint64_t w;
+} snrt_collective_op_t;
+
 extern volatile uint32_t _snrt_mutex;
 extern volatile snrt_barrier_t _snrt_barrier;
 extern volatile uint32_t _reduction_result;
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index c50e7e4a7..affdf4929 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -152,8 +152,8 @@ inline void snrt_inter_cluster_barrier() {
         *((uint32_t *) addr) = 1;
         snrt_disable_reduction();
 
-        // fence to wait until the reduction is finished
-        fence();
+        // Fence to wait until the reduction is finished
+        snrt_fence();
     }
 #else
     // Only continue with dma core's - send the rest into sleep mode
@@ -351,19 +351,17 @@ inline void snrt_wait_writeback(uint32_t val) {
  * @param mask Multicast mask value
  */
 inline void snrt_enable_multicast(uint64_t mask){
-    uint32_t user_high = (uint32_t) (mask >> (32 - SNRT_COLLECTIVE_WIDTH));
-    uint32_t user_low = (uint32_t) ((mask << SNRT_COLLECTIVE_WIDTH)  | SNRT_COLL_MULTICAST);
-    write_csr(0x7c4, user_high);
-    write_csr(0x7c5, user_low);
+    snrt_collective_op_t op = {
+        .f.collective_opcode = SNRT_COLLECTIVE_MULTICAST,
+        .f.mask = mask,
+    };
+    snrt_set_awuser(op.w);
 }
 
 /**
  * @brief Disable LSU multicast
  */
-inline void snrt_disable_multicast() { 
-    write_csr(0x7c4, 0);
-    write_csr(0x7c5, 0);
-}
+inline void snrt_disable_multicast() { snrt_set_awuser(0); }
 
 //================================================================================
 // Reduction functions
@@ -374,37 +372,46 @@ inline void snrt_disable_multicast() {
  * @details All stores performed after this call will be reductions
  *
  * @param mask Mask defines all involved members
- * @param reduction Type of reduction operation
+ * @param opcode Type of reduction operation
  */
-inline void snrt_enable_reduction(uint64_t mask, uint32_t reduction) { 
-    uint32_t user_high = (uint32_t) (mask >> (32 - SNRT_COLLECTIVE_WIDTH));
-    uint32_t user_low = (uint32_t) ((mask << SNRT_COLLECTIVE_WIDTH)  | reduction);
-    write_csr(0x7c4, user_high);
-    write_csr(0x7c5, user_low);
+inline void snrt_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode) { 
+    snrt_collective_opcode_t coll_opcode;
+
+    switch (opcode) {
+        case SNRT_COLL_NARROW_BARRIER:
+            coll_opcode = SNRT_COLLECTIVE_PARALLEL_REDUCTION;
+            break;
+        default:
+            coll_opcode = SNRT_COLLECTIVE_OFFLOAD_REDUCTION;
+            break;
+    }
+
+    snrt_collective_op_t op = {
+        .f.reduction_opcode = opcode,
+        .f.collective_opcode = coll_opcode,
+        .f.mask = mask,
+    };
+    snrt_set_awuser(op.w);
 }
 
 /**
  * @brief Disable LSU reduction
  */
-inline void snrt_disable_reduction() {
-    write_csr(0x7c4, 0);
-    write_csr(0x7c5, 0);
-}
+inline void snrt_disable_reduction() { snrt_set_awuser(0); }
 
 //================================================================================
 // User functions
 //================================================================================
 
 /**
- * @brief Enable LSU user field
- * @details All stores performed after this call equiped with given user field
+ * @brief Enable LSU AW user field
+ * @details All stores performed after this call are equipped with the given AW user field
  *
- * @param field Defines the user field for the AXI transmission
+ * @param field Defines the AW user field for the AXI transfer
  */
-
-inline void snrt_set_user_field(uint64_t field){
-    uint32_t user_high = (uint32_t) (field >> 32);
+inline void snrt_set_awuser(uint64_t field){
     uint32_t user_low = (uint32_t) (field);
-    write_csr(0x7c4, user_high);
-    write_csr(0x7c5, user_low);
+    uint32_t user_high = (uint32_t) (field >> 32);
+    write_csr(0x7c4, user_low);
+    write_csr(0x7c5, user_high);
 }
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
index 566e9832b..81f96ef45 100644
--- a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
+++ b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
@@ -30,27 +30,5 @@
 % endif
 #define SNRT_COLLECTIVE_WIDTH ${cfg['cluster']['collective_width']}
 
-// OP Codes copied from floo_pkg definition
-#define SNRT_COLL_MULTICAST             16
-#define SNRT_COLL_PARALLEL_REDUCTION    32
-#define SNRT_COLL_OFFLOAD_REDUCTION     48
-
-// On the wide link - fp offload
-#define SNRT_COLL_WIDE_FPADD            (4 + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_WIDE_FPMUL            (5 + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_WIDE_FPMIN            (6 + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_WIDE_FPMAX            (7 + SNRT_COLL_OFFLOAD_REDUCTION)
-
-// On the narrow link - integer offload
-#define SNRT_COLL_NARROW_ADD            (8  + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_NARROW_MUL            (9  + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_NARROW_MIN_INT        (10 + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_NARROW_MIN_UINT       (14 + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_NARROW_MAX_INT        (11 + SNRT_COLL_OFFLOAD_REDUCTION)
-#define SNRT_COLL_NARROW_MAX_UINT       (15 + SNRT_COLL_OFFLOAD_REDUCTION)
-
-// On the narrow link - parallel reduction
-#define SNRT_COLL_NARROW_BARRIER        (2  + SNRT_COLL_PARALLEL_REDUCTION)
-
 // Software configuration
 #define SNRT_LOG2_STACK_SIZE 10

From 2ebb2417d461242e0c1b4935a6f83bc39469750c Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 16:33:23 +0200
Subject: [PATCH 19/38] sw: Improve wide collectives API

---
 sw/snRuntime/src/dma.h | 146 +++++++++++++++++++++++++----------------
 1 file changed, 91 insertions(+), 55 deletions(-)

diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 5c1d4deae..b34cc3738 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -62,85 +62,120 @@ static inline uint32_t snrt_dma_start_1d(volatile void *dst, volatile void *src,
     return snrt_dma_start_1d((uint64_t)dst, (uint64_t)src, size, channel);
 }
 
+/**
+ * @brief Set AW user field of the DMA's AXI interface
+ * @details All DMA transfers performed after this call are equipped with the given AW user field
+ *
+ * @param field Defines the AW user field for the AXI transfer
+ */
+inline void snrt_dma_set_awuser(uint64_t field) {
+    uint32_t user_low = (uint32_t) (field);
+    uint32_t user_high = (uint32_t) (field >> 32);
+    asm volatile("dmuser %[user_low], %[user_high] \n" : : 
+        [ user_low ] "r"(user_low), [ user_high ] "r"(user_high));
+}
+
+/**
+ * @brief Enable multicast for successive transfers
+ * @details All transfers performed after this call will be multicast to all
+ *          addresses specified by the address and mask pair.
+ *
+ * @param mask Multicast mask value
+ */
+ inline void snrt_dma_enable_multicast(uint64_t mask) {
+    snrt_collective_op_t op = {
+        .f.collective_opcode = SNRT_COLLECTIVE_MULTICAST,
+        .f.mask = mask,
+    };
+    snrt_dma_set_awuser(op.w);
+}
 
 /**
- * @brief Enable collective operations for successive transfers.
- * @param coll_mask The mask for the collective operation
- * @param coll_op operation type
- * @details The next dma transfer will be a collective operation
+ * @brief Enable reduction operations for successive transfers
+ * @details All transfers performed after this call will be part of a reduction
+ *          involving all masters identified by the mask.
+ *
+ * @param mask Mask defines all involved members
+ * @param opcode Type of reduction operation
  */
-inline void snrt_dma_enable_collective(uint64_t coll_mask, uint32_t coll_op) {
-    // Prepare the collective components
-    uint32_t mask_coll_op = (1 << SNRT_COLLECTIVE_WIDTH) - 1;
-    uint32_t usr_lo = (((uint32_t) (coll_mask << SNRT_COLLECTIVE_WIDTH)) | (coll_op & mask_coll_op));
-    uint32_t usr_hi = (((uint32_t) (coll_mask >> (32-SNRT_COLLECTIVE_WIDTH))) & mask_coll_op);
-    // Set the user field
-    asm volatile("dmuser %[usr_lo], %[usr_hi] \n" : : 
-        [ usr_lo ] "r"(usr_lo), [ usr_hi ] "r"(usr_hi));
+inline void snrt_dma_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode) {
+    snrt_collective_op_t op = {
+        .f.reduction_opcode = opcode,
+        .f.collective_opcode = SNRT_COLLECTIVE_OFFLOAD_REDUCTION,
+        .f.mask = mask,
+    };
+    snrt_dma_set_awuser(op.w);
 }
 
 /**
- * @brief Disable multicast for successive transfers.
- * @details Resets the collective component to zero.
+ * @brief Disable multicast for successive transfers
+ * @details Successive DMA transfers will be unicast transfers
  */
-inline void snrt_dma_disable_collective() {
+ inline void snrt_dma_disable_multicast() {
     asm volatile("dmuser zero, zero \n");
 }
 
 /**
- * @brief Start an asynchronous collective 1D DMA transfer with 64-bit wide
+ * @brief Disable reduction operations for successive transfers
+ * @details Successive DMA transfers will be unicast transfers
+ */
+inline void snrt_dma_disable_reduction() {
+    asm volatile("dmuser zero, zero \n");
+}
+
+/**
+ * @brief Start an asynchronous reduction 1D DMA transfer with 64-bit wide
  * pointers.
- * @param coll_mask The mask for the collective operation
- * @param coll_op operation type
+ * @param mask Mask defines all involved members
+ * @param opcode Reduction operation
  * @see snrt_dma_start_1d(uint64_t, uint64_t, size_t, uint32_t) for a
  *      description of the other parameters.
  */
-static inline uint32_t snrt_dma_start_1d_collective(uint64_t dst, uint64_t src,
-                                               size_t size, uint64_t coll_mask,
-                                               uint32_t coll_op,
-                                               const uint32_t channel = 0) {
-    snrt_dma_enable_collective(coll_mask, coll_op);
+static inline uint32_t snrt_dma_start_1d_reduction(uint64_t dst, uint64_t src,
+                                                   size_t size, uint64_t mask,
+                                                   snrt_reduction_opcode_t opcode,
+                                                   const uint32_t channel = 0) {
+    snrt_dma_enable_reduction(mask, opcode);
     uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
-    snrt_dma_disable_collective();
+    snrt_dma_disable_reduction();
     return txid;
 }
 
 /**
  * @brief Start an asynchronous multicast 1D DMA transfer with 64-bit wide
  * pointers.
- * @param coll_mask The mask for the multicast operation
- * @param coll_op operation type
+ * @param mask The mask for the multicast operation
  * @see snrt_dma_start_1d(uint64_t, uint64_t, size_t, uint32_t) for a
  *      description of the other parameters.
  */
 static inline uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src,
                                                size_t size, uint64_t mask,
                                                const uint32_t channel = 0) {
-    snrt_dma_enable_collective(mask, SNRT_COLL_MULTICAST);
+    snrt_dma_enable_multicast(mask);
     uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
-    snrt_dma_disable_collective();
+    snrt_dma_disable_multicast();
     return txid;
 }
 
 /**
- * @brief Start an asynchronous collective 1D DMA transfer using native-size
+ * @brief Start an asynchronous reduction 1D DMA transfer using native-size
  * pointers.
  *
- * This is a convenience overload of snrt_dma_start_1d_collective() using `void*`
+ * This is a convenience overload of snrt_dma_start_1d_reduction() using `void*`
  * pointers.
  *
- * @see snrt_dma_start_1d_collective(uint64_t, uint64_t, size_t, uint64_t, uint32_t, uint32_t)
+ * @see snrt_dma_start_1d_reduction(uint64_t, uint64_t, size_t, uint64_t, uint32_t, uint32_t)
  */
-static inline uint32_t snrt_dma_start_1d_collective(volatile void *dst,
-                                               volatile void *src, size_t size,
-                                               volatile void * coll_mask,
-                                               uint32_t coll_op,
-                                               const uint32_t channel = 0) {
-    return snrt_dma_start_1d_collective((uint64_t)dst, (uint64_t)src, size, (uint64_t)coll_mask, coll_op, channel);
+static inline uint32_t snrt_dma_start_1d_reduction(volatile void *dst,
+                                                   volatile void *src, size_t size,
+                                                   uint64_t mask,
+                                                   snrt_reduction_opcode_t opcode,
+                                                   const uint32_t channel = 0) {
+    return snrt_dma_start_1d_reduction((uint64_t)dst, (uint64_t)src, size, (uint64_t)mask, opcode, channel);
 }
 
 /**
- * @brief Start an asynchronous collective 1D DMA transfer using native-size
+ * @brief Start an asynchronous multicast 1D DMA transfer using native-size
  * pointers.
  *
  * This is a convenience overload of snrt_dma_start_1d_mcast() using `void*`
@@ -150,9 +185,9 @@ static inline uint32_t snrt_dma_start_1d_collective(volatile void *dst,
  */
 static inline uint32_t snrt_dma_start_1d_mcast(volatile void *dst,
                                                volatile void *src, size_t size,
-                                               volatile void * mask,
+                                               uint64_t mask,
                                                const uint32_t channel = 0) {
-    return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, (uint64_t)mask, channel);
+    return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, mask, channel);
 }
 
 /**
@@ -331,16 +366,17 @@ inline snrt_dma_txid_t snrt_dma_load_1d_tile(volatile void *dst,
  * @param tile_idx Index of the tile in the 1D array.
  * @param tile_size Number of elements within a tile of the 1D array.
  * @param prec Number of bytes of each element in the 1D array.
- * @param mcast Multicast mask applied on the destination address.
+ * @param mask Multicast mask applied on the destination address.
  */
 inline snrt_dma_txid_t snrt_dma_mcast_load_1d_tile(void *dst, void *src,
                                                    size_t tile_idx,
                                                    size_t tile_size,
                                                    uint32_t prec,
-                                                   void *mcast) {
+                                                   uint64_t mask) {
     size_t tile_nbytes = tile_size * prec;
-    return snrt_dma_start_1d_mcast(dst, (void*) ((char*) src + (tile_idx * tile_nbytes)),
-                                   tile_nbytes, mcast);
+    return snrt_dma_start_1d_mcast((uintptr_t)dst,
+                                   (uintptr_t)src + tile_idx * tile_nbytes,
+                                   tile_nbytes, mask);
 }
 
 /**
@@ -350,19 +386,19 @@ inline snrt_dma_txid_t snrt_dma_mcast_load_1d_tile(void *dst, void *src,
  * @param tile_idx Index of the tile in the 1D array.
  * @param tile_size Number of elements within a tile of the 1D array.
  * @param prec Number of bytes of each element in the 1D array.
- * @param coll_mask Multicast mask for collective operation applied on the destination address.
- * @param coll_op Type of operation (Should only work for multicast)
+ * @param mask Mask for reduction operation.
+ * @param opcode Reduction operation.
  */
-inline snrt_dma_txid_t snrt_dma_collective_load_1d_tile(void *dst, void *src,
-                                                   size_t tile_idx,
-                                                   size_t tile_size,
-                                                   uint32_t prec,
-                                                   void *coll_mask,
-                                                   uint32_t coll_op) {
+inline snrt_dma_txid_t snrt_dma_reduction_load_1d_tile(void *dst, void *src,
+                                                       size_t tile_idx,
+                                                       size_t tile_size,
+                                                       uint32_t prec,
+                                                       uint64_t mask,
+                                                       snrt_collective_opcode_t opcode) {
     size_t tile_nbytes = tile_size * prec;
-    return snrt_dma_start_1d_collective((uintptr_t)dst,
-                                   (uintptr_t)src + tile_idx * tile_nbytes,
-                                   tile_nbytes, (uintptr_t)coll_mask, coll_op);
+    return snrt_dma_start_1d_reduction((uintptr_t)dst,
+                                       (uintptr_t)src + tile_idx * tile_nbytes,
+                                       tile_nbytes, mask, opcode);
 }
 
 /**

From 280bea7a6f217cf3c1fe7a141ec643085e46887a Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 16:55:52 +0200
Subject: [PATCH 20/38] hw: Derive *UserWidth parameters from user_*_t
 parameters

---
 hw/snitch_cluster/src/snitch_cluster.sv       |  11 +-
 .../src/snitch_cluster_pkg.sv.tpl             |   4 +-
 .../src/snitch_cluster_wrapper.sv.tpl         |   2 -
 target/snitch_cluster/cfg/default.json        |  14 +-
 target/snitch_cluster/cfg/dma_mchan.json      |   6 +-
 target/snitch_cluster/cfg/frep_xl.json        |   6 +-
 target/snitch_cluster/cfg/frep_xs.json        |   6 +-
 target/snitch_cluster/cfg/github-ci.json      |   9 +-
 target/snitch_cluster/cfg/omega.json          |   6 +-
 target/snitch_cluster/cfg/reduction.hjson     | 153 ------------------
 .../schema/snitch_cluster.schema.json         |  14 +-
 11 files changed, 25 insertions(+), 206 deletions(-)
 delete mode 100644 target/snitch_cluster/cfg/reduction.hjson

diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 15bca9f83..aca0a8b08 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -34,10 +34,6 @@ module snitch_cluster
   parameter int unsigned NarrowIdWidthIn    = 2,
   /// AXI: dma id width in.
   parameter int unsigned WideIdWidthIn      = 2,
-  /// AXI: user width.
-  parameter int unsigned NarrowUserWidth    = 1,
-  /// AXI: dma user width.
-  parameter int unsigned WideUserWidth      = 1,
   /// Width of the atomic ID to be used in a system.
   parameter int unsigned AtomicIdWidth      = 1,
   /// Width of the collective operation
@@ -317,6 +313,10 @@ module snitch_cluster
   localparam int unsigned NumTCDMIn = NrTCDMPortsCores + 1;
   localparam logic [PhysicalAddrWidth-1:0] TCDMMask = ~(TCDMSizeNapotRounded - 1);
 
+  // User widths
+  localparam int unsigned NarrowUserWidth = $bits(user_narrow_t);
+  localparam int unsigned WideUserWidth   = $bits(user_dma_t);
+
   // Core Requests, SoC Request, PTW.
   localparam int unsigned NrNarrowMasters = 3;
   localparam int unsigned NarrowIdWidthOut = $clog2(NrNarrowMasters) + NarrowIdWidthIn;
@@ -1706,8 +1706,5 @@ module snitch_cluster
     ~AliasRegionEnable || ((TCDMSizeNapotRounded - 1) & AliasRegionBase) == 0)
   // Make sure we only have one DMA in the system.
   `ASSERT_INIT(NumberDMA, $onehot0(Xdma))
-  // Verify that the size of the user field matches
-  `ASSERT_INIT(CheckNarrowUserFieldWidth, NarrowUserWidth == $bits(user_narrow_t));
-  `ASSERT_INIT(CheckWideUserFieldWidth, WideUserWidth == $bits(user_dma_t));
 
 endmodule
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index edfcfb7c4..ac8794920 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -63,8 +63,6 @@ package ${cfg['cluster']['name']}_pkg;
   localparam int unsigned EnableMulticast = ${int(cfg['cluster']['enable_multicast'])};
   localparam int unsigned ReRouteCollectiveOp = ${int(cfg['cluster']['enable_reroute_collective'])};
 
-  localparam int unsigned NarrowUserWidth = ${cfg['cluster']['user_width']};
-  localparam int unsigned WideUserWidth = ${cfg['cluster']['dma_user_width']};
   localparam int unsigned AtomicIdWidth = ${cfg['cluster']['atomic_id_width']};
   localparam int unsigned CollectiveWidth = ${cfg['cluster']['collective_width']};
 
@@ -119,7 +117,7 @@ package ${cfg['cluster']['name']}_pkg;
     logic [CollectiveWidth-1:0]     collective_op;
   } user_dma_t;
 %else:
-  typedef logic [WideUserWidth-1:0] user_dma_t;
+  typedef logic user_dma_t;
 %endif
 
   `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_narrow_t)
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index 79aeab8fe..6b5c285f2 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -72,8 +72,6 @@ module ${cfg['cluster']['name']}_wrapper (
     .WideDataWidth (${cfg['cluster']['dma_data_width']}),
     .NarrowIdWidthIn (${cfg['cluster']['name']}_pkg::NarrowIdWidthIn),
     .WideIdWidthIn (${cfg['cluster']['name']}_pkg::WideIdWidthIn),
-    .NarrowUserWidth (${cfg['cluster']['name']}_pkg::NarrowUserWidth),
-    .WideUserWidth (${cfg['cluster']['name']}_pkg::WideUserWidth),
     .AtomicIdWidth (${cfg['cluster']['name']}_pkg::AtomicIdWidth),
     .CollectiveWidth (${cfg['cluster']['name']}_pkg::CollectiveWidth),
     .BootAddr (${to_sv_hex(cfg['cluster']['boot_addr'], 32)}),
diff --git a/target/snitch_cluster/cfg/default.json b/target/snitch_cluster/cfg/default.json
index d4dd79301..eb64d23ec 100644
--- a/target/snitch_cluster/cfg/default.json
+++ b/target/snitch_cluster/cfg/default.json
@@ -9,8 +9,7 @@
         cluster_base_hartid: 0,
         addr_width: 48,
         data_width: 64,
-        atomic_id_width: 5, // clog2(total number of clusters)
-        user_width: 5, // atomic_id_width + 6 + addr_width if we enable_multicast (not enable_dma_multicast)
+        atomic_id_width: 5,
         tcdm: {
             size: 128,
             banks: 32,
@@ -23,14 +22,9 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        // Collective operation on narrow interface
-        enable_multicast: false,
-        enable_reduction: false,
-        // To support the multicast for the DMA
-        enable_dma_multicast: false,
-        dma_user_width: 1, // 6 + addr_width if we enable_dma_multicast (not enable_multicast)
-        // Only enable if collective operation are handled in SoC
-        enable_reroute_collective: false,
+        enable_multicast: true,
+        enable_reduction: true,
+        enable_dma_multicast: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/dma_mchan.json b/target/snitch_cluster/cfg/dma_mchan.json
index 01ee47cd0..e6751f090 100644
--- a/target/snitch_cluster/cfg/dma_mchan.json
+++ b/target/snitch_cluster/cfg/dma_mchan.json
@@ -9,8 +9,7 @@
         cluster_base_hartid: 0,
         addr_width: 48,
         data_width: 64,
-        atomic_id_width: 5, // clog2(total number of clusters)
-        user_width: 53, // addr_width + atomic_id_width
+        atomic_id_width: 5,
         tcdm: {
             size: 128,
             banks: 32,
@@ -24,7 +23,8 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 48,
+        enable_multicast: true,
+        enable_reduction: true,
         enable_dma_multicast: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
diff --git a/target/snitch_cluster/cfg/frep_xl.json b/target/snitch_cluster/cfg/frep_xl.json
index 98bf3f666..8098355ec 100644
--- a/target/snitch_cluster/cfg/frep_xl.json
+++ b/target/snitch_cluster/cfg/frep_xl.json
@@ -9,8 +9,7 @@
         cluster_base_hartid: 0,
         addr_width: 48,
         data_width: 64,
-        atomic_id_width: 5, // clog2(total number of clusters)
-        user_width: 53, // addr_width + atomic_id_width
+        atomic_id_width: 5,
         tcdm: {
             size: 128,
             banks: 32,
@@ -23,8 +22,9 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 48,
         enable_multicast: true,
+        enable_reduction: true,
+        enable_dma_multicast: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/frep_xs.json b/target/snitch_cluster/cfg/frep_xs.json
index a9f70656c..a9b99b04b 100644
--- a/target/snitch_cluster/cfg/frep_xs.json
+++ b/target/snitch_cluster/cfg/frep_xs.json
@@ -9,7 +9,7 @@
         cluster_base_hartid: 0,
         addr_width: 48,
         data_width: 64,
-        user_width: 5, // clog2(total number of clusters)
+        atomic_id_width: 5,
         tcdm: {
             size: 128,
             banks: 32,
@@ -22,7 +22,9 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 1,
+        enable_multicast: true,
+        enable_reduction: true,
+        enable_dma_multicast: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/github-ci.json b/target/snitch_cluster/cfg/github-ci.json
index 496d79a53..f8bfccb28 100644
--- a/target/snitch_cluster/cfg/github-ci.json
+++ b/target/snitch_cluster/cfg/github-ci.json
@@ -9,9 +9,7 @@
         cluster_base_hartid: 0,
         addr_width: 48,
         data_width: 64,
-        atomic_id_width: 5, // clog2(total number of clusters)
-        collective_width: 6, // # bits used for the collective operation
-        user_width: 59, // addr_width + atomic_id_width + collective_width
+        atomic_id_width: 5,
         tcdm: {
             size: 128,
             banks: 32,
@@ -24,14 +22,9 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        // Collective operation on narrow interface
         enable_multicast: true,
         enable_reduction: true,
-        // To support the multicast for the DMA
         enable_dma_multicast: true,
-        dma_user_width: 54, // 6 + addr_width if we enable_dma_multicast (not enable_multicast)
-        // Only enable if collective operation are handled in SoC
-        enable_reroute_collective: false,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/omega.json b/target/snitch_cluster/cfg/omega.json
index a2641a2fe..c3a949a1c 100644
--- a/target/snitch_cluster/cfg/omega.json
+++ b/target/snitch_cluster/cfg/omega.json
@@ -9,8 +9,7 @@
         cluster_base_hartid: 0,
         addr_width: 48,
         data_width: 64,
-        atomic_id_width: 5, // clog2(total number of clusters)
-        user_width: 53, // addr_width + atomic_id_width
+        atomic_id_width: 5,
         tcdm: {
             size: 128,
             banks: 32,
@@ -24,8 +23,9 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        dma_user_width: 48,
         enable_multicast: true,
+        enable_reduction: true,
+        enable_dma_multicast: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/reduction.hjson b/target/snitch_cluster/cfg/reduction.hjson
deleted file mode 100644
index 729ba89d2..000000000
--- a/target/snitch_cluster/cfg/reduction.hjson
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Cluster configuration for a simple testbench system.
-{
-    cluster: {
-        cluster_base_addr: 0x10000000, 
-        cluster_base_offset: 0,
-        cluster_base_hartid: 0,
-        addr_width: 48,
-        data_width: 64,
-        atomic_id_width: 5, // clog2(total number of clusters)
-        collective_width: 6, // # bits used for the collective operation
-        user_width: 59, // addr_width + atomic_id_width + collective_width
-        tcdm: {
-            size: 128,
-            banks: 32,
-        },
-        cluster_periph_size: 64, // kB
-        zero_mem_size: 64, // kB
-        alias_region_enable: false,
-        dma_data_width: 512,
-        dma_axi_req_fifo_depth: 24,
-        dma_req_fifo_depth: 8,
-        narrow_trans: 4,
-        wide_trans: 32,
-        dma_user_width: 54,
-        // Enable the multicast and the reduction feature in the sw (Only Narrow IF - Not DMA calls)
-        enable_multicast: true,
-        enable_reduction: true,
-        // To support the multicast for the DMA
-        enable_dma_multicast: true,
-        // Currently we need to reroute collective operation to the SoC IF independent of the dst address
-        enable_reroute_collective: true,
-        // We don't need Snitch debugging in Occamy
-        enable_debug: false,
-        // We don't need Snitch (core-internal) virtual memory support
-        vm_support: false,
-        // Memory configuration inputs
-        sram_cfg_expose: true,
-        sram_cfg_fields: {
-            ema: 3,
-            emaw: 2,
-            emas: 1
-        },
-        // Timing parameters
-        timing: {
-            lat_comp_fp32: 2,
-            lat_comp_fp64: 3,
-            lat_comp_fp16: 1,
-            lat_comp_fp16_alt: 1,
-            lat_comp_fp8: 1,
-            lat_comp_fp8_alt: 1,
-            lat_noncomp: 1,
-            lat_conv: 2,
-            lat_sdotp: 3,
-            fpu_pipe_config: "BEFORE",
-            narrow_xbar_latency: "CUT_ALL_PORTS",
-            wide_xbar_latency: "CUT_ALL_PORTS",
-            // Isolate the core.
-            register_core_req: true,
-            register_core_rsp: true,
-            register_offload_req: true,
-            register_offload_rsp: true,
-            register_fpu_req: true,
-            register_ext_narrow: false,
-            register_ext_wide: false
-        },
-        hives: [
-            // Hive 0
-            {
-                icache: {
-                    size: 8, // total instruction cache size in kByte
-                    ways: 2, // number of ways
-                    cacheline: 256 // word size in bits
-                },
-                cores: [
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/compute_core_template" },
-                    { $ref: "#/dma_core_template" },
-                ]
-            }
-        ]
-    },
-    external_addr_regions: [
-        {
-            name: "dram",
-            address: 0x80000000,
-            length: 0x80000000,
-            cacheable: true
-        },
-        {
-            name: "clint",
-            address: 0xFFFF0000,
-            length: 0x1000
-        },
-    ],
-    // Templates.
-    compute_core_template: {
-        isa: "rv32imafd",
-        xssr: true,
-        xfrep: true,
-        xdma: false,
-        xf16: true,
-        xf16alt: true,
-        xf8: true,
-        xf8alt: true,
-        xfdotp: true,
-        xfvec: true,
-        ssr_nr_credits: 4,
-        num_int_outstanding_loads: 1,
-        num_int_outstanding_mem: 4,
-        num_fp_outstanding_loads: 4,
-        num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
-        num_dtlb_entries: 1,
-        num_itlb_entries: 1,
-        // SSSR configuration below
-        ssr_intersection: true,
-        ssr_intersection_triple: [0, 1, 2],
-        ssrs: [
-            {indirection: true},    // Master 0
-            {indirection: true},    // Master 1
-            {},                     // Slave
-        ],
-    },
-    dma_core_template: {
-        isa: "rv32imafd",
-        xdma: true,
-        xssr: false,
-        xfrep: false,
-        xf16: false,
-        xf16alt: false,
-        xf8: false,
-        xf8alt: false,
-        xfdotp: false,
-        xfvec: false,
-        num_int_outstanding_loads: 1,
-        num_int_outstanding_mem: 4,
-        num_fp_outstanding_loads: 4,
-        num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
-        num_dtlb_entries: 1,
-        num_itlb_entries: 1,
-    }
-}
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index 372047cba..281823703 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -161,24 +161,14 @@
                     "description": "The number of separate DMA channels to instantiate.",
                     "default": 1
                 },
-                "user_width": {
-                    "type": "number",
-                    "description": "User width of the narrower AXI plug into the cluster.",
-                    "default": 1
-                },
-                "dma_user_width": {
-                    "type": "number",
-                    "description": "User width of the wide AXI plug into the cluster.",
-                    "default": 1
-                },
                 "atomic_id_width": {
                     "type": "number",
-                    "description": "Width of the cluster's atomics ID.",
+                    "description": "Width of the cluster's atomic ID.",
                     "default": 1
                 },  
                 "collective_width": {
                     "type": "number",
-                    "description": "Width of the collective operation field",
+                    "description": "Width of the collective operation field.",
                     "default": 6
                 },  
                 "enable_multicast": {

From c7550c0bc2d540c25b2ee03888b11d590f8e8fea Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 17:12:33 +0200
Subject: [PATCH 21/38] treewide: Remove reroute parameter and rename
 enable_multicast parameters

---
 hw/snitch_cluster/src/snitch_cc.sv            | 18 +++++---------
 hw/snitch_cluster/src/snitch_cluster.sv       | 24 +++++++------------
 .../src/snitch_cluster_pkg.sv.tpl             |  5 ++--
 .../src/snitch_cluster_wrapper.sv.tpl         |  5 ++--
 sw/snRuntime/src/sync.h                       |  4 ++--
 .../runtime/common/snitch_cluster_cfg.h.tpl   |  8 +++----
 .../schema/snitch_cluster.schema.json         | 18 ++++----------
 7 files changed, 28 insertions(+), 54 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 21ab2b00e..5844e9f10 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -66,16 +66,14 @@ module snitch_cc #(
   parameter bit          Xfrep              = 1,
   /// Has `SSR` support.
   parameter bit          Xssr               = 1,
-  /// Reroute collective Operation (Multicast + Reduction) to the AXI Crossbar anyway!
-  parameter bit          ReRouteCollectiveOp = 0,
-  /// Size of the collectiv width
-  parameter int unsigned CollectiveWidth    = 1,
   /// Has `COPIFT` support.
   parameter bit          Xcopift            = 1,
   /// Has `IPU` support.
   parameter bit          Xipu               = 1,
   /// Has virtual memory support.
   parameter bit          VMSupport          = 1,
+  /// Width of the collective operation field
+  parameter int unsigned CollectiveWidth    = 1,
   parameter int unsigned NumIntOutstandingLoads = 0,
   parameter int unsigned NumIntOutstandingMem = 0,
   parameter int unsigned NumFPOutstandingLoads = 0,
@@ -672,14 +670,10 @@ module snitch_cc #(
   // as they are routed internally within the cluster. In order for collectives destined to
   // the TCDM to work, we need to handle them differently, and always forward them to the
   // SoC interconnect, which will reroute them back to the TCDM from outside the cluster.
-  if (ReRouteCollectiveOp) begin
-    // We use the collective mask, in the user field, to detect collective operations.
-    addr_t collective_mask;
-    assign collective_mask = addr_t'((merged_dreq.q.user >> CollectiveWidth) & ((1 << AddrWidth) - 1));
-    assign slave_select_coll_op = (collective_mask != 0) ? SelectSoc : slave_select;
-  end else begin
-    assign slave_select_coll_op = slave_select;
-  end
+  // The collective mask, in the user field, is used to detect collective operations.
+  addr_t collective_mask;
+  assign collective_mask = addr_t'(merged_dreq.q.user[CollectiveWidth+:AddrWidth]);
+  assign slave_select_coll_op = (collective_mask != 0) ? SelectSoc : slave_select;
 
   tcdm_req_t core_tcdm_req;
   tcdm_rsp_t core_tcdm_rsp;
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index aca0a8b08..026c86de9 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -76,13 +76,10 @@ module snitch_cluster
   parameter int unsigned ICacheWays [NrHives]      = '{default: 0},
   /// Enable virtual memory support.
   parameter bit          VMSupport          = 1,
-  /// Enable multicast on DMA XBAR.
-  parameter bit          EnableDmaMulticast = 0,
-  /// Enable multicast on the Narrow XBAR
-  parameter bit          EnableMulticast    = 0,
-  /// Cluster will forward any collective operation request to the SoC
-  /// independent of the address range. The SoC has to handle MCast loopbacks
-  parameter bit          ReRouteCollectiveOp = 0,
+  /// Enable wide collective operations.
+  parameter bit          EnableWideCollectives     = 0,
+  /// Enable narrow collective operations.
+  parameter bit          EnableNarrowCollectives   = 0,
   /// Per-core enabling of the standard `E` ISA reduced-register extension.
   parameter bit [NrCores-1:0] RVE           = '0,
   /// Per-core enabling of the standard `F` ISA extensions.
@@ -722,7 +719,7 @@ module snitch_cluster
     end
   end
 
-  if (EnableDmaMulticast) begin : gen_mcast_dma_xbar
+  if (EnableWideCollectives) begin : gen_mcast_dma_xbar
 
     // Define the collective connectivity matrix!
     typedef bit [DmaMcastXbarCfg.NoMstPorts-1:0] dma_line_t;
@@ -730,8 +727,7 @@ module snitch_cluster
 
     // If we want to reroute collective operation the only available collective operation
     // port is the SoC port
-    localparam dma_line_t DMAlocalArray = (ReRouteCollectiveOp) ?
-        dma_line_t'{SoCDMAOut: 1'b1, default: 1'b0} : dma_line_t'{default: 1'b1};
+    localparam dma_line_t DMAlocalArray = dma_line_t'{SoCDMAOut: 1'b1, default: 1'b0};
     localparam dma_matrix_t DMACollectiveConnectivity = dma_matrix_t'{default: DMAlocalArray};
 
     axi_mcast_xbar #(
@@ -1120,7 +1116,6 @@ module snitch_cluster
         .DebugSupport (DebugSupport),
         .TCDMAliasEnable (AliasRegionEnable),
         .TCDMAliasStart (TCDMAliasStart),
-        .ReRouteCollectiveOp (ReRouteCollectiveOp),
         .CollectiveWidth (CollectiveWidth)
       ) i_snitch_cc (
         .clk_i,
@@ -1293,7 +1288,7 @@ module snitch_cluster
   // cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters)
   assign atomic_id = (hart_base_id_i / NrCores) + (hart_base_id_i % NrCores) + 1'b1;
 
-  if (EnableMulticast) begin : gen_user
+  if (EnableNarrowCollectives) begin : gen_user
     assign cluster_user = '{
       collective_mask: addr_t'(core_to_axi_req.q.user[CollectiveWidth+:PhysicalAddrWidth]),
       collective_op:   collective_op_t'(core_to_axi_req.q.user[0+:CollectiveWidth]),
@@ -1392,15 +1387,14 @@ module snitch_cluster
   assign cluster_xbar_default_port = '{default: SoC};
 
   // Instance the narrow axi xbar
-  if (EnableMulticast) begin : gen_narrow_mcast_axi_crossbar
+  if (EnableNarrowCollectives) begin : gen_narrow_mcast_axi_crossbar
 
     // Define the collective connectivity matrix!
     typedef bit [ClusterMcastXbarCfg.NoMstPorts-1:0] cluster_line_t;
     typedef bit [ClusterMcastXbarCfg.NoSlvPorts-1:0][ClusterMcastXbarCfg.NoMstPorts-1:0] cluster_matrix_t;
     // If we want to reroute collective operation the only available collective operation port is
     // the SoC port
-    localparam cluster_line_t ClusterlocalArray = (ReRouteCollectiveOp) ?
-        cluster_line_t'{SoC: 1'b1, default: 1'b0} : cluster_line_t'{default: 1'b1};
+    localparam cluster_line_t ClusterlocalArray = cluster_line_t'{SoC: 1'b1, default: 1'b0};
     localparam cluster_matrix_t ClusterCollectiveConnectivity =
         cluster_matrix_t'{default: ClusterlocalArray};
 
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index ac8794920..24fc54c9c 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -59,9 +59,8 @@ package ${cfg['cluster']['name']}_pkg;
   localparam int unsigned WideIdWidthIn = ${cfg['cluster']['dma_id_width_in']};
   localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn;
 
-  localparam int unsigned EnableDmaMulticast = ${int(cfg['cluster']['enable_dma_multicast'])};
-  localparam int unsigned EnableMulticast = ${int(cfg['cluster']['enable_multicast'])};
-  localparam int unsigned ReRouteCollectiveOp = ${int(cfg['cluster']['enable_reroute_collective'])};
+  localparam int unsigned EnableWideCollectives = ${int(cfg['cluster']['enable_wide_collectives'])};
+  localparam int unsigned EnableNarrowCollectives = ${int(cfg['cluster']['enable_narrow_collectives'])};
 
   localparam int unsigned AtomicIdWidth = ${cfg['cluster']['atomic_id_width']};
   localparam int unsigned CollectiveWidth = ${cfg['cluster']['collective_width']};
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index 6b5c285f2..b7f32217e 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -104,9 +104,8 @@ module ${cfg['cluster']['name']}_wrapper (
     .ICacheLineCount (${cfg['cluster']['name']}_pkg::ICacheLineCount),
     .ICacheWays (${cfg['cluster']['name']}_pkg::ICacheWays),
     .VMSupport (${int(cfg['cluster']['vm_support'])}),
-    .EnableDmaMulticast (${cfg['cluster']['name']}_pkg::EnableDmaMulticast),
-    .EnableMulticast (${cfg['cluster']['name']}_pkg::EnableMulticast),
-    .ReRouteCollectiveOp (${cfg['cluster']['name']}_pkg::ReRouteCollectiveOp),
+    .EnableWideCollectives (${cfg['cluster']['name']}_pkg::EnableWideCollectives),
+    .EnableNarrowCollectives (${cfg['cluster']['name']}_pkg::EnableNarrowCollectives),
     .RVE (${core_isa('e')}),
     .RVF (${core_isa('f')}),
     .RVD (${core_isa('d')}),
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index affdf4929..f8b415e75 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -87,7 +87,7 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) {
  *        in the non-multicast case even if it was not 100% necessary!
  */
 inline void snrt_wake_all(uint32_t core_mask) {
-#ifdef SNRT_SUPPORTS_MULTICAST
+#ifdef SNRT_SUPPORTS_NARROW_MULTICAST
     // Multicast cluster interrupt to every other cluster's core
     // Note: we need to address another cluster's address space
     //       because the cluster XBAR has not been extended to support
@@ -135,7 +135,7 @@ inline void snrt_cluster_hw_barrier() {
  */
 
 inline void snrt_inter_cluster_barrier() {
-#ifdef SNRT_SUPPORTS_REDUCTION
+#ifdef SNRT_SUPPORTS_NARROW_REDUCTION
     // Only continue with dma core's - send the rest into the next hw barrier
     if(snrt_is_dm_core()){
         // fetch the address for the reduction
diff --git a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
index 81f96ef45..ec9f5d054 100644
--- a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
+++ b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
@@ -21,13 +21,11 @@
 #define SNRT_CLUSTER_OFFSET ${cfg['cluster']['cluster_base_offset']}
 #define SNRT_NUM_SEQUENCER_LOOPS ${cfg['cluster']['hives'][0]['cores'][0]['num_sequencer_loops']}
 
-% if cfg['cluster']['enable_multicast']:
-#define SNRT_SUPPORTS_MULTICAST
+% if cfg['cluster']['enable_narrow_collectives']:
+#define SNRT_SUPPORTS_NARROW_MULTICAST
+#define SNRT_SUPPORTS_NARROW_REDUCTION
 % endif
 
-% if cfg['cluster']['enable_reduction']:
-#define SNRT_SUPPORTS_REDUCTION
-% endif
 #define SNRT_COLLECTIVE_WIDTH ${cfg['cluster']['collective_width']}
 
 // Software configuration
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index 281823703..a63c9659a 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -171,24 +171,14 @@
                     "description": "Width of the collective operation field.",
                     "default": 6
                 },  
-                "enable_multicast": {
+                "enable_narrow_collectives": {
                     "type": "boolean",
-                    "description": "Whether to enable multicast in the sw & hw for the cluster.",
+                    "description": "Whether to enable narrow multicast and reduction support in the cluster.",
                     "default": false
                 },
-                "enable_reduction": {
+                "enable_wide_collectives": {
                     "type": "boolean",
-                    "description": "Whether to enable multicast in the sw for the cluster.",
-                    "default": false
-                },
-                "enable_dma_multicast": {
-                    "type": "boolean",
-                    "description": "Whether to enable the multicast capable axi-crossbar in the snitch cluster",
-                    "default": false
-                },
-                "enable_reroute_collective": {
-                    "type": "boolean",
-                    "description": "Whether to reroute any collective operation request to the SoC port independent of the address",
+                    "description": "Whether to enable wide multicast and reduction support in the cluster.",
                     "default": false
                 },
                 "hart_base_id": {

From c105f7499f61911e1bbcfc2f7a7c56c8e8f680ac Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 18:14:36 +0200
Subject: [PATCH 22/38] hw: Fix compilation errors

---
 hw/snitch_cluster/src/snitch_cluster.sv         | 2 +-
 hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 026c86de9..9ac444959 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -1383,7 +1383,7 @@ module snitch_cluster
   end
 
   // Set default master port for all multicast's crossbar input's
-  localparam bit [ClusterMcastXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = 1'b1;
+  localparam bit ClusterEnableDefaultMstPort = 1'b1;
   assign cluster_xbar_default_port = '{default: SoC};
 
   // Instance the narrow axi xbar
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index 24fc54c9c..620b37d1e 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -119,6 +119,9 @@ package ${cfg['cluster']['name']}_pkg;
   typedef logic user_dma_t;
 %endif
 
+  localparam int unsigned NarrowUserWidth = $bits(user_narrow_t);
+  localparam int unsigned WideUserWidth = $bits(user_dma_t);
+
   `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_narrow_t)
   `AXI_TYPEDEF_ALL(narrow_out, addr_t, narrow_out_id_t, data_t, strb_t, user_narrow_t)
   `AXI_TYPEDEF_ALL(wide_in, addr_t, wide_in_id_t, data_dma_t, strb_dma_t, user_dma_t)

From 2d335b32d089f183239d16ea4dab656125fda698 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Thu, 10 Jul 2025 18:23:59 +0200
Subject: [PATCH 23/38] sw: Fix compilation errors

---
 sw/snRuntime/api/sync_decls.h |  2 +-
 sw/snRuntime/src/dma.h        | 24 ++++++++--------
 sw/snRuntime/src/riscv.h      |  2 ++
 sw/snRuntime/src/sync.h       | 52 +++++++++++++++++------------------
 4 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index d25b75420..e79c63c4f 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -66,7 +66,7 @@ inline void snrt_enable_multicast(uint64_t mask);
 
 inline void snrt_disable_multicast();
 
-inline void snrt_enable_reduction(uint64_t mask, uint32_t reduction);
+inline void snrt_enable_reduction(uint64_t mask, snrt_reduction_opcode_t reduction);
 
 inline void snrt_disable_reduction();
 
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index b34cc3738..95ff2016d 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -83,10 +83,9 @@ inline void snrt_dma_set_awuser(uint64_t field) {
  * @param mask Multicast mask value
  */
  inline void snrt_dma_enable_multicast(uint64_t mask) {
-    snrt_collective_op_t op = {
-        .f.collective_opcode = SNRT_COLLECTIVE_MULTICAST,
-        .f.mask = mask,
-    };
+    snrt_collective_op_t op;
+    op.f.collective_opcode = SNRT_COLLECTIVE_MULTICAST;
+    op.f.mask = mask;
     snrt_dma_set_awuser(op.w);
 }
 
@@ -99,11 +98,10 @@ inline void snrt_dma_set_awuser(uint64_t field) {
  * @param opcode Type of reduction operation
  */
 inline void snrt_dma_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode) {
-    snrt_collective_op_t op = {
-        .f.reduction_opcode = opcode,
-        .f.collective_opcode = SNRT_COLLECTIVE_OFFLOAD_REDUCTION,
-        .f.mask = mask,
-    };
+    snrt_collective_op_t op;
+    op.f.reduction_opcode = opcode;
+    op.f.collective_opcode = SNRT_COLLECTIVE_OFFLOAD_REDUCTION;
+    op.f.mask = mask;
     snrt_dma_set_awuser(op.w);
 }
 
@@ -112,7 +110,7 @@ inline void snrt_dma_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opc
  * @details Successive DMA transfers will be unicast transfers
  */
  inline void snrt_dma_disable_multicast() {
-    asm volatile("dmuser zero, zero \n");
+    snrt_dma_set_awuser(0);
 }
 
 /**
@@ -120,7 +118,7 @@ inline void snrt_dma_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opc
  * @details Successive DMA transfers will be unicast transfers
  */
 inline void snrt_dma_disable_reduction() {
-    asm volatile("dmuser zero, zero \n");
+    snrt_dma_set_awuser(0);
 }
 
 /**
@@ -171,7 +169,7 @@ static inline uint32_t snrt_dma_start_1d_reduction(volatile void *dst,
                                                    uint64_t mask,
                                                    snrt_reduction_opcode_t opcode,
                                                    const uint32_t channel = 0) {
-    return snrt_dma_start_1d_reduction((uint64_t)dst, (uint64_t)src, size, (uint64_t)mask, opcode, channel);
+    return snrt_dma_start_1d_reduction((uint64_t)dst, (uint64_t)src, size, mask, opcode, channel);
 }
 
 /**
@@ -394,7 +392,7 @@ inline snrt_dma_txid_t snrt_dma_reduction_load_1d_tile(void *dst, void *src,
                                                        size_t tile_size,
                                                        uint32_t prec,
                                                        uint64_t mask,
-                                                       snrt_collective_opcode_t opcode) {
+                                                       snrt_reduction_opcode_t opcode) {
     size_t tile_nbytes = tile_size * prec;
     return snrt_dma_start_1d_reduction((uintptr_t)dst,
                                        (uintptr_t)src + tile_idx * tile_nbytes,
diff --git a/sw/snRuntime/src/riscv.h b/sw/snRuntime/src/riscv.h
index 5adb825bf..0dbe08b93 100644
--- a/sw/snRuntime/src/riscv.h
+++ b/sw/snRuntime/src/riscv.h
@@ -16,6 +16,8 @@ inline void snrt_wfi() { asm volatile("wfi"); }
 
 inline void snrt_nop() { asm volatile("nop" : : :); }
 
+inline void snrt_fence() { asm volatile("fence" : : :); }
+
 inline uint32_t snrt_mcycle() {
     uint32_t r;
     asm volatile("csrr %0, mcycle" : "=r"(r) : : "memory");
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index f8b415e75..bcdd3f56a 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -339,6 +339,23 @@ inline void snrt_wait_writeback(uint32_t val) {
     asm volatile("mv %0, %0" : "+r"(val)::);
 }
 
+//================================================================================
+// User functions
+//================================================================================
+
+/**
+ * @brief Enable LSU AW user field
+ * @details All stores performed after this call are equipped with the given AW user field
+ *
+ * @param field Defines the AW user field for the AXI transfer
+ */
+ inline void snrt_set_awuser(uint64_t field){
+    uint32_t user_low = (uint32_t) (field);
+    uint32_t user_high = (uint32_t) (field >> 32);
+    write_csr(0x7c4, user_low);
+    write_csr(0x7c5, user_high);
+}
+
 //================================================================================
 // Multicast functions
 //================================================================================
@@ -351,10 +368,9 @@ inline void snrt_wait_writeback(uint32_t val) {
  * @param mask Multicast mask value
  */
 inline void snrt_enable_multicast(uint64_t mask){
-    snrt_collective_op_t op = {
-        .f.collective_opcode = SNRT_COLLECTIVE_MULTICAST,
-        .f.mask = mask,
-    };
+    snrt_collective_op_t op;
+    op.f.collective_opcode = SNRT_COLLECTIVE_MULTICAST;
+    op.f.mask = mask;
     snrt_set_awuser(op.w);
 }
 
@@ -378,7 +394,7 @@ inline void snrt_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode)
     snrt_collective_opcode_t coll_opcode;
 
     switch (opcode) {
-        case SNRT_COLL_NARROW_BARRIER:
+        case SNRT_REDUCTION_BARRIER:
             coll_opcode = SNRT_COLLECTIVE_PARALLEL_REDUCTION;
             break;
         default:
@@ -386,11 +402,10 @@ inline void snrt_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode)
             break;
     }
 
-    snrt_collective_op_t op = {
-        .f.reduction_opcode = opcode,
-        .f.collective_opcode = coll_opcode,
-        .f.mask = mask,
-    };
+    snrt_collective_op_t op;
+    op.f.reduction_opcode = opcode;
+    op.f.collective_opcode = coll_opcode;
+    op.f.mask = mask;
     snrt_set_awuser(op.w);
 }
 
@@ -398,20 +413,3 @@ inline void snrt_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode)
  * @brief Disable LSU reduction
  */
 inline void snrt_disable_reduction() { snrt_set_awuser(0); }
-
-//================================================================================
-// User functions
-//================================================================================
-
-/**
- * @brief Enable LSU AW user field
- * @details All stores performed after this call are equipped with the given AW user field
- *
- * @param field Defines the AW user field for the AXI transfer
- */
-inline void snrt_set_awuser(uint64_t field){
-    uint32_t user_low = (uint32_t) (field);
-    uint32_t user_high = (uint32_t) (field >> 32);
-    write_csr(0x7c4, user_low);
-    write_csr(0x7c5, user_high);
-}
\ No newline at end of file

From f24633ffb77542f24a2a73d078c2207f225b42f8 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 11:12:44 +0200
Subject: [PATCH 24/38] hw: Clean up collective connectivity matrices

---
 hw/snitch_cluster/src/snitch_cluster.sv | 42 ++++++++++++++-----------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 9ac444959..3c03bd0cb 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -721,18 +721,20 @@ module snitch_cluster
 
   if (EnableWideCollectives) begin : gen_mcast_dma_xbar
 
-    // Define the collective connectivity matrix!
-    typedef bit [DmaMcastXbarCfg.NoMstPorts-1:0] dma_line_t;
-    typedef bit [DmaMcastXbarCfg.NoSlvPorts-1:0][DmaMcastXbarCfg.NoMstPorts-1:0] dma_matrix_t;
-
-    // If we want to reroute collective operation the only available collective operation
-    // port is the SoC port
-    localparam dma_line_t DMAlocalArray = dma_line_t'{SoCDMAOut: 1'b1, default: 1'b0};
-    localparam dma_matrix_t DMACollectiveConnectivity = dma_matrix_t'{default: DMAlocalArray};
+    // Define collective connectivity matrix to ensure collectives are routed to the SoC port only
+    typedef bit [DmaMcastXbarCfg.NoMstPorts-1:0] wide_mst_connectivity_t;
+    typedef wide_mst_connectivity_t [DmaMcastXbarCfg.NoSlvPorts-1:0] wide_xbar_connectivity_t;
+    localparam wide_mst_connectivity_t WideMstCollectiveConnectivity = wide_mst_connectivity_t'{
+      SoCDMAOut: 1'b1,
+      default: 1'b0
+    };
+    localparam wide_xbar_connectivity_t DmaCollectiveConnectivity = wide_xbar_connectivity_t'{
+      default: WideMstCollectiveConnectivity
+    };
 
     axi_mcast_xbar #(
       .Cfg (DmaMcastXbarCfg),
-      .CollectiveOpsConnectivity (DMACollectiveConnectivity),
+      .CollectiveOpsConnectivity (DmaCollectiveConnectivity),
       .ATOPs (0),
       .slv_aw_chan_t (axi_mst_dma_aw_chan_t),
       .mst_aw_chan_t (axi_slv_dma_aw_chan_t),
@@ -1382,21 +1384,23 @@ module snitch_cluster
     };
   end
 
-  // Set default master port for all multicast's crossbar input's
+  // Set default master port for multicast XBAR
   localparam bit ClusterEnableDefaultMstPort = 1'b1;
   assign cluster_xbar_default_port = '{default: SoC};
 
-  // Instance the narrow axi xbar
+  // Instantiate the narrow AXI XBAR
   if (EnableNarrowCollectives) begin : gen_narrow_mcast_axi_crossbar
 
-    // Define the collective connectivity matrix!
-    typedef bit [ClusterMcastXbarCfg.NoMstPorts-1:0] cluster_line_t;
-    typedef bit [ClusterMcastXbarCfg.NoSlvPorts-1:0][ClusterMcastXbarCfg.NoMstPorts-1:0] cluster_matrix_t;
-    // If we want to reroute collective operation the only available collective operation port is
-    // the SoC port
-    localparam cluster_line_t ClusterlocalArray = cluster_line_t'{SoC: 1'b1, default: 1'b0};
-    localparam cluster_matrix_t ClusterCollectiveConnectivity =
-        cluster_matrix_t'{default: ClusterlocalArray};
+    // Define collective connectivity matrix to ensure collectives are routed to the SoC port only
+    typedef bit [ClusterMcastXbarCfg.NoMstPorts-1:0] master_connectivity_t;
+    typedef master_connectivity_t [ClusterMcastXbarCfg.NoSlvPorts-1:0] xbar_connectivity_t;
+    localparam master_connectivity_t MasterCollectiveConnectivity = master_connectivity_t'{
+      SoC: 1'b1,
+      default: 1'b0
+    };
+    localparam xbar_connectivity_t ClusterCollectiveConnectivity = xbar_connectivity_t'{
+      default: MasterCollectiveConnectivity
+    };
 
     axi_mcast_xbar #(
       .Cfg                      (ClusterMcastXbarCfg),

From d35f6d0470e8fdeb5eb49dd1c6319366b8fb33f9 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 11:21:03 +0200
Subject: [PATCH 25/38] sw: Fix linting errors

---
 sw/snRuntime/api/sync_decls.h |  3 +-
 sw/snRuntime/src/dma.h        | 58 ++++++++++++++++-------------------
 sw/snRuntime/src/start.c      |  3 +-
 sw/snRuntime/src/sync.c       |  1 -
 sw/snRuntime/src/sync.h       | 46 ++++++++++++++-------------
 5 files changed, 55 insertions(+), 56 deletions(-)

diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index e79c63c4f..a96a03040 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -66,7 +66,8 @@ inline void snrt_enable_multicast(uint64_t mask);
 
 inline void snrt_disable_multicast();
 
-inline void snrt_enable_reduction(uint64_t mask, snrt_reduction_opcode_t reduction);
+inline void snrt_enable_reduction(uint64_t mask,
+                                  snrt_reduction_opcode_t reduction);
 
 inline void snrt_disable_reduction();
 
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 95ff2016d..8cd0e11e9 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -64,15 +64,17 @@ static inline uint32_t snrt_dma_start_1d(volatile void *dst, volatile void *src,
 
 /**
  * @brief Set AW user field of the DMA's AXI interface
- * @details All DMA transfers performed after this call are equipped with the given AW user field
+ * @details All DMA transfers performed after this call are equipped with the
+ *          given AW user field
  *
  * @param field Defines the AW user field for the AXI transfer
  */
 inline void snrt_dma_set_awuser(uint64_t field) {
-    uint32_t user_low = (uint32_t) (field);
-    uint32_t user_high = (uint32_t) (field >> 32);
-    asm volatile("dmuser %[user_low], %[user_high] \n" : : 
-        [ user_low ] "r"(user_low), [ user_high ] "r"(user_high));
+    uint32_t user_low = (uint32_t)(field);
+    uint32_t user_high = (uint32_t)(field >> 32);
+    asm volatile("dmuser %[user_low], %[user_high] \n"
+                 :
+                 : [ user_low ] "r"(user_low), [ user_high ] "r"(user_high));
 }
 
 /**
@@ -82,7 +84,7 @@ inline void snrt_dma_set_awuser(uint64_t field) {
  *
  * @param mask Multicast mask value
  */
- inline void snrt_dma_enable_multicast(uint64_t mask) {
+inline void snrt_dma_enable_multicast(uint64_t mask) {
     snrt_collective_op_t op;
     op.f.collective_opcode = SNRT_COLLECTIVE_MULTICAST;
     op.f.mask = mask;
@@ -97,7 +99,8 @@ inline void snrt_dma_set_awuser(uint64_t field) {
  * @param mask Mask defines all involved members
  * @param opcode Type of reduction operation
  */
-inline void snrt_dma_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode) {
+inline void snrt_dma_enable_reduction(uint64_t mask,
+                                      snrt_reduction_opcode_t opcode) {
     snrt_collective_op_t op;
     op.f.reduction_opcode = opcode;
     op.f.collective_opcode = SNRT_COLLECTIVE_OFFLOAD_REDUCTION;
@@ -109,17 +112,13 @@ inline void snrt_dma_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opc
  * @brief Disable multicast for successive transfers
  * @details Successive DMA transfers will be unicast transfers
  */
- inline void snrt_dma_disable_multicast() {
-    snrt_dma_set_awuser(0);
-}
+inline void snrt_dma_disable_multicast() { snrt_dma_set_awuser(0); }
 
 /**
  * @brief Disable reduction operations for successive transfers
  * @details Successive DMA transfers will be unicast transfers
  */
-inline void snrt_dma_disable_reduction() {
-    snrt_dma_set_awuser(0);
-}
+inline void snrt_dma_disable_reduction() { snrt_dma_set_awuser(0); }
 
 /**
  * @brief Start an asynchronous reduction 1D DMA transfer with 64-bit wide
@@ -129,10 +128,9 @@ inline void snrt_dma_disable_reduction() {
  * @see snrt_dma_start_1d(uint64_t, uint64_t, size_t, uint32_t) for a
  *      description of the other parameters.
  */
-static inline uint32_t snrt_dma_start_1d_reduction(uint64_t dst, uint64_t src,
-                                                   size_t size, uint64_t mask,
-                                                   snrt_reduction_opcode_t opcode,
-                                                   const uint32_t channel = 0) {
+static inline uint32_t snrt_dma_start_1d_reduction(
+    uint64_t dst, uint64_t src, size_t size, uint64_t mask,
+    snrt_reduction_opcode_t opcode, const uint32_t channel = 0) {
     snrt_dma_enable_reduction(mask, opcode);
     uint32_t txid = snrt_dma_start_1d(dst, src, size, channel);
     snrt_dma_disable_reduction();
@@ -162,14 +160,14 @@ static inline uint32_t snrt_dma_start_1d_mcast(uint64_t dst, uint64_t src,
  * This is a convenience overload of snrt_dma_start_1d_reduction() using `void*`
  * pointers.
  *
- * @see snrt_dma_start_1d_reduction(uint64_t, uint64_t, size_t, uint64_t, uint32_t, uint32_t)
+ * @see snrt_dma_start_1d_reduction(uint64_t, uint64_t, size_t, uint64_t,
+ *      uint32_t, uint32_t)
  */
-static inline uint32_t snrt_dma_start_1d_reduction(volatile void *dst,
-                                                   volatile void *src, size_t size,
-                                                   uint64_t mask,
-                                                   snrt_reduction_opcode_t opcode,
-                                                   const uint32_t channel = 0) {
-    return snrt_dma_start_1d_reduction((uint64_t)dst, (uint64_t)src, size, mask, opcode, channel);
+static inline uint32_t snrt_dma_start_1d_reduction(
+    volatile void *dst, volatile void *src, size_t size, uint64_t mask,
+    snrt_reduction_opcode_t opcode, const uint32_t channel = 0) {
+    return snrt_dma_start_1d_reduction((uint64_t)dst, (uint64_t)src, size, mask,
+                                       opcode, channel);
 }
 
 /**
@@ -185,7 +183,8 @@ static inline uint32_t snrt_dma_start_1d_mcast(volatile void *dst,
                                                volatile void *src, size_t size,
                                                uint64_t mask,
                                                const uint32_t channel = 0) {
-    return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, mask, channel);
+    return snrt_dma_start_1d_mcast((uint64_t)dst, (uint64_t)src, size, mask,
+                                   channel);
 }
 
 /**
@@ -387,12 +386,9 @@ inline snrt_dma_txid_t snrt_dma_mcast_load_1d_tile(void *dst, void *src,
  * @param mask Mask for reduction operation.
  * @param opcode Reduction operation.
  */
-inline snrt_dma_txid_t snrt_dma_reduction_load_1d_tile(void *dst, void *src,
-                                                       size_t tile_idx,
-                                                       size_t tile_size,
-                                                       uint32_t prec,
-                                                       uint64_t mask,
-                                                       snrt_reduction_opcode_t opcode) {
+inline snrt_dma_txid_t snrt_dma_reduction_load_1d_tile(
+    void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec,
+    uint64_t mask, snrt_reduction_opcode_t opcode) {
     size_t tile_nbytes = tile_size * prec;
     return snrt_dma_start_1d_reduction((uintptr_t)dst,
                                        (uintptr_t)src + tile_idx * tile_nbytes,
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index a495d4350..ba4703ba3 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -70,12 +70,11 @@ static inline void snrt_init_bss() {
 
 #ifdef SNRT_WAKE_UP
 static inline void snrt_wake_up() {
-
     // Core 0 of cluster 0 wakes up all other cores
     if (snrt_cluster_idx() == 0 && snrt_cluster_core_idx() == 0) {
         snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
         snrt_fence();
-    } 
+    }
 
     // Synchronize all cores
     snrt_cluster_hw_barrier();
diff --git a/sw/snRuntime/src/sync.c b/sw/snRuntime/src/sync.c
index 1a333bf53..05649590b 100644
--- a/sw/snRuntime/src/sync.c
+++ b/sw/snRuntime/src/sync.c
@@ -44,4 +44,3 @@ extern void snrt_enable_reduction(uint64_t mask, uint32_t reduction);
 extern void snrt_disable_reduction();
 
 extern void snrt_set_user_field(uint64_t field);
-
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index bcdd3f56a..4e154d1ec 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -81,10 +81,10 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) {
  * @brief Wakes up all core by writing in their respective clint var.
  *        Can only be called by a single core inside the hole system!
  * @note When the Multicast is enbled then the core mask is sent to itself too
- *        therefore setting the wake up flag althrough the core is awake.
- *        As consequence the function "snrt_int_clr_mcip()" needs to be called
- *        even if the core was awake. For a simplified flow we copy this behaviour
- *        in the non-multicast case even if it was not 100% necessary!
+ *       therefore setting the wake up flag althrough the core is awake.
+ *       As consequence the function "snrt_int_clr_mcip()" needs to be called
+ *       even if the core was awake. For a simplified flow we copy this
+ *       behaviour in the non-multicast case even if it was not 100% necessary!
  */
 inline void snrt_wake_all(uint32_t core_mask) {
 #ifdef SNRT_SUPPORTS_NARROW_MULTICAST
@@ -137,19 +137,20 @@ inline void snrt_cluster_hw_barrier() {
 inline void snrt_inter_cluster_barrier() {
 #ifdef SNRT_SUPPORTS_NARROW_REDUCTION
     // Only continue with dma core's - send the rest into the next hw barrier
-    if(snrt_is_dm_core()){
+    if (snrt_is_dm_core()) {
         // fetch the address for the reduction
-        cls_t * ctrl_red = cls();
-        void * addr = (void *) snrt_remote_l1_ptr(&(ctrl_red->reduction), snrt_cluster_idx(), 0);
-        
+        cls_t *ctrl_red = cls();
+        void *addr = (void *)snrt_remote_l1_ptr(&(ctrl_red->reduction),
+                                                snrt_cluster_idx(), 0);
+
         // clear the memory location of any previouse reduction
-        if(snrt_cluster_idx() == 0){
-            *((uint32_t *) addr) = 0;
+        if (snrt_cluster_idx() == 0) {
+            *((uint32_t *)addr) = 0;
         }
 
         // init the reduction
         snrt_enable_reduction(SNRT_BROADCAST_MASK, SNRT_COLL_NARROW_BARRIER);
-        *((uint32_t *) addr) = 1;
+        *((uint32_t *)addr) = 1;
         snrt_disable_reduction();
 
         // Fence to wait until the reduction is finished
@@ -157,12 +158,13 @@ inline void snrt_inter_cluster_barrier() {
     }
 #else
     // Only continue with dma core's - send the rest into sleep mode
-    if(snrt_is_dm_core()){
-        uint32_t cnt = __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
+    if (snrt_is_dm_core()) {
+        uint32_t cnt =
+            __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
 
         // All but the last cluster enter WFI, while the last cluster resets the
-        // counter for the next barrier and multicasts an interrupt to wake up the
-        // other clusters.
+        // counter for the next barrier and multicasts an interrupt to wake up
+        // the other clusters.
         if (cnt == snrt_cluster_num()) {
             _snrt_barrier.cnt = 0;
             // Wake all clusters
@@ -345,13 +347,14 @@ inline void snrt_wait_writeback(uint32_t val) {
 
 /**
  * @brief Enable LSU AW user field
- * @details All stores performed after this call are equipped with the given AW user field
+ * @details All stores performed after this call are equipped with the given AW
+ *          user field
  *
  * @param field Defines the AW user field for the AXI transfer
  */
- inline void snrt_set_awuser(uint64_t field){
-    uint32_t user_low = (uint32_t) (field);
-    uint32_t user_high = (uint32_t) (field >> 32);
+inline void snrt_set_awuser(uint64_t field) {
+    uint32_t user_low = (uint32_t)(field);
+    uint32_t user_high = (uint32_t)(field >> 32);
     write_csr(0x7c4, user_low);
     write_csr(0x7c5, user_high);
 }
@@ -367,7 +370,7 @@ inline void snrt_wait_writeback(uint32_t val) {
  *
  * @param mask Multicast mask value
  */
-inline void snrt_enable_multicast(uint64_t mask){
+inline void snrt_enable_multicast(uint64_t mask) {
     snrt_collective_op_t op;
     op.f.collective_opcode = SNRT_COLLECTIVE_MULTICAST;
     op.f.mask = mask;
@@ -390,7 +393,8 @@ inline void snrt_disable_multicast() { snrt_set_awuser(0); }
  * @param mask Mask defines all involved members
  * @param opcode Type of reduction operation
  */
-inline void snrt_enable_reduction(uint64_t mask, snrt_reduction_opcode_t opcode) { 
+inline void snrt_enable_reduction(uint64_t mask,
+                                  snrt_reduction_opcode_t opcode) {
     snrt_collective_opcode_t coll_opcode;
 
     switch (opcode) {

From 7081573e8f26c6485108e201a387fd3d9c2774ec Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 11:23:05 +0200
Subject: [PATCH 26/38] bender: Bump AXI

---
 Bender.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Bender.lock b/Bender.lock
index 2b7a97918..1a401ec68 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -7,7 +7,7 @@ packages:
     dependencies:
     - common_cells
   axi:
-    revision: 9debffc1b1b2b4e4045ec10dfe6eb87a412eab95
+    revision: 458c8c58cbe0acd0c5e6bb8592b52111bc9a5d84
     version: null
     source:
       Git: https://github.com/Lura518/axi.git

From d6686f6251f0ba3e5754ffed622281c0d9e0f04d Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 16:20:52 +0200
Subject: [PATCH 27/38] hw: Update reqrsp and TCDM interface testbenches

---
 hw/reqrsp_interface/test/reqrsp_to_axi_tb.sv | 3 +--
 hw/tcdm_interface/test/reqrsp_to_tcdm_tb.sv  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/reqrsp_interface/test/reqrsp_to_axi_tb.sv b/hw/reqrsp_interface/test/reqrsp_to_axi_tb.sv
index ebc509a2b..4a1cb85c7 100644
--- a/hw/reqrsp_interface/test/reqrsp_to_axi_tb.sv
+++ b/hw/reqrsp_interface/test/reqrsp_to_axi_tb.sv
@@ -58,11 +58,10 @@ module reqrsp_to_axi_tb import reqrsp_pkg::*; #(
     .AxiIdWidth (IW),
     .AddrWidth (AW),
     .DataWidth (DW),
-    .AxiUserWidth (UW)
+    .UserWidth (UW)
   ) i_reqrsp_to_axi (
     .clk_i (clk),
     .rst_ni (rst_n),
-    .user_i ('0),
     .reqrsp (master),
     .axi (slave)
   );
diff --git a/hw/tcdm_interface/test/reqrsp_to_tcdm_tb.sv b/hw/tcdm_interface/test/reqrsp_to_tcdm_tb.sv
index d7b6608cb..fdd68fc83 100644
--- a/hw/tcdm_interface/test/reqrsp_to_tcdm_tb.sv
+++ b/hw/tcdm_interface/test/reqrsp_to_tcdm_tb.sv
@@ -50,8 +50,7 @@ module reqrsp_to_tcdm_tb import reqrsp_pkg::*; #(
   reqrsp_to_tcdm_intf #(
     .AddrWidth (AW),
     .DataWidth (DW),
-    .BufDepth (BufDepth),
-    .user_t (logic)
+    .BufDepth (BufDepth)
   ) i_dut (
     .clk_i (clk),
     .rst_ni (rst_n),

From 00f96fbf9fc72bfa7440276ec82f755710cf58c9 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 16:29:11 +0200
Subject: [PATCH 28/38] iis-setup.sh: Fix oseda container version

---
 iis-setup.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/iis-setup.sh b/iis-setup.sh
index 72994fefc..d88cec94e 100755
--- a/iis-setup.sh
+++ b/iis-setup.sh
@@ -8,7 +8,7 @@ export BENDER=bender-0.28.1
 export CC=gcc-9.2.0
 export CXX=g++-9.2.0
 export VCS_SEPP=vcs-2024.09
-export VERILATOR_SEPP=oseda
+export VERILATOR_SEPP="oseda -2025.03"
 export QUESTA_SEPP=questa-2023.4
 export LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.2.0/bin
 

From b9d0774978cc7122ef3f5df6ea20cacbc554e530 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 16:58:21 +0200
Subject: [PATCH 29/38] hw: Fix errors related to select_t type

---
 hw/snitch_cluster/src/snitch_cc.sv | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 5844e9f10..3b0d57030 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -607,7 +607,8 @@ module snitch_cc #(
   // Decide whether to go to SoC or TCDM
 
   localparam int unsigned SelectWidth = cf_math_pkg::idx_width(2);
-  typedef enum logic [SelectWidth-1:0] {SelectTcdm = 1, SelectSoc = 0} select_t;
+  typedef logic [SelectWidth-1:0] select_t;
+  typedef enum select_t {SelectTcdm = 1, SelectSoc = 0} select_e;
 
   dreq_t data_tcdm_req;
   drsp_t data_tcdm_rsp;
@@ -638,13 +639,13 @@ module snitch_cc #(
 
   reqrsp_rule_t [TCDMAliasEnable:0] addr_map;
   assign addr_map[0] = '{
-    idx: 1,
+    idx: SelectTcdm,
     base: tcdm_addr_base_i,
     mask: ({AddrWidth{1'b1}} << TCDMAddrWidth)
   };
   if (TCDMAliasEnable) begin : gen_tcdm_alias_rule
     assign addr_map[1] = '{
-      idx: 1,
+      idx: SelectTcdm,
       base: TCDMAliasStart,
       mask: ({AddrWidth{1'b1}} << TCDMAddrWidth)
     };
@@ -662,7 +663,7 @@ module snitch_cc #(
     .dec_valid_o (),
     .dec_error_o (),
     .en_default_idx_i (1'b1),
-    .default_idx_i ('0)
+    .default_idx_i (SelectSoc)
   );
 
   // Collective communication operations are performed within the interconnect at the SoC

From 976282f8fe41620e9c8ab2dbd3a12f91b4c0cc4e Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 17:54:57 +0200
Subject: [PATCH 30/38] hw: Fix port mismatch errors

---
 hw/snitch_cluster/src/snitch_cc.sv      | 3 +++
 hw/snitch_cluster/src/snitch_cluster.sv | 9 ++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 3b0d57030..31ded5c2d 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -288,6 +288,7 @@ module snitch_cc #(
   reqrsp_iso #(
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (64),
     .req_t (dreq_t),
     .rsp_t (drsp_t),
     .BypassReq (!RegisterCoreReq),
@@ -558,6 +559,7 @@ module snitch_cc #(
       .NrPorts (2),
       .AddrWidth (AddrWidth),
       .DataWidth (DataWidth),
+      .UserWidth (64),
       .req_t (dreq_t),
       .rsp_t (drsp_t),
       // TODO(zarubaf): Wire-up to top-level.
@@ -682,6 +684,7 @@ module snitch_cc #(
   reqrsp_to_tcdm #(
     .AddrWidth (AddrWidth),
     .DataWidth (DataWidth),
+    .UserWidth (64),
     // TODO(zarubaf): Make a parameter.
     .BufDepth (4),
     .reqrsp_req_t (dreq_t),
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 3c03bd0cb..7af20679d 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -311,6 +311,7 @@ module snitch_cluster
   localparam logic [PhysicalAddrWidth-1:0] TCDMMask = ~(TCDMSizeNapotRounded - 1);
 
   // User widths
+  localparam int unsigned CoreUserWidth   = 64;
   localparam int unsigned NarrowUserWidth = $bits(user_narrow_t);
   localparam int unsigned WideUserWidth   = $bits(user_dma_t);
 
@@ -426,7 +427,7 @@ module snitch_cluster
   typedef logic [PhysicalAddrWidth-1:0] addr_t;
   typedef logic [NarrowDataWidth-1:0]   data_t;
   typedef logic [NarrowDataWidth/8-1:0] strb_t;
-  typedef logic [63:0]                  user_t;
+  typedef logic [CoreUserWidth-1:0]     user_t;
   typedef logic [WideDataWidth-1:0]     data_dma_t;
   typedef logic [WideDataWidth/8-1:0]   strb_dma_t;
   typedef logic [NarrowIdWidthIn-1:0]   id_mst_t;
@@ -1220,6 +1221,7 @@ module snitch_cluster
     .NrPorts (NrHives),
     .AddrWidth (PhysicalAddrWidth),
     .DataWidth (NarrowDataWidth),
+    .UserWidth (CoreUserWidth),
     .req_t (reqrsp_req_t),
     .rsp_t (reqrsp_rsp_t),
     .RespDepth (2)
@@ -1268,6 +1270,7 @@ module snitch_cluster
     .NrPorts (NrCores),
     .AddrWidth (PhysicalAddrWidth),
     .DataWidth (NarrowDataWidth),
+    .UserWidth (CoreUserWidth),
     .req_t (reqrsp_req_t),
     .rsp_t (reqrsp_rsp_t),
     .RespDepth (2)
@@ -1322,8 +1325,8 @@ module snitch_cluster
 
   reqrsp_to_axi #(
     .DataWidth (NarrowDataWidth),
-    .reqrsp_req_t (reqrsp_req_t),
-    .reqrsp_rsp_t (reqrsp_rsp_t),
+    .reqrsp_req_t (reqrsp_amo_req_t),
+    .reqrsp_rsp_t (reqrsp_amo_rsp_t),
     .axi_req_t (axi_mst_req_t),
     .axi_rsp_t (axi_mst_resp_t)
   ) i_reqrsp_to_axi_core (

From fb1342b71b8cab6b463b1f8095319d66bcd530b4 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 18:40:11 +0200
Subject: [PATCH 31/38] treewide: Replace legacy parameters with
 enable_*_collectives parameters

---
 hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl | 4 ++--
 target/snitch_cluster/cfg/default.json          | 5 ++---
 target/snitch_cluster/cfg/dma_mchan.json        | 5 ++---
 target/snitch_cluster/cfg/frep_xl.json          | 5 ++---
 target/snitch_cluster/cfg/frep_xs.json          | 5 ++---
 target/snitch_cluster/cfg/github-ci.json        | 5 ++---
 target/snitch_cluster/cfg/omega.json            | 5 ++---
 7 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index 620b37d1e..ac8bdd2ff 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -97,7 +97,7 @@ package ${cfg['cluster']['name']}_pkg;
 
 // Generate the typedef's for the userfield's with the required subfields depending
 // on the configuration
-% if cfg['cluster']['enable_multicast']:
+% if cfg['cluster']['enable_narrow_collectives']:
   typedef struct packed {
     addr_t                          collective_mask;
     logic [CollectiveWidth-1:0]     collective_op;
@@ -110,7 +110,7 @@ package ${cfg['cluster']['name']}_pkg;
 %endif
 
 // Will be extended when implementing collective operation on the wide dma link
-% if cfg['cluster']['enable_dma_multicast']:
+% if cfg['cluster']['enable_wide_collectives']:
   typedef struct packed {
     addr_t                          collective_mask;
     logic [CollectiveWidth-1:0]     collective_op;
diff --git a/target/snitch_cluster/cfg/default.json b/target/snitch_cluster/cfg/default.json
index eb64d23ec..cf15680fc 100644
--- a/target/snitch_cluster/cfg/default.json
+++ b/target/snitch_cluster/cfg/default.json
@@ -22,9 +22,8 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_multicast: true,
-        enable_reduction: true,
-        enable_dma_multicast: true,
+        enable_narrow_collectives: true,
+        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/dma_mchan.json b/target/snitch_cluster/cfg/dma_mchan.json
index e6751f090..17f79e34e 100644
--- a/target/snitch_cluster/cfg/dma_mchan.json
+++ b/target/snitch_cluster/cfg/dma_mchan.json
@@ -23,9 +23,8 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_multicast: true,
-        enable_reduction: true,
-        enable_dma_multicast: true,
+        enable_narrow_collectives: true,
+        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/frep_xl.json b/target/snitch_cluster/cfg/frep_xl.json
index 8098355ec..a7f0d434f 100644
--- a/target/snitch_cluster/cfg/frep_xl.json
+++ b/target/snitch_cluster/cfg/frep_xl.json
@@ -22,9 +22,8 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_multicast: true,
-        enable_reduction: true,
-        enable_dma_multicast: true,
+        enable_narrow_collectives: true,
+        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/frep_xs.json b/target/snitch_cluster/cfg/frep_xs.json
index a9b99b04b..13b850eb7 100644
--- a/target/snitch_cluster/cfg/frep_xs.json
+++ b/target/snitch_cluster/cfg/frep_xs.json
@@ -22,9 +22,8 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_multicast: true,
-        enable_reduction: true,
-        enable_dma_multicast: true,
+        enable_narrow_collectives: true,
+        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/github-ci.json b/target/snitch_cluster/cfg/github-ci.json
index f8bfccb28..ed33820b4 100644
--- a/target/snitch_cluster/cfg/github-ci.json
+++ b/target/snitch_cluster/cfg/github-ci.json
@@ -22,9 +22,8 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_multicast: true,
-        enable_reduction: true,
-        enable_dma_multicast: true,
+        enable_narrow_collectives: true,
+        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/omega.json b/target/snitch_cluster/cfg/omega.json
index c3a949a1c..caf531bff 100644
--- a/target/snitch_cluster/cfg/omega.json
+++ b/target/snitch_cluster/cfg/omega.json
@@ -23,9 +23,8 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_multicast: true,
-        enable_reduction: true,
-        enable_dma_multicast: true,
+        enable_narrow_collectives: true,
+        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support

From 52d09c3dab172f92e407a9751de9f240bb2e95b3 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 19:06:34 +0200
Subject: [PATCH 32/38] cfg: Disable collectives as this requires loopback from
 SoC

---
 target/snitch_cluster/cfg/default.json            | 2 --
 target/snitch_cluster/cfg/dma_mchan.json          | 2 --
 target/snitch_cluster/cfg/frep_xl.json            | 2 --
 target/snitch_cluster/cfg/frep_xs.json            | 2 --
 target/snitch_cluster/cfg/github-ci.json          | 2 --
 target/snitch_cluster/cfg/omega.json              | 2 --
 util/clustergen/schema/snitch_cluster.schema.json | 4 ++--
 7 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/target/snitch_cluster/cfg/default.json b/target/snitch_cluster/cfg/default.json
index cf15680fc..a2f891680 100644
--- a/target/snitch_cluster/cfg/default.json
+++ b/target/snitch_cluster/cfg/default.json
@@ -22,8 +22,6 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_narrow_collectives: true,
-        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/dma_mchan.json b/target/snitch_cluster/cfg/dma_mchan.json
index 17f79e34e..aa2cf56fe 100644
--- a/target/snitch_cluster/cfg/dma_mchan.json
+++ b/target/snitch_cluster/cfg/dma_mchan.json
@@ -23,8 +23,6 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_narrow_collectives: true,
-        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/frep_xl.json b/target/snitch_cluster/cfg/frep_xl.json
index a7f0d434f..520b2bc13 100644
--- a/target/snitch_cluster/cfg/frep_xl.json
+++ b/target/snitch_cluster/cfg/frep_xl.json
@@ -22,8 +22,6 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_narrow_collectives: true,
-        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/frep_xs.json b/target/snitch_cluster/cfg/frep_xs.json
index 13b850eb7..41ee7278e 100644
--- a/target/snitch_cluster/cfg/frep_xs.json
+++ b/target/snitch_cluster/cfg/frep_xs.json
@@ -22,8 +22,6 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_narrow_collectives: true,
-        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/github-ci.json b/target/snitch_cluster/cfg/github-ci.json
index ed33820b4..c62c13c17 100644
--- a/target/snitch_cluster/cfg/github-ci.json
+++ b/target/snitch_cluster/cfg/github-ci.json
@@ -22,8 +22,6 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_narrow_collectives: true,
-        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/target/snitch_cluster/cfg/omega.json b/target/snitch_cluster/cfg/omega.json
index caf531bff..34c06333b 100644
--- a/target/snitch_cluster/cfg/omega.json
+++ b/target/snitch_cluster/cfg/omega.json
@@ -23,8 +23,6 @@
         dma_req_fifo_depth: 8,
         narrow_trans: 4,
         wide_trans: 32,
-        enable_narrow_collectives: true,
-        enable_wide_collectives: true,
         // We don't need Snitch debugging in Occamy
         enable_debug: false,
         // We don't need Snitch (core-internal) virtual memory support
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index a63c9659a..b96daa18b 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -173,12 +173,12 @@
                 },  
                 "enable_narrow_collectives": {
                     "type": "boolean",
-                    "description": "Whether to enable narrow multicast and reduction support in the cluster.",
+                    "description": "Whether to enable narrow multicast and reduction support in the cluster. Requires multicast and reduction requests to be handled in the SoC interconnect.",
                     "default": false
                 },
                 "enable_wide_collectives": {
                     "type": "boolean",
-                    "description": "Whether to enable wide multicast and reduction support in the cluster.",
+                    "description": "Whether to enable wide multicast and reduction support in the cluster. Requires multicast and reduction requests to be handled in the SoC interconnect.",
                     "default": false
                 },
                 "hart_base_id": {

From d67396af3ffe7c4014408e84b3b0e2781d3dd526 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 19:09:41 +0200
Subject: [PATCH 33/38] sw: Revert `snrt_inter_cluster_barrier` to work for
 other than DMA cores

---
 sw/snRuntime/src/sync.h | 75 ++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index 4e154d1ec..02c0ce8fa 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -130,54 +130,46 @@ inline void snrt_cluster_hw_barrier() {
 /**
  * @brief Synchronize one core from every cluster with the others.
  * @details Implemented as a software barrier.
- * @note All cores per cluster must invoke this function, or the calling cores
- *       will stall indefinitely.
+ * @note One core per cluster must invoke this function (the same across all
+ *       clusters), or the calling cores will stall indefinitely.
  */
 
 inline void snrt_inter_cluster_barrier() {
 #ifdef SNRT_SUPPORTS_NARROW_REDUCTION
-    // Only continue with dma core's - send the rest into the next hw barrier
-    if (snrt_is_dm_core()) {
-        // fetch the address for the reduction
-        cls_t *ctrl_red = cls();
-        void *addr = (void *)snrt_remote_l1_ptr(&(ctrl_red->reduction),
-                                                snrt_cluster_idx(), 0);
-
-        // clear the memory location of any previouse reduction
-        if (snrt_cluster_idx() == 0) {
-            *((uint32_t *)addr) = 0;
-        }
+    // Fetch the address for the reduction
+    cls_t *ctrl_red = cls();
+    uint32_t *addr = (uint32_t *)snrt_remote_l1_ptr(&(ctrl_red->reduction),
+                                                    snrt_cluster_idx(), 0);
+
+    // Clear the memory location of any previous reduction
+    if (snrt_cluster_idx() == 0) {
+        *addr = 0;
+    }
 
-        // init the reduction
-        snrt_enable_reduction(SNRT_BROADCAST_MASK, SNRT_COLL_NARROW_BARRIER);
-        *((uint32_t *)addr) = 1;
-        snrt_disable_reduction();
+    // Launch the reduction
+    snrt_enable_reduction(SNRT_BROADCAST_MASK, SNRT_REDUCTION_BARRIER);
+    *addr = 1;
+    snrt_disable_reduction();
 
-        // Fence to wait until the reduction is finished
-        snrt_fence();
-    }
+    // Fence to wait until the reduction is finished
+    snrt_fence();
 #else
-    // Only continue with dma core's - send the rest into sleep mode
-    if (snrt_is_dm_core()) {
-        uint32_t cnt =
-            __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
-
-        // All but the last cluster enter WFI, while the last cluster resets the
-        // counter for the next barrier and multicasts an interrupt to wake up
-        // the other clusters.
-        if (cnt == snrt_cluster_num()) {
-            _snrt_barrier.cnt = 0;
-            // Wake all clusters
-            snrt_wake_all((1 << snrt_cluster_core_num()) - 1);
-        } else {
-            snrt_wfi();
-        }
+    // Everyone increments a shared counter
+    uint32_t cnt =
+        __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
+
+    // All but the last cluster enter WFI, while the last cluster resets the
+    // counter for the next barrier and multicasts an interrupt to wake up the
+    // other clusters.
+    if (cnt == snrt_cluster_num()) {
+        _snrt_barrier.cnt = 0;
+        // Wake all clusters
+        snrt_wake_all(1 << snrt_cluster_core_idx());
     } else {
         snrt_wfi();
+        // Clear interrupt for next barrier
+        snrt_int_clr_mcip();
     }
-
-    // Clear the reset flag
-    snrt_int_clr_mcip();
 #endif
 }
 
@@ -195,7 +187,9 @@ inline void snrt_global_barrier() {
     snrt_cluster_hw_barrier();
 
     // Synchronize all clusters
-    snrt_inter_cluster_barrier();
+    if (snrt_is_dm_core()) {
+        snrt_inter_cluster_barrier();
+    }
 
     // Synchronize cores in a cluster with the HW barrier
     snrt_cluster_hw_barrier();
@@ -253,9 +247,6 @@ inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
                            __ATOMIC_RELAXED);
         snrt_inter_cluster_barrier();
         *cluster_result = _reduction_result;
-    } else {
-        // All core need to invoke the barrier
-        snrt_inter_cluster_barrier();
     }
     snrt_cluster_hw_barrier();
     return *cluster_result;

From 0741e9ae07a057e5bd6d7fe0814f752bed75f3e3 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Fri, 11 Jul 2025 19:10:12 +0200
Subject: [PATCH 34/38] sw: Add comment in `multi_cluster` test

---
 sw/tests/multi_cluster.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sw/tests/multi_cluster.c b/sw/tests/multi_cluster.c
index fb771e0b1..9d275f196 100644
--- a/sw/tests/multi_cluster.c
+++ b/sw/tests/multi_cluster.c
@@ -15,11 +15,16 @@ int main() {
     uint32_t *cluster_sum = (uint32_t *)snrt_l3_next();
     uint32_t *core_cluster_sum = (uint32_t *)snrt_l3_next() + 4;
 
+    // One at a time, the zero-th cores in all clusters increment
+    // cluster_sum (i.e. should contain cluster_num at the end),
+    // and all cores in cluster i increment core_cluster_sum[i]
+    // (i.e. should contain cluster_core_num at the end).
     for (uint32_t i = 0; i < global_core_num; i++) {
         snrt_global_barrier();
         if (i == global_core_id) {
             *cluster_sum += (cluster_core_id == 0);
             core_cluster_sum[cluster_id] += 1;
+            snrt_fence();
         }
     }
 

From 6a7e9afa0fd0648be174a2f046bf297a0917e2dc Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 15 Jul 2025 17:45:39 +0200
Subject: [PATCH 35/38] hw: Remove external memory from multicast rules

Doesn't satisfy multicast rule conversion constraints.
---
 hw/snitch_cluster/src/snitch_cluster.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 7af20679d..d8c405005 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -347,7 +347,7 @@ module snitch_cluster
     AxiAddrWidth: PhysicalAddrWidth,
     AxiDataWidth: NarrowDataWidth,
     NoAddrRules: NrRules,
-    NoMulticastRules: 3,
+    NoMulticastRules: 2,
     NoMulticastPorts: 3,
     default: '0
   };

From d65167653304a3b7c3ca653778f851c9197aad04 Mon Sep 17 00:00:00 2001
From: Luca Colagrande <luca.colagrande3@gmail.com>
Date: Tue, 15 Jul 2025 17:47:25 +0200
Subject: [PATCH 36/38] treewide: Add `reduction_opcode_width` configuration
 parameter

---
 sw/snRuntime/api/sync_decls.h                          | 10 +++++++---
 .../sw/runtime/common/snitch_cluster_cfg.h.tpl         |  3 ++-
 util/clustergen/schema/snitch_cluster.schema.json      |  7 ++++++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index a96a03040..457bf5532 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -6,6 +6,9 @@
 
 #include <stdint.h>
 
+#define SNRT_COLLECTIVE_MASK_WIDTH \
+    (64 - SNRT_REDUCTION_OPCODE_WIDTH - SNRT_COLLECTIVE_OPCODE_WIDTH)
+
 typedef struct {
     uint32_t volatile cnt;
     uint32_t volatile iteration;
@@ -35,9 +38,10 @@ typedef enum {
 
 typedef union {
     struct __attribute__((__packed__)) {
-        snrt_reduction_opcode_t reduction_opcode : SNRT_COLLECTIVE_WIDTH;
-        snrt_collective_opcode_t collective_opcode : 2;
-        uint64_t mask : (64 - SNRT_COLLECTIVE_WIDTH - 2);
+        snrt_reduction_opcode_t reduction_opcode : SNRT_REDUCTION_OPCODE_WIDTH;
+        snrt_collective_opcode_t collective_opcode
+            : SNRT_COLLECTIVE_OPCODE_WIDTH;
+        uint64_t mask : SNRT_COLLECTIVE_MASK_WIDTH;
     } f;
     uint64_t w;
 } snrt_collective_op_t;
diff --git a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
index ec9f5d054..345425d30 100644
--- a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
+++ b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
@@ -26,7 +26,8 @@
 #define SNRT_SUPPORTS_NARROW_REDUCTION
 % endif
 
-#define SNRT_COLLECTIVE_WIDTH ${cfg['cluster']['collective_width']}
+#define SNRT_REDUCTION_OPCODE_WIDTH ${cfg['cluster']['reduction_opcode_width']}
+#define SNRT_COLLECTIVE_OPCODE_WIDTH ${cfg['cluster']['collective_width'] - cfg['cluster']['reduction_opcode_width']}
 
 // Software configuration
 #define SNRT_LOG2_STACK_SIZE 10
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index b96daa18b..e8c7b72d7 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -166,9 +166,14 @@
                     "description": "Width of the cluster's atomic ID.",
                     "default": 1
                 },  
+                "reduction_opcode_width": {
+                    "type": "number",
+                    "description": "Width of the reduction opcode field.",
+                    "default": 4
+                },  
                 "collective_width": {
                     "type": "number",
-                    "description": "Width of the collective operation field.",
+                    "description": "Width of the collective operation information in the awuser field, which includes the reduction opcode.",
                     "default": 6
                 },  
                 "enable_narrow_collectives": {

From 9459607db9c461f753b90af1ed1fb5a0a041fe26 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Mon, 16 Jun 2025 09:59:18 +0200
Subject: [PATCH 37/38] (feat) Add external DCA (direct compute access) to the
 snitch cluster.

* Access the combined FPU from outside of the cluster

* Extension of the tracer

(Cherry-Picked from 5d029e64a9a2e7d42533506a4830da95f87ef944)
---
 hw/snitch/src/snitch_pkg.sv                   |  48 +++-
 hw/snitch_cluster/src/snitch_cc.sv            | 147 +++++++++--
 hw/snitch_cluster/src/snitch_cluster.sv       | 144 ++++++++++-
 .../src/snitch_cluster_pkg.sv.tpl             |  24 ++
 .../src/snitch_cluster_wrapper.sv.tpl         |  32 ++-
 hw/snitch_cluster/src/snitch_fp_ss.sv         | 228 +++++++++++++++---
 hw/snitch_cluster/src/snitch_fpu.sv           |  10 +-
 target/snitch_cluster/test/testharness.sv     |   8 +-
 .../schema/snitch_cluster.schema.json         |  15 ++
 util/trace/gen_trace.py                       | 153 +++++++++++-
 10 files changed, 739 insertions(+), 70 deletions(-)

diff --git a/hw/snitch/src/snitch_pkg.sv b/hw/snitch/src/snitch_pkg.sv
index 5bc04f790..cfed8d7d4 100644
--- a/hw/snitch/src/snitch_pkg.sv
+++ b/hw/snitch/src/snitch_pkg.sv
@@ -184,7 +184,8 @@ package snitch_pkg;
   typedef enum logic [1:0] {
     SrcSnitch =  0,
     SrcFpu = 1,
-    SrcFpuSeq = 2
+    SrcFpuSeq = 2,
+    SrcDca = 3
   } trace_src_e;
 
   typedef struct packed {
@@ -354,6 +355,51 @@ package snitch_pkg;
     extras_str = $sformatf("%s}", extras_str);
     return extras_str;
   endfunction
+
+  typedef struct packed {
+    longint source;
+    longint dca_in_hs;          // Handshake to indicate DCA Data in
+    longint dca_out_hs;         // Handshake to indicate DCA Data out
+    longint dca_in_op_code;     // OPS-Code of the FPU (@FPNEW Doku)
+    longint dca_in_op_mode;     // OP-Mode of the FPU (@FPNEW Doku)
+    longint dca_in_rnd_mode;    // Round-Mode of the FPU (@FPNEW Doku)
+    longint dca_in_vector_mode; // Vector-Mode of the FPU (@FPNEW Doku)
+    longint dca_in_op_0;        // First Operand of the FPU
+    longint dca_in_op_1;        // Second Operand of the FPU
+    longint dca_in_op_2;        // Third Operand of the FPU
+    longint dca_in_src_fmt;     // Input SRC format (@FPNEW Doku)
+    longint dca_in_dst_fmt;     // Output SRC format (@FPNEW Doku)
+    longint dca_in_int_fmt;     // Intermidiate format (@FPNEW Doku)
+    longint dca_in_tag;         // Unique input Tag
+    longint dca_out_tag;        // Unique output Tag
+    longint dca_out_status;     // Status of the FPU (@FPNEW Doku)
+    longint dca_out_result;     // Result of the FPU
+  } dca_trace_port_t;
+  // All Dokumentation with (@FPNEW Doku) can be found here:
+  // https://github.com/openhwgroup/cvfpu/tree/master/docs
+
+  function automatic string print_dca_trace(dca_trace_port_t dca_trace);
+    string extras_str = "{";
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "source", dca_trace.source);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "dca_in_hs", dca_trace.dca_in_hs);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "dca_out_hs", dca_trace.dca_out_hs);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "op_code", dca_trace.dca_in_op_code);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "op_mode", dca_trace.dca_in_op_mode);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "rnd_mode", dca_trace.dca_in_rnd_mode);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "vector_mode", dca_trace.dca_in_vector_mode);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "op_0", dca_trace.dca_in_op_0);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "op_1", dca_trace.dca_in_op_1);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "op_2", dca_trace.dca_in_op_2);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "src_format", dca_trace.dca_in_src_fmt);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "dst_format", dca_trace.dca_in_dst_fmt);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "int_format", dca_trace.dca_in_int_fmt);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "in_tag", dca_trace.dca_in_tag);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "status", dca_trace.dca_out_status);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "result", dca_trace.dca_out_result);
+    extras_str = $sformatf("%s'%s': 0x%0x, ", extras_str, "out_tag", dca_trace.dca_out_tag);
+    extras_str = $sformatf("%s}", extras_str);
+    return extras_str;
+  endfunction
   // pragma translate_on
 
 endpackage
diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 31ded5c2d..a5a477014 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -44,6 +44,8 @@ module snitch_cc #(
   parameter type         hive_rsp_t         = logic,
   parameter type         acc_req_t          = logic,
   parameter type         acc_resp_t         = logic,
+  parameter type         dca_req_t          = logic,
+  parameter type         dca_resp_t         = logic,
   parameter type         dma_events_t       = logic,
   parameter fpnew_pkg::fpu_implementation_t FPUImplementation = '0,
   /// Boot address of core.
@@ -66,6 +68,8 @@ module snitch_cc #(
   parameter bit          Xfrep              = 1,
   /// Has `SSR` support.
   parameter bit          Xssr               = 1,
+  /// Has `DCA` support.
+  parameter bit          Xdca               = 0,
   /// Has `COPIFT` support.
   parameter bit          Xcopift            = 1,
   /// Has `IPU` support.
@@ -106,6 +110,10 @@ module snitch_cc #(
   parameter bit          RegisterFPUIn      = 0,
   /// Insert Pipeline registers immediately after FPU datapath
   parameter bit          RegisterFPUOut     = 0,
+  /// Insert Pipeline register between DCA from Router and FPU
+  parameter bit          RegisterDCAIn      = 0,
+  /// Insert Pipeline register between DCA from FPU and Router
+  parameter bit          RegisterDCAOut     = 0,
   parameter snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{default: 0},
   /// Consistency Address Queue (CAQ) parameters.
   parameter int unsigned CaqDepth     = 0,
@@ -146,7 +154,14 @@ module snitch_cc #(
   input  addr_t                             tcdm_addr_base_i,
   // Cluster HW barrier
   output logic                              barrier_o,
-  input  logic                              barrier_i
+  input  logic                              barrier_i,
+  // Direct Compute Access (DCA) interface
+  input dca_req_t                           dca_req_i,
+  input logic                               dca_req_valid_i,
+  output logic                              dca_req_ready_o,
+  output dca_resp_t                         dca_resp_o,
+  output logic                              dca_resp_valid_o,
+  input logic                               dca_resp_ready_i
 );
 
   // FMA architecture is "merged" -> mulexp and macexp instructions are supported
@@ -469,6 +484,7 @@ module snitch_cc #(
   // pragma translate_off
   snitch_pkg::fpu_trace_port_t fpu_trace;
   snitch_pkg::fpu_sequencer_trace_port_t fpu_sequencer_trace;
+  snitch_pkg::dca_trace_port_t dca_trace;
   // pragma translate_on
 
   logic  [2:0][4:0] ssr_raddr;
@@ -485,6 +501,54 @@ module snitch_cc #(
   logic             ssr_streamctl_valid;
   logic             ssr_streamctl_ready;
 
+  // Signals for the DCA
+  dca_req_t   dca_req_q;          // Delayed Request by the (optional) Spill Register
+  logic       dca_req_valid_q;    
+  logic       dca_req_ready_q;
+  dca_resp_t  dca_resp;           // Response from the FPU in front of the (optional) Spill Register
+  logic       dca_resp_valid;
+  logic       dca_resp_ready;
+
+  // Cut off-DCA Interface Request
+  if(Xdca) begin : gen_spill_register
+    spill_register #(
+      .T        (dca_req_t),
+      .Bypass   (~RegisterDCAIn)
+    ) i_spill_reg_dca_req (
+      .clk_i    (clk_i),
+      .rst_ni   (rst_ni),
+      .valid_i  (dca_req_valid_i),
+      .ready_o  (dca_req_ready_o),
+      .data_i   (dca_req_i),
+      .valid_o  (dca_req_valid_q),
+      .ready_i  (dca_req_ready_q),
+      .data_o   (dca_req_q)
+      );
+
+    // Cut off-DCA Interface Response
+    spill_register #(
+      .T          (dca_resp_t),
+      .Bypass     (~RegisterDCAOut)
+    ) i_spill_reg_dca_resp (
+      .clk_i      (clk_i),
+      .rst_ni     (rst_ni),
+      .valid_i    (dca_resp_valid),
+      .ready_o    (dca_resp_ready),
+      .data_i     (dca_resp),
+      .valid_o    (dca_resp_valid_o),
+      .ready_i    (dca_resp_ready_i),
+      .data_o     (dca_resp_o)
+    );
+  end else begin
+    assign dca_req_ready_o = 1'b0;
+    assign dca_req_valid_q = 1'b0;
+    assign dca_req_q = '0;
+
+    assign dca_resp_ready = 1'b0;
+    assign dca_resp_valid_o = 1'b0;
+    assign dca_resp_o = '0;
+  end
+
   if (FPEn) begin : gen_fpu
     snitch_pkg::core_events_t fp_ss_core_events;
 
@@ -505,11 +569,14 @@ module snitch_cc #(
       .drsp_t (drsp_t),
       .acc_req_t (acc_req_t),
       .acc_resp_t (acc_resp_t),
+      .dca_req_t (dca_req_t),
+      .dca_resp_t (dca_resp_t),
       .RegisterSequencer (RegisterSequencer),
       .RegisterFPUIn (RegisterFPUIn),
       .RegisterFPUOut (RegisterFPUOut),
       .Xfrep (Xfrep),
       .Xssr (Xssr),
+      .Xdca (Xdca),
       .Xcopift (Xcopift),
       .RVF (RVF),
       .RVD (RVD),
@@ -525,34 +592,41 @@ module snitch_cc #(
       // pragma translate_off
       .trace_port_o            ( fpu_trace           ),
       .sequencer_tracer_port_o ( fpu_sequencer_trace ),
+      .dca_trace_port_o        ( dca_trace ),
       // pragma translate_on
-      .hart_id_i        ( hart_id_i      ),
-      .acc_req_i        ( acc_snitch_req ),
-      .acc_req_valid_i  ( acc_qvalid     ),
-      .acc_req_ready_o  ( acc_qready     ),
-      .acc_resp_o       ( acc_seq        ),
-      .acc_resp_valid_o ( acc_pvalid     ),
-      .acc_resp_ready_i ( acc_pready     ),
-      .caq_pvalid_o     ( caq_pvalid     ),
-      .data_req_o       ( fpu_dreq       ),
-      .data_rsp_i       ( fpu_drsp       ),
-      .fpu_rnd_mode_i   ( fpu_rnd_mode   ),
-      .fpu_fmt_mode_i   ( fpu_fmt_mode   ),
-      .fpu_status_o     ( fpu_status     ),
-      .ssr_raddr_o      ( ssr_raddr      ),
-      .ssr_rdata_i      ( ssr_rdata      ),
-      .ssr_rvalid_o     ( ssr_rvalid     ),
-      .ssr_rready_i     ( ssr_rready     ),
-      .ssr_rdone_o      ( ssr_rdone      ),
-      .ssr_waddr_o      ( ssr_waddr      ),
-      .ssr_wdata_o      ( ssr_wdata      ),
-      .ssr_wvalid_o     ( ssr_wvalid     ),
-      .ssr_wready_i     ( ssr_wready     ),
-      .ssr_wdone_o      ( ssr_wdone      ),
+      .hart_id_i          ( hart_id_i           ),
+      .acc_req_i          ( acc_snitch_req      ),
+      .acc_req_valid_i    ( acc_qvalid          ),
+      .acc_req_ready_o    ( acc_qready          ),
+      .acc_resp_o         ( acc_seq             ),
+      .acc_resp_valid_o   ( acc_pvalid          ),
+      .acc_resp_ready_i   ( acc_pready          ),
+      .caq_pvalid_o       ( caq_pvalid          ),
+      .data_req_o         ( fpu_dreq            ),
+      .data_rsp_i         ( fpu_drsp            ),
+      .fpu_rnd_mode_i     ( fpu_rnd_mode        ),
+      .fpu_fmt_mode_i     ( fpu_fmt_mode        ),
+      .fpu_status_o       ( fpu_status          ),
+      .ssr_raddr_o        ( ssr_raddr           ),
+      .ssr_rdata_i        ( ssr_rdata           ),
+      .ssr_rvalid_o       ( ssr_rvalid          ),
+      .ssr_rready_i       ( ssr_rready          ),
+      .ssr_rdone_o        ( ssr_rdone           ),
+      .ssr_waddr_o        ( ssr_waddr           ),
+      .ssr_wdata_o        ( ssr_wdata           ),
+      .ssr_wvalid_o       ( ssr_wvalid          ),
+      .ssr_wready_i       ( ssr_wready          ),
+      .ssr_wdone_o        ( ssr_wdone           ),
       .streamctl_done_i   ( ssr_streamctl_done  ),
       .streamctl_valid_i  ( ssr_streamctl_valid ),
       .streamctl_ready_o  ( ssr_streamctl_ready ),
-      .core_events_o      ( fp_ss_core_events   )
+      .core_events_o      ( fp_ss_core_events   ),
+      .dca_req_i          ( dca_req_q           ),
+      .dca_req_valid_i    ( dca_req_valid_q     ),
+      .dca_req_ready_o    ( dca_req_ready_q     ),
+      .dca_resp_o         ( dca_resp            ),
+      .dca_resp_valid_o   ( dca_resp_valid      ),
+      .dca_resp_ready_i   ( dca_resp_ready      )
     );
 
     reqrsp_mux #(
@@ -604,6 +678,10 @@ module snitch_cc #(
     assign core_events_o.issue_fpu = '0;
     assign core_events_o.issue_fpu_seq = '0;
     assign core_events_o.issue_core_to_fpu = '0;
+
+    assign dca_resp_valid = 1'b0;
+    assign dca_resp = '0;
+    assign dca_req_ready_q = 1'b0;
   end
 
   // Decide whether to go to SoC or TCDM
@@ -902,6 +980,7 @@ module snitch_cc #(
     automatic snitch_pkg::snitch_trace_port_t extras_snitch;
     automatic snitch_pkg::fpu_trace_port_t extras_fpu;
     automatic snitch_pkg::fpu_sequencer_trace_port_t extras_fpu_seq_out;
+    automatic snitch_pkg::dca_trace_port_t extras_dca;
 
     if (rst_ni) begin
       extras_snitch = '{
@@ -953,6 +1032,11 @@ module snitch_cc #(
         end
       end
 
+      // If dca enabled then forward the trace port
+      if(Xdca) begin
+        extras_dca = dca_trace;
+      end
+
       cycle++;
       // Trace snitch iff:
       // we are not stalled <==> we have issued and processed an instruction (including offloads)
@@ -988,6 +1072,16 @@ module snitch_cc #(
           end
         end
       end
+      if(Xdca) begin
+        // Trace DCA iff
+        // When either an input or output handshake occures
+        if(extras_dca.dca_in_hs || extras_dca.dca_out_hs) begin
+          $sformat(trace_entry, "%t %1d %8d 0x%h DASM(%h) #; %s\n",
+              $time, cycle, i_snitch.priv_lvl_q, 32'hz, extras_dca.dca_in_op_code,
+              snitch_pkg::print_dca_trace(extras_dca));
+          $fwrite(f, trace_entry);
+        end
+      end
     end else begin
       cycle = '0;
     end
@@ -1000,5 +1094,8 @@ module snitch_cc #(
   // pragma translate_on
 
   `ASSERT_INIT(BootAddrAligned, BootAddr[1:0] == 2'b00)
+  
+  // For the DCA Extension the is is required that each core has the FPU D-ext loaded
+  `ASSERT_INIT(DCACoreConfiguration, (~Xdca) || RVD)
 
 endmodule
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index d8c405005..405945948 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -105,6 +105,8 @@ module snitch_cluster
   parameter bit [NrCores-1:0] Xdma          = '0,
   /// Per-core enabling of the custom `Xssr` ISA extensions.
   parameter bit [NrCores-1:0] Xssr          = '0,
+  /// Per-cluster enabling of the custom `DCA` extension.
+  parameter bit Xdca                        = 0,
   /// Per-core enabling of the custom `Xfrep` ISA extensions.
   parameter bit [NrCores-1:0] Xfrep         = '0,
   /// Per-core enabling of the custom `Xcopift` ISA extensions.
@@ -178,6 +180,10 @@ module snitch_cluster
   parameter bit          RegisterFPUIn      = 0,
   /// Insert Pipeline registers immediately after FPU datapath
   parameter bit          RegisterFPUOut     = 0,
+  /// Insert Pipeline register between DCA from Router and FPU
+  parameter bit          RegisterDCAIn      = 0,
+  /// Insert Pipeline register between DCA from FPU and Router
+  parameter bit          RegisterDCAOut     = 0,
   /// Run Snitch (the integer part) at half of the clock frequency
   parameter bit          IsoCrossing        = 0,
   parameter axi_pkg::xbar_latency_e NarrowXbarLatency = axi_pkg::CUT_ALL_PORTS,
@@ -222,7 +228,10 @@ module snitch_cluster
   parameter bit          AliasRegionEnable  = 1'b0,
   parameter logic [PhysicalAddrWidth-1:0] AliasRegionBase    = '0,
   /// Instantiate internal bootrom.
-  parameter bit          IntBootromEnable   = 1'b1
+  parameter bit          IntBootromEnable   = 1'b1,
+  /// IF for the DCA Access
+  parameter type         dca_router_req_t = logic,
+  parameter type         dca_router_resp_t = logic
 ) (
   /// System clock. If `IsoCrossing` is enabled this port is the _fast_ clock.
   /// The slower, half-frequency clock, is derived internally.
@@ -281,11 +290,23 @@ module snitch_cluster
   input  narrow_out_resp_t              narrow_ext_resp_i,
   // External TCDM ports
   input  tcdm_dma_req_t                 tcdm_ext_req_i,
-  output tcdm_dma_rsp_t                 tcdm_ext_resp_o
+  output tcdm_dma_rsp_t                 tcdm_ext_resp_o,
+  /// DCA IF to the FPU's
+  input  dca_router_req_t               dca_8x_req_i,
+  input  logic                          dca_8x_req_valid_i,
+  output logic                          dca_8x_req_ready_o,
+  /// DCA IF from the FPU's
+  output dca_router_resp_t              dca_8x_resp_o,
+  output logic                          dca_8x_resp_valid_o,
+  input  logic                          dca_8x_resp_ready_i
 );
   // ---------
   // Constants
   // ---------
+
+  /// DCA Bit width - Currently only applicable for the double extension (e.g. 64 Bit)
+  localparam int unsigned DCA_DATA_WIDTH = 64;
+
   /// Minimum width to hold the core number.
   localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores);
   localparam int unsigned TCDMMemAddrWidth = $clog2(TCDMDepth);
@@ -547,6 +568,26 @@ module snitch_cluster
     logic [1:0]    ptw_is_4mega;
   } hive_rsp_t;
 
+  // Typedef to generate the DCA request containing all information.
+  typedef struct packed {
+    logic [2:0][DCA_DATA_WIDTH-1:0]   dca_operands;       // FP-Operands from the Router
+    fpnew_pkg::roundmode_e            dca_rnd_mode;       // Round Mode for the FPU for this OP --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::operation_e            dca_op_code;        // OP Code for the FPU Command --> logic [3:0] (@FPNEW Doku)
+    logic                             dca_op_mode;        // OP Mode for the corresponding Code
+    fpnew_pkg::fp_format_e            dca_src_format;     // Format for the Source --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::fp_format_e            dca_dst_format;     // Format for the Destination --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::int_format_e           dca_int_format;     // Format for the Integer --> logic [1:0] (@FPNEW Doku)
+    logic                             dca_vector_op;      // Flag to indicate an vector operation
+    logic [6:0]                       dca_tag;            // Make the tag accessible by the outside world
+  } dca_req_t;
+
+  // Typedef to generate the DCA response containing all information
+  typedef struct packed {
+    logic [6:0]                       dca_tag;            // Return the tag to the outside world
+    fpnew_pkg::status_t               dca_status;         // Status Flag(s) of the FPU --> logic [4:0] (@FPNEW Doku)
+    logic [DCA_DATA_WIDTH-1:0]        dca_result;         // Result of the FPU Operation
+  } dca_resp_t;
+
   // ---------------------------
   // Cluster-internal Addressing
   // ---------------------------
@@ -1025,6 +1066,89 @@ module snitch_cluster
   hive_req_t [NrCores-1:0] hive_req;
   hive_rsp_t [NrCores-1:0] hive_rsp;
 
+  // Split the DCA into the cores!
+  dca_req_t [NrCores-1:0] dca_req;
+  logic [NrCores-1:0] dca_req_valid;
+  logic [NrCores-1:0] dca_req_ready;
+
+  dca_resp_t [NrCores-1:0] dca_resp;
+  logic [NrCores-1:0] dca_resp_valid;
+  logic [NrCores-1:0] dca_resp_ready;
+
+  // Only generate the DCA logic if the Extension is requested
+  if(Xdca) begin : gen_dca_handshake_control
+
+    // DCA Fork the Handshaking to the available number of cores
+    stream_fork #(
+      .N_OUP          (NrCores-1)
+    ) i_dca_fork_fpu (
+      .clk_i         (clk_i),
+      .rst_ni        (rst_ni),
+      .valid_i       (dca_8x_req_valid_i),
+      .ready_o       (dca_8x_req_ready_o),
+      .valid_o       (dca_req_valid[NrCores-2:0]),
+      .ready_i       (dca_req_ready[NrCores-2:0])
+    );
+
+    // Disable the DCA for the DMA Core (NrCores - 1)!
+    assign dca_req_valid[NrCores-1] = 1'b0;
+    assign dca_resp_ready[NrCores-1] = 1'b0;
+
+    // DCA Join the Handshaking from the available number of cores
+    stream_join #(
+      .N_INP          (NrCores-1)
+    ) i_dca_join_fpu (
+      .inp_valid_i     (dca_resp_valid[NrCores-2:0]),
+      .inp_ready_o     (dca_resp_ready[NrCores-2:0]),
+      .oup_valid_o     (dca_8x_resp_valid_o),
+      .oup_ready_i     (dca_8x_resp_ready_i)
+    );
+
+    for (genvar i = 0; i < NrCores; i++) begin : gen_dca_split
+      // Request Splitting
+      if(i < (NrCores-1)) begin
+        assign dca_req[i].dca_operands[2][DCA_DATA_WIDTH-1:0] = dca_8x_req_i.dca_operands[2][(DCA_DATA_WIDTH*(i+1))-1:DCA_DATA_WIDTH*i];   // Split up here the data
+        assign dca_req[i].dca_operands[1][DCA_DATA_WIDTH-1:0] = dca_8x_req_i.dca_operands[1][(DCA_DATA_WIDTH*(i+1))-1:DCA_DATA_WIDTH*i];   // Split up here the data
+        assign dca_req[i].dca_operands[0][DCA_DATA_WIDTH-1:0] = dca_8x_req_i.dca_operands[0][(DCA_DATA_WIDTH*(i+1))-1:DCA_DATA_WIDTH*i];   // Split up here the data
+
+        assign dca_req[i].dca_rnd_mode = dca_8x_req_i.dca_rnd_mode;
+        assign dca_req[i].dca_op_code = dca_8x_req_i.dca_op_code;
+        assign dca_req[i].dca_op_mode = dca_8x_req_i.dca_op_mode;
+        assign dca_req[i].dca_src_format = dca_8x_req_i.dca_src_format;
+        assign dca_req[i].dca_dst_format = dca_8x_req_i.dca_dst_format;
+        assign dca_req[i].dca_int_format = dca_8x_req_i.dca_int_format;
+        assign dca_req[i].dca_vector_op = dca_8x_req_i.dca_vector_op;
+        assign dca_req[i].dca_tag = dca_8x_req_i.dca_tag;
+
+        // Response Merging
+        assign dca_8x_resp_o.dca_result[(DCA_DATA_WIDTH*(i+1)-1):DCA_DATA_WIDTH*i] = dca_resp[i].dca_result[DCA_DATA_WIDTH-1:0];
+      end else begin
+        // The Connection to the DMA core is separated, we do not need it
+        assign dca_req[i] = '0;
+      end
+    end
+
+    // Copy the tag
+    assign dca_8x_resp_o.dca_tag = dca_resp[0].dca_tag;
+
+    // OR - Connect the overall status Bits
+    always_comb begin
+        dca_8x_resp_o.dca_status = '0; // Initialize to avoid latch inference
+        for (int i = 0; i < (NrCores-1); i++) begin
+            dca_8x_resp_o.dca_status |= dca_resp[i].dca_status; // Bitwise OR reduction
+        end
+    end
+
+  end else begin
+    assign dca_req = '0;
+    assign dca_req_valid = '0;
+    assign dca_8x_req_ready_o = 1'b0;
+
+    assign dca_resp_ready = '0;
+    assign dca_8x_resp_valid_o = 1'b0;
+    assign dca_8x_resp_o = '0;
+  end
+
   for (genvar i = 0; i < NrCores; i++) begin : gen_core
     localparam int unsigned TcdmPorts = get_tcdm_ports(i);
     localparam int unsigned TcdmPortsOffs = get_tcdm_port_offs(i);
@@ -1073,6 +1197,8 @@ module snitch_cluster
         .hive_rsp_t (hive_rsp_t),
         .acc_req_t (acc_req_t),
         .acc_resp_t (acc_resp_t),
+        .dca_req_t  (dca_req_t),
+        .dca_resp_t (dca_resp_t),
         .dma_events_t (dma_events_t),
         .BootAddr (BootAddrInternal),
         .RVE (RVE[i]),
@@ -1089,6 +1215,7 @@ module snitch_cluster
         .IsoCrossing (IsoCrossing),
         .Xfrep (Xfrep[i]),
         .Xssr (Xssr[i]),
+        .Xdca (Xdca),
         .Xcopift (Xcopift[i]),
         .Xipu (1'b0),
         .VMSupport (VMSupport),
@@ -1113,6 +1240,8 @@ module snitch_cluster
         .RegisterSequencer (RegisterSequencer),
         .RegisterFPUIn (RegisterFPUIn),
         .RegisterFPUOut (RegisterFPUOut),
+        .RegisterDCAIn  (RegisterDCAIn),
+        .RegisterDCAOut (RegisterDCAOut),
         .TCDMAddrWidth (TCDMAddrWidth),
         .CaqDepth (CaqDepth),
         .CaqTagWidth (CaqTagWidth),
@@ -1141,7 +1270,13 @@ module snitch_cluster
         .core_events_o (core_events[i]),
         .tcdm_addr_base_i (tcdm_start_address),
         .barrier_o (barrier_in[i]),
-        .barrier_i (barrier_out)
+        .barrier_i (barrier_out),
+        .dca_req_i (dca_req[i]),
+        .dca_req_valid_i (dca_req_valid[i]),
+        .dca_req_ready_o (dca_req_ready[i]),
+        .dca_resp_o (dca_resp[i]),
+        .dca_resp_valid_o (dca_resp_valid[i]),
+        .dca_resp_ready_i (dca_resp_ready[i])
       );
       for (genvar j = 0; j < TcdmPorts; j++) begin : gen_tcdm_user
         always_comb begin
@@ -1707,5 +1842,8 @@ module snitch_cluster
     ~AliasRegionEnable || ((TCDMSizeNapotRounded - 1) & AliasRegionBase) == 0)
   // Make sure we only have one DMA in the system.
   `ASSERT_INIT(NumberDMA, $onehot0(Xdma))
+  // For the DCA Extension the (Currently) only allowed configuration is 8 CC and 1 DC (9 Cores)
+  `ASSERT_INIT(DCASystemConfiguration, (~Xdca) || (NrCores == 9))
+  `ASSERT_INIT(DCASystemDMAWidth, (~Xdca) || (WideDataWidth == 512))
 
 endmodule
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
index ac8bdd2ff..5c6d39d07 100644
--- a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -85,6 +85,30 @@ package ${cfg['cluster']['name']}_pkg;
     sram_cfg_t tcdm;
   } sram_cfgs_t;
 
+% if cfg['cluster']['enable_dca']:
+  // Typedef to generate the 512 Bit request from the Router
+  typedef struct packed {
+    logic [2:0][WideDataWidth-1:0] dca_operands;       // FP-Operands from the Router
+    fpnew_pkg::roundmode_e    dca_rnd_mode;       // Round Mode for the FPU for this OP --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::operation_e    dca_op_code;        // OP Code for the FPU Command --> logic [3:0] (@FPNEW Doku)
+    logic                     dca_op_mode;        // OP Mode for the corresponding Code
+    fpnew_pkg::fp_format_e    dca_src_format;     // Format for the Source --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::fp_format_e    dca_dst_format;     // Format for the Destination --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::int_format_e   dca_int_format;     // Format for the Integer --> logic [1:0] (@FPNEW Doku)
+    logic                     dca_vector_op;      // Flag to indicate an vector operation
+    logic [6:0]               dca_tag;            // Make the tag accessible by the outside world
+  } dca_router_req_t;
+
+  // Typedef to generate the 512 Bit response to the router
+  typedef struct packed {
+    logic [6:0]               dca_tag;            // Return the tag to the outside world
+    fpnew_pkg::status_t       dca_status;         // Status Flag(s) of the FPU --> logic [4:0] (@FPNEW Doku)
+    logic [WideDataWidth-1:0] dca_result;         // Result of the FPU Operation
+  } dca_router_resp_t;
+%else:
+  typedef logic dca_router_req_t;
+  typedef logic dca_router_resp_t;
+%endif
   typedef logic [AddrWidth-1:0]         addr_t;
   typedef logic [NarrowDataWidth-1:0]   data_t;
   typedef logic [NarrowDataWidth/8-1:0] strb_t;
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index b7f32217e..fb6778656 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -51,7 +51,13 @@ module ${cfg['cluster']['name']}_wrapper (
   output ${cfg['cluster']['name']}_pkg::narrow_out_req_t    narrow_ext_req_o,
   input  ${cfg['cluster']['name']}_pkg::narrow_out_resp_t   narrow_ext_resp_i,
   input  ${cfg['cluster']['name']}_pkg::tcdm_dma_req_t      tcdm_ext_req_i,
-  output ${cfg['cluster']['name']}_pkg::tcdm_dma_rsp_t      tcdm_ext_resp_o
+  output ${cfg['cluster']['name']}_pkg::tcdm_dma_rsp_t      tcdm_ext_resp_o,
+  input  ${cfg['cluster']['name']}_pkg::dca_router_req_t    dca_8x_req_i,
+  input  logic                                              dca_8x_req_valid_i,
+  output logic                                              dca_8x_req_ready_o,
+  output ${cfg['cluster']['name']}_pkg::dca_router_resp_t   dca_8x_resp_o,
+  output logic                                              dca_8x_resp_valid_o,
+  input  logic                                              dca_8x_resp_ready_i
 );
 
   localparam int unsigned NumIntOutstandingLoads [${cfg['cluster']['nr_cores']}] = '{${core_cfg('num_int_outstanding_loads')}};
@@ -119,6 +125,7 @@ module ${cfg['cluster']['name']}_wrapper (
     .Xdma (${core_cfg_flat('xdma')}),
     .Xssr (${core_cfg_flat('xssr')}),
     .Xfrep (${core_cfg_flat('xfrep')}),
+    .Xdca (${int(cfg['cluster']['enable_dca'])}),
     .Xcopift (${core_cfg_flat('xcopift')}),
     .FPUImplementation (${cfg['cluster']['name']}_pkg::FPUImplementation),
     .SnitchPMACfg (${cfg['cluster']['name']}_pkg::SnitchPMACfg),
@@ -151,6 +158,8 @@ module ${cfg['cluster']['name']}_wrapper (
     .RegisterFPUReq (${int(cfg['cluster']['timing']['register_fpu_req'])}),
     .RegisterFPUIn (${int(cfg['cluster']['timing']['register_fpu_in'])}),
     .RegisterFPUOut (${int(cfg['cluster']['timing']['register_fpu_out'])}),
+    .RegisterDCAIn (${int(cfg['cluster']['timing']['register_dca_in'])}),
+    .RegisterDCAOut (${int(cfg['cluster']['timing']['register_dca_out'])}),
     .RegisterSequencer (${int(cfg['cluster']['timing']['register_sequencer'])}),
     .IsoCrossing (${int(cfg['cluster']['timing']['iso_crossings'])}),
     .NarrowXbarLatency (axi_pkg::${cfg['cluster']['timing']['narrow_xbar_latency']}),
@@ -165,7 +174,9 @@ module ${cfg['cluster']['name']}_wrapper (
     .CaqTagWidth (${int(cfg['cluster']['caq_tag_width'])}),
     .DebugSupport (${int(cfg['cluster']['enable_debug'])}),
     .AliasRegionEnable (${int(cfg['cluster']['alias_region_enable'])}),
-    .AliasRegionBase (${int(cfg['cluster']['alias_region_base'])})
+    .AliasRegionBase (${int(cfg['cluster']['alias_region_base'])}),
+    .dca_router_req_t (snitch_cluster_pkg::dca_router_req_t),
+    .dca_router_resp_t (snitch_cluster_pkg::dca_router_resp_t)
   ) i_cluster (
     .clk_i,
     .rst_ni,
@@ -220,6 +231,21 @@ module ${cfg['cluster']['name']}_wrapper (
     .wide_out_req_o,
     .wide_out_resp_i,
     .wide_in_req_i,
-    .wide_in_resp_o
+    .wide_in_resp_o,
+% if cfg['cluster']['enable_dca']:
+    .dca_8x_req_i,
+    .dca_8x_req_valid_i,
+    .dca_8x_req_ready_o,
+    .dca_8x_resp_o,
+    .dca_8x_resp_valid_o,
+    .dca_8x_resp_ready_i
+% else:
+    .dca_8x_req_i ('0),
+    .dca_8x_req_valid_i (1'b0),
+    .dca_8x_req_ready_o,
+    .dca_8x_resp_o,
+    .dca_8x_resp_valid_o,
+    .dca_8x_resp_ready_i (1'b0)
+%endif
   );
 endmodule
diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv
index 345e1eb3a..f8d936cc7 100644
--- a/hw/snitch_cluster/src/snitch_fp_ss.sv
+++ b/hw/snitch_cluster/src/snitch_fp_ss.sv
@@ -21,11 +21,14 @@ module snitch_fp_ss import snitch_pkg::*; #(
   parameter bit Xfrep = 1,
   parameter fpnew_pkg::fpu_implementation_t FPUImplementation = '0,
   parameter bit Xssr = 1,
+  parameter bit Xdca = 0,
   parameter bit Xcopift = 1,
   parameter int unsigned NumSsrs = 0,
   parameter logic [NumSsrs-1:0][4:0]  SsrRegs = '0,
   parameter type acc_req_t = logic,
   parameter type acc_resp_t = logic,
+  parameter type dca_req_t = logic,
+  parameter type dca_resp_t = logic,
   parameter bit RVF = 1,
   parameter bit RVD = 1,
   parameter bit XF16 = 0,
@@ -43,6 +46,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
   // pragma translate_off
   output fpu_trace_port_t  trace_port_o,
   output fpu_sequencer_trace_port_t sequencer_tracer_port_o,
+  output dca_trace_port_t  dca_trace_port_o,
   // pragma translate_on
   input  logic [31:0]      hart_id_i,
   // Accelerator Interface - Slave
@@ -80,7 +84,14 @@ module snitch_fp_ss import snitch_pkg::*; #(
   // TODO: is it good enough to assert this at issuing time instead?
   output logic             caq_pvalid_o,
   // Core event strobes
-  output core_events_t core_events_o
+  output core_events_t core_events_o,
+  // Direct Compute Access (DCA) interface
+  input dca_req_t           dca_req_i,
+  input logic               dca_req_valid_i,
+  output logic              dca_req_ready_o,
+  output dca_resp_t         dca_resp_o,
+  output logic              dca_resp_valid_o,
+  input logic               dca_resp_ready_i
 );
 
   fpnew_pkg::operation_e  fpu_op;
@@ -170,6 +181,30 @@ module snitch_fp_ss import snitch_pkg::*; #(
 
   logic dst_ready;
 
+
+  // Type for DCA (Direct Compute Access)
+
+  // Typedef to concat the FPU request
+  typedef struct packed {
+    logic [2:0][FLEN-1:0]   operands;       // FP-Operands from the Router
+    fpnew_pkg::roundmode_e  rnd_mode;       // Round Mode for the FPU for this OP --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::operation_e  op_code;        // OP Code for the FPU Command --> logic [3:0] (@FPNEW Doku)
+    logic                   op_mode;        // OP Mode for the corresponding Code
+    fpnew_pkg::fp_format_e  src_format;     // Format for the Source --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::fp_format_e  dst_format;     // Format for the Destination --> logic [2:0] (@FPNEW Doku)
+    fpnew_pkg::int_format_e int_format;     // Format for the Integer --> logic [1:0] (@FPNEW Doku)
+    logic                   vector_op;      // Flag to indicate an vector operation
+    logic [7:0]             tag;            // Make the tag accessible by the outside world
+  } fpu_req_t;
+
+  // Typedef to concat the FPU response
+  typedef struct packed {
+    logic [7:0]             tag;            // Return the tag to the outside world
+    fpnew_pkg::status_t     status;         // Status Flag(s) of the FPU --> logic [4:0] (@FPNEW Doku)
+    logic [FLEN-1:0]        result;         // Result of the FPU Operation
+  } fpu_resp_t;
+
+
   // -------------
   // FPU Sequencer
   // -------------
@@ -2587,43 +2622,150 @@ module snitch_fp_ss import snitch_pkg::*; #(
     end
   end
 
+  // ----------------------
+  // Direct Compute Access
+  // ----------------------
+  // This part allows the router to "hijack" the FPU from the core to execute FP-Operaton.
+
+  // Define signals to connect the FPU
+  fpu_req_t   dca_req;
+  fpu_req_t   snitch_req;
+
+  fpu_req_t   mux_dca_snitch;
+  logic       mux_dca_snitch_valid;
+  logic       mux_dca_snitch_ready;
+
+  fpu_resp_t  demux_dca_snitch;
+  logic       demux_dca_snitch_valid;
+  logic       demux_dca_snitch_ready;
+
+  // Assign the signal for the DCA Input if the extension is enabled
+  if (Xdca) begin : gen_assign_input_dca
+    assign dca_req.operands = dca_req_i.dca_operands;
+    assign dca_req.rnd_mode = dca_req_i.dca_rnd_mode;
+    assign dca_req.op_code = dca_req_i.dca_op_code;
+    assign dca_req.op_mode = dca_req_i.dca_op_mode;
+    assign dca_req.src_format = dca_req_i.dca_src_format;
+    assign dca_req.dst_format = dca_req_i.dca_dst_format;
+    assign dca_req.int_format = dca_req_i.dca_int_format;
+    assign dca_req.vector_op = dca_req_i.dca_vector_op;
+    assign dca_req.tag = {1'b1, dca_req_i.dca_tag};
+  end
+
+  // Assign the signal for the FPU Input
+  assign snitch_req.operands = op;
+  assign snitch_req.rnd_mode = fpu_rnd_mode;
+  assign snitch_req.op_code = fpu_op;
+  assign snitch_req.op_mode = op_mode;
+  assign snitch_req.src_format = src_fmt;
+  assign snitch_req.dst_format = dst_fmt;
+  assign snitch_req.int_format = int_fmt;
+  assign snitch_req.vector_op = vectorial_op;
+  assign snitch_req.tag = {1'b0, fpu_tag_in};
+
+  // We use rotating priority - Could be changed afterwards but the problem is the rr_arb_tree doesn't really support locked'in & priority @ same time 
+  // --> Would violate AXI Timing
+  // Can write by myself but is this really a good use case?
+  
+  // --------------------------------------------
+  // Multiplexer between Core and Direct Access
+  // --------------------------------------------
+  if (Xdca) begin : gen_xdca_mux
+    stream_arbiter #(
+        .DATA_T           (fpu_req_t),
+        .N_INP            (2),
+        .ARBITER          ("rr")
+    ) i_mux_dca (
+        .clk_i            (clk_i),
+        .rst_ni           (~rst_i),
+
+        .inp_data_i       ({dca_req, snitch_req}),
+        .inp_valid_i      ({dca_req_valid_i, fpu_in_valid}),
+        .inp_ready_o      ({dca_req_ready_o, fpu_in_ready}),
+
+        .oup_data_o       (mux_dca_snitch),
+        .oup_valid_o      (mux_dca_snitch_valid),
+        .oup_ready_i      (mux_dca_snitch_ready)
+    );
+  end else begin : gen_xdca_bypass_mux
+    assign mux_dca_snitch_valid = fpu_in_valid;
+    assign fpu_in_ready = mux_dca_snitch_ready;
+    assign mux_dca_snitch = snitch_req;
+    assign dca_req_ready_o = 1'b0;
+  end
+
   // ----------------------
   // Floating Point Unit
   // ----------------------
   snitch_fpu #(
-    .RVF     ( RVF     ),
-    .RVD     ( RVD     ),
-    .XF16    ( XF16    ),
-    .XF16ALT ( XF16ALT ),
-    .XF8     ( XF8     ),
-    .XF8ALT  ( XF8ALT  ),
-    .XFVEC   ( XFVEC   ),
-    .FLEN    ( FLEN    ),
-    .FPUImplementation  (FPUImplementation),
-    .RegisterFPUIn      (RegisterFPUIn),
-    .RegisterFPUOut     (RegisterFPUOut)
+    .RVF                ( RVF                       ),
+    .RVD                ( RVD                       ),
+    .XF16               ( XF16                      ),
+    .XF16ALT            ( XF16ALT                   ),
+    .XF8                ( XF8                       ),
+    .XF8ALT             ( XF8ALT                    ),
+    .XFVEC              ( XFVEC                     ),
+    .FLEN               ( FLEN                      ),
+    .FPUImplementation  ( FPUImplementation         ),
+    .RegisterFPUIn      ( RegisterFPUIn             ),
+    .RegisterFPUOut     ( RegisterFPUOut            )
   ) i_fpu (
-    .clk_i                           ,
-    .rst_ni         ( ~rst_i        ),
-    .hart_id_i      ( hart_id_i     ),
-    .operands_i     ( op            ),
-    .rnd_mode_i     ( fpu_rnd_mode  ),
-    .op_i           ( fpu_op        ),
-    .op_mod_i       ( op_mode       ), // Sign of operand?
-    .src_fmt_i      ( src_fmt       ),
-    .dst_fmt_i      ( dst_fmt       ),
-    .int_fmt_i      ( int_fmt       ),
-    .vectorial_op_i ( vectorial_op  ),
-    .tag_i          ( fpu_tag_in    ),
-    .in_valid_i     ( fpu_in_valid  ),
-    .in_ready_o     ( fpu_in_ready  ),
-    .result_o       ( fpu_result    ),
-    .status_o       ( fpu_status_o  ),
-    .tag_o          ( fpu_tag_out   ),
-    .out_valid_o    ( fpu_out_valid ),
-    .out_ready_i    ( fpu_out_ready )
+    .clk_i                                           ,
+    .rst_ni         ( ~rst_i                        ),
+    .hart_id_i      ( hart_id_i                     ),
+    .operands_i     ( mux_dca_snitch.operands       ),
+    .rnd_mode_i     ( mux_dca_snitch.rnd_mode       ),
+    .op_i           ( mux_dca_snitch.op_code        ),
+    .op_mod_i       ( mux_dca_snitch.op_mode        ), // Sign of operand?
+    .src_fmt_i      ( mux_dca_snitch.src_format     ),
+    .dst_fmt_i      ( mux_dca_snitch.dst_format     ),
+    .int_fmt_i      ( mux_dca_snitch.int_format     ),
+    .vectorial_op_i ( mux_dca_snitch.vector_op      ),
+    .tag_i          ( mux_dca_snitch.tag            ),
+    .in_valid_i     ( mux_dca_snitch_valid          ),
+    .in_ready_o     ( mux_dca_snitch_ready          ),
+    .result_o       ( demux_dca_snitch.result       ),
+    .status_o       ( demux_dca_snitch.status       ),
+    .tag_o          ( demux_dca_snitch.tag          ),
+    .out_valid_o    ( demux_dca_snitch_valid        ),
+    .out_ready_i    ( demux_dca_snitch_ready        )
   );
 
+  // --------------------------------------------
+  // Demultiplexer between Core and Direct Access
+  // --------------------------------------------
+  if (Xdca) begin : gen_xdca_demux
+    stream_demux #(
+      .N_OUP            (32'd2)
+    ) i_demux_dca (
+      .inp_valid_i      (demux_dca_snitch_valid),
+      .inp_ready_o      (demux_dca_snitch_ready),
+
+      .oup_sel_i        (demux_dca_snitch.tag[7]),
+
+      .oup_valid_o      ({dca_resp_valid_o, fpu_out_valid}),
+      .oup_ready_i      ({dca_resp_ready_i, fpu_out_ready})
+    );
+    // Connect the data signals to the output
+    assign dca_resp_o.dca_result = demux_dca_snitch.result;
+    assign dca_resp_o.dca_status = demux_dca_snitch.status;
+    assign dca_resp_o.dca_tag = demux_dca_snitch.tag[6:0];
+  // No DCA IF is required
+  end else begin : gen_xdca_bypass_demux
+    assign demux_dca_snitch_ready = fpu_out_ready;
+    assign fpu_out_valid = demux_dca_snitch_valid;
+
+    assign dca_resp_valid_o = 1'b0;
+    assign dca_resp_o = '0;
+  end
+
+  // Assign the signals to the corresponding outputs
+  assign fpu_result = demux_dca_snitch.result;
+  assign fpu_status_o = demux_dca_snitch.status;
+  assign fpu_tag_out = demux_dca_snitch.tag[6:0];
+
+  // DCA End
+
   assign ssr_waddr_o = fpr_waddr;
   assign ssr_wdata_o = fpr_wdata;
   logic [63:0] nan_boxed_arga;
@@ -2751,6 +2893,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
 
   // Tracer
   // pragma translate_off
+  // Assign the FPU trace
   assign trace_port_o.source       = snitch_pkg::SrcFpu;
   assign trace_port_o.acc_q_hs     = (acc_req_valid_q  && acc_req_ready_q );
   assign trace_port_o.fpu_out_hs   = (fpu_out_valid && fpu_out_ready );
@@ -2785,6 +2928,29 @@ module snitch_fp_ss import snitch_pkg::*; #(
   assign trace_port_o.fpr_waddr    = fpr_waddr[0];
   assign trace_port_o.fpr_wdata    = fpr_wdata[0];
   assign trace_port_o.fpr_we       = fpr_we[0];
+
+  // Assign the DCA tracer
+  if (Xdca) begin : gen_xdca_tracer
+    assign dca_trace_port_o.source              = snitch_pkg::SrcDca;
+    assign dca_trace_port_o.dca_in_hs           = (dca_req_valid_i && dca_req_ready_o);
+    assign dca_trace_port_o.dca_out_hs          = (dca_resp_valid_o && dca_resp_ready_i);
+    assign dca_trace_port_o.dca_in_op_code      = dca_req_i.dca_op_code;
+    assign dca_trace_port_o.dca_in_op_mode      = dca_req_i.dca_op_mode;
+    assign dca_trace_port_o.dca_in_rnd_mode     = dca_req_i.dca_rnd_mode;
+    assign dca_trace_port_o.dca_in_vector_mode  = dca_req_i.dca_vector_op;
+    assign dca_trace_port_o.dca_in_op_0         = dca_req_i.dca_operands[0];
+    assign dca_trace_port_o.dca_in_op_1         = dca_req_i.dca_operands[1];
+    assign dca_trace_port_o.dca_in_op_2         = dca_req_i.dca_operands[2];
+    assign dca_trace_port_o.dca_in_src_fmt      = dca_req_i.dca_src_format;
+    assign dca_trace_port_o.dca_in_dst_fmt      = dca_req_i.dca_dst_format;
+    assign dca_trace_port_o.dca_in_int_fmt      = dca_req_i.dca_int_format;
+    assign dca_trace_port_o.dca_in_tag          = dca_req_i.dca_tag;
+    assign dca_trace_port_o.dca_out_tag         = dca_resp_o.dca_tag;
+    assign dca_trace_port_o.dca_out_status      = dca_resp_o.dca_status;
+    assign dca_trace_port_o.dca_out_result      = dca_resp_o.dca_result;
+  end else begin
+    assign dca_trace_port_o                     = '0;
+  end
   // pragma translate_on
 
   /// Assertions
diff --git a/hw/snitch_cluster/src/snitch_fpu.sv b/hw/snitch_cluster/src/snitch_fpu.sv
index ed7958edc..8782e2368 100644
--- a/hw/snitch_cluster/src/snitch_fpu.sv
+++ b/hw/snitch_cluster/src/snitch_fpu.sv
@@ -28,14 +28,14 @@ module snitch_fpu import snitch_pkg::*; #(
   input fpnew_pkg::fp_format_e              dst_fmt_i,
   input fpnew_pkg::int_format_e             int_fmt_i,
   input logic                               vectorial_op_i,
-  input logic [6:0]                         tag_i,
+  input logic [7:0]                         tag_i,
   // Input Handshake
   input  logic                              in_valid_i,
   output logic                              in_ready_o,
   // Output signals
   output logic [FLEN-1:0]                   result_o,
   output logic [4:0]                        status_o,
-  output logic [6:0]                        tag_o,
+  output logic [7:0]                        tag_o,
   // Output handshake
   output logic                              out_valid_o,
   input  logic                              out_ready_i
@@ -58,13 +58,13 @@ module snitch_fpu import snitch_pkg::*; #(
     fpnew_pkg::fp_format_e   dst_fmt;
     fpnew_pkg::int_format_e  int_fmt;
     logic                    vectorial_op;
-    logic [6:0]              tag;
+    logic [7:0]              tag;
   } fpu_in_t;
 
   typedef struct packed {
     logic [FLEN-1:0] result;
     logic [4:0]      status;
-    logic [6:0]      tag;
+    logic [7:0]      tag;
   } fpu_out_t;
 
   fpu_in_t fpu_in_q, fpu_in;
@@ -102,7 +102,7 @@ module snitch_fpu import snitch_pkg::*; #(
     // FPU configuration
     .Features                    ( FPUFeatures            ),
     .Implementation              ( FPUImplementation      ),
-    .TagType                     ( logic[6:0]             ),
+    .TagType                     ( logic[7:0]             ),
     .CompressedVecCmpResult      ( 1                      ),
     .StochasticRndImplementation ( fpnew_pkg::DEFAULT_RSR )
   ) i_fpu (
diff --git a/target/snitch_cluster/test/testharness.sv b/target/snitch_cluster/test/testharness.sv
index 84b17ef49..fdefe8e78 100644
--- a/target/snitch_cluster/test/testharness.sv
+++ b/target/snitch_cluster/test/testharness.sv
@@ -51,7 +51,13 @@ module testharness #(
     .narrow_ext_req_o (),
     .narrow_ext_resp_i ('0),
     .tcdm_ext_req_i ('0),
-    .tcdm_ext_resp_o ()
+    .tcdm_ext_resp_o (),
+    .dca_8x_req_i ('0),
+    .dca_8x_req_valid_i ('0),
+    .dca_8x_req_ready_o (),
+    .dca_8x_resp_o (),
+    .dca_8x_resp_valid_o (),
+    .dca_8x_resp_ready_i ('0)
   );
 
   ///////////
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index e8c7b72d7..ba5fb1a98 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -186,6 +186,11 @@
                     "description": "Whether to enable wide multicast and reduction support in the cluster. Requires multicast and reduction requests to be handled in the SoC interconnect.",
                     "default": false
                 },
+                "enable_dca": {
+                    "type": "boolean",
+                    "description": "Whether to enable the direct compute access into the cluster",
+                    "default": false
+                },
                 "hart_base_id": {
                     "type": "number",
                     "description": "Base hart id of the cluster. All cores get the respective cluster id plus their cluster position as the final `hart_id`.",
@@ -331,6 +336,16 @@
                             "description": "Insert Pipeline registers immediately after FPU datapath",
                             "default": false
                         },
+                        "register_dca_in": {
+                            "type": "boolean",
+                            "description": "Insert Pipeline registers immediately befor the dca req path merges into the FPU",
+                            "default": false
+                        },
+                        "register_dca_out": {
+                            "type": "boolean",
+                            "description": "Insert Pipeline registers immediately after the dca resp path diverge from the FPU",
+                            "default": false
+                        },
                         "register_tcdm_cuts": {
                             "type": "boolean",
                             "description": "Insert Pipeline registers after each memory cut.",
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index ecf0d54f8..3e89b5492 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -81,7 +81,7 @@
                      for i in range(2, 12)), *('ft{}'.format(i)
                                                for i in range(8, 12)))
 
-TRACE_SRCES = {'snitch': 0, 'fpu': 1, 'sequencer': 2}
+TRACE_SRCES = {'snitch': 0, 'fpu': 1, 'sequencer': 2, 'dca': 3}
 
 LS_SIZES = ('Byte', 'Half', 'Word', 'Doub')
 
@@ -834,6 +834,144 @@ def annotate_fpu(
     return ', '.join(ret)
 
 
+# Small helper function which parses the data for one input operand
+# The vector flag indicates if multiple entries are available
+def helper_parse_register_data(data, vector_flag, isIntFormat, Format) -> str:
+    ret = ''
+    decoded_fp_format = [
+        {"name":"FP32", "vec":2,"length":32},
+        {"name":"FP64", "vec":1,"length":64},
+        {"name":"FP16", "vec":4,"length":16},
+        {"name":"FP8", "vec":8,"length":8},
+        {"name":"FP16A", "vec":4,"length":16},
+        {"name":"FP8A", "vec":8,"length":8}
+    ]
+
+    decoded_int_format = [
+        {"name":"INT8", "vec":8,"length":8},
+        {"name":"INT16", "vec":4,"length":16},
+        {"name":"INT32", "vec":2,"length":32},
+        {"name":"INT64", "vec":1,"length":64}
+    ]
+
+    # Parse the data depending if it is a vector and/or a float
+    if(vector_flag == 0):
+        if(isIntFormat == False):
+            ret = f'[{flt_fmt(flt_decode(data, Format), 6)}]'
+        else:
+            temp_data = (data & ((1 << decoded_int_format[Format]["length"])-1))
+            ret = f'[{temp_data}]'
+    else:
+        if(isIntFormat == False):
+            ret = '['
+            for i in range(decoded_fp_format[Format]["vec"]):
+                temp_data = (data >> (i*decoded_fp_format[Format]["length"])) & ((1 << decoded_fp_format[Format]["length"])-1)
+                ret = ret + f'{flt_fmt(flt_decode(temp_data, Format), 6)} ,'
+            ret = ret[:-2] + ']'
+        else:
+            ret = '['
+            for i in range(decoded_int_format[Format]["vec"]):
+                temp_data = (data >> (i*decoded_int_format[Format]["vec"])) & ((1 << decoded_int_format[Format]["length"])-1)
+                ret = ret + f'{temp_data} ,'
+            ret = ret[:-2] + ']'
+
+    # Append the Format to the Values
+    if(isIntFormat == False):
+        ret = ret + f' ({decoded_fp_format[Format]["name"]})'
+    else:
+        ret = ret + f' ({decoded_int_format[Format]["name"]})'
+    return ret
+
+# Helperfunction to evaluate the statu of the FPU
+def helper_eval_status(status: int) -> str:
+    flags = ["NV, ", "DZ, ", "OF, ", "UF, ", "NX, "]
+    ret = "".join(flag for i, flag in enumerate(flags) if status & (1 << (len(flags) - 1 - i)))
+
+    if(ret == ""):
+        return "IO"
+    else:
+        return ret[:-2]
+
+# Annotate DCA Instruction
+# Info: We receive the pure op-code / op-mode / vector-mode from the tracer
+#       rather than the 32-Bit inst. As it is more tidious to parse them back into
+#       the 32-Bit format it is easier to parse them by hand.
+#       Doku: https://github.com/openhwgroup/cvfpu/tree/master/docs
+def annotate_dca(
+        extras: dict,
+        insn: str,
+        cycle: int,
+        dca_wb_info: dict,  # One deque (FIFO) for storing information about the DCA access
+        perf_metrics: list,
+        force_hex_addr: bool = True,
+        permissive: bool = False
+    ):
+
+    inst_decoded = [
+        {"ops_name":"FMADD",    "req_op_0":True,    "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"FNMSUB",   "req_op_0":True,    "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"ADD",      "req_op_0":False,   "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"MUL",      "req_op_0":True,    "req_op_1":True,    "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"DIV",      "req_op_0":True,    "req_op_1":True,    "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"SQRT",     "req_op_0":True,    "req_op_1":False,   "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"SGNJ",     "req_op_0":True,    "req_op_1":True,    "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"MINMAX",   "req_op_0":True,    "req_op_1":True,    "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"CMP",      "req_op_0":True,    "req_op_1":True,    "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"CLASSIFY", "req_op_0":True,    "req_op_1":False,   "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"F2F",      "req_op_0":True,    "req_op_1":False,   "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"F2I",      "req_op_0":True,    "req_op_1":False,   "req_op_2":False,   "SrcFmtInt":False,  "DstFmtInt":True},
+        {"ops_name":"I2F",      "req_op_0":True,    "req_op_1":False,   "req_op_2":False,   "SrcFmtInt":True,   "DstFmtInt":False},
+        {"ops_name":"CPKAB",    "req_op_0":True,    "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"CPKCD",    "req_op_0":True,    "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"SDOTP",    "req_op_0":True,    "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"EXVSUM",   "req_op_0":True,    "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+        {"ops_name":"VSUM",     "req_op_0":True,    "req_op_1":True,    "req_op_2":True,    "SrcFmtInt":False,  "DstFmtInt":False},
+    ]
+
+    rnd_decode = ['RNE','RTZ','RDN','RUP','RMM','ROD','RSR','DYN']
+
+    inst = ''
+    annot = ''
+
+    if(extras['dca_in_hs'] == 1):
+        # Decode the Intruction name / op mode / rounding modes
+        inst = f"DCA {inst_decoded[extras['op_code']]['ops_name']} (M:{extras['op_mode']} RND:{rnd_decode[extras['rnd_mode']]})"
+
+        # Add Operand 0 if required
+        if(inst_decoded[extras['op_code']]['req_op_0'] == True):
+            # Either pass the SRC FP Format or the Int Format depending on the Instruction (Only valid on op[0])
+            if(inst_decoded[extras['op_code']]['SrcFmtInt'] == True):
+                annot = annot + 'op[0] = ' + helper_parse_register_data(extras['op_0'], extras['vector_mode'], True, extras['int_format']) + ', '
+            else:
+                annot = annot + 'op[0] = ' + helper_parse_register_data(extras['op_0'], extras['vector_mode'], False, extras['src_format']) + ', '
+
+        # Add Operand 0 if required
+        if(inst_decoded[extras['op_code']]['req_op_1'] == True):
+            annot = annot + 'op[1] = ' + helper_parse_register_data(extras['op_1'], extras['vector_mode'], False, extras['src_format']) + ', '
+
+        # Add Operand 0 if required
+        if(inst_decoded[extras['op_code']]['req_op_2'] == True):
+            annot = annot + 'op[2] = ' + helper_parse_register_data(extras['op_2'], extras['vector_mode'], False, extras['src_format']) + ', '     
+
+        # finally add the tag
+        annot = annot + f"tag {extras['in_tag']};"
+
+        # Update the performence metric
+        perf_metrics[-1]['dca_fpu_issues'] += 1
+
+        # Push dst format to the Queue and if it is an int-format or not
+        if(inst_decoded[extras['op_code']]['DstFmtInt'] == True):
+            dca_wb_info[0].appendleft((True, extras['int_format'], extras['vector_mode']))
+        else:
+            dca_wb_info[0].appendleft((False, extras['dst_format'], extras['vector_mode']))
+
+    # Handle the Output Handshake
+    if(extras['dca_out_hs'] == 1):
+        isIntFormat, Format, VectorMode = dca_wb_info[0].pop()
+        annot = annot + '(d:dca) res = ' + helper_parse_register_data(extras['result'], VectorMode, isIntFormat, Format) + ', status = ' + helper_eval_status(extras['status']) + f", tag {extras['out_tag']};"
+
+    return inst, annot
+
 # noinspection PyTypeChecker
 def annotate_insn(
     line: str,
@@ -841,6 +979,8 @@ def annotate_insn(
     dict,  # One deque (FIFO) per GPR storing start cycles for each GPR WB
     fpr_wb_info:
     dict,  # One deque (FIFO) per FPR storing start cycles and formats for each FPR WB
+    dca_wb_info:
+    dict,  # One deque (FIFO) for storing information about the DCA access
     sequencer:
     Sequencer,  # Sequencer model to properly map tunneled instruction PCs
     perf_metrics: list,  # A list performance metric dicts
@@ -925,6 +1065,11 @@ def annotate_insn(
                              sequencer.curr_sec, int_as_hex,
                              permissive))
             annot = ', '.join(annot_list)
+
+        # Annotate DCA
+        elif extras['source'] == TRACE_SRCES['dca']:
+            insn, annot = annotate_dca(extras, insn, time_info[1], dca_wb_info, perf_metrics)
+
         else:
             raise ValueError('Unknown trace source: {}'.format(
                 extras['source']))
@@ -1103,6 +1248,7 @@ def custom_formatwarning(message, category, filename, lineno, line=None):
         time_info = None
         gpr_wb_info = defaultdict(deque)
         fpr_wb_info = defaultdict(deque)
+        dca_wb_info = defaultdict(deque)
         sequencer = Sequencer()
         dma_trans = [{'rep': 1}]
         perf_metrics = [
@@ -1117,6 +1263,7 @@ def custom_formatwarning(message, category, filename, lineno, line=None):
                         line,
                         gpr_wb_info,
                         fpr_wb_info,
+                        dca_wb_info,
                         sequencer,
                         perf_metrics,
                         args.mc_exec,
@@ -1182,6 +1329,10 @@ def wb_msg(reg_name, transactions):
         if len(que) != 0:
             warn_trip = True
             warnings.warn(wb_msg(REG_ABI_NAMES_I[gpr], que))
+    for dca, que in dca_wb_info.items():
+        if len(que) != 0:
+            warn_trip = True
+            warnings.warn(wb_msg("DCA", que))
     # Check final state of sequencer is clean
     if sequencer.terminate():
         warn_trip = True

From 12d417831640b78320a4f70a1a26485b9ca30fd8 Mon Sep 17 00:00:00 2001
From: Raphael <raroth@student.ethz.ch>
Date: Fri, 25 Jul 2025 14:28:47 +0200
Subject: [PATCH 38/38] Merge everything relevant to the GEMM / MHA from snitch
 into my brunch

Squashed commit of the following commits:

* commit 8fd7a66500bc0caf7091dc945231f263e097004c - Fix tracing

* commit e27b57e4fb3ad87c741e00bcc5a6366800ed0727 - sw: Use DataGen class in FlashAttention-2 and FusedConcatLinear data generators

* commit cfee4e1823763407da0fef9826b91e6a346316bf -  sw: Add MHA kernel

* commit e04070414722c0ef3e8bb9af88f8ee6ab536f116 -  sw: Enable GEMM parallelized over K on subset of clusters

* commit 43b8dd84e965f8e812cab4a4c2b7de83e31bbaf2 - target: Separate HAL source and build dirs

* commit 16b74eadc89827a485d237763e500e8ab500c368 - target: Add missing RDL files to clean targets

* commit 4d2b3121f0f563df6366b889af8427e37edb4334 - flashattention_2: Fix to work on multi-cluster systems

* commit 2a9536f0185353441536c3503cf0f0df48658cfa - docs: Add system integration page

* commit 5769bbdd782a8f4e210cfddc4c8873ca7d7c2ecd - sw: Make `snitch_cluster_cfg.h.tpl` depend only on config

* commit c937cfcad9d54608ad3d74dc20d1d240c1db317a - docs: Add system integration guide

* commit 5fcf257337324f0c9fe25b49c1f57d08a90e25b6 - runtime: Fix global reduction with DMA

* commit 165065430ffe5d8ed5b980d4a70b583476463629 - target: Streamline `SNRT_APPS` integration in derived systems

* commit bc60d210b378ab00e7faaffdab3d4ab94217bbd8 - target: Pick up CLI gentrace flags and set `--permissive` when debugging

* commit 155f764a74faa8b1b0f57cdb03ab7cc1b3923c98 - target: Update trace visualization command after `SN_CFG` name change

* commit 3720c55a1a11e9b8fc6cbe60ddead8b65a93c7cb - sw: Add multicast 2D tile transfer functions

* commit d17d87c72df5047ba56b4baf54ccf855a3097387 - sw: Enable overriding scripts directory

* commit 6b75d9992c3c457a7132038588059fdcce14ab8c - runtime: Fix CLS pointer initialization

* commit 9528b4a6ab3f5275c72f6a29fba3c5404a4cad13 - runtime: Fix `snrt_wake_up` with fence

* commit 8d7345049d2611fad7750c037908c2f23f869812 - runtime: Add `snrt_fence` routine

* commit 7f430f2772616398659d9cd2f2767617e0fe6cf8 - Expose multiple wide TCDM ports (#258)
---
 docs/ug/system_integration.md                 |  51 ++
 hw/snitch_cluster/src/snitch_cluster.sv       |  62 +--
 .../src/snitch_cluster_wrapper.sv.tpl         |  20 +-
 mkdocs.yml                                    |   1 +
 sw/apps/common.mk                             |  10 +-
 sw/blas/gemm/src/gemm.h                       |  13 +-
 sw/dnn/flashattention_2/scripts/datagen.py    | 475 ++++++++----------
 .../src/flashattention_2_fp16.h               |   2 +-
 .../src/flashattention_2_fp32.h               |   2 +-
 .../src/flashattention_2_fp8.h                |   2 +-
 .../src/fused_concat_linear.h                 |   2 +-
 sw/dnn/mha/data/params.json                   |  14 +
 sw/dnn/mha/scripts/datagen.py                 | 259 ++++++++++
 sw/dnn/mha/scripts/verify.py                  | 106 ++++
 sw/dnn/mha/src/main.c                         |  14 +
 sw/dnn/mha/src/mha.h                          |  60 +++
 sw/dnn/mha/src/mha_fp32.h                     |  47 ++
 sw/dnn/src/dnn.h                              |   1 +
 sw/snRuntime/api/sync_decls.h                 |   2 +-
 sw/snRuntime/src/dma.c                        |  41 +-
 sw/snRuntime/src/dma.h                        | 130 ++++-
 sw/snRuntime/src/sync.h                       |  71 ++-
 target/common/common.mk                       |  10 +-
 target/common/rtl.mk                          |   2 +-
 target/snitch_cluster/sw.mk                   |  94 ++--
 target/snitch_cluster/sw/apps/dnn/mha/app.mk  |  13 +
 .../runtime/common/snitch_cluster_cfg.h.tpl   |   1 -
 .../snitch_cluster/sw/runtime/rtl/src/snrt.S  |   3 +
 .../snitch_cluster/sw/runtime/rtl/src/snrt.h  |   1 +
 target/snitch_cluster/sw/runtime/runtime.mk   |   2 +-
 .../schema/snitch_cluster.schema.json         |   8 +-
 util/trace/gen_trace.py                       |   3 +-
 32 files changed, 1130 insertions(+), 392 deletions(-)
 create mode 100644 docs/ug/system_integration.md
 create mode 100644 sw/dnn/mha/data/params.json
 create mode 100755 sw/dnn/mha/scripts/datagen.py
 create mode 100755 sw/dnn/mha/scripts/verify.py
 create mode 100644 sw/dnn/mha/src/main.c
 create mode 100644 sw/dnn/mha/src/mha.h
 create mode 100644 sw/dnn/mha/src/mha_fp32.h
 create mode 100644 target/snitch_cluster/sw/apps/dnn/mha/app.mk

diff --git a/docs/ug/system_integration.md b/docs/ug/system_integration.md
new file mode 100644
index 000000000..a31b292c5
--- /dev/null
+++ b/docs/ug/system_integration.md
@@ -0,0 +1,51 @@
+# Integrating the Snitch cluster in an SoC
+
+While this repository provides many IPs that can be reused independently, we suggest to integrate the Snitch cluster as a whole, that is the `snitch_cluster` module, in derived systems.
+
+The `snitch_cluster` module is implemented in [snitch_cluster.sv](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/hw/snitch_cluster/src/snitch_cluster.sv).
+
+## Configurability
+
+A reference instantiation of the Snitch cluster can be found in the testbench used to test the cluster within this repository, see [testharness.sv](https://github.com/pulp-platform/{{ repo }}/blob/{{ branch }}/target/snitch_cluster/test/testharness.sv).
+
+As you may note, we do not instantiate the `snitch_cluster` directly but a so-called `snitch_cluster_wrapper`, with a much simplified interface. All parameters of the `snitch_cluster` module are set within the wrapper.
+
+The benefit of the wrapper is that it can be programmatically generated from a single source of truth, namely a JSON5 configuration file, from which the software hardware-abstraction layer (HAL), and all other sources dependent on the configuration within the repository, are also generated.
+
+This way, if you want to modify the cluster configuration, you don't have to go and manually change it in multiple places (the RTL, the HAL, etc.), but only in the single-source-of-truth cluster configuration file. More information on the configuration file can be found in the [tutorial](tutorial.md#configuring-the-hardware).
+
+We suggest that the same approach is used when integrating the Snitch cluster in an SoC. This allows you to easily test different configurations of the cluster inside your SoC.
+
+## Integrating the RTL
+
+We provide Make rules to generate the cluster wrapper and other RTL files. Include the following lines in a Makefile, to inherit Snitch's rules:
+```Makefile
+SN_ROOT = $(shell $(BENDER) path snitch_cluster)
+
+include $(SN_ROOT)/target/common/common.mk
+include $(SN_ROOT)/target/common/rtl.mk
+```
+
+!!! note
+    Snitch's Makefiles require `SN_ROOT` to be defined and to point to the root of the Snitch cluster repository. You can set this however you prefer, i.e. you don't have to use Bender if you manage your dependencies in a different way.
+
+You can then use the `sn-rtl` and `sn-clean-rtl` targets to respectively build and clean all of Snitch's generated RTL sources.
+<!-- TODO(colluca): In Picobello we explicitly use the $(SN_CLUSTER_WRAPPER) $(SN_CLUSTER_PKG) variables to build only the generated sources that depend on the cluster config. Find a common ground, probably define targets for only those files. -->
+
+## Integrating the software
+
+Similarly, Snitch comes with a collection of software tests and applications. These build on the functions provided by the Snitch runtime library, so they must be linked against an implementation of the latter. The runtime library abstracts away all the low-level characteristics of the system, allowing applications to be written in a mostly system-independent way, and to be portable to any multi-cluster Snitch-based system.
+To this end, every system must implement a hardware abstraction layer (HAL) for the Snitch runtime, which the mentioned infrastructure builds on.
+
+Given a path to the platform-specific HAL sources, you can reuse the Snitch cluster's Make rules to build the runtime, tests and applications for the target platform.
+Include the following lines in a Makefile, to inherit Snitch's rules:
+
+```Makefile
+SN_RUNTIME_HAL_DIR = sw/runtime/hal
+
+include $(SN_ROOT)/target/common/sw.mk
+```
+
+The included Makefile(s) can be customized to some extent by overriding some variables before the Makefile inclusion line.
+For example by setting `SNRT_BUILD_APPS = OFF` none of the default Snitch applications will be built.
+You can explicitly set the list of applications to be built via the `SNRT_APPS` variable, which can include additional system-dependent applications you may develop in the system repository. For further information on the available customization options you may want to take a look inside the recursively included Makefiles.
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 405945948..078561da5 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -68,6 +68,8 @@ module snitch_cluster
   parameter int unsigned DMAReqFifoDepth    = 3,
   /// Number of DMA channels.
   parameter int unsigned DMANumChannels     = 1,
+  /// Number of exposed TCDM wide ports
+  parameter int unsigned NumExpWideTcdmPorts = 1,
   /// Width of a single icache line.
   parameter int unsigned ICacheLineWidth [NrHives] = '{default: 0},
   /// Number of icache lines per set.
@@ -235,70 +237,70 @@ module snitch_cluster
 ) (
   /// System clock. If `IsoCrossing` is enabled this port is the _fast_ clock.
   /// The slower, half-frequency clock, is derived internally.
-  input  logic                          clk_i,
+  input  logic                                    clk_i,
   /// Asynchronous active high reset. This signal is assumed to be _async_.
-  input  logic                          rst_ni,
+  input  logic                                    rst_ni,
   /// Per-core debug request signal. Asserting this signals puts the
   /// corresponding core into debug mode. This signal is assumed to be _async_.
-  input  logic [NrCores-1:0]            debug_req_i,
+  input  logic [NrCores-1:0]                      debug_req_i,
   /// Machine external interrupt pending. Usually those interrupts come from a
   /// platform-level interrupt controller. This signal is assumed to be _async_.
-  input  logic [NrCores-1:0]            meip_i,
+  input  logic [NrCores-1:0]                      meip_i,
   /// Machine timer interrupt pending. Usually those interrupts come from a
   /// core-local interrupt controller such as a timer/RTC. This signal is
   /// assumed to be _async_.
-  input  logic [NrCores-1:0]            mtip_i,
+  input  logic [NrCores-1:0]                      mtip_i,
   /// Core software interrupt pending. Usually those interrupts come from
   /// another core to facilitate inter-processor-interrupts. This signal is
   /// assumed to be _async_.
-  input  logic [NrCores-1:0]            msip_i,
+  input  logic [NrCores-1:0]                      msip_i,
   // External interrupt pending.
-  input  logic [NrCores-1:0]            mxip_i,
+  input  logic [NrCores-1:0]                      mxip_i,
   /// First hartid of the cluster. Cores of a cluster are monotonically
   /// increasing without a gap, i.e., a cluster with 8 cores and a
   /// `hart_base_id_i` of 5 get the hartids 5 - 12.
-  input  logic [9:0]                    hart_base_id_i,
+  input  logic [9:0]                              hart_base_id_i,
   /// Base address of cluster. TCDM and cluster peripheral location are derived from
   /// it. This signal is pseudo-static.
-  input  logic [PhysicalAddrWidth-1:0]  cluster_base_addr_i,
+  input  logic [PhysicalAddrWidth-1:0]            cluster_base_addr_i,
   /// Configuration inputs for the memory cuts used in implementation.
   /// These signals are pseudo-static.
-  input  sram_cfgs_t                    sram_cfgs_i,
+  input  sram_cfgs_t                              sram_cfgs_i,
   /// Bypass half-frequency clock. (`d2` = divide-by-two). This signal is
   /// pseudo-static.
-  input  logic                          clk_d2_bypass_i,
+  input  logic                                    clk_d2_bypass_i,
   /// AXI Core cluster in-port.
-  input  narrow_in_req_t                narrow_in_req_i,
-  output narrow_in_resp_t               narrow_in_resp_o,
+  input  narrow_in_req_t                          narrow_in_req_i,
+  output narrow_in_resp_t                         narrow_in_resp_o,
   /// AXI Core cluster out-port.
-  output narrow_out_req_t               narrow_out_req_o,
-  input  narrow_out_resp_t              narrow_out_resp_i,
+  output narrow_out_req_t                         narrow_out_req_o,
+  input  narrow_out_resp_t                        narrow_out_resp_i,
   /// AXI DMA cluster out-port. Usually wider than the cluster ports so that the
   /// DMA engine can efficiently transfer bulk of data.
-  output wide_out_req_t                 wide_out_req_o,
-  input  wide_out_resp_t                wide_out_resp_i,
+  output wide_out_req_t                           wide_out_req_o,
+  input  wide_out_resp_t                          wide_out_resp_i,
   /// AXI DMA cluster in-port.
-  input  wide_in_req_t                  wide_in_req_i,
-  output wide_in_resp_t                 wide_in_resp_o,
+  input  wide_in_req_t                            wide_in_req_i,
+  output wide_in_resp_t                           wide_in_resp_o,
   // An additional AXI Core cluster out-port, used e.g. to connect
   // to the configuration interface of an external accelerator.
   // Compared to the `narrow_out` interface, the address space of
   // this port extends the cluster address space. We refer to the prior
   // as an external AXI plug, and to this as an externally-exposed
   // internal AXI plug.
-  output narrow_out_req_t               narrow_ext_req_o,
-  input  narrow_out_resp_t              narrow_ext_resp_i,
+  output narrow_out_req_t                         narrow_ext_req_o,
+  input  narrow_out_resp_t                        narrow_ext_resp_i,
   // External TCDM ports
-  input  tcdm_dma_req_t                 tcdm_ext_req_i,
-  output tcdm_dma_rsp_t                 tcdm_ext_resp_o,
+  input  tcdm_dma_req_t [NumExpWideTcdmPorts-1:0] tcdm_ext_req_i,
+  output tcdm_dma_rsp_t [NumExpWideTcdmPorts-1:0] tcdm_ext_resp_o,
   /// DCA IF to the FPU's
-  input  dca_router_req_t               dca_8x_req_i,
-  input  logic                          dca_8x_req_valid_i,
-  output logic                          dca_8x_req_ready_o,
+  input  dca_router_req_t                         dca_8x_req_i,
+  input  logic                                    dca_8x_req_valid_i,
+  output logic                                    dca_8x_req_ready_o,
   /// DCA IF from the FPU's
-  output dca_router_resp_t              dca_8x_resp_o,
-  output logic                          dca_8x_resp_valid_o,
-  input  logic                          dca_8x_resp_ready_i
+  output dca_router_resp_t                        dca_8x_resp_o,
+  output logic                                    dca_8x_resp_valid_o,
+  input  logic                                    dca_8x_resp_ready_i
 );
   // ---------
   // Constants
@@ -907,7 +909,7 @@ module snitch_cluster
   );
 
   snitch_tcdm_interconnect #(
-    .NumInp (1),
+    .NumInp (NumExpWideTcdmPorts),
     .NumOut (NrSuperBanks),
     .NumHyperBanks (NrHyperBanks),
     .tcdm_req_t (tcdm_dma_req_t),
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index fb6778656..875d0b923 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -28,6 +28,12 @@ ${int(getattr(c['isa_parsed'], isa))}\
   % endfor
 </%def>\
 
+<%
+  actual_num_exposed_wide_tcdm_ports = cfg['cluster']['num_exposed_wide_tcdm_ports']
+  if actual_num_exposed_wide_tcdm_ports == 0:
+    actual_num_exposed_wide_tcdm_ports += 1
+%>
+
 module ${cfg['cluster']['name']}_wrapper (
   input  logic                                   clk_i,
   input  logic                                   rst_ni,
@@ -50,8 +56,8 @@ module ${cfg['cluster']['name']}_wrapper (
   output ${cfg['cluster']['name']}_pkg::wide_in_resp_t      wide_in_resp_o,
   output ${cfg['cluster']['name']}_pkg::narrow_out_req_t    narrow_ext_req_o,
   input  ${cfg['cluster']['name']}_pkg::narrow_out_resp_t   narrow_ext_resp_i,
-  input  ${cfg['cluster']['name']}_pkg::tcdm_dma_req_t      tcdm_ext_req_i,
-  output ${cfg['cluster']['name']}_pkg::tcdm_dma_rsp_t      tcdm_ext_resp_o,
+  input  ${cfg['cluster']['name']}_pkg::tcdm_dma_req_t [${actual_num_exposed_wide_tcdm_ports}-1:0] tcdm_ext_req_i,
+  output ${cfg['cluster']['name']}_pkg::tcdm_dma_rsp_t [${actual_num_exposed_wide_tcdm_ports}-1:0] tcdm_ext_resp_o,
   input  ${cfg['cluster']['name']}_pkg::dca_router_req_t    dca_8x_req_i,
   input  logic                                              dca_8x_req_valid_i,
   output logic                                              dca_8x_req_ready_o,
@@ -106,6 +112,7 @@ module ${cfg['cluster']['name']}_wrapper (
     .DMANumAxInFlight (${cfg['cluster']['dma_axi_req_fifo_depth']}),
     .DMAReqFifoDepth (${cfg['cluster']['dma_req_fifo_depth']}),
     .DMANumChannels (${cfg['cluster']['dma_nr_channels']}),
+    .NumExpWideTcdmPorts (${actual_num_exposed_wide_tcdm_ports}),
     .ICacheLineWidth (${cfg['cluster']['name']}_pkg::ICacheLineWidth),
     .ICacheLineCount (${cfg['cluster']['name']}_pkg::ICacheLineCount),
     .ICacheWays (${cfg['cluster']['name']}_pkg::ICacheWays),
@@ -217,13 +224,12 @@ module ${cfg['cluster']['name']}_wrapper (
     .narrow_ext_req_o (narrow_ext_req_o),
     .narrow_ext_resp_i (${cfg['cluster']['name']}_pkg::narrow_out_resp_t'('0)),
 % endif
-% if cfg['cluster']['wide_tcdm_port_expose']:
-    .tcdm_ext_req_i (tcdm_ext_req_i),
-    .tcdm_ext_resp_o (tcdm_ext_resp_o),
-% else:
+% if cfg['cluster']['num_exposed_wide_tcdm_ports']==0:
     .tcdm_ext_req_i (${cfg['cluster']['name']}_pkg::tcdm_dma_req_t'('0)),
-    .tcdm_ext_resp_o (tcdm_ext_resp_o),
+% else:
+    .tcdm_ext_req_i (tcdm_ext_req_i),
 % endif
+    .tcdm_ext_resp_o (tcdm_ext_resp_o),
     .narrow_in_req_i,
     .narrow_in_resp_o,
     .narrow_out_req_o,
diff --git a/mkdocs.yml b/mkdocs.yml
index 3a30f0bab..b706e2102 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -51,6 +51,7 @@ nav:
           - Advanced:
               - Trace Analysis: ug/trace_analysis.md
               - Code Optimization: ug/code_optimization.md
+      - System Integration: ug/system_integration.md
       - Documentation: ug/documentation.md
   - Reference Manual:
       - Hardware:
diff --git a/sw/apps/common.mk b/sw/apps/common.mk
index e45da98ea..68f5b9f94 100644
--- a/sw/apps/common.mk
+++ b/sw/apps/common.mk
@@ -5,12 +5,12 @@
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
 DATA_DIR    := $(realpath $(SRC_DIR)/../data)
-SCRIPTS_DIR := $(realpath $(SRC_DIR)/../scripts)
 
-$(APP)_DATA_CFG ?= $(DATA_DIR)/params.json
-SECTION         ?=
-DATA_H          := $($(APP)_BUILD_DIR)/data.h
-DATAGEN_PY       = $(SCRIPTS_DIR)/datagen.py
+$(APP)_SCRIPT_DIR ?= $(realpath $(SRC_DIR)/../scripts)
+$(APP)_DATA_CFG   ?= $(DATA_DIR)/params.json
+SECTION           ?=
+DATA_H            := $($(APP)_BUILD_DIR)/data.h
+DATAGEN_PY        := $($(APP)_SCRIPT_DIR)/datagen.py
 
 $(APP)_HEADERS      := $(DATA_H)
 $(APP)_INCDIRS      += $(dir $(DATA_H)) $(SRC_DIR)
diff --git a/sw/blas/gemm/src/gemm.h b/sw/blas/gemm/src/gemm.h
index 08003873a..523d99df8 100644
--- a/sw/blas/gemm/src/gemm.h
+++ b/sw/blas/gemm/src/gemm.h
@@ -175,7 +175,7 @@ static inline uint32_t calculate_partitioned_banks_stride(
  * 3. Allocates space in TCDM for local copies of matrix tiles, unless
  *    matrix tiles are already stored in TCDM (see `load_* arguments`).
  * 4. Distributes tiles to clusters for parallel processing.
- * 5. Iterates over the tiles, performing the following:
+ * 5. Each cluster iterates over the assigned tiles, performing the following:
  *    - Copies data for the current tile into local memory.
  *    - Performs the tile computation using the `sc_st_gemm` function.
  *    - Performs a logarithmic reduction to combine partial results across
@@ -226,8 +226,15 @@ static inline int gemm(const gemm_args_t *args) {
     // Distribute m and k tiles to clusters
     uint32_t cluster_m_tiles = largs->m_tiles;
     uint32_t cluster_k_tiles = largs->k_tiles;
+    uint32_t num_working_clusters = snrt_cluster_num();
     if (largs->parallelize_m) cluster_m_tiles /= snrt_cluster_num();
-    if (largs->parallelize_k) cluster_k_tiles /= snrt_cluster_num();
+    if (largs->parallelize_k) {
+        uint32_t k_tiles_quotient = cluster_k_tiles / snrt_cluster_num();
+        uint32_t k_tiles_remainder = cluster_k_tiles % snrt_cluster_num();
+        cluster_k_tiles = k_tiles_quotient;
+        if (snrt_cluster_idx() < k_tiles_remainder) cluster_k_tiles++;
+        if (k_tiles_quotient == 0) num_working_clusters = k_tiles_remainder;
+    }
 
     // Calculate number of iterations
     uint32_t num_tiles = cluster_m_tiles * largs->n_tiles * cluster_k_tiles;
@@ -456,7 +463,7 @@ static inline int gemm(const gemm_args_t *args) {
             // Note: both compute and DMA cores participate in this step.
             if (largs->parallelize_k && (comp_k == (cluster_k_tiles - 1))) {
                 snrt_global_reduction_dma(
-                    (double *)lcr, (double *)lc[c_buff_idx], tile_m * tile_n);
+                    (double *)lcr, (double *)lc[c_buff_idx], tile_m * tile_n, num_working_clusters);
             }
         }
 
diff --git a/sw/dnn/flashattention_2/scripts/datagen.py b/sw/dnn/flashattention_2/scripts/datagen.py
index 2ad1c98f0..a670936d2 100755
--- a/sw/dnn/flashattention_2/scripts/datagen.py
+++ b/sw/dnn/flashattention_2/scripts/datagen.py
@@ -6,276 +6,241 @@
 # Viviane Potocnik <vivianep@iis.ee.ethz.ch>
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-import argparse
 import numpy as np
-import pathlib
-import json5
 import torch
 import pyflexfloat as ff
+import sys
+
+import snitch.util.sim.data_utils as du
 
-from snitch.util.sim import data_utils
-from snitch.util.sim.data_utils import format_struct_definition, \
-    format_array_definition, format_array_declaration, emit_license
 from snitch.blas import gemm
 
 np.random.seed(42)
 torch.manual_seed(42)
-
-# AXI splits bursts crossing 4KB address boundaries. To minimize
-# the occurrence of these splits the data should be aligned to 4KB
-BURST_ALIGNMENT = 4096
-
-
-def torch_golden_model(Q, K, V):
-    return torch.nn.functional.scaled_dot_product_attention(Q, K, V)
-
-
-def exact_golden_model(Q, K, V, B_r, B_c):
-    # Convert torch tensors to numpy arrays
-    Q = Q.numpy()
-    K = K.numpy()
-    V = V.numpy()
-    # Get layer dimensions
-    L = Q.shape[0]
-    S = K.shape[0]
-    # Calculate tiling parameters
-    T_r = L // B_r
-    T_c = S // B_c
-    # Transpose K
-    K_t = np.transpose(K)
-    # Iterate tiles
-    O_tiles = []
-    for i in range(T_r):
-        # Tile Q
-        start_row = i * B_r
-        end_row = start_row + B_r
-        Q_i = Q[start_row:end_row, :]
-        # Initialize l_i, m_i, O_i
-        m_i = np.full((B_r, 1), -np.inf)
-        for j in range(T_c):
-            # Tile K_t and V
-            start_col = j * B_c
-            end_col = start_col + B_c
-            K_t_j = K_t[:, start_col:end_col]
-            V_j = V[start_col:end_col, ]
-            # Compute O tile update
-            S_ij = np.matmul(Q_i, K_t_j)
-            m_i_prev = m_i
-            m_i = np.maximum(m_i_prev, np.max(S_ij, 1, keepdims=True))
-            shifted_exp = np.exp(m_i_prev - m_i)
-            P_ij = np.exp(S_ij - m_i)
-            PxV = np.matmul(P_ij, V_j)
-            if j == 0:
-                l_i = np.sum(P_ij, 1, keepdims=True)
-                O_i = PxV
-            else:
-                l_i = (shifted_exp * l_i) + np.sum(P_ij, 1, keepdims=True)
-                diag = np.diag(shifted_exp[:, 0])
-                diag_inv = np.linalg.inv(diag)
-                O_i = np.matmul(diag_inv, O_i)
-                O_i += PxV
-        # Finalize O tile
-        diag_l_i = np.diag(l_i[:, 0])
-        diag_l_inv_i = np.linalg.inv(diag_l_i)
-        O_i = np.matmul(diag_l_inv_i, O_i)
-        O_tiles.append(O_i)
-    return np.concatenate(O_tiles, 0)
-
-
 np.set_printoptions(formatter={'object': str})
 
 
-def exact_flexfloat_golden_model(Q, K, V, B_r, B_c, desc):
-    # Get layer dimensions
-    L = Q.shape[0]
-    d = Q.shape[1]
-    S = K.shape[0]
-    # Calculate tiling parameters
-    T_r = L // B_r
-    T_c = S // B_c
-    # Transpose K
-    K_t = np.transpose(K)
-    # Iterate tiles
-    O_tiles = []
-    for i in range(T_r):
-        # Tile Q
-        start_row = i * B_r
-        end_row = start_row + B_r
-        Q_i = Q[start_row:end_row, :]
-        # Initialize l_i, m_i, O_i
-        m_i = np.full((B_r, 1), -np.inf)
-        for j in range(T_c):
-            # Tile K_t and V
-            start_col = j * B_c
-            end_col = start_col + B_c
-            K_t_j = K_t[:, start_col:end_col]
-            V_j = V[start_col:end_col,]
-            # Compute O tile update
-            S_ij = ff.array(np.zeros((B_r, B_c)), desc)
-            S_ij = gemm.GemmDataGen().exact_golden_model(1, Q_i, K_t_j, 0, S_ij)
-            m_i_prev = m_i
-            m_i = np.maximum(m_i_prev, np.max(S_ij, 1, keepdims=True))
-            shifted_exp = np.exp((m_i_prev.astype(np.float32) - m_i.astype(np.float32)))
-            P_ij = np.exp((S_ij - m_i).astype(np.float32))
-            PxV = ff.array(np.zeros((B_r, d)), desc)
-            PxV = gemm.GemmDataGen().exact_golden_model(1, P_ij, V_j, 0, PxV)
-            row_sum = np.sum(P_ij.astype(np.float32), 1, keepdims=True)
-            if j == 0:
-                l_i = row_sum
-                O_i = PxV
-            else:
-                l_i = (shifted_exp * l_i) + row_sum
-                diag = np.diag(shifted_exp[:, 0])
-                diag_inv = np.linalg.inv(diag)
-                O_i = np.matmul(diag_inv, O_i)
-                O_i += PxV
-        # Finalize O tile
-        diag_l_i = np.diag(l_i[:, 0])
-        diag_l_inv_i = np.linalg.inv(diag_l_i)
-        O_i = np.matmul(diag_l_inv_i, O_i)
-        O_tiles.append(O_i)
-    return np.concatenate(O_tiles, 0)
-
-
-# Verify layer parameters are valid
-def validate(L, S, d, B_r, B_c, dtype, baseline, gemm_impl):
-    assert (L % B_r) == 0, 'L is not an integer multiple of B_r'
-    assert (S % B_c) == 0, 'S is not an integer multiple of B_c'
-    assert dtype != 'FP64', 'FP64 precision is not supported yet'
-
-    # Calculate total TCDM occupation
-    prec = data_utils.size_from_precision_t(dtype)
-    q_fa_size = B_r * d * prec
-    k_fa_size = B_c * d * prec
-    v_fa_size = B_c * d * prec
-    s_fa_size = B_r * B_c * prec
-    p_fa_size = B_r * B_c * prec
-    o_fa_size = B_r * d * prec
-    m_i_size = B_r * prec
-    l_i_size = B_r * prec
-    total_size = q_fa_size
-    total_size += k_fa_size
-    total_size += v_fa_size * 2  # V and V^t
-    total_size += s_fa_size
-    total_size += p_fa_size
-    total_size += o_fa_size
-    total_size += m_i_size * 2  # m_i and m_i_prev
-    total_size += l_i_size
-    data_utils.validate_tcdm_footprint(total_size)
-
-    # Q*K^t
-    gemm.GemmDataGen().validate(
-        gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
-        k_tiles=1, transa=0, transb=1, m=B_r, n=B_c, k=d, beta=0, load_a=0, load_b=0, load_c=0
-    )
-
-    # P*V
-    if baseline:
-        gemm.GemmDataGen().validate(
-            gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
-            k_tiles=1, transa=0, transb=0, m=B_r, n=d, k=B_c, beta=1, load_a=0, load_b=0, load_c=0
-        )
-    else:
-        # P*(V^t)^t
+class FlashAttention2DataGen(du.DataGen):
+
+    # AXI splits bursts crossing 4KB address boundaries. To minimize
+    # the occurrence of these splits the data should be aligned to 4KB
+    BURST_ALIGNMENT = 4096
+
+    def torch_golden_model(self, Q, K, V):
+        return torch.nn.functional.scaled_dot_product_attention(Q, K, V)
+
+    def exact_golden_model(self, Q, K, V, B_r, B_c):
+        # Convert torch tensors to numpy arrays
+        Q = Q.numpy()
+        K = K.numpy()
+        V = V.numpy()
+        # Get layer dimensions
+        L = Q.shape[0]
+        S = K.shape[0]
+        # Calculate tiling parameters
+        T_r = L // B_r
+        T_c = S // B_c
+        # Transpose K
+        K_t = np.transpose(K)
+        # Iterate tiles
+        O_tiles = []
+        for i in range(T_r):
+            # Tile Q
+            start_row = i * B_r
+            end_row = start_row + B_r
+            Q_i = Q[start_row:end_row, :]
+            # Initialize l_i, m_i, O_i
+            m_i = np.full((B_r, 1), -np.inf)
+            for j in range(T_c):
+                # Tile K_t and V
+                start_col = j * B_c
+                end_col = start_col + B_c
+                K_t_j = K_t[:, start_col:end_col]
+                V_j = V[start_col:end_col, ]
+                # Compute O tile update
+                S_ij = np.matmul(Q_i, K_t_j)
+                m_i_prev = m_i
+                m_i = np.maximum(m_i_prev, np.max(S_ij, 1, keepdims=True))
+                shifted_exp = np.exp(m_i_prev - m_i)
+                P_ij = np.exp(S_ij - m_i)
+                PxV = np.matmul(P_ij, V_j)
+                if j == 0:
+                    l_i = np.sum(P_ij, 1, keepdims=True)
+                    O_i = PxV
+                else:
+                    l_i = (shifted_exp * l_i) + np.sum(P_ij, 1, keepdims=True)
+                    diag = np.diag(shifted_exp[:, 0])
+                    diag_inv = np.linalg.inv(diag)
+                    O_i = np.matmul(diag_inv, O_i)
+                    O_i += PxV
+            # Finalize O tile
+            diag_l_i = np.diag(l_i[:, 0])
+            diag_l_inv_i = np.linalg.inv(diag_l_i)
+            O_i = np.matmul(diag_l_inv_i, O_i)
+            O_tiles.append(O_i)
+        return np.concatenate(O_tiles, 0)
+
+    def exact_flexfloat_golden_model(self, Q, K, V, B_r, B_c, desc):
+        # Get layer dimensions
+        L = Q.shape[0]
+        d = Q.shape[1]
+        S = K.shape[0]
+        # Calculate tiling parameters
+        T_r = L // B_r
+        T_c = S // B_c
+        # Transpose K
+        K_t = np.transpose(K)
+        # Iterate tiles
+        O_tiles = []
+        for i in range(T_r):
+            # Tile Q
+            start_row = i * B_r
+            end_row = start_row + B_r
+            Q_i = Q[start_row:end_row, :]
+            # Initialize l_i, m_i, O_i
+            m_i = np.full((B_r, 1), -np.inf)
+            for j in range(T_c):
+                # Tile K_t and V
+                start_col = j * B_c
+                end_col = start_col + B_c
+                K_t_j = K_t[:, start_col:end_col]
+                V_j = V[start_col:end_col,]
+                # Compute O tile update
+                S_ij = ff.array(np.zeros((B_r, B_c)), desc)
+                S_ij = gemm.GemmDataGen().exact_golden_model(1, Q_i, K_t_j, 0, S_ij)
+                m_i_prev = m_i
+                m_i = np.maximum(m_i_prev, np.max(S_ij, 1, keepdims=True))
+                shifted_exp = np.exp((m_i_prev.astype(np.float32) - m_i.astype(np.float32)))
+                P_ij = np.exp((S_ij - m_i).astype(np.float32))
+                PxV = ff.array(np.zeros((B_r, d)), desc)
+                PxV = gemm.GemmDataGen().exact_golden_model(1, P_ij, V_j, 0, PxV)
+                row_sum = np.sum(P_ij.astype(np.float32), 1, keepdims=True)
+                if j == 0:
+                    l_i = row_sum
+                    O_i = PxV
+                else:
+                    l_i = (shifted_exp * l_i) + row_sum
+                    diag = np.diag(shifted_exp[:, 0])
+                    diag_inv = np.linalg.inv(diag)
+                    O_i = np.matmul(diag_inv, O_i)
+                    O_i += PxV
+            # Finalize O tile
+            diag_l_i = np.diag(l_i[:, 0])
+            diag_l_inv_i = np.linalg.inv(diag_l_i)
+            O_i = np.matmul(diag_l_inv_i, O_i)
+            O_tiles.append(O_i)
+        return np.concatenate(O_tiles, 0)
+
+    # Verify layer parameters are valid
+    def validate(self, L, S, d, B_r, B_c, dtype, baseline, gemm_impl, **kwargs):
+        assert (L % B_r) == 0, 'L is not an integer multiple of B_r'
+        assert (S % B_c) == 0, 'S is not an integer multiple of B_c'
+        assert dtype != 'FP64', 'FP64 precision is not supported yet'
+
+        # Calculate total TCDM occupation
+        prec = du.size_from_precision_t(dtype)
+        q_fa_size = B_r * d * prec
+        k_fa_size = B_c * d * prec
+        v_fa_size = B_c * d * prec
+        s_fa_size = B_r * B_c * prec
+        p_fa_size = B_r * B_c * prec
+        o_fa_size = B_r * d * prec
+        m_i_size = B_r * prec
+        l_i_size = B_r * prec
+        total_size = q_fa_size
+        total_size += k_fa_size
+        total_size += v_fa_size * 2  # V and V^t
+        total_size += s_fa_size
+        total_size += p_fa_size
+        total_size += o_fa_size
+        total_size += m_i_size * 2  # m_i and m_i_prev
+        total_size += l_i_size
+        du.validate_tcdm_footprint(total_size)
+
+        # Q*K^t
         gemm.GemmDataGen().validate(
             gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
-            k_tiles=1, transa=0, transb=1, m=B_r, n=d, k=B_c, beta=1, load_a=0, load_b=0, load_c=0
+            k_tiles=1, transa=0, transb=1, m=B_r, n=B_c, k=d, beta=0, load_a=0, load_b=0, load_c=0
         )
 
-
-def get_gemm_implementation(params):
-    prec = params['dtype'].lower()
-    impl = f'gemm_{prec}_'
-    if params['baseline']:
-        impl += 'naive'
-    else:
-        impl += 'opt'
-        if prec == 'fp8':
-            impl += '_ex'
-    return impl
-
-
-def emit_header(section, params):
-    L = params['L']
-    S = params['S']
-    d = params['d']
-    B_r = params['B_r']
-    B_c = params['B_c']
-    prec = params['dtype']
-    gemm_impl = get_gemm_implementation(params)
-
-    validate(gemm_impl=gemm_impl, **params)
-
-    # torch_type = data_utils.torch_type_from_precision_t(prec)
-    ff_desc = data_utils.ff_desc_from_precision_t(prec)
-    ctype = data_utils.ctype_from_precision_t(prec)
-
-    # Generate same data for all dtypes for easier debugging.
-    # To achieve this, we always generate in FP16 and then convert.
-    # Q = torch.rand(L, d, requires_grad=False, dtype=torch.float16).to(dtype=torch_type)
-    # K = torch.rand(S, d, requires_grad=False, dtype=torch.float16).to(dtype=torch_type)
-    # V = torch.rand(S, d, requires_grad=False, dtype=torch.float16).to(dtype=torch_type)
-    Q = ff.array(np.random.rand(L, d), ff_desc)
-    K = ff.array(np.random.rand(S, d), ff_desc)
-    V = ff.array(np.random.rand(S, d), ff_desc)
-
-    output = exact_flexfloat_golden_model(Q, K, V, B_r, B_c, ff_desc)
-
-    q_uid = 'Q'
-    k_uid = 'K'
-    v_uid = 'V'
-    o_uid = 'O'
-
-    layer_cfg = {
-        **params,
-        'gemm_implementation': gemm_impl,
-        'Q': q_uid,
-        'K': k_uid,
-        'V': v_uid,
-        'O': o_uid,
-    }
-
-    data_str = [emit_license()]
-    data_str += [format_array_declaration(f'extern {ctype}', q_uid, Q.shape)]
-    data_str += [format_array_declaration(f'extern {ctype}', k_uid, K.shape)]
-    data_str += [format_array_declaration(f'extern {ctype}', v_uid, V.shape)]
-    data_str += [format_array_declaration(ctype, o_uid, output.shape)]
-    data_str += [format_struct_definition('flashattention_2_layer_t', 'layer', layer_cfg)]
-    data_str += [format_array_definition(ctype, q_uid, Q)]
-    data_str += [format_array_definition(ctype, k_uid, K)]
-    data_str += [format_array_definition(ctype, v_uid, V)]
-    data_str = '\n\n'.join(data_str)
-
-    return data_str
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
-    parser.add_argument(
-        "-c", "--cfg",
-        type=pathlib.Path,
-        required=True,
-        help='Select param config file kernel'
-    )
-    parser.add_argument(
-        '--section',
-        type=str,
-        help='Section to store matrices in')
-    parser.add_argument(
-        'output',
-        type=pathlib.Path,
-        help='Path of the output header file')
-    args = parser.parse_args()
-
-    # Load param config file
-    with args.cfg.open() as f:
-        param = json5.loads(f.read())
-
-    # Emit header file
-    with open(args.output, 'w') as f:
-        f.write(emit_header(args.section, param))
+        # P*V
+        if baseline:
+            gemm.GemmDataGen().validate(
+                gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
+                k_tiles=1, transa=0, transb=0, m=B_r, n=d, k=B_c, beta=1, load_a=0, load_b=0, load_c=0
+            )
+        else:
+            # P*(V^t)^t
+            gemm.GemmDataGen().validate(
+                gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
+                k_tiles=1, transa=0, transb=1, m=B_r, n=d, k=B_c, beta=1, load_a=0, load_b=0, load_c=0
+            )
+
+    def get_gemm_implementation(self, params):
+        prec = params['dtype'].lower()
+        impl = f'gemm_{prec}_'
+        if params['baseline']:
+            impl += 'naive'
+        else:
+            impl += 'opt'
+            if prec == 'fp8':
+                impl += '_ex'
+        return impl
+
+    def emit_header(self, **kwargs):
+        header = [super().emit_header()]
+
+        L = kwargs['L']
+        S = kwargs['S']
+        d = kwargs['d']
+        B_r = kwargs['B_r']
+        B_c = kwargs['B_c']
+        prec = kwargs['dtype']
+        gemm_impl = self.get_gemm_implementation(kwargs)
+
+        self.validate(gemm_impl=gemm_impl, **kwargs)
+
+        # torch_type = du.torch_type_from_precision_t(prec)
+        ff_desc = du.ff_desc_from_precision_t(prec)
+        ctype = du.ctype_from_precision_t(prec)
+
+        # Generate same data for all dtypes for easier debugging.
+        # To achieve this, we always generate in FP16 and then convert.
+        # Q = torch.rand(L, d, requires_grad=False, dtype=torch.float16).to(dtype=torch_type)
+        # K = torch.rand(S, d, requires_grad=False, dtype=torch.float16).to(dtype=torch_type)
+        # V = torch.rand(S, d, requires_grad=False, dtype=torch.float16).to(dtype=torch_type)
+        Q = ff.array(np.random.rand(L, d), ff_desc)
+        K = ff.array(np.random.rand(S, d), ff_desc)
+        V = ff.array(np.random.rand(S, d), ff_desc)
+
+        output = self.exact_flexfloat_golden_model(Q, K, V, B_r, B_c, ff_desc)
+
+        q_uid = 'Q'
+        k_uid = 'K'
+        v_uid = 'V'
+        o_uid = 'O'
+
+        layer_cfg = {
+            **kwargs,
+            'gemm_implementation': gemm_impl,
+            'Q': q_uid,
+            'K': k_uid,
+            'V': v_uid,
+            'O': o_uid,
+        }
+
+        header += [du.format_array_declaration(f'extern {ctype}', q_uid, Q.shape)]
+        header += [du.format_array_declaration(f'extern {ctype}', k_uid, K.shape)]
+        header += [du.format_array_declaration(f'extern {ctype}', v_uid, V.shape)]
+        header += [du.format_array_declaration(ctype, o_uid, output.shape)]
+        header += [du.format_struct_definition('flashattention_2_layer_t', 'layer', layer_cfg)]
+        header += [du.format_array_definition(ctype, q_uid, Q)]
+        header += [du.format_array_definition(ctype, k_uid, K)]
+        header += [du.format_array_definition(ctype, v_uid, V)]
+        header = '\n\n'.join(header)
+
+        return header
 
 
 if __name__ == '__main__':
-    main()
+    sys.exit(FlashAttention2DataGen().main())
diff --git a/sw/dnn/flashattention_2/src/flashattention_2_fp16.h b/sw/dnn/flashattention_2/src/flashattention_2_fp16.h
index 5e50efe70..1ac3be5ae 100644
--- a/sw/dnn/flashattention_2/src/flashattention_2_fp16.h
+++ b/sw/dnn/flashattention_2/src/flashattention_2_fp16.h
@@ -35,7 +35,7 @@ static inline void flashattention_2_fp16(flashattention_2_layer_t layer) {
     gemm_args.alpha = 1;
 
     // alias system parameters
-    uint32_t compute_id = snrt_global_core_idx();
+    uint32_t compute_id = snrt_cluster_core_idx();
     uint32_t cluster_id = snrt_cluster_idx();
     uint32_t num_cores = snrt_cluster_compute_core_num();
     uint32_t num_clusters = snrt_cluster_num();
diff --git a/sw/dnn/flashattention_2/src/flashattention_2_fp32.h b/sw/dnn/flashattention_2/src/flashattention_2_fp32.h
index dc0369de8..de8a0b736 100644
--- a/sw/dnn/flashattention_2/src/flashattention_2_fp32.h
+++ b/sw/dnn/flashattention_2/src/flashattention_2_fp32.h
@@ -35,7 +35,7 @@ static inline void flashattention_2_fp32(flashattention_2_layer_t layer) {
     gemm_args.alpha = 1;
 
     // alias system parameters
-    uint32_t compute_id = snrt_global_core_idx();
+    uint32_t compute_id = snrt_cluster_core_idx();
     uint32_t cluster_id = snrt_cluster_idx();
     uint32_t num_cores = snrt_cluster_compute_core_num();
     uint32_t num_clusters = snrt_cluster_num();
diff --git a/sw/dnn/flashattention_2/src/flashattention_2_fp8.h b/sw/dnn/flashattention_2/src/flashattention_2_fp8.h
index ef91c0d9a..cfa6a66a9 100644
--- a/sw/dnn/flashattention_2/src/flashattention_2_fp8.h
+++ b/sw/dnn/flashattention_2/src/flashattention_2_fp8.h
@@ -56,7 +56,7 @@ static inline void flashattention_2_fp8(flashattention_2_layer_t layer) {
     gemm_args.alpha = 1;
 
     // alias system parameters
-    uint32_t compute_id = snrt_global_core_idx();
+    uint32_t compute_id = snrt_cluster_core_idx();
     uint32_t cluster_id = snrt_cluster_idx();
     uint32_t num_cores = snrt_cluster_compute_core_num();
     uint32_t num_clusters = snrt_cluster_num();
diff --git a/sw/dnn/fused_concat_linear/src/fused_concat_linear.h b/sw/dnn/fused_concat_linear/src/fused_concat_linear.h
index 5bf9a3fd3..f716b8c97 100644
--- a/sw/dnn/fused_concat_linear/src/fused_concat_linear.h
+++ b/sw/dnn/fused_concat_linear/src/fused_concat_linear.h
@@ -89,7 +89,7 @@ static inline int fused_concat_linear_optimized(fused_concat_linear_layer_t l) {
     size_t size_a = m * k * l.dtype;
     void *a = snrt_l1_alloc_cluster_local(size_a, l.dtype);
 
-    if (snrt_is_dm_core()) {
+    if (snrt_is_dm_core() && (snrt_cluster_idx() < l.num_inputs)) {
         snrt_dma_load_2d_tile(a, l.inputs[snrt_cluster_idx()], 0, 0, m, k, k,
                               l.dtype);
         snrt_dma_wait_all();
diff --git a/sw/dnn/mha/data/params.json b/sw/dnn/mha/data/params.json
new file mode 100644
index 000000000..a637c6535
--- /dev/null
+++ b/sw/dnn/mha/data/params.json
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+{
+    num_heads: 1,
+	L: 16,
+	S: 16,
+    d: 16,
+    B_r: 16,
+    B_c: 16,
+    dtype: "FP32",
+    baseline: true
+}
\ No newline at end of file
diff --git a/sw/dnn/mha/scripts/datagen.py b/sw/dnn/mha/scripts/datagen.py
new file mode 100755
index 000000000..0e6ce28d3
--- /dev/null
+++ b/sw/dnn/mha/scripts/datagen.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Viviane Potocnik <vivianep@iis.ee.ethz.ch>
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import argparse
+import numpy as np
+import pathlib
+import json5
+import torch
+import pyflexfloat as ff
+
+from snitch.util.sim import data_utils
+from snitch.util.sim.data_utils import format_struct_definition, \
+    format_array_definition, format_array_declaration, emit_license, format_scalar_definition
+from snitch.blas.gemm.scripts.datagen import GemmDataGen
+
+np.random.seed(42)
+torch.manual_seed(42)
+
+# AXI splits bursts crossing 4KB address boundaries. To minimize
+# the occurrence of these splits the data should be aligned to 4KB
+BURST_ALIGNMENT = 4096
+
+
+np.set_printoptions(formatter={'object': str})
+
+
+def exact_flexfloat_golden_model(Q, K, V, B_r, B_c, desc):
+    # Get layer dimensions
+    L = Q.shape[0]
+    d = Q.shape[1]
+    S = K.shape[0]
+    # Calculate tiling parameters
+    T_r = L // B_r
+    T_c = S // B_c
+    # Transpose K
+    K_t = np.transpose(K)
+    # Iterate tiles
+    O_tiles = []
+    for i in range(T_r):
+        # Tile Q
+        start_row = i * B_r
+        end_row = start_row + B_r
+        Q_i = Q[start_row:end_row, :]
+        # Initialize l_i, m_i, O_i
+        m_i = np.full((B_r, 1), -np.inf)
+        for j in range(T_c):
+            # Tile K_t and V
+            start_col = j * B_c
+            end_col = start_col + B_c
+            K_t_j = K_t[:, start_col:end_col]
+            V_j = V[start_col:end_col,]
+            # Compute O tile update
+            S_ij = ff.array(np.zeros((B_r, B_c)), desc)
+            S_ij = GemmDataGen().exact_golden_model(1, Q_i, K_t_j, 0, S_ij)
+            m_i_prev = m_i
+            m_i = np.maximum(m_i_prev, np.max(S_ij, 1, keepdims=True))
+            shifted_exp = np.exp((m_i_prev.astype(np.float32) - m_i.astype(np.float32)))
+            P_ij = np.exp((S_ij - m_i).astype(np.float32))
+            PxV = ff.array(np.zeros((B_r, d)), desc)
+            PxV = GemmDataGen().exact_golden_model(1, P_ij, V_j, 0, PxV)
+            row_sum = np.sum(P_ij.astype(np.float32), 1, keepdims=True)
+            if j == 0:
+                l_i = row_sum
+                O_i = PxV
+            else:
+                l_i = (shifted_exp * l_i) + row_sum
+                diag = np.diag(shifted_exp[:, 0])
+                diag_inv = np.linalg.inv(diag)
+                O_i = np.matmul(diag_inv, O_i)
+                O_i += PxV
+        # Finalize O tile
+        diag_l_i = np.diag(l_i[:, 0])
+        diag_l_inv_i = np.linalg.inv(diag_l_i)
+        O_i = np.matmul(diag_l_inv_i, O_i)
+        O_tiles.append(O_i)
+    return np.concatenate(O_tiles, 0)
+
+
+# Verify layer parameters are valid
+def validate(num_heads, L, S, d, B_r, B_c, dtype, baseline, gemm_impl):
+    assert num_heads > 0, 'num_heads must be greater than 0'
+    # assert num_heads <= 255, 'num_heads must be less than or equal to 255 (with 8 bits allocated)'
+    assert (L % B_r) == 0, 'L is not an integer multiple of B_r'
+    assert (S % B_c) == 0, 'S is not an integer multiple of B_c'
+    assert dtype != 'FP64', 'FP64 precision is not supported yet'
+
+    # Calculate total TCDM occupation
+    prec = data_utils.size_from_precision_t(dtype)
+    q_fa_size = B_r * d * prec
+    k_fa_size = B_c * d * prec
+    v_fa_size = B_c * d * prec
+    s_fa_size = B_r * B_c * prec
+    p_fa_size = B_r * B_c * prec
+    o_fa_size = B_r * d * prec
+    m_i_size = B_r * prec
+    l_i_size = B_r * prec
+    total_size = q_fa_size
+    total_size += k_fa_size
+    total_size += v_fa_size * 2  # V and V^t
+    # total_size *= num_heads ######## is this correct? jiayi #######
+    total_size += s_fa_size
+    total_size += p_fa_size
+    total_size += o_fa_size
+    total_size += m_i_size * 2  # m_i and m_i_prev
+    total_size += l_i_size
+
+    data_utils.validate_tcdm_footprint(total_size)
+
+    # Q*K^t
+    GemmDataGen().validate(
+        gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
+        k_tiles=1, transa=0, transb=1, m=B_r, n=B_c, k=d, beta=0, load_a=0, load_b=0, load_c=0
+    )
+
+    # P*V
+    if baseline:
+        GemmDataGen().validate(
+            gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
+            k_tiles=1, transa=0, transb=0, m=B_r, n=d, k=B_c, beta=1, load_a=0, load_b=0, load_c=0
+        )
+    else:
+        # P*(V^t)^t
+        GemmDataGen().validate(
+            gemm_fp=gemm_impl, parallelize_m=0, parallelize_k=0, m_tiles=1, n_tiles=1,
+            k_tiles=1, transa=0, transb=1, m=B_r, n=d, k=B_c, beta=1, load_a=0, load_b=0, load_c=0
+        )
+
+
+def get_gemm_implementation(params):
+    prec = params['dtype'].lower()
+    impl = f'gemm_{prec}_'
+    if params['baseline']:
+        impl += 'naive'
+    else:
+        impl += 'opt'
+        if prec == 'fp8':
+            impl += '_ex'
+    return impl
+
+
+def emit_header(section, params):
+    num_heads = params['num_heads']
+    L = params['L']
+    S = params['S']
+    d = params['d']
+    B_r = params['B_r']
+    B_c = params['B_c']
+    prec = params['dtype']
+    gemm_impl = get_gemm_implementation(params)
+
+    validate(gemm_impl=gemm_impl, **params)
+
+    # torch_type = data_utils.torch_type_from_precision_t(prec)
+    ff_desc = data_utils.ff_desc_from_precision_t(prec)
+    ctype = data_utils.ctype_from_precision_t(prec)
+
+    data_list = []
+    data_list.append(emit_license())
+
+    Q_list = []
+    K_list = []
+    V_list = []
+    O_list = []
+
+    for head_idx in range(num_heads):
+
+        Q = ff.array(np.random.rand(L, d), ff_desc)
+        K = ff.array(np.random.rand(S, d), ff_desc)
+        V = ff.array(np.random.rand(S, d), ff_desc)
+        O = exact_flexfloat_golden_model(Q, K, V, B_r, B_c, ff_desc)
+
+        q_uid = 'Q_' + str(head_idx)
+        k_uid = 'K_' + str(head_idx)
+        v_uid = 'V_' + str(head_idx)
+        o_uid = 'O_' + str(head_idx)
+
+        Q_list.append(q_uid)
+        K_list.append(k_uid)
+        V_list.append(v_uid)
+        O_list.append(o_uid)
+
+        data_list.append(format_array_declaration(f'extern {ctype}', q_uid, Q.shape))
+        data_list.append(format_array_declaration(f'extern {ctype}', k_uid, K.shape))
+        data_list.append(format_array_declaration(f'extern {ctype}', v_uid, V.shape))
+        data_list.append(format_array_declaration(ctype, o_uid, O.shape))
+
+        data_list.append(format_array_definition(ctype, q_uid, Q))
+        data_list.append(format_array_definition(ctype, k_uid, K))
+        data_list.append(format_array_definition(ctype, v_uid, V))
+
+    w_uid = 'W'
+    W = ff.array(np.random.rand(d * num_heads, O.shape[1]), ff_desc)
+
+    o_uid = 'O'
+    # TODO(colluca): replace with FusedConcatLinear golden model
+    O = np.zeros((L, d), dtype=ctype)
+
+    data_list.append(format_array_declaration(f'extern {ctype}', w_uid, W.shape))
+    data_list.append(format_array_definition(ctype, w_uid, W))
+    data_list.append(format_array_declaration(ctype, o_uid, O.shape))
+
+    V_list_str = ', '.join(V_list)
+    Q_list_str = ', '.join(Q_list)
+    K_list_str = ', '.join(K_list)
+    O_list_str = ', '.join(O_list)
+
+    layer_cfg = {
+        **params,
+        'gemm_implementation': gemm_impl,
+        'Q': '(void*[]){' + Q_list_str + '}',
+        'K': '(void*[]){' + K_list_str + '}',
+        'V': '(void*[]){' + V_list_str + '}',
+        'W': w_uid,
+        'head_outputs': '(void*[]){' + O_list_str + '}',
+        'O': o_uid,
+    }
+
+    data_list.append(format_struct_definition('mha_layer_t', 'layer', layer_cfg))
+
+    data_str = '\n\n'.join(data_list)
+
+    return data_str
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for layernorm kernel')
+    parser.add_argument(
+        "-c", "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help='Select param config file kernel'
+    )
+    parser.add_argument(
+        '--section',
+        type=str,
+        help='Section to store matrices in')
+    parser.add_argument(
+        'output',
+        type=pathlib.Path,
+        help='Path of the output header file')
+    args = parser.parse_args()
+
+    # Load param config file
+    with args.cfg.open() as f:
+        param = json5.loads(f.read())
+
+    # Emit header file
+    with open(args.output, 'w') as f:
+        f.write(emit_header(args.section, param))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sw/dnn/mha/scripts/verify.py b/sw/dnn/mha/scripts/verify.py
new file mode 100755
index 000000000..ce4208192
--- /dev/null
+++ b/sw/dnn/mha/scripts/verify.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+
+import numpy as np
+import sys
+
+import pyflexfloat as ff
+
+from snitch.util.sim.verif_utils import Verifier
+import snitch.util.sim.data_utils as du
+from sw.blas.gemm.scripts.datagen import GemmDataGen
+from sw.dnn.flashattention_2.scripts.datagen import FlashAttention2DataGen
+
+
+class MhaVerifier(Verifier):
+
+    OUTPUT_UIDS = ['O']
+
+    ERR_THRESHOLD = {4: 1e-6, 2: 8e-3, 1: 3e-1}
+
+    def __init__(self):
+        super().__init__()
+        self.layer_struct = {
+            'num_heads': 'I',
+            'L': 'I',
+            'S': 'I',
+            'd': 'I',
+            'B_r': 'I',
+            'B_c': 'I',
+            'dtype': 'I',
+            'baseline': 'I',
+            'gemm_implementation': 'I',
+            'Q': 'I',
+            'K': 'I',
+            'V': 'I',
+            'O': 'I',
+            'W_O': 'I'
+        }
+        self.layer = self.get_input_from_symbol('layer', self.layer_struct)
+        self.L = self.layer['L']
+        self.S = self.layer['S']
+        self.d = self.layer['d']
+        self.B_r = self.layer['B_r']
+        self.B_c = self.layer['B_c']
+        self.prec = self.layer['dtype']
+        self.num_heads = self.layer['num_heads']
+        self.W_O = self.layer['W_O']    
+        self.Q = [self.get_input_from_symbol('Q_' + str(head), du.ctype_from_precision_t(self.prec)) for head in range(self.num_heads)]
+
+    def get_actual_results(self):
+        # Iterates over OUTPUT_UIDS in case we want to verify the intermediate outputs
+        # from every head
+        outputs = []
+        for uid in self.OUTPUT_UIDS:
+            outputs.append(self.get_output_from_symbol(uid, du.ctype_from_precision_t(self.prec)))
+        results = np.concatenate(outputs, axis=None)
+        return results
+
+    def get_expected_results(self):
+        # FlashAttention-2 calculation for each head
+        head_outputs = []
+        for head in range(self.num_heads):
+
+            # Get input tensors for the current head (raw bytes)
+            Q = self.get_input_from_symbol('Q_' + str(head), du.ctype_from_precision_t(self.prec))
+            K = self.get_input_from_symbol('K_' + str(head), du.ctype_from_precision_t(self.prec))
+            V = self.get_input_from_symbol('V_' + str(head), du.ctype_from_precision_t(self.prec))
+
+            # Convert input tensors to float using ff.FlexFloat.__float__
+            Q_f = np.array([q.__float__() for q in Q])
+            K_f = np.array([k.__float__() for k in K])
+            V_f = np.array([v.__float__() for v in V])
+
+            # Reshape input tensors
+            ff_desc = du.ff_desc_from_precision_t(self.prec)
+            Q = ff.array(Q_f.reshape(self.L, self.d), ff_desc)
+            V = ff.array(V_f.reshape(self.S, self.d), ff_desc)
+            K = ff.array(K_f.reshape(self.S, self.d), ff_desc)
+
+            # Calculate head output
+            head_outputs.append(FlashAttention2DataGen().exact_flexfloat_golden_model(Q, K, V, self.B_r, self.B_c, ff_desc))
+
+        # Verify outputs from all heads (change also OUTPUT_UIDS accordingly)
+        # return np.concatenate(head_outputs, axis=None)
+
+        # Concatenate heads
+        concat_output = np.concatenate(head_outputs, axis=1)
+
+        # Final projection
+        W = self.get_input_from_symbol('W', du.ctype_from_precision_t(self.prec))
+        W = np.array([w.__float__() for w in W])
+        W = ff.array(W.reshape(self.d*self.num_heads, self.d), du.ff_desc_from_precision_t(self.prec))
+        O = GemmDataGen().exact_golden_model(1, concat_output, W, 0, np.zeros((self.L, self.d), dtype=du.ctype_from_precision_t(self.prec)))
+        return O.flatten()
+
+    def check_results(self, *args):
+        return super().check_results(*args, rtol=self.ERR_THRESHOLD[self.prec])
+
+
+if __name__ == "__main__":
+    sys.exit(MhaVerifier().main())
diff --git a/sw/dnn/mha/src/main.c b/sw/dnn/mha/src/main.c
new file mode 100644
index 000000000..df0c1f4b8
--- /dev/null
+++ b/sw/dnn/mha/src/main.c
@@ -0,0 +1,14 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "dnn.h"
+
+#include "data.h"
+
+int main() {
+    mha_layer(layer);
+    return 0;
+}
diff --git a/sw/dnn/mha/src/mha.h b/sw/dnn/mha/src/mha.h
new file mode 100644
index 000000000..0ab9c4fed
--- /dev/null
+++ b/sw/dnn/mha/src/mha.h
@@ -0,0 +1,60 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+// #include "blas.h"
+// #include "snrt.h"
+
+// /**
+//  * @struct mha_layer_t
+//  * @brief This structure contains all parameters necessary
+//  *        for computing a Multihead attention layer using FlashAttention-2. Refer to
+//  *        "FlashAttention-2: Faster Attention with Better
+//  *        Parallelism and Work Partitioning" for more info.
+//  *        The FlashAttention-2 paper refers to a single sequence
+//  *        length N. To support auto-regressive inference we
+//  *        define two separate parameters L and S, following the
+//  *        PyTorch naming scheme.
+//  * @var mha_layer_t::L
+//  * Target sequence length
+//  * @var mha_layer_t::S
+//  * Source sequence length
+//  * @var mha_layer_t::d
+//  * Head dimension
+//  * @var mha_layer_t::Q
+//  * Pointer to query tensor
+//  * @var mha_layer_t::K
+//  * Pointer to key tensor
+//  * @var mha_layer_t::V
+//  * Pointer to value tensor
+//  * @var mha_layer_t::head_outputs
+//  * Pointer to output tensor of each head
+//  * @var mha_layer_t::O
+//  * Pointer to output tensor
+//  */
+typedef struct {
+    uint32_t num_heads;
+    uint32_t L;
+    uint32_t S;
+    uint32_t d;
+    uint32_t B_r;
+    uint32_t B_c;
+    precision_t dtype;
+    uint32_t baseline;
+    gemm_fp_t gemm_implementation;
+    void **Q;
+    void **K;
+    void **V;
+    void *W;
+    void **head_outputs;
+    void *O;
+} mha_layer_t;
+
+#include "../mha/src/mha_fp32.h"
+
+
+static inline void mha_layer(mha_layer_t layer) {
+    mha_fp32(layer);
+}
\ No newline at end of file
diff --git a/sw/dnn/mha/src/mha_fp32.h b/sw/dnn/mha/src/mha_fp32.h
new file mode 100644
index 000000000..45d12b4d6
--- /dev/null
+++ b/sw/dnn/mha/src/mha_fp32.h
@@ -0,0 +1,47 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+static inline void mha_fp32(mha_layer_t layer) {
+
+    if (snrt_cluster_idx() < layer.num_heads) {
+
+        // Prepare arguments for FlashAttention-2
+        flashattention_2_layer_t fa2_args;
+        fa2_args.L = layer.L;
+        fa2_args.S = layer.S;
+        fa2_args.d = layer.d;
+        fa2_args.B_r = layer.B_r;
+        fa2_args.B_c = layer.B_c;
+        fa2_args.Q = ((float **)layer.Q)[snrt_cluster_idx()];
+        fa2_args.K = ((float **)layer.K)[snrt_cluster_idx()];
+        fa2_args.V = ((float **)layer.V)[snrt_cluster_idx()];
+        fa2_args.O = ((float **)layer.head_outputs)[snrt_cluster_idx()];
+        fa2_args.dtype = layer.dtype;
+        fa2_args.baseline = layer.baseline;
+        fa2_args.gemm_implementation = layer.gemm_implementation;
+
+        // Call FlashAttention-2
+        flashattention_2_fp32(fa2_args);
+
+        // Prepare arguments for the FusedConcatLinear layer
+        fused_concat_linear_layer_t fcl_args;
+        fcl_args.num_inputs = layer.num_heads;
+        fcl_args.input_shape[0] = layer.L;
+        fcl_args.input_shape[1] = layer.d;
+        fcl_args.output_shape[0] = layer.L;
+        fcl_args.output_shape[1] = layer.d * layer.num_heads;
+        fcl_args.inputs = layer.head_outputs;
+        fcl_args.weights = layer.W;
+        fcl_args.concat_output = nullptr;
+        fcl_args.linear_output = layer.O;
+        fcl_args.dtype = layer.dtype;
+        fcl_args.gemm_implementation = layer.gemm_implementation;
+
+        // Call the FusedConcatLinear layer
+        fused_concat_linear_layer(fcl_args);
+
+    }
+}
\ No newline at end of file
diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h
index 7344c4794..6bc47b7ac 100644
--- a/sw/dnn/src/dnn.h
+++ b/sw/dnn/src/dnn.h
@@ -200,4 +200,5 @@ typedef struct network_single_cluster_t_ {
 #include "../gelu/src/gelu.h"
 #include "../layernorm/src/layernorm.h"
 #include "../maxpool/src/maxpool.h"
+#include "../mha/src/mha.h"
 #include "../softmax/src/softmax.h"
diff --git a/sw/snRuntime/api/sync_decls.h b/sw/snRuntime/api/sync_decls.h
index 457bf5532..efbab1b57 100644
--- a/sw/snRuntime/api/sync_decls.h
+++ b/sw/snRuntime/api/sync_decls.h
@@ -60,7 +60,7 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx);
 
 inline void snrt_cluster_hw_barrier();
 
-inline void snrt_global_barrier();
+inline void snrt_global_barrier(uint32_t num_participants);
 
 inline uint32_t snrt_global_all_to_all_reduction(uint32_t value);
 
diff --git a/sw/snRuntime/src/dma.c b/sw/snRuntime/src/dma.c
index b3c6c8c81..455d15c65 100644
--- a/sw/snRuntime/src/dma.c
+++ b/sw/snRuntime/src/dma.c
@@ -2,6 +2,10 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+extern void snrt_dma_enable_mcast(uint32_t mask);
+
+extern void snrt_dma_disable_mcast();
+
 extern void snrt_dma_wait_all_channels(uint32_t num_channels);
 
 extern void snrt_dma_start_tracking();
@@ -10,10 +14,6 @@ extern void snrt_dma_stop_tracking();
 
 extern void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len);
 
-extern void snrt_dma_enable_mcast(uint64_t mask);
-
-extern void snrt_dma_disable_mcast();
-
 extern void snrt_dma_enable_collective(uint64_t coll_mask, uint32_t coll_op);
 
 extern void snrt_dma_disable_collective();
@@ -22,6 +22,19 @@ extern snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src,
                                              size_t tile_idx, size_t tile_size,
                                              uint32_t prec);
 
+extern snrt_dma_txid_t snrt_dma_load_1d_tile_mcast(void *dst, void *src,
+                                                   size_t tile_idx,
+                                                   size_t tile_size,
+                                                   uint32_t prec);
+
+extern snrt_dma_txid_t snrt_dma_1d_to_2d(volatile void *dst, volatile void *src,
+                                         size_t size, size_t row_size,
+                                         size_t stride);
+
+extern snrt_dma_txid_t snrt_dma_2d_to_1d(volatile void *dst, volatile void *src,
+                                         size_t size, size_t row_size,
+                                         size_t stride);
+
 extern snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src,
                                               size_t tile_idx, size_t tile_size,
                                               uint32_t prec);
@@ -36,7 +49,27 @@ extern snrt_dma_txid_t snrt_dma_load_2d_tile(
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
     uint32_t prec);
 
+extern snrt_dma_txid_t snrt_dma_load_2d_tile_mcast(
+    void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
+    size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
+    uint32_t prec, size_t tile_ld, uint32_t mask);
+
+extern snrt_dma_txid_t snrt_dma_load_2d_tile_mcast(
+    void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
+    size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
+    uint32_t prec, uint32_t mask);
+
+extern snrt_dma_txid_t snrt_dma_load_2d_tile_in_banks(
+    void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
+    size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
+    uint32_t prec, size_t num_banks);
+
 extern snrt_dma_txid_t snrt_dma_store_2d_tile(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
     uint32_t prec);
+
+extern snrt_dma_txid_t snrt_dma_store_2d_tile_from_banks(
+    void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
+    size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
+    uint32_t prec, size_t num_banks);
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 8cd0e11e9..1c5967418 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -234,10 +234,8 @@ static inline snrt_dma_txid_t snrt_dma_start_2d(uint64_t dst, uint64_t src,
 /**
  * @brief Start an asynchronous 2D DMA transfer using native-size pointers.
  *
- * This is a convenience overload of snrt_dma_start_2d() using `void*` pointers.
- *
- * @see snrt_dma_start_2d(uint64_t, uint64_t, size_t, size_t, size_t, size_t,
- * uint32_t)
+ * This is a convenience overload of \ref snrt_dma_start_2d(uint64_t, uint64_t,
+ * size_t, size_t, size_t, size_t, uint32_t) using `void*` pointers.
  */
 static inline uint32_t snrt_dma_start_2d(volatile void *dst, volatile void *src,
                                          size_t size, size_t dst_stride,
@@ -247,6 +245,46 @@ static inline uint32_t snrt_dma_start_2d(volatile void *dst, volatile void *src,
                              src_stride, repeat, channel);
 }
 
+/**
+ * @brief Start an asynchronous, multicast 2D DMA transfer with 64-bit wide
+ * pointers.
+ *
+ * @param mask Multicast mask.
+ *
+ * @see snrt_dma_start_2d(uint64_t, uint64_t, size_t, size_t, size_t, size_t,
+ * uint32_t) for a description of the other parameters.
+ */
+static inline uint32_t snrt_dma_start_2d_mcast(uint64_t dst, uint64_t src,
+                                               size_t size, size_t dst_stride,
+                                               size_t src_stride, size_t repeat,
+                                               uint32_t mask,
+                                               const uint32_t channel = 0) {
+    snrt_dma_enable_multicast(mask);
+    uint32_t txid = snrt_dma_start_2d(dst, src, size, dst_stride, src_stride,
+                                      repeat, channel);
+    snrt_dma_disable_multicast();
+    return txid;
+}
+
+/**
+ * @brief Start an asynchronous, multicast 2D DMA transfer using native-size
+ * pointers.
+ *
+ * This is a convenience overload of \ref snrt_dma_start_2d_mcast(uint64_t,
+ * uint64_t, size_t, size_t, size_t, size_t, uint32_t, uint32_t) using `void*`
+ * pointers.
+ */
+static inline uint32_t snrt_dma_start_2d_mcast(volatile void *dst,
+                                               volatile void *src, size_t size,
+                                               size_t dst_stride,
+                                               size_t src_stride, size_t repeat,
+                                               uint32_t mask,
+                                               const uint32_t channel = 0) {
+    return snrt_dma_start_2d_mcast((uint64_t)dst, (uint64_t)src, size,
+                                   dst_stride, src_stride, repeat, mask,
+                                   channel);
+}
+
 /**
  * @brief Block until a DMA transfer finishes on a specific DMA channel.
  * @param txid The DMA transfer's ID.
@@ -365,7 +403,7 @@ inline snrt_dma_txid_t snrt_dma_load_1d_tile(volatile void *dst,
  * @param prec Number of bytes of each element in the 1D array.
  * @param mask Multicast mask applied on the destination address.
  */
-inline snrt_dma_txid_t snrt_dma_mcast_load_1d_tile(void *dst, void *src,
+inline snrt_dma_txid_t snrt_dma_load_1d_tile_mcast(void *dst, void *src,
                                                    size_t tile_idx,
                                                    size_t tile_size,
                                                    uint32_t prec,
@@ -475,6 +513,15 @@ inline snrt_dma_txid_t snrt_dma_load_2d_tile(
     );
 }
 
+/**
+ * @brief Load a 2D tile of a 2D array.
+ *
+ * The stride in the destination tile is assumed to be that of a 1D tile,
+ * effectively. In other words, this is the same as \ref snrt_dma_2d_to_1d().
+ *
+ * @see snrt_dma_load_2d_tile(void *, void *, size_t, size_t, size_t, size_t,
+ *      size_t, uint32_t, size_t) for a detailed description of the parameters.
+ */
 inline snrt_dma_txid_t snrt_dma_load_2d_tile(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
@@ -484,6 +531,52 @@ inline snrt_dma_txid_t snrt_dma_load_2d_tile(
                                  tile_x0_size * prec);
 }
 
+/**
+ * @brief Load a 2D tile of a 2D array using multicast.
+ * @param mask Multicast mask.
+ *
+ * @see snrt_dma_load_2d_tile(void *, void *, size_t, size_t, size_t, size_t,
+ *      size_t, uint32_t, size_t) for a description of the other parameters.
+ */
+inline snrt_dma_txid_t snrt_dma_load_2d_tile_mcast(
+    void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
+    size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
+    uint32_t prec, size_t tile_ld, uint32_t mask) {
+    size_t src_offset = 0;
+    // Advance src array in x0 and x1 dimensions, and convert to byte offset
+    src_offset += tile_x0_idx * tile_x0_size;
+    src_offset += tile_x1_idx * tile_x1_size * full_x0_size;
+    src_offset *= prec;
+    // Initiate transfer
+    return snrt_dma_start_2d_mcast((uint64_t)dst,               // dst
+                                   (uint64_t)src + src_offset,  // src
+                                   tile_x0_size * prec,         // size
+                                   tile_ld,                     // dst_stride
+                                   full_x0_size * prec,         // src_stride
+                                   tile_x1_size,                // repeat
+                                   mask  // multicast mask
+    );
+}
+
+/**
+ * @brief Load a 2D tile of a 2D array.
+ *
+ * The stride in the destination tile is assumed to be that of a 1D tile,
+ * effectively. In other words, this is similar to \ref snrt_dma_2d_to_1d().
+ *
+ * @see snrt_dma_load_2d_tile_mcast(void *, void *, size_t, size_t, size_t,
+ *      size_t, size_t, uint32_t, size_t, uint32_t) for a detailed description
+ *      of the parameters.
+ */
+inline snrt_dma_txid_t snrt_dma_load_2d_tile_mcast(
+    void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
+    size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
+    uint32_t prec, uint32_t mask) {
+    return snrt_dma_load_2d_tile_mcast(dst, src, tile_x1_idx, tile_x0_idx,
+                                       tile_x1_size, tile_x0_size, full_x0_size,
+                                       prec, tile_x0_size * prec, mask);
+}
+
 /**
  * @brief Load a 2D tile of a 2D array and reshape it to occupy a subset of
  *        TCDM banks.
@@ -548,6 +641,15 @@ inline snrt_dma_txid_t snrt_dma_store_2d_tile(
     );
 }
 
+/**
+ * @brief Store a 2D tile of a 2D array.
+ *
+ * @details The stride in the source tile is assumed to be that of a 1D tile,
+ * effectively. In other words, this is the same as \ref snrt_dma_1d_to_2d().
+ *
+ * @see snrt_dma_store_2d_tile(void *, void *, size_t, size_t, size_t, size_t,
+ *      size_t, uint32_t, size_t) for a detailed description of the parameters.
+ */
 inline snrt_dma_txid_t snrt_dma_store_2d_tile(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
@@ -557,6 +659,22 @@ inline snrt_dma_txid_t snrt_dma_store_2d_tile(
                                   prec, tile_x0_size * prec);
 }
 
+/**
+ * @brief Store a 2D tile of a 2D array from a 1D layout occupying a subset of
+ *        TCDM banks.
+ * @param dst Pointer to the destination array.
+ * @param src Pointer to the source tile.
+ * @param tile_x1_idx Outermost coordinate of the tile in the 2D array.
+ * @param tile_x0_idx Innermost coordinate of the tile in the 2D array.
+ * @param tile_x1_size Number of elements in the outermost dimension of the
+ *                     tile.
+ * @param tile_x0_size Number of elements in the innermost dimension of the
+ *                     tile.
+ * @param full_x0_size Number of elements in the innermost dimension of the
+ *                     array.
+ * @param prec Number of bytes of each element in the 2D array.
+ * @param num_banks Number of banks the tile is stored in.
+ */
 inline snrt_dma_txid_t snrt_dma_store_2d_tile_from_banks(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
@@ -569,4 +687,4 @@ inline snrt_dma_txid_t snrt_dma_store_2d_tile_from_banks(
     return snrt_dma_store_2d_tile(dst, src, tile_x1_idx, tile_x0_idx,
                                   tile_x1_size_in_banks, tile_x0_size_in_banks,
                                   full_x0_size, prec, tile_ld);
-}
\ No newline at end of file
+}
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index 02c0ce8fa..934911249 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -76,6 +76,22 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) {
 //================================================================================
 // Barrier functions
 //================================================================================
+/**
+ * @brief Wake the first \ref num_clusters clusters.
+ * @param num_clusters The number of clusters to wake up.
+ */
+inline void snrt_wake_clusters(uint32_t core_mask, uint32_t num_clusters=0) {
+    // If the number of participants is not specified, assume all clusters
+    if (num_clusters == 0) num_clusters = snrt_cluster_num();
+
+    // Wake clusters sequentially
+    for (int i = 0; i < num_clusters; i++) {
+        if (snrt_cluster_idx() != i) {
+            snrt_cluster(i)->peripheral_reg.cl_clint_set.f.cl_clint_set =
+                core_mask;
+        }
+    }
+}
 
 /**
  * @brief Wakes up all core by writing in their respective clint var.
@@ -108,13 +124,7 @@ inline void snrt_wake_all(uint32_t core_mask) {
         snrt_disable_multicast();
     }
 #else
-    // loop to send cluster interrupt to every other cluster's core
-    for (int i = 0; i < snrt_cluster_num(); i++) {
-        if (snrt_cluster_idx() != i) {
-            snrt_cluster(i)->peripheral_reg.cl_clint_set.f.cl_clint_set =
-                core_mask;
-        }
-    }
+    snrt_wake_clusters(core_mask);
 #endif
 }
 
@@ -129,12 +139,16 @@ inline void snrt_cluster_hw_barrier() {
 
 /**
  * @brief Synchronize one core from every cluster with the others.
+ * @param num_participants The number of clusters that participate in the
+ *                         reduction. If set to 0, all clusters are assumed
+ *                         to participate.
  * @details Implemented as a software barrier.
- * @note One core per cluster must invoke this function (the same across all
- *       clusters), or the calling cores will stall indefinitely.
+ * @note One core per cluster participating in the barrier must invoke this
+ *       function, or the calling cores will stall indefinitely.
+ * @todo Collective reduction does not support num_participants yet
  */
 
-inline void snrt_inter_cluster_barrier() {
+inline void snrt_inter_cluster_barrier(uint32_t num_participants=0) {
 #ifdef SNRT_SUPPORTS_NARROW_REDUCTION
     // Fetch the address for the reduction
     cls_t *ctrl_red = cls();
@@ -154,6 +168,8 @@ inline void snrt_inter_cluster_barrier() {
     // Fence to wait until the reduction is finished
     snrt_fence();
 #else
+    // If the number of participants is not specified, assume all clusters
+    if (num_participants == 0) num_participants = snrt_cluster_num();
     // Everyone increments a shared counter
     uint32_t cnt =
         __atomic_add_fetch(&(_snrt_barrier.cnt), 1, __ATOMIC_RELAXED);
@@ -161,10 +177,10 @@ inline void snrt_inter_cluster_barrier() {
     // All but the last cluster enter WFI, while the last cluster resets the
     // counter for the next barrier and multicasts an interrupt to wake up the
     // other clusters.
-    if (cnt == snrt_cluster_num()) {
+    if (cnt == num_participants) {
         _snrt_barrier.cnt = 0;
-        // Wake all clusters
-        snrt_wake_all(1 << snrt_cluster_core_idx());
+        // Wake other clusters
+        snrt_wake_clusters(1 << snrt_cluster_core_idx(), num_participants);
     } else {
         snrt_wfi();
         // Clear interrupt for next barrier
@@ -179,16 +195,18 @@ inline void snrt_inter_cluster_barrier() {
  *          cores are synchronized through a hardware barrier (see
  *          @ref snrt_cluster_hw_barrier). Clusters are synchronized through
  *          a software barrier (see @ref snrt_inter_cluster_barrier).
+ * @param num_participants The number of clusters that participate in the
+ *                         barrier. If set to 0, all clusters are assumed
+ *                         to participate.
  * @note Every Snitch core must invoke this function, or the calling cores
  *       will stall indefinitely.
  */
-inline void snrt_global_barrier() {
-    // Synchronize cores in a cluster with the HW barrier
+inline void snrt_global_barrier(uint32_t num_participants=0) {
     snrt_cluster_hw_barrier();
 
     // Synchronize all clusters
     if (snrt_is_dm_core()) {
-        snrt_inter_cluster_barrier();
+        snrt_inter_cluster_barrier(num_participants);
     }
 
     // Synchronize cores in a cluster with the HW barrier
@@ -265,15 +283,27 @@ inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
  * @param dst_buffer The pointer to the calling cluster's destination buffer.
  * @param src_buffer The pointer to the calling cluster's source buffer.
  * @param len The amount of data in each buffer.
+ * @param num_participants The number of clusters that participate in the
+ *                         reduction. If set to 0, all clusters are assumed
+ *                         to participate.
  * @note The destination buffers must lie at the same offset in every cluster's
  *       TCDM.
  */
 inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
-                                      size_t len) {
+                                      size_t len, uint32_t num_participants=0) {
+    // If the number of participants is not specified, assume all clusters
+    if (num_participants == 0) num_participants = snrt_cluster_num();
+
     // If we have a single cluster, no reduction has to be done
-    if (snrt_cluster_num() > 1) {
+    if (num_participants > 1) {
+
+        // DMA core will send compute cores' data, so it must wait on it
+        // to be available
+        snrt_fpu_fence();
+        snrt_cluster_hw_barrier();
+
         // Iterate levels in the binary reduction tree
-        int num_levels = ceil(log2(snrt_cluster_num()));
+        int num_levels = ceil(log2(num_participants));
         for (unsigned int level = 0; level < num_levels; level++) {
             // Determine whether the current cluster is an active cluster.
             // An active cluster is a cluster that participates in the current
@@ -295,7 +325,7 @@ inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
             }
 
             // Synchronize senders and receivers
-            snrt_global_barrier();
+            snrt_global_barrier(num_participants);
 
             // Every cluster which is not a sender performs the reduction
             if (is_active && !is_sender) {
@@ -313,6 +343,7 @@ inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
             }
 
             // Synchronize compute and DM cores for next tree level
+            snrt_fpu_fence();
             snrt_cluster_hw_barrier();
         }
     }
diff --git a/target/common/common.mk b/target/common/common.mk
index 46a109e9e..b396ec5b4 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -148,6 +148,12 @@ ROI_DUMP          = $(LOGS_DIR)/roi.json
 VISUAL_TRACE      = $(LOGS_DIR)/trace.json
 
 VISUALIZE_PY_FLAGS += --tracevis "$(BINARY) $(SNITCH_TXT_TRACES) --addr2line $(ADDR2LINE) -f snitch"
+GENTRACE_PY_FLAGS  += --mc-exec $(RISCV_MC) --mc-flags "$(RISCV_MC_FLAGS)"
+
+# Do not suspend trace generation upon gentrace errors when debugging
+ifeq ($(DEBUG),ON)
+GENTRACE_PY_FLAGS += --permissive
+endif
 
 .PHONY: traces annotate visual-trace clean-traces clean-annotate clean-perf clean-visual-trace
 traces: $(TXT_TRACES)
@@ -164,7 +170,7 @@ clean-visual-trace:
 	rm -f $(VISUAL_TRACE)
 
 $(addprefix $(LOGS_DIR)/,trace_hart_%.txt hart_%_perf.json dma_%_perf.json): $(LOGS_DIR)/trace_hart_%.dasm $(GENTRACE_PY) $(SN_GENTRACE_SRC)
-	$(GENTRACE_PY) $< --mc-exec $(RISCV_MC) --mc-flags "$(RISCV_MC_FLAGS)" --dma-trace $(SIM_DIR)/dma_trace_$*_00000.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
+	$(GENTRACE_PY) $< $(GENTRACE_PY_FLAGS) --dma-trace $(SIM_DIR)/dma_trace_$*_00000.log --dump-hart-perf $(LOGS_DIR)/hart_$*_perf.json --dump-dma-perf $(LOGS_DIR)/dma_$*_perf.json -o $(LOGS_DIR)/trace_hart_$*.txt
 
 # Generate source-code interleaved traces for all harts. Reads the binary from
 # the logs/.rtlbinary file that is written at start of simulation in the vsim script
@@ -178,7 +184,7 @@ $(JOINT_PERF_DUMP): $(PERF_DUMPS) $(JOIN_PY)
 	$(JOIN_PY) -i $(shell ls $(LOGS_DIR)/*_perf.json) -o $@
 
 $(ROI_DUMP): $(JOINT_PERF_DUMP) $(ROI_SPEC) $(ROI_PY)
-	$(ROI_PY) $(JOINT_PERF_DUMP) $(ROI_SPEC) --cfg $(CFG) -o $@
+	$(ROI_PY) $(JOINT_PERF_DUMP) $(ROI_SPEC) --cfg $(SN_CFG) -o $@
 
 $(VISUAL_TRACE): $(ROI_DUMP) $(VISUALIZE_PY)
 	$(VISUALIZE_PY) $(ROI_DUMP) $(VISUALIZE_PY_FLAGS) -o $@
diff --git a/target/common/rtl.mk b/target/common/rtl.mk
index 1502fefa3..53de11483 100644
--- a/target/common/rtl.mk
+++ b/target/common/rtl.mk
@@ -49,7 +49,7 @@ $(SN_BOOTROM_DIR)/bootrom.elf $(SN_BOOTROM_DIR)/bootrom.dump $(SN_BOOTROM_DIR)/b
 sn-rtl: $(SN_GEN_RTL_SRCS)
 
 sn-clean-rtl:
-	rm -f $(SN_GEN_RTL_SRCS)
+	rm -f $(SN_GEN_RTL_SRCS) $(SN_CLUSTER_RDL)
 
 $(SN_BOOTROM_DIR):
 	mkdir -p $@
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index f0a8c2d5b..2ebe7eb33 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -10,20 +10,25 @@
 
 .PHONY: sn-sw sn-clean-sw
 
-sn-sw: sn-runtime sn-tests
-sn-clean-sw: sn-clean-runtime sn-clean-tests
+sn-sw: sn-runtime sn-tests sn-apps
+sn-clean-sw: sn-clean-runtime sn-clean-tests sn-clean-apps
 
 ####################
 # Platform headers #
 ####################
 
-SNRT_HAL_HDRS_DIR ?= $(SN_ROOT)/target/snitch_cluster/sw/runtime/common
+SNRT_HAL_SRC_DIR   ?= $(SN_ROOT)/target/snitch_cluster/sw/runtime/common
+SNRT_HAL_BUILD_DIR ?= $(SNRT_HAL_SRC_DIR)
 
-SNITCH_CLUSTER_CFG_H                = $(SNRT_HAL_HDRS_DIR)/snitch_cluster_cfg.h
-SNITCH_CLUSTER_ADDRMAP_H            = $(SNRT_HAL_HDRS_DIR)/snitch_cluster_addrmap.h
-SNITCH_CLUSTER_RAW_ADDRMAP_H        = $(SNRT_HAL_HDRS_DIR)/snitch_cluster_raw_addrmap.h
-SNITCH_CLUSTER_PERIPHERAL_H         = $(SNRT_HAL_HDRS_DIR)/snitch_cluster_peripheral.h
-SNITCH_CLUSTER_PERIPHERAL_ADDRMAP_H = $(SNRT_HAL_HDRS_DIR)/snitch_cluster_peripheral_addrmap.h
+SNITCH_CLUSTER_CFG_H                = $(SNRT_HAL_BUILD_DIR)/snitch_cluster_cfg.h
+SNITCH_CLUSTER_ADDRMAP_H            = $(SNRT_HAL_BUILD_DIR)/snitch_cluster_addrmap.h
+SNITCH_CLUSTER_RAW_ADDRMAP_H        = $(SNRT_HAL_BUILD_DIR)/snitch_cluster_raw_addrmap.h
+SNITCH_CLUSTER_PERIPHERAL_H         = $(SNRT_HAL_BUILD_DIR)/snitch_cluster_peripheral.h
+SNITCH_CLUSTER_PERIPHERAL_ADDRMAP_H = $(SNRT_HAL_BUILD_DIR)/snitch_cluster_peripheral_addrmap.h
+SNITCH_CLUSTER_ADDRMAP_RDL          = $(SNRT_HAL_BUILD_DIR)/snitch_cluster_addrmap.rdl
+
+SNITCH_CLUSTER_CFG_H_TPL       = $(SNRT_HAL_SRC_DIR)/snitch_cluster_cfg.h.tpl
+SNITCH_CLUSTER_ADDRMAP_RDL_TPL = $(SNRT_HAL_SRC_DIR)/snitch_cluster_addrmap.rdl.tpl
 
 SNRT_HAL_HDRS += $(SNITCH_CLUSTER_CFG_H)
 SNRT_HAL_HDRS += $(SNITCH_CLUSTER_ADDRMAP_H)
@@ -31,11 +36,9 @@ SNRT_HAL_HDRS += $(SNITCH_CLUSTER_RAW_ADDRMAP_H)
 SNRT_HAL_HDRS += $(SNITCH_CLUSTER_PERIPHERAL_H)
 SNRT_HAL_HDRS += $(SNITCH_CLUSTER_PERIPHERAL_ADDRMAP_H)
 
-SNITCH_CLUSTER_ADDRMAP_RDL = $(SNRT_HAL_HDRS_DIR)/snitch_cluster_addrmap.rdl
-
 # CLUSTERGEN rules
-$(eval $(call sn_cluster_gen_rule,$(SNITCH_CLUSTER_CFG_H),$(SNITCH_CLUSTER_CFG_H).tpl))
-$(eval $(call sn_cluster_gen_rule,$(SNITCH_CLUSTER_ADDRMAP_RDL),$(SNITCH_CLUSTER_ADDRMAP_RDL).tpl))
+$(eval $(call sn_cluster_gen_rule,$(SNITCH_CLUSTER_CFG_H),$(SNITCH_CLUSTER_CFG_H_TPL)))
+$(eval $(call sn_cluster_gen_rule,$(SNITCH_CLUSTER_ADDRMAP_RDL),$(SNITCH_CLUSTER_ADDRMAP_RDL_TPL)))
 
 # peakrdl headers
 SN_PEAKRDL_INCDIRS += -I $(SN_ROOT)/hw/snitch_cluster/src/snitch_cluster_peripheral
@@ -57,7 +60,7 @@ $(SNITCH_CLUSTER_PERIPHERAL_ADDRMAP_H): $(SN_ROOT)/hw/snitch_cluster/src/snitch_
 .PHONY: sn-clean-headers
 sn-clean-sw: sn-clean-headers
 sn-clean-headers:
-	rm -f $(SNRT_HAL_HDRS)
+	rm -f $(SNRT_HAL_HDRS) $(SNITCH_CLUSTER_ADDRMAP_RDL)
 
 ##################
 # Subdirectories #
@@ -71,41 +74,38 @@ include $(SN_ROOT)/target/snitch_cluster/sw/riscv-tests/riscv-tests.mk
 SNRT_BUILD_APPS ?= ON
 
 ifeq ($(SNRT_BUILD_APPS), ON)
-SNRT_APPS  = sw/apps/nop
-SNRT_APPS += sw/apps/blas/axpy
-SNRT_APPS += sw/apps/blas/gemm
-SNRT_APPS += sw/apps/blas/gemv
-SNRT_APPS += sw/apps/blas/dot
-SNRT_APPS += sw/apps/blas/syrk
-SNRT_APPS += sw/apps/dnn/batchnorm
-# SNRT_APPS += sw/apps/dnn/conv2d
-# SNRT_APPS += sw/apps/dnn/fusedconv
-SNRT_APPS += sw/apps/dnn/gelu
-SNRT_APPS += sw/apps/dnn/layernorm
-SNRT_APPS += sw/apps/dnn/maxpool
-SNRT_APPS += sw/apps/dnn/softmax
-SNRT_APPS += sw/apps/dnn/flashattention_2
-SNRT_APPS += sw/apps/dnn/concat
-SNRT_APPS += sw/apps/dnn/fused_concat_linear
-SNRT_APPS += sw/apps/dnn/transpose
-SNRT_APPS += sw/apps/montecarlo/pi_estimation
-SNRT_APPS += sw/apps/atax
-SNRT_APPS += sw/apps/correlation
-SNRT_APPS += sw/apps/covariance
-SNRT_APPS += sw/apps/doitgen
-SNRT_APPS += sw/apps/kmeans
-SNRT_APPS += sw/apps/exp
-SNRT_APPS += sw/apps/log
-SNRT_APPS += sw/apps/kbpcpa
-SNRT_APPS += sw/apps/box3d1r
-SNRT_APPS += sw/apps/j3d27pt
+SNRT_APPS  = $(SN_ROOT)/target/snitch_cluster/sw/apps/nop
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/blas/axpy
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/blas/gemm
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/blas/gemv
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/blas/dot
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/blas/syrk
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/batchnorm
+# SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/conv2d
+# SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/fusedconv
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/gelu
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/layernorm
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/maxpool
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/softmax
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/flashattention_2
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/concat
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/fused_concat_linear
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/transpose
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/mha
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/montecarlo/pi_estimation
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/atax
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/correlation
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/covariance
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/doitgen
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/kmeans
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/exp
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/log
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/kbpcpa
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/box3d1r
+SNRT_APPS += $(SN_ROOT)/target/snitch_cluster/sw/apps/j3d27pt
+endif
 
 # Include Makefile from each app subdirectory
 $(foreach app,$(SNRT_APPS), \
-	$(eval include $(SN_ROOT)/target/snitch_cluster/$(app)/app.mk) \
+	$(eval include $(app)/app.mk) \
 )
-
-sn-sw: sn-apps
-sn-clean-sw: sn-clean-apps
-
-endif
diff --git a/target/snitch_cluster/sw/apps/dnn/mha/app.mk b/target/snitch_cluster/sw/apps/dnn/mha/app.mk
new file mode 100644
index 000000000..7154f135b
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/dnn/mha/app.mk
@@ -0,0 +1,13 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := mha
+$(APP)_BUILD_DIR ?= $(SN_ROOT)/target/snitch_cluster/sw/apps/dnn/$(APP)/build
+SRC_DIR          := $(SN_ROOT)/sw/dnn/$(APP)/src
+SRCS             := $(SRC_DIR)/main.c
+
+include $(SN_ROOT)/sw/dnn/common.mk
+include $(SN_ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
index 345425d30..c4784cd24 100644
--- a/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
+++ b/target/snitch_cluster/sw/runtime/common/snitch_cluster_cfg.h.tpl
@@ -10,7 +10,6 @@
 #define SNRT_CLUSTER_CORE_NUM CFG_CLUSTER_NR_CORES
 #define SNRT_CLUSTER_NUM ${cfg['nr_clusters']}
 #define SNRT_CLUSTER_DM_CORE_NUM 1
-#define SNRT_TCDM_START_ADDR SNITCH_CLUSTER_ADDRMAP_CLUSTER_TCDM_BASE_ADDR
 #define SNRT_TCDM_BANK_WIDTH ${cfg['cluster']['data_width'] // 8}
 #define SNRT_TCDM_BANK_NUM ${cfg['cluster']['tcdm']['banks']}
 #define SNRT_TCDM_HYPERBANK_NUM ${cfg['cluster']['tcdm']['hyperbanks']}
diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snrt.S b/target/snitch_cluster/sw/runtime/rtl/src/snrt.S
index 536b69c0a..8e4b90ac8 100644
--- a/target/snitch_cluster/sw/runtime/rtl/src/snrt.S
+++ b/target/snitch_cluster/sw/runtime/rtl/src/snrt.S
@@ -11,5 +11,8 @@
 #define SNRT_INIT_TLS
 #define SNRT_CRT0_PARK
 
+#include "snitch_cluster_raw_addrmap.h"
+#define SNRT_TCDM_START_ADDR SNITCH_CLUSTER_ADDRMAP_CLUSTER_TCDM_BASE_ADDR
+
 #include "snitch_cluster_cfg.h"
 #include "start.S"
diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
index 2c27901f3..3330a8272 100644
--- a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
+++ b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
@@ -12,6 +12,7 @@
 #include "snitch_cluster_cfg.h"
 #include "snitch_cluster_peripheral_addrmap.h"
 #include "snitch_cluster_raw_addrmap.h"
+#define SNRT_TCDM_START_ADDR SNITCH_CLUSTER_ADDRMAP_CLUSTER_TCDM_BASE_ADDR
 
 // Forward declarations
 #include "alloc_decls.h"
diff --git a/target/snitch_cluster/sw/runtime/runtime.mk b/target/snitch_cluster/sw/runtime/runtime.mk
index bba19a9ca..5f693ab11 100644
--- a/target/snitch_cluster/sw/runtime/runtime.mk
+++ b/target/snitch_cluster/sw/runtime/runtime.mk
@@ -26,7 +26,7 @@ SNRT_INCDIRS += $(SNRT_DIR)/src/omp
 SNRT_INCDIRS += $(SNRT_DIR)/api/omp
 SNRT_INCDIRS += $(SNRT_DIR)/vendor/riscv-opcodes
 SNRT_INCDIRS += $(SNRT_SRCDIR)
-SNRT_INCDIRS += $(SNRT_HAL_HDRS_DIR)
+SNRT_INCDIRS += $(SNRT_HAL_BUILD_DIR)
 
 SNRT_RISCV_CFLAGS += $(RISCV_CFLAGS)
 SNRT_RISCV_CFLAGS += $(addprefix -I,$(SNRT_INCDIRS))
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index ba5fb1a98..6e8358e97 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -241,10 +241,10 @@
                     "description": "Whether to expose memory cut configuration inputs for implementation",
                     "default": false
                 },
-                "wide_tcdm_port_expose": {
-                    "type": "boolean",
-                    "description": "Whether to expose a wide port into the TCDM at the cluster interface. Used to provide external masters, such as accelerators, with wide access to the TCDM.",
-                    "default": false
+                "num_exposed_wide_tcdm_ports": {
+                    "type": "number",
+                    "description": "Number of exposed wide ports into the TCDM at the cluster interface. Used to provide external masters, such as accelerators, with wide access to the TCDM.",
+                    "default": 0
                 },
                 "narrow_axi_port_expose": {
                     "type": "boolean",
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index 3e89b5492..d4eb60a5d 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -1288,7 +1288,8 @@ def custom_formatwarning(message, category, filename, lineno, line=None):
                         message += f'line {lineno}.'
                     print(traceback.format_exc(), file=sys.stderr)
                     print(message, file=sys.stderr)
-                    raise e
+                    if not args.permissive:
+                        raise e
             else:
                 break  # Nothing more in pipe, EOF
         perf_metrics[-1]['tend'] = time_info[0] // 1000