diff --git a/.dockerignore b/.dockerignore
index 9b6a1acdfe..73707cc4c9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,4 +1,5 @@
 *
 !sw/**/*.py
 !util/**/*.py
+!target/snitch_cluster/util/*.py
 !pyproject.toml
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 05b4fefb24..2d8a6e354b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -39,7 +39,7 @@ jobs:
           cache-to: type=gha,mode=max`
           file: util/container/Dockerfile
           push: true
-          tags: ghcr.io/pulp-platform/snitch_cluster:${{ github.ref_name }}
+          tags: ghcr.io/${{ github.repository_owner }}/snitch_cluster:${{ github.ref_name }}
 
   ########
   # Docs #
@@ -53,7 +53,7 @@ jobs:
       github.event_name != 'pull_request' ||
       github.event.pull_request.head.repo.full_name != github.repository
     container:
-      image: ghcr.io/pulp-platform/snitch_cluster:${{ github.ref_name }}
+      image: ghcr.io/${{ github.repository_owner }}/snitch_cluster:${{ github.ref_name }}
     steps:
       - uses: actions/checkout@v2
       - name: Build docs
@@ -71,7 +71,7 @@ jobs:
       github.event_name != 'pull_request' ||
       github.event.pull_request.head.repo.full_name != github.repository
     container:
-      image: ghcr.io/pulp-platform/snitch_cluster:${{ github.ref_name }}
+      image: ghcr.io/${{ github.repository_owner }}/snitch_cluster:${{ github.ref_name }}
     steps:
       - uses: actions/checkout@v2
       - name: Run pytest
@@ -89,7 +89,7 @@ jobs:
       github.event_name != 'pull_request' ||
       github.event.pull_request.head.repo.full_name != github.repository
     container:
-      image: ghcr.io/pulp-platform/snitch_cluster:${{ github.ref_name }}
+      image: ghcr.io/${{ github.repository_owner }}/snitch_cluster:${{ github.ref_name }}
     steps:
       - uses: actions/checkout@v2
         with:
@@ -140,7 +140,7 @@ jobs:
   #     github.event_name != 'pull_request' ||
   #     github.event.pull_request.head.repo.full_name != github.repository
   #   container:
-  #     image: ghcr.io/pulp-platform/snitch_cluster:${{ github.ref_name }}
+  #     image: ghcr.io/${{ github.repository_owner }}/snitch_cluster:${{ github.ref_name }}
   #   steps:
   #     - uses: actions/checkout@v2
   #       with:
@@ -167,7 +167,7 @@ jobs:
       github.event_name != 'pull_request' ||
       github.event.pull_request.head.repo.full_name != github.repository
     container:
-      image: ghcr.io/pulp-platform/snitch_cluster:${{ github.ref_name }}
+      image: ghcr.io/${{ github.repository_owner }}/snitch_cluster:${{ github.ref_name }}
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/Bender.lock b/Bender.lock
index d48a9c04d8..c7961670e9 100644
--- a/Bender.lock
+++ b/Bender.lock
@@ -7,8 +7,8 @@ packages:
     dependencies:
     - common_cells
   axi:
-    revision: 4e54ac6766b160217a83a74d5a23af9bbf59e6ee
-    version: null
+    revision: 853ede23b2a9837951b74dbdc6d18c3eef5bac7d
+    version: 0.39.5
     source:
       Git: https://github.com/pulp-platform/axi
     dependencies:
@@ -32,8 +32,8 @@ packages:
     dependencies:
     - common_cells
   cluster_icache:
-    revision: 0e1fb6751d9684d968ba7fb40836e6118b448ecd
-    version: 0.1.1
+    revision: 64e21ae455bbdde850c4df13bef86ea55ac42537
+    version: null
     source:
       Git: https://github.com/pulp-platform/cluster_icache.git
     dependencies:
@@ -42,8 +42,8 @@ packages:
     - scm
     - tech_cells_generic
   common_cells:
-    revision: c27bce39ebb2e6bae52f60960814a2afca7bd4cb
-    version: 1.37.0
+    revision: 842753eabe166818bde831908c865872d115528d
+    version: null
     source:
       Git: https://github.com/pulp-platform/common_cells
     dependencies:
@@ -91,8 +91,8 @@ packages:
     - common_cells
     - common_verification
   register_interface:
-    revision: ae616e5a1ec2b41e72d200e5ab09c65e94aebd3d
-    version: 0.4.4
+    revision: 5daa85d164cf6b54ad061ea1e4c6f3624556e467
+    version: 0.4.5
     source:
       Git: https://github.com/pulp-platform/register_interface
     dependencies:
diff --git a/Bender.yml b/Bender.yml
index 29fc783b54..453117bf63 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -19,14 +19,14 @@ package:
     - Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
 
 dependencies:
-  axi:                { git: https://github.com/pulp-platform/axi,                rev:  4e54ac6766b160217a83a74d5a23af9bbf59e6ee  }
-  axi_riscv_atomics:  { git: https://github.com/pulp-platform/axi_riscv_atomics,  version:  0.6.0   }
-  common_cells:       { git: https://github.com/pulp-platform/common_cells,       version:  1.35.0  }
-  FPnew:              { git: "https://github.com/pulp-platform/cvfpu.git",        rev:      pulp-v0.1.3 }
-  register_interface: { git: https://github.com/pulp-platform/register_interface, version:  0.4.2   }
-  tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version:  0.2.11  }
-  riscv-dbg:          { git: https://github.com/pulp-platform/riscv-dbg,          version:  0.8.0   }
-  cluster_icache:     { git: https://github.com/pulp-platform/cluster_icache.git, version:  0.1.0   }
+  axi:                { git: https://github.com/pulp-platform/axi,                version: 0.39.5  }
+  axi_riscv_atomics:  { git: https://github.com/pulp-platform/axi_riscv_atomics,  version: 0.6.0   }
+  common_cells:       { git: https://github.com/pulp-platform/common_cells,       rev:     remove-ffnr }
+  FPnew:              { git: https://github.com/pulp-platform/cvfpu.git,          rev:     pulp-v0.1.3 }
+  register_interface: { git: https://github.com/pulp-platform/register_interface, version: 0.4.2   }
+  tech_cells_generic: { git: https://github.com/pulp-platform/tech_cells_generic, version: 0.2.11  }
+  riscv-dbg:          { git: https://github.com/pulp-platform/riscv-dbg,          version: 0.8.0   }
+  cluster_icache:     { git: https://github.com/pulp-platform/cluster_icache.git, rev:     64e21ae455bbdde850c4df13bef86ea55ac42537 }
   idma:               { git: https://github.com/pulp-platform/iDMA,               rev: __deploy__9cbcd30__snitch-tracing }
 
 export_include_dirs:
@@ -35,6 +35,7 @@ export_include_dirs:
   - hw/tcdm_interface/include
   - hw/snitch/include
   - hw/snitch_ssr/include
+  - target/snitch_cluster/generated
 
 sources:
   # reqrsp_interface
@@ -93,6 +94,8 @@ sources:
       # Level 0
       - hw/snitch/src/snitch_pma_pkg.sv
       - hw/snitch/src/riscv_instr.sv
+      - hw/snitch/src/snitch_fp_queue.sv
+      - hw/snitch/src/snitch_in_queue.sv
       # Level 1
       - hw/snitch/src/snitch_pkg.sv
       # Level 2
@@ -116,14 +119,14 @@ sources:
   - files:
       - hw/snitch_vm/src/snitch_ptw.sv
 
-  # snitch_ipu
-  - files:
-      # Level 0
-      - hw/snitch_ipu/src/snitch_ipu_pkg.sv
-      # Level 1
-      - hw/snitch_ipu/src/snitch_ipu_alu.sv
-      # Level 2
-      - hw/snitch_ipu/src/snitch_int_ss.sv
+  # # snitch_ipu
+  # - files:
+  #     # Level 0
+  #     - hw/snitch_ipu/src/snitch_ipu_pkg.sv
+  #     # Level 1
+  #     - hw/snitch_ipu/src/snitch_ipu_alu.sv
+  #     # Level 2
+  #     - hw/snitch_ipu/src/snitch_int_ss.sv
 
   # snitch_ssr
   - files:
@@ -185,8 +188,19 @@ sources:
 
   # target/snitch_cluster
   - target: snitch_cluster
+    files:
+      - target/snitch_cluster/generated/snitch_cluster_pkg.sv
+  - target: all(snitch_cluster, not(postlayout))
     files:
       - target/snitch_cluster/generated/snitch_cluster_wrapper.sv
+  - target: all(snitch_cluster, postlayout)
+    files:
+      - nonfree/gf12/fusion/runs/0/out/15/snitch_cluster_wrapper.v
   - target: all(snitch_cluster, any(simulation, verilator))
     files:
       - target/snitch_cluster/test/testharness.sv
+
+  - target: gf12
+    files:
+      - nonfree/gf12/mems/tc_sram.sv
+      - nonfree/gf12/sourcecode/tc_clk.sv
diff --git a/Makefile b/Makefile
index bdfa044c21..46e3fc9668 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ ROOT = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ############
 
 NONFREE_REMOTE ?= git@iis-git.ee.ethz.ch:pulp-restricted/snitch-cluster-nonfree.git
-NONFREE_COMMIT ?= e30961e20a23a76442da27d2ba07c9fe20f3b575
+NONFREE_COMMIT ?= synth
 NONFREE_DIR = $(ROOT)/nonfree
 
 all: nonfree
diff --git a/docs/ug/trace_analysis.md b/docs/ug/trace_analysis.md
index 2a9d1bad40..7ba1dfccb6 100644
--- a/docs/ug/trace_analysis.md
+++ b/docs/ug/trace_analysis.md
@@ -128,6 +128,7 @@ In the following table you can find a complete list of all the performance metri
 |`snitch_issues`           |Total number of instructions issued by Snitch, excluding those offloaded to the FPSS (see `snitch_fseq_offloads`).                                                                                  |
 |`snitch_fseq_offloads`    |Total number of instructions offloaded by Snitch to the FPSS.                                                                                                                                       |
 |`snitch_fseq_rel_offloads`|The ratio between `snitch_fseq_offloads` and the total number of instructions issued by Snitch core proper, i.e. `snitch_issues + snitch_fseq_offloads`.                                            |
+|`snitch_occupancy`        |IPC of the Snitch core, calculated as `snitch_issues / cycles`.                                                                                                                                     |
 |`fpss_issues`             |Total number of instructions issued by the FPSS. It counts repeated issues from the FREP sequencer.                                                                                                 |
 |`fpss_fpu_issues`         |Similar to `fpss_issues`, but counts only instructions destined to the FPU proper. It does not for instance include instructions issued to the FPSS's LSU.                                          |
 |`fseq_yield`              |The ratio between `fpss_issues` and `snitch_fseq_offloads`. The difference lies in the FREP sequencer possibly replicating instructions. If the sequencer is not used this ratio should amount to 1.|
diff --git a/docs/ug/tutorial.md b/docs/ug/tutorial.md
index 27c5ce4d46..6a1f18d8b4 100644
--- a/docs/ug/tutorial.md
+++ b/docs/ug/tutorial.md
@@ -149,7 +149,7 @@ bin/snitch_cluster.vsim.gui sw/apps/blas/axpy/build/axpy.elf
 
 ## Debugging and benchmarking
 
-When you run a simulation, every core logs all the instructions it executes in a trace file. The traces are located in the `logs` folder within the simulation directory. Every trace is identified by a hart ID, that is a unique ID for every _hardware thread (hart)_ in a RISC-V system (and since all our cores have a single thread that is a unique ID per core).
+When you run a simulation, every core logs all the instructions it executes in a trace file. The traces are located in the `logs` folder within the _simulation directory_. Every trace is identified by a hart ID, that is a unique ID for every _hardware thread (hart)_ in a RISC-V system (and since all our cores have a single thread that is a unique ID per core).
 
 The simulation dumps the traces in a non-human-readable format with `.dasm` extension. To convert these to a human-readable form run:
 
@@ -345,3 +345,95 @@ As you may have noticed, there is a good deal of code which is independent of th
 It is thus preferable to develop the data generation scripts and Snitch kernels in a shared location, from which multiple platforms can take and include the code. The `sw` directory in the root of this repository was created with this goal in mind. For the AXPY example, shared sources are hosted under the `sw/blas/axpy` directory.
 
 We recommend that you follow this approach also in your own developments for as much of the code which can be reused.
+
+## Implementing the hardware
+
+If you make changes to the hardware, you probably also want to physically implement it to estimate the PPA impact of your modifications. As the physical implementation flow involves proprietary tools licensed under non-disclosure agreements, our physical implementation flow is contained in a separate private git repository. If you are an IIS user, with access to our Gitlab server and IIS machines, you may follow the next instructions to replicate our implementation flow.
+
+Firstly, we need to clone all the sources for the physical flow. The following command takes care of everything for you:
+```shell
+make nonfree
+```
+
+Behind the scenes, it will clone the `snitch-cluster-nonfree` repo under the `nonfree` folder. Let's move into this folder:
+
+```shell
+cd nonfree
+```
+
+Here, you will find a Makefile with a series of convenience targets to launch our flow up to a certain stage: may it be elaboration (`elab`), synthesis (`synth`) or place-and-route (`pnr`). If you can wait long enough you may also launch the entire flow to produce a final optimized post-layout netlist:
+
+```shell
+make post-layout-netlist
+```
+
+This may take as long as a day, or more, depending on your machine's performance. If you previously launched the flow up to a certain stage, you can resume it from that point without restarting from scratch. Just specify the `FIRST_STAGE` flag with the name of the stage you want to start from, e.g.:
+
+```shell
+make FIRST_STAGE=synth-init-opto post-layout-netlist
+```
+
+You will find reports and output files produced by the flow in the `nonfree/gf12/fusion/runs/0/` folder, respectively in the `reports` and `out` subdirectories, separated into individual subdirectories for every stage in the flow. These are all you should need to derive area and timing numbers for your design.
+
+## Running a physical simulation
+
+Once your design is physically implemented, you want to also verify that it works as intended.
+Assuming you used the previous command to get a final optimized post-layout netlist, you can directly build a simulation model out of it. Head back to the main repository, in the `target/snitch_cluster` folder, and build the simulation model with the following flag:
+
+```shell
+<<<<<<< HEAD
+make clean-vsim
+=======
+>>>>>>> eeefc1a... treewide: Add post-layout simulation flow
+make PL_SIM=1 bin/snitch_cluster.vsim
+```
+
+This resembles the commands you've previously seen in section [Building the hardware](#building-the-hardware). In fact, all testbench components are the same, we simply use the added flag to tell [Bender](https://github.com/pulp-platform/bender) to reference the physical netlist in place of the source RTL as a DUT during compilation.
+The `Bender.yml` file in the root of the repository automatically references the final netlist in our flow, but you could replace that with a netlist from an intermediate stage if you do not intend to run the whole flow.
+
+<<<<<<< HEAD
+!!! note
+    Make is not aware of the effect of this flag, so it will not update the RTL source list for compilation. To ensure that it is updated, we can delete the compilation script, which was implicitly generated when you last built the simulation model. The first command above achieves this, by deleting all artifacts from the last build.
+
+=======
+>>>>>>> eeefc1a... treewide: Add post-layout simulation flow
+Running a physical simulation is then no different from running a functional simulation, so you may continue using the commands introduced in section [Running a simulation](#running-a-simulation).
+
+## Power estimation
+
+During physical implementation, the tools are able to independently generate area and timing numbers. For a complete PPA analysis, you will want to include power estimates as well.
+
+Power numbers are extremely dependent on the switching activity in your circuit, which in turn depends on the stimuli you intend to feed in to your DUT, so you are in charge of providing this information to the tools. The switching activity is typically recorded in the form of a [VCD](https://en.wikipedia.org/wiki/Value_change_dump) file, and can be generated by most RTL simulators.
+
+To do so, set the `VCD_DUMP` flag when building the physical simulation model:
+```shell
+make PL_SIM=1 VCD_DUMP=1 DEBUG=ON bin/snitch_cluster.vsim
+``` 
+
+!!! danger
+    When using QuestaSim for VCD generation, you must build the model with the `DEBUG=ON` flag, to ensure that all nets are preserved during compilation, preventing them from being optimized away. This guarantees that the VCD file contains switching activity for all nets in your circuit. 
+
+When you run a simulation, the simulator will now automatically create a `vcd` subdirectory within the _simulation directory_, where a VCD file is generated.
+
+Most often you are not interested in estimating the power of an entire simulation, but only of a specific section, e.g. a kernel execution.
+You can pass start and end times for VCD recording to the simulation as environment variables:
+
+```shell
+VCD_START=127ns VCD_END=8898ns bin/snitch_cluster.vsim sw/apps/blas/axpy/build/axpy.elf
+```
+
+!!! note
+    Variable assignments must preceed the executable in a shell command to be interpreted as environment variable assignments. Note that environment variables set this way only persist for the current command.
+
+A benefit of RTL simulations is that they are cycle-accurate. You can thus use them as a reference to find the start and end times of interest with the help of the simulation traces (unavailable during physical simulation), and directly apply these to the physical simulation.
+
+With a VCD file at your disposal, you can now estimate the power consumption of your circuit. In the non-free repository, run the following command:
+```shell
+make SIM_DIR=<path_to_simulation_directory> power
+```
+You need to point the command to the _simulation directory_ in which the VCD dump was generated, for it to find the VCD file.
+
+!!! note
+    Since the actual simulation command is run in a different directory, you need to point to the _simulation directory_ using an absolute path.
+
+Once the command terminates, you will find power reports in the `nonfree/gf12/synopsys/reports` folder, from which you can extract relevant power numbers.
diff --git a/hw/snitch/src/riscv_instr.sv b/hw/snitch/src/riscv_instr.sv
index b415628afb..6504530166 100644
--- a/hw/snitch/src/riscv_instr.sv
+++ b/hw/snitch/src/riscv_instr.sv
@@ -334,6 +334,9 @@ package riscv_instr;
   localparam logic [31:0] SCFGWI             = 32'b?????????????????010000000101011;
   localparam logic [31:0] SCFGR              = 32'b0000000?????00001001?????0101011;
   localparam logic [31:0] SCFGW              = 32'b0000000??????????010000010101011;
+  localparam logic [31:0] FLT_D_COPIFT       = 32'b1010001??????????001?????0101011;
+  localparam logic [31:0] FCVT_D_W_COPIFT    = 32'b110100100000?????????????0101011;
+  localparam logic [31:0] FCVT_D_WU_COPIFT   = 32'b110100100001?????????????0101011;
   localparam logic [31:0] FLH                = 32'b?????????????????001?????0000111;
   localparam logic [31:0] FSH                = 32'b?????????????????001?????0100111;
   localparam logic [31:0] FMADD_H            = 32'b?????10??????????????????1000011;
@@ -700,6 +703,8 @@ package riscv_instr;
   localparam logic [31:0] VFCPKB_B_D         = 32'b1011010??????????111?????0110011;
   localparam logic [31:0] VFCPKC_B_D         = 32'b1011011??????????011?????0110011;
   localparam logic [31:0] VFCPKD_B_D         = 32'b1011011??????????111?????0110011;
+  localparam logic [31:0] VFCVT_D_S          = 32'b100110000100?????001?????0110011;
+  localparam logic [31:0] VFCVTU_D_S         = 32'b100110000100?????101?????0110011;
   localparam logic [31:0] VFCVT_S_B          = 32'b100110000111?????000?????0110011;
   localparam logic [31:0] VFCVTU_S_B         = 32'b100110000111?????100?????0110011;
   localparam logic [31:0] VFCVT_B_S          = 32'b100110000100?????011?????0110011;
@@ -839,88 +844,6 @@ package riscv_instr;
   localparam logic [31:0] VL2R_V             = 32'b000001101000?????101?????0000111;
   localparam logic [31:0] VL4R_V             = 32'b000011101000?????110?????0000111;
   localparam logic [31:0] VL8R_V             = 32'b000111101000?????111?????0000111;
-  localparam logic [31:0] IMV_X_W            = 32'b111000000000?????000?????1011011;
-  localparam logic [31:0] IMV_W_X            = 32'b111100000000?????000?????1011011;
-  localparam logic [31:0] IADDI              = 32'b?????????????????000?????1111011;
-  localparam logic [31:0] ISLLI              = 32'b000000???????????001?????1111011;
-  localparam logic [31:0] ISLTI              = 32'b?????????????????010?????1111011;
-  localparam logic [31:0] ISLTIU             = 32'b?????????????????011?????1111011;
-  localparam logic [31:0] IXORI              = 32'b?????????????????100?????1111011;
-  localparam logic [31:0] ISRLI              = 32'b000000???????????101?????1111011;
-  localparam logic [31:0] ISRAI              = 32'b010000???????????101?????1111011;
-  localparam logic [31:0] IORI               = 32'b?????????????????110?????1111011;
-  localparam logic [31:0] IANDI              = 32'b?????????????????111?????1111011;
-  localparam logic [31:0] IADD               = 32'b0000000??????????000?????1011011;
-  localparam logic [31:0] ISUB               = 32'b0100000??????????000?????1011011;
-  localparam logic [31:0] ISLL               = 32'b0000000??????????001?????1011011;
-  localparam logic [31:0] ISLT               = 32'b0000000??????????010?????1011011;
-  localparam logic [31:0] ISLTU              = 32'b0000000??????????011?????1011011;
-  localparam logic [31:0] IXOR               = 32'b0000000??????????100?????1011011;
-  localparam logic [31:0] ISRL               = 32'b0000000??????????101?????1011011;
-  localparam logic [31:0] ISRA               = 32'b0100000??????????101?????1011011;
-  localparam logic [31:0] IOR                = 32'b0000000??????????110?????1011011;
-  localparam logic [31:0] IAND               = 32'b0000000??????????111?????1011011;
-  localparam logic [31:0] IMADD              = 32'b?????01??????????000?????1011011;
-  localparam logic [31:0] IMSUB              = 32'b?????01??????????001?????1011011;
-  localparam logic [31:0] INMSUB             = 32'b?????01??????????010?????1011011;
-  localparam logic [31:0] INMADD             = 32'b?????01??????????011?????1011011;
-  localparam logic [31:0] IMUL               = 32'b0000010??????????000?????1011011;
-  localparam logic [31:0] IMULH              = 32'b0000010??????????001?????1011011;
-  localparam logic [31:0] IMULHSU            = 32'b0000010??????????010?????1011011;
-  localparam logic [31:0] IMULHU             = 32'b0000010??????????011?????1011011;
-  localparam logic [31:0] IANDN              = 32'b0100000??????????111?????1011011;
-  localparam logic [31:0] IORN               = 32'b0100000??????????110?????1011011;
-  localparam logic [31:0] IXNOR              = 32'b0100000??????????100?????1011011;
-  localparam logic [31:0] ISLO               = 32'b0010000??????????001?????1011011;
-  localparam logic [31:0] ISRO               = 32'b0010000??????????101?????1011011;
-  localparam logic [31:0] IROL               = 32'b0110000??????????001?????1011011;
-  localparam logic [31:0] IROR               = 32'b0110000??????????101?????1011011;
-  localparam logic [31:0] ISBCLR             = 32'b0100100??????????001?????1011011;
-  localparam logic [31:0] ISBSET             = 32'b0010100??????????001?????1011011;
-  localparam logic [31:0] ISBINV             = 32'b0110100??????????001?????1011011;
-  localparam logic [31:0] ISBEXT             = 32'b0100100??????????101?????1011011;
-  localparam logic [31:0] IGORC              = 32'b0010100??????????101?????1011011;
-  localparam logic [31:0] IGREV              = 32'b0110100??????????101?????1011011;
-  localparam logic [31:0] ISLOI              = 32'b001000???????????001?????1111011;
-  localparam logic [31:0] ISROI              = 32'b001000???????????101?????1111011;
-  localparam logic [31:0] IRORI              = 32'b011000???????????101?????1111011;
-  localparam logic [31:0] ISBCLRI            = 32'b010010???????????001?????1111011;
-  localparam logic [31:0] ISBSETI            = 32'b001010???????????001?????1111011;
-  localparam logic [31:0] ISBINVI            = 32'b011010???????????001?????1111011;
-  localparam logic [31:0] ISBEXTI            = 32'b010010???????????101?????1111011;
-  localparam logic [31:0] IGORCI             = 32'b001010???????????101?????1111011;
-  localparam logic [31:0] IGREVI             = 32'b011010???????????101?????1111011;
-  localparam logic [31:0] ICLZ               = 32'b011000000000?????010?????1011011;
-  localparam logic [31:0] ICTZ               = 32'b011000000001?????010?????1011011;
-  localparam logic [31:0] IPCNT              = 32'b011000000010?????010?????1011011;
-  localparam logic [31:0] ISEXT_B            = 32'b011000000100?????010?????1011011;
-  localparam logic [31:0] ISEXT_H            = 32'b011000000101?????010?????1011011;
-  localparam logic [31:0] ICRC32_B           = 32'b011000010000?????001?????1011011;
-  localparam logic [31:0] ICRC32_H           = 32'b011000010001?????001?????1011011;
-  localparam logic [31:0] ICRC32_W           = 32'b011000010010?????001?????1011011;
-  localparam logic [31:0] ICRC32C_B          = 32'b011000011000?????001?????1011011;
-  localparam logic [31:0] ICRC32C_H          = 32'b011000011001?????001?????1011011;
-  localparam logic [31:0] ICRC32C_W          = 32'b011000011010?????001?????1011011;
-  localparam logic [31:0] ISH1ADD            = 32'b0010000??????????010?????1011011;
-  localparam logic [31:0] ISH2ADD            = 32'b0010000??????????100?????1011011;
-  localparam logic [31:0] ISH3ADD            = 32'b0010000??????????110?????1011011;
-  localparam logic [31:0] ICLMUL             = 32'b0000101??????????001?????1011011;
-  localparam logic [31:0] ICLMULR            = 32'b0000101??????????010?????1011011;
-  localparam logic [31:0] ICLMULH            = 32'b0000101??????????011?????1011011;
-  localparam logic [31:0] IMIN               = 32'b0000101??????????100?????1011011;
-  localparam logic [31:0] IMAX               = 32'b0000101??????????101?????1011011;
-  localparam logic [31:0] IMINU              = 32'b0000101??????????110?????1011011;
-  localparam logic [31:0] IMAXU              = 32'b0000101??????????111?????1011011;
-  localparam logic [31:0] ISHFL              = 32'b0000100??????????001?????1011011;
-  localparam logic [31:0] IUNSHFL            = 32'b0000100??????????101?????1011011;
-  localparam logic [31:0] IBEXT              = 32'b0000100??????????110?????1011011;
-  localparam logic [31:0] IBDEP              = 32'b0100100??????????110?????1011011;
-  localparam logic [31:0] IPACK              = 32'b0000100??????????100?????1011011;
-  localparam logic [31:0] IPACKU             = 32'b0100100??????????100?????1011011;
-  localparam logic [31:0] IPACKH             = 32'b0000100??????????111?????1011011;
-  localparam logic [31:0] IBFP               = 32'b0100100??????????111?????1011011;
-  localparam logic [31:0] ISHFLI             = 32'b0000100??????????001?????1111011;
-  localparam logic [31:0] IUNSHFLI           = 32'b0000100??????????101?????1111011;
   /* CSR Addresses */
   localparam logic [11:0] CSR_FFLAGS = 12'h1;
   localparam logic [11:0] CSR_FRM = 12'h2;
@@ -1134,6 +1057,8 @@ package riscv_instr;
   localparam logic [11:0] CSR_SSR = 12'h7c0;
   localparam logic [11:0] CSR_FPMODE = 12'h7c1;
   localparam logic [11:0] CSR_BARRIER = 12'h7c2;
+  localparam logic [11:0] CSR_SC = 12'h7c3;
+  localparam logic [11:0] CSR_FPQ = 12'h7c4;
   localparam logic [11:0] CSR_HTIMEDELTAH = 12'h615;
   localparam logic [11:0] CSR_CYCLEH = 12'hc80;
   localparam logic [11:0] CSR_TIMEH = 12'hc81;
diff --git a/hw/snitch/src/snitch.sv b/hw/snitch/src/snitch.sv
index 42c86dc717..7f0ca9c272 100644
--- a/hw/snitch/src/snitch.sv
+++ b/hw/snitch/src/snitch.sv
@@ -90,6 +90,14 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   input  acc_resp_t     acc_prsp_i,
   input  logic          acc_pvalid_i,
   output logic          acc_pready_o,
+  // FP Queue read interface
+  output logic [31:0]   fpq_pdata_o,
+  output logic          fpq_pvalid_o,
+  input  logic          fpq_pready_i,
+  // IN Queue write interface
+  input  logic [31:0]   inq_qdata_i,
+  input  logic          inq_qvalid_i,
+  output logic          inq_qready_o,
   /// TCDM Data Interface
   /// Write transactions do not return data on the `P Channel`
   /// Transactions need to be handled strictly in-order.
@@ -111,6 +119,8 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   input  logic          caq_pvalid_i,
   // Core events for performance counters
   output snitch_pkg::core_events_t  core_events_o,
+  // FP Queue CSR
+  output logic          en_fpinq_o,
   // Cluster HW barrier
   output logic          barrier_o,
   input  logic          barrier_i
@@ -161,6 +171,26 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   logic [0:0]               gpr_we;
   logic [2**RegWidth-1:0]   sb_d, sb_q;
 
+  // IN->FP Queue write
+  logic        rd_is_fpq;  // Write to queue this cycle
+  logic        fpq_qready; // Whether queue is full
+  logic [31:0] fpq_wdata;
+  // FP->IN Queue read
+  logic        rs_is_inq;  // Read from queue this cycle
+  logic        inq_pvalid; // Whether queue is empty
+  logic [31:0] inq_rdata;
+
+  logic        rs1_is_inq;
+  logic        rs2_is_inq;
+
+  // When queues are enabled:
+  // Do NOT write to IN RF if t6 (any IN RF) is written to in FPSS.
+  // These are FP instrs that write back to IN RF through AccBus
+  logic        inq_false_write;
+  // Do NOT read from inq if t6 must be read in FPSS.
+  // These are FP instrs that read from int RF
+  logic        inq_false_read;
+
   // Load/Store Defines
   logic is_load, is_store, is_signed;
   logic is_fp_load, is_fp_store;
@@ -257,6 +287,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   logic [0:0][4:0] cause_d, cause_q;
   logic [0:0] cause_irq_d, cause_irq_q;
   logic spp_d, spp_q;
+  logic csr_fpinq_d, csr_fpinq_q;
   snitch_pkg::priv_lvl_t mpp_d, mpp_q;
   logic [0:0] ie_d, ie_q;
   logic [0:0] pie_d, pie_q;
@@ -293,6 +324,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   `FFAR(spp_q, spp_d, 1'b0, clk_i, rst_i)
   `FFAR(ie_q, ie_d, '0, clk_i, rst_i)
   `FFAR(pie_q, pie_d, '0, clk_i, rst_i)
+  `FFAR(csr_fpinq_q, csr_fpinq_d, 1'b0, clk_i, rst_i)
   // Interrupts
   `FFAR(eie_q, eie_d, '0, clk_i, rst_i)
   `FFAR(tie_q, tie_d, '0, clk_i, rst_i)
@@ -436,20 +468,24 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
     sb_d = sb_q;
     if (retire_load) sb_d[lsu_rd] = 1'b0;
     // only place the reservation if we actually executed the load or offload instruction
-    if ((is_load | acc_register_rd) && !stall && !exception) sb_d[rd] = 1'b1;
+    if ((is_load | (acc_register_rd & ~inq_false_write)) && !stall && !exception) sb_d[rd] = 1'b1;
     if (retire_acc) sb_d[acc_prsp_i.id[RegWidth-1:0]] = 1'b0;
     sb_d[0] = 1'b0;
   end
   // TODO(zarubaf): This can probably be described a bit more efficient
-  assign opa_ready = (opa_select != Reg) | ~sb_q[rs1];
-  assign opb_ready = (opb_select != Reg & opb_select != SImmediate) | ~sb_q[rs2];
+  // Whether rs1 and rs2 are ready to be read
+  assign opa_ready = (opa_select != Reg) | (rs1_is_inq ? inq_pvalid : ~sb_q[rs1]);
+  assign opb_ready = (opb_select != Reg & opb_select != SImmediate) | (rs2_is_inq ? inq_pvalid : ~sb_q[rs2]);
   assign operands_ready = opa_ready & opb_ready;
   // either we are not using the destination register or we need to make
   // sure that its destination operand is not marked busy in the scoreboard.
-  assign dst_ready = ~uses_rd | (uses_rd & ~sb_q[rd]);
+  // Whether there are any write hazards for writing into destination
+  assign dst_ready = rd_is_fpq ? fpq_qready : ~uses_rd | (uses_rd & ~sb_q[rd]);
 
-  assign valid_instr = inst_ready_i
-                      & inst_valid_o
+  // inst_valid_o eg - To stall reading an inst from icache
+  // valid_instr: Whether the instruction being sent to acc etc is valid:
+  assign valid_instr = inst_ready_i     // Whether the icache is ready to send an instr
+                      & inst_valid_o    // Whether the req sent by snitch core was indeed valid
                       & operands_ready
                       & dst_ready
                       & ((itlb_valid & itlb_ready) | ~trans_active);
@@ -528,6 +564,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
     // if we are writing the field this cycle we need
     // an int destination register
     uses_rd = write_rd;
+    inq_false_read = 1'b0;
 
     rd_bypass = '0;
     zero_lsb = 1'b0;
@@ -1021,78 +1058,78 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
         acc_register_rd = 1'b1;
         acc_qreq_o.addr = SHARED_MULDIV;
       end
-      // Off-loaded to IPU
-      ANDN, ORN, XNOR, SLO, SRO, ROL, ROR, SBCLR, SBSET, SBINV, SBEXT,
-      GORC, GREV, CLZ, CTZ, PCNT, SEXT_B,
-      SEXT_H, CRC32_B, CRC32_H, CRC32_W, CRC32C_B, CRC32C_H, CRC32C_W,
-      CLMUL, CLMULR, CLMULH, MIN, MAX, MINU, MAXU, SHFL, UNSHFL, BEXT,
-      BDEP, PACK, PACKU, PACKH, BFP: begin
-        write_rd = 1'b0;
-        uses_rd = 1'b1;
-        acc_qvalid_o = valid_instr;
-        opa_select = Reg;
-        opb_select = Reg;
-        acc_register_rd = 1'b1;
-        acc_qreq_o.addr = INT_SS;
-      end
-      SLOI, SROI, RORI, SBCLRI, SBSETI, SBINVI, SBEXTI, GORCI,
-      GREVI, SHFLI, UNSHFLI: begin
-        write_rd = 1'b0;
-        uses_rd = 1'b1;
-        acc_qvalid_o = valid_instr;
-        opa_select = Reg;
-        opb_select = IImmediate;
-        acc_register_rd = 1'b1;
-        acc_qreq_o.addr = INT_SS;
-      end
-      IADDI, ISLLI, ISLTI, ISLTIU, IXORI, ISRLI, ISRAI, IORI, IANDI, IADD,
-      ISUB, ISLL, ISLT, ISLTU, IXOR, ISRL, ISRA, IOR, IAND,
-      IAND, IMADD, IMSUB, INMSUB, INMADD, IMUL, IMULH, IMULHSU, IMULHU,
-      IANDN, IORN, IXNOR, ISLO, ISRO, IROL, IROR, ISBCLR, ISBSET, ISBINV,
-      ISBEXT, IGORC, IGREV, ISLOI, ISROI, IRORI, ISBCLRI, ISBSETI, ISBINVI,
-      ISBEXTI, IGORCI, IGREVI, ICLZ, ICTZ, IPCNT, ISEXT_B, ISEXT_H, ICRC32_B,
-      ICRC32_H, ICRC32_W, ICRC32C_B, ICRC32C_H, ICRC32C_W, ICLMUL, ICLMULR,
-      ICLMULH, IMIN, IMAX, IMINU, IMAXU, ISHFL, IUNSHFL, IBEXT, IBDEP, IPACK,
-      IPACKU, IPACKH, IBFP: begin
-        if (Xipu) begin
-          acc_qreq_o.addr = INT_SS;
-          write_rd = 1'b0;
-          acc_qvalid_o = valid_instr;
-        end else begin
-          illegal_inst = 1'b1;
-        end
-      end
-      IMV_X_W: begin
-        if (Xipu) begin
-          acc_qreq_o.addr = INT_SS;
-          write_rd = 1'b0;
-          uses_rd = 1'b1;
-          acc_qvalid_o = valid_instr;
-          acc_register_rd = 1'b1; // No RS in GPR but RD in GPR, register in int scoreboard
-        end else begin
-          illegal_inst = 1'b1;
-        end
-      end
-      IMV_W_X: begin
-        if (Xipu) begin
-          acc_qreq_o.addr = INT_SS;
-          opa_select = Reg;
-          write_rd = 1'b0;
-          acc_qvalid_o = valid_instr;
-        end else begin
-          illegal_inst = 1'b1;
-        end
-      end
-      IREP: begin
-        if (Xipu) begin
-          acc_qreq_o.addr = INT_SS;
-          opa_select = Reg;
-          write_rd = 1'b0;
-          acc_qvalid_o = valid_instr;
-        end else begin
-          illegal_inst = 1'b1;
-        end
-      end
+      // // Off-loaded to IPU
+      // ANDN, ORN, XNOR, SLO, SRO, ROL, ROR, SBCLR, SBSET, SBINV, SBEXT,
+      // GORC, GREV, CLZ, CTZ, PCNT, SEXT_B,
+      // SEXT_H, CRC32_B, CRC32_H, CRC32_W, CRC32C_B, CRC32C_H, CRC32C_W,
+      // CLMUL, CLMULR, CLMULH, MIN, MAX, MINU, MAXU, SHFL, UNSHFL, BEXT,
+      // BDEP, PACK, PACKU, PACKH, BFP: begin
+      //   write_rd = 1'b0;
+      //   uses_rd = 1'b1;
+      //   acc_qvalid_o = valid_instr;
+      //   opa_select = Reg;
+      //   opb_select = Reg;
+      //   acc_register_rd = 1'b1;
+      //   acc_qreq_o.addr = INT_SS;
+      // end
+      // SLOI, SROI, RORI, SBCLRI, SBSETI, SBINVI, SBEXTI, GORCI,
+      // GREVI, SHFLI, UNSHFLI: begin
+      //   write_rd = 1'b0;
+      //   uses_rd = 1'b1;
+      //   acc_qvalid_o = valid_instr;
+      //   opa_select = Reg;
+      //   opb_select = IImmediate;
+      //   acc_register_rd = 1'b1;
+      //   acc_qreq_o.addr = INT_SS;
+      // end
+      // IADDI, ISLLI, ISLTI, ISLTIU, IXORI, ISRLI, ISRAI, IORI, IANDI, IADD,
+      // ISUB, ISLL, ISLT, ISLTU, IXOR, ISRL, ISRA, IOR, IAND,
+      // IAND, IMADD, IMSUB, INMSUB, INMADD, IMUL, IMULH, IMULHSU, IMULHU,
+      // IANDN, IORN, IXNOR, ISLO, ISRO, IROL, IROR, ISBCLR, ISBSET, ISBINV,
+      // ISBEXT, IGORC, IGREV, ISLOI, ISROI, IRORI, ISBCLRI, ISBSETI, ISBINVI,
+      // ISBEXTI, IGORCI, IGREVI, ICLZ, ICTZ, IPCNT, ISEXT_B, ISEXT_H, ICRC32_B,
+      // ICRC32_H, ICRC32_W, ICRC32C_B, ICRC32C_H, ICRC32C_W, ICLMUL, ICLMULR,
+      // ICLMULH, IMIN, IMAX, IMINU, IMAXU, ISHFL, IUNSHFL, IBEXT, IBDEP, IPACK,
+      // IPACKU, IPACKH, IBFP: begin
+      //   if (Xipu) begin
+      //     acc_qreq_o.addr = INT_SS;
+      //     write_rd = 1'b0;
+      //     acc_qvalid_o = valid_instr;
+      //   end else begin
+      //     illegal_inst = 1'b1;
+      //   end
+      // end
+      // IMV_X_W: begin
+      //   if (Xipu) begin
+      //     acc_qreq_o.addr = INT_SS;
+      //     write_rd = 1'b0;
+      //     uses_rd = 1'b1;
+      //     acc_qvalid_o = valid_instr;
+      //     acc_register_rd = 1'b1; // No RS in GPR but RD in GPR, register in int scoreboard
+      //   end else begin
+      //     illegal_inst = 1'b1;
+      //   end
+      // end
+      // IMV_W_X: begin
+      //   if (Xipu) begin
+      //     acc_qreq_o.addr = INT_SS;
+      //     opa_select = Reg;
+      //     write_rd = 1'b0;
+      //     acc_qvalid_o = valid_instr;
+      //   end else begin
+      //     illegal_inst = 1'b1;
+      //   end
+      // end
+      // IREP: begin
+      //   if (Xipu) begin
+      //     acc_qreq_o.addr = INT_SS;
+      //     opa_select = Reg;
+      //     write_rd = 1'b0;
+      //     acc_qvalid_o = valid_instr;
+      //   end else begin
+      //     illegal_inst = 1'b1;
+      //   end
+      // end
       // Offload FP-FP Instructions - fire and forget
       // TODO (smach): Check legal rounding modes and issue illegal isn if needed
       // Single Precision Floating-Point
@@ -1722,6 +1759,15 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
           illegal_inst = 1'b1;
         end
       end
+      FLT_D_COPIFT: begin
+        if(FP_EN && RVD) begin
+          write_rd = 1'b0;
+          //uses_rd? and write_rd?
+          acc_qvalid_o = valid_instr;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
       // Single Precision Floating-Point
       FLE_S,
       FLT_S,
@@ -1912,6 +1958,17 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
           opa_select = Reg;
           write_rd = 1'b0;
           acc_qvalid_o = valid_instr;
+          inq_false_read = 1'b1;
+        end else begin
+          illegal_inst = 1'b1;
+        end
+      end
+      // Double Precision Floating Point operate on SSRs
+      FCVT_D_W_COPIFT,
+      FCVT_D_WU_COPIFT: begin
+        if(FP_EN && RVD) begin
+          write_rd = 1'b0;
+          acc_qvalid_o = valid_instr;
         end else begin
           illegal_inst = 1'b1;
         end
@@ -1924,6 +1981,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
           opa_select = Reg;
           write_rd = 1'b0;
           acc_qvalid_o = valid_instr;
+          inq_false_read = 1'b1;
         end else begin
           illegal_inst = 1'b1;
         end
@@ -2273,6 +2331,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
     fcsr_d.fflags = fcsr_q.fflags | fpu_status_i;
     fcsr_d.fmode.src = fcsr_q.fmode.src;
     fcsr_d.fmode.dst = fcsr_q.fmode.dst;
+    csr_fpinq_d = csr_fpinq_q;
     scratch_d = scratch_q;
     epc_d = epc_q;
     cause_d = cause_q;
@@ -2520,10 +2579,15 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
             end else illegal_csr = 1'b1;
           end
           // HW cluster barrier
+          // Custom CSRs
           CSR_BARRIER: begin
             barrier_o = 1'b1;
             csr_stall_d = 1'b1;
           end
+          CSR_FPQ: begin
+            csr_rvalue = {31'b0, csr_fpinq_q};
+            if(!exception) csr_fpinq_d = alu_result[0];
+          end
           default: begin
             csr_rvalue = '0;
             csr_dump = 1'b1;
@@ -2622,6 +2686,54 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   end
   // pragma translate_on
 
+  // --------------------
+  // COPIFT Queue
+  // --------------------
+
+  // Common enable signal for the IN->FP and FP->IN Queues
+  assign en_fpinq_o = csr_fpinq_q;
+
+  // Read from inq if rs==t6 & queues are enabled, but not if t6 is to be read in FPSS.
+  assign rs1_is_inq = (rs1=='d31) & en_fpinq_o & ~inq_false_read;
+  assign rs2_is_inq = (rs2=='d31) & en_fpinq_o & ~inq_false_read;
+  assign rs_is_inq = rs1_is_inq | rs2_is_inq;
+  // False Writes to IN RF: FP instrs that write to back int RF through AccBus when queue is enabled
+  assign inq_false_write = (acc_qreq_o.addr == FP_SS) & acc_register_rd & en_fpinq_o;
+
+  // IN->FP Queue (fpq)
+  snitch_fp_queue #(
+    .Depth (32)
+  ) i_fp_queue (
+    .clk_i,
+    .rst_i,
+
+    // Writing into fpq (Handled in snitch. Bypasses write to t6)
+    .fpq_qdata_i (fpq_wdata),    // Data to be written into FP queue
+    .fpq_qvalid_i(rd_is_fpq),    // Whether snitch wants to write to queue
+    .fpq_qready_o(fpq_qready),   // Whether the FP Queue is ready to take in data
+    // Reading from fpq (Handled in FPSS. Bypasses read from AccBus)
+    .fpq_pdata_o (fpq_pdata_o),  // Data available to be read from fpq
+    .fpq_pvalid_o(fpq_pvalid_o), // Whether queue is not empty
+    .fpq_pready_i(fpq_pready_i & fpq_pvalid_o)  // Whether FP wants to read from fpq
+  );
+
+  // FP->IN Queue (inq)
+  snitch_in_queue #(
+    .Depth (32)
+  ) i_in_queue (
+    .clk_i,
+    .rst_i,
+
+    // Writing into inq (Handled in FPSS. Bypasses write to AccBus)
+    .inq_qdata_i (inq_qdata_i),  // Data to be written into IN queue
+    .inq_qvalid_i(inq_qvalid_i), // Whether FPSS wants to write to queue
+    .inq_qready_o(inq_qready_o), // Whether the IN Queue is ready to take in data
+    // Reading from inq (Handled in snitch. Bypasses read from t6)
+    .inq_pdata_o (inq_rdata),    // Data available to be read from inq
+    .inq_pvalid_o(inq_pvalid),   // Whether queue is not empty
+    .inq_pready_i(valid_instr & rs_is_inq & inq_pvalid)   // Whether snitch wants to read from inq
+  );
+
   snitch_regfile #(
     .DataWidth    ( 32       ),
     .NrReadPorts  ( 2        ),
@@ -2644,7 +2756,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   always_comb begin
     unique case (opa_select)
       None: opa = '0;
-      Reg: opa = gpr_rdata[0];
+      Reg: opa = rs1_is_inq & en_fpinq_o ? inq_rdata : gpr_rdata[0];
       UImmediate: opa = uimm;
       JImmediate: opa = jimm;
       CSRImmmediate: opa = {{{32-RegWidth}{1'b0}}, rs1};
@@ -2655,7 +2767,7 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
   always_comb begin
     unique case (opb_select)
       None: opb = '0;
-      Reg: opb = gpr_rdata[1];
+      Reg: opb = rs2_is_inq & en_fpinq_o ? inq_rdata : gpr_rdata[1];
       IImmediate: opb = iimm;
       SFImmediate, SImmediate: opb = simm;
       PC: opb = pc_q;
@@ -2910,10 +3022,15 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
     endcase
   end
 
+  // Set we of gpr to zero if writing to queue
   always_comb begin
     gpr_we[0] = 1'b0;
     gpr_waddr[0] = rd;
     gpr_wdata[0] = alu_writeback;
+
+    rd_is_fpq  = 1'b0;
+    fpq_wdata = alu_writeback;
+    
     // external interfaces
     lsu_pready = 1'b0;
     acc_pready_o = 1'b0;
@@ -2921,19 +3038,26 @@ module snitch import snitch_pkg::*; import riscv_instr::*; #(
     retire_load = 1'b0;
 
     if (retire_i) begin
-      gpr_we[0] = 1'b1;
+      rd_is_fpq  = (rd =='d31) & en_fpinq_o; // write to fpq if rd==t6  and csr is enabled
+      gpr_we[0] = ~rd_is_fpq;
     // if we are not retiring another instruction retire the load now
     end else if (lsu_pvalid) begin
       retire_load = 1'b1;
-      gpr_we[0] = 1'b1;
+      rd_is_fpq  = (lsu_rd =='d31) & en_fpinq_o;
+      gpr_we[0] = ~rd_is_fpq;
       gpr_waddr[0] = lsu_rd;
       gpr_wdata[0] = ld_result[31:0];
+      fpq_wdata = ld_result[31:0];
+
       lsu_pready = 1'b1;
     end else if (acc_pvalid_i) begin
       retire_acc = 1'b1;
-      gpr_we[0] = 1'b1;
+      rd_is_fpq  = (acc_prsp_i.id =='d31) & en_fpinq_o;
+      gpr_we[0] = ~rd_is_fpq & ~inq_false_write; // if inq_false_write, FP would have already written into inq. But it works even without this signal because acc_pvalid_i is set to false in FPSS.
       gpr_waddr[0] = acc_prsp_i.id;
       gpr_wdata[0] = acc_prsp_i.data[31:0];
+      fpq_wdata = acc_prsp_i.data[31:0];
+
       acc_pready_o = 1'b1;
     end
   end
diff --git a/hw/snitch/src/snitch_fp_queue.sv b/hw/snitch/src/snitch_fp_queue.sv
new file mode 100644
index 0000000000..05f1483108
--- /dev/null
+++ b/hw/snitch/src/snitch_fp_queue.sv
@@ -0,0 +1,49 @@
+/// Floating Point Queue (instructions reading from int RF and writing to float RF)
+
+module snitch_fp_queue #(
+  parameter int unsigned AddrWidth           = 32,
+  parameter int unsigned DataWidth           = 32,
+
+  parameter int unsigned Depth               = 32,
+  /// Derived parameter *Do not override*
+  parameter type addr_t = logic [AddrWidth-1:0],
+  parameter type data_t = logic [DataWidth-1:0]
+) (
+  input  logic                 clk_i,
+  input  logic                 rst_i,
+  // int to fifo input channel
+  input  data_t                fpq_qdata_i,   // Data to be written into FP queue
+  input  logic                 fpq_qvalid_i,  // Whether this data is valid
+  output logic                 fpq_qready_o,  // Whether the FP Queue is ready to take in data
+  // fifo to fp output channel
+  output data_t                fpq_pdata_o,   // Data coming out of FP Queue
+  output logic                 fpq_pvalid_o,  // Whether this data is valid
+  input  logic                 fpq_pready_i   // Whether the FP Unit is ready to take in this data
+);
+
+  `include "common_cells/assertions.svh"
+
+  logic is_full, is_empty;
+  assign fpq_qready_o = ~is_full; // If FIFO is not full, ready to take input
+  assign fpq_pvalid_o = ~is_empty; // If FIFO is not empty, ready to be read
+
+  fifo_v3 #(
+    .FALL_THROUGH ( 1'b0   ),
+    .DEPTH        ( Depth  ),
+    .dtype        ( data_t )
+  ) i_fifo (
+    .clk_i,
+    .rst_ni (~rst_i),
+    .flush_i (1'b0),
+    .testmode_i(1'b0),
+    .full_o (is_full),
+    .empty_o (is_empty),
+    .usage_o (/* open */), // What does this do?
+    .data_i (fpq_qdata_i), // Data that will be enqueued
+    .push_i (fpq_qvalid_i),    // If valid data is available
+    .data_o (fpq_pdata_o), // Data that will be popped
+    .pop_i (fpq_pready_i)// & fpq_pvalid_o) // If fp is ready to read
+  );
+
+
+endmodule
diff --git a/hw/snitch/src/snitch_in_queue.sv b/hw/snitch/src/snitch_in_queue.sv
new file mode 100644
index 0000000000..0a94181bbe
--- /dev/null
+++ b/hw/snitch/src/snitch_in_queue.sv
@@ -0,0 +1,49 @@
+/// Floating Point Queue (instructions reading from int RF and writing to float RF)
+
+module snitch_in_queue #(
+  parameter int unsigned AddrWidth           = 32,
+  parameter int unsigned DataWidth           = 32,
+
+  parameter int unsigned Depth               = 32,
+  /// Derived parameter *Do not override*
+  parameter type addr_t = logic [AddrWidth-1:0],
+  parameter type data_t = logic [DataWidth-1:0]
+) (
+  input  logic                 clk_i,
+  input  logic                 rst_i,
+  // IN to fifo input channel
+  input  data_t                inq_qdata_i,   // Data to be written into IN queue
+  input  logic                 inq_qvalid_i,  // Whether this data is valid
+  output logic                 inq_qready_o,  // Whether the IN Queue is ready to take in data
+  // fifo to IN output channel
+  output data_t                inq_pdata_o,   // Data coming out of IN Queue
+  output logic                 inq_pvalid_o,  // Whether this data is valid
+  input  logic                 inq_pready_i   // Whether the int cc is ready to take in this data
+);
+
+  `include "common_cells/assertions.svh"
+
+  logic is_full, is_empty;
+  assign inq_qready_o = ~is_full;  // If FIFO is not full, ready to take input
+  assign inq_pvalid_o = ~is_empty; // If FIFO is not empty, ready to be read
+
+  fifo_v3 #(
+    .FALL_THROUGH ( 1'b0   ),
+    .DEPTH        ( Depth  ),
+    .dtype        ( data_t )
+  ) i_fifo (
+    .clk_i,
+    .rst_ni (~rst_i),
+    .flush_i (1'b0),
+    .testmode_i(1'b0),
+    .full_o (is_full),
+    .empty_o (is_empty),
+    .usage_o (/* open */),
+    .data_i (inq_qdata_i),  // Data that will be enqueued
+    .push_i (inq_qvalid_i), // If valid data is available
+    .data_o (inq_pdata_o),  // Data that will be popped
+    .pop_i (inq_pready_i)   // If int cc is ready to read
+  );
+
+
+endmodule
diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv
index 76a35113dc..0dba296669 100644
--- a/hw/snitch_cluster/src/snitch_cc.sv
+++ b/hw/snitch_cluster/src/snitch_cc.sv
@@ -191,6 +191,14 @@ module snitch_cc #(
   logic ssr_pvalid, ssr_pready;
   logic acc_demux_snitch_valid, acc_demux_snitch_ready;
   logic acc_demux_snitch_valid_q, acc_demux_snitch_ready_q;
+  // FP Queue input interface
+  logic [31:0]  fpq_pdata;
+  logic         fpq_pvalid;
+  logic         fpq_pready;
+  // IN Queue output interface
+  logic [31:0]  inq_qdata;
+  logic         inq_qvalid;
+  logic         inq_qready;
 
   fpnew_pkg::roundmode_e fpu_rnd_mode;
   fpnew_pkg::fmt_mode_t  fpu_fmt_mode;
@@ -262,6 +270,12 @@ module snitch_cc #(
     .acc_prsp_i ( acc_demux_snitch ),
     .acc_pvalid_i ( acc_demux_snitch_valid ),
     .acc_pready_o ( acc_demux_snitch_ready ),
+    .fpq_pdata_o ( fpq_pdata ),
+    .fpq_pvalid_o ( fpq_pvalid ),
+    .fpq_pready_i ( fpq_pready ),
+    .inq_qdata_i ( inq_qdata ),
+    .inq_qvalid_i ( inq_qvalid ),
+    .inq_qready_o ( inq_qready ),
     .caq_pvalid_i ( caq_pvalid_q ),
     .data_req_o ( snitch_dreq_d ),
     .data_rsp_i ( snitch_drsp_d ),
@@ -276,7 +290,8 @@ module snitch_cc #(
     .fpu_status_i ( fpu_status ),
     .core_events_o ( snitch_events),
     .barrier_o ( barrier_o ),
-    .barrier_i ( barrier_i )
+    .barrier_i ( barrier_i ),
+    .en_fpinq_o (en_fpinq)
   );
 
   reqrsp_iso #(
@@ -518,12 +533,21 @@ module snitch_cc #(
       .sequencer_tracer_port_o ( fpu_sequencer_trace ),
       // pragma translate_on
       .hart_id_i        ( hart_id_i      ),
+      // Accelerator Interface
       .acc_req_i        ( acc_snitch_req ),
       .acc_req_valid_i  ( acc_qvalid     ),
       .acc_req_ready_o  ( acc_qready     ),
       .acc_resp_o       ( acc_seq        ),
       .acc_resp_valid_o ( acc_pvalid     ),
       .acc_resp_ready_i ( acc_pready     ),
+      // COPIFT Queue Interface
+      .fpq_pdata_i      ( fpq_pdata),
+      .fpq_pvalid_i     ( fpq_pvalid ),
+      .fpq_pready_o     ( fpq_pready ),
+      .inq_qdata_o      ( inq_qdata ),
+      .inq_qvalid_o     ( inq_qvalid ),
+      .inq_qready_i     ( inq_qready ),
+
       .caq_pvalid_o     ( caq_pvalid     ),
       .data_req_o       ( fpu_dreq       ),
       .data_rsp_i       ( fpu_drsp       ),
@@ -543,7 +567,8 @@ module snitch_cc #(
       .streamctl_done_i   ( ssr_streamctl_done  ),
       .streamctl_valid_i  ( ssr_streamctl_valid ),
       .streamctl_ready_o  ( ssr_streamctl_ready ),
-      .core_events_o      ( fp_ss_core_events   )
+      .core_events_o      ( fp_ss_core_events   ),
+      .en_fpinq_i         (en_fpinq)
     );
 
     reqrsp_mux #(
@@ -913,6 +938,7 @@ module snitch_cc #(
         retire_acc:   i_snitch.retire_acc,
         acc_pid:      i_snitch.acc_prsp_i.id,
         acc_pdata_32: i_snitch.acc_prsp_i.data[31:0],
+
         // FPU offload
         fpu_offload:
           (i_snitch.acc_qready_i && i_snitch.acc_qvalid_o && i_snitch.acc_qreq_o.addr == 0),
diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv
index 9d68831f37..e86bcd7e23 100644
--- a/hw/snitch_cluster/src/snitch_cluster.sv
+++ b/hw/snitch_cluster/src/snitch_cluster.sv
@@ -22,6 +22,7 @@
 /// Snitch Cluster Top-Level.
 module snitch_cluster
   import snitch_pkg::*;
+  import snitch_icache_pkg::*;
 #(
   /// Width of physical address.
   parameter int unsigned PhysicalAddrWidth  = 48,
@@ -63,8 +64,8 @@ module snitch_cluster
   parameter int unsigned ICacheLineWidth [NrHives] = '{default: 0},
   /// Number of icache lines per set.
   parameter int unsigned ICacheLineCount [NrHives] = '{default: 0},
-  /// Number of icache sets.
-  parameter int unsigned ICacheSets [NrHives]      = '{default: 0},
+  /// Number of icache ways.
+  parameter int unsigned ICacheWays [NrHives]      = '{default: 0},
   /// Enable virtual memory support.
   parameter bit          VMSupport          = 1,
   /// Per-core enabling of the standard `E` ISA reduced-register extension.
@@ -510,10 +511,10 @@ module snitch_cluster
   tcdm_req_t [NrTCDMPortsCores-1:0] tcdm_req;
   tcdm_rsp_t [NrTCDMPortsCores-1:0] tcdm_rsp;
 
-  core_events_t [NrCores-1:0]                      core_events;
-  tcdm_events_t                                    tcdm_events;
-  dma_events_t [DMANumChannels-1:0]                dma_events;
-  snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events;
+  core_events_t      [NrCores-1:0]        core_events;
+  tcdm_events_t                           tcdm_events;
+  dma_events_t       [DMANumChannels-1:0] dma_events;
+  icache_l0_events_t [NrCores-1:0]        icache_events;
 
   // 4. Memory Subsystem (Core side).
   reqrsp_req_t [NrCores-1:0] core_req;
@@ -969,7 +970,7 @@ module snitch_cluster
       hive_req_t [HiveSize-1:0] hive_req_reshape;
       hive_rsp_t [HiveSize-1:0] hive_rsp_reshape;
 
-      snitch_icache_pkg::icache_events_t [HiveSize-1:0] icache_events_reshape;
+      icache_l0_events_t [HiveSize-1:0] icache_events_reshape;
 
       for (genvar j = 0; j < NrCores; j++) begin : gen_hive_matrix
         // Check whether the core actually belongs to the current hive.
@@ -993,7 +994,7 @@ module snitch_cluster
         .CoreCount (HiveSize),
         .ICacheLineWidth (ICacheLineWidth[i]),
         .ICacheLineCount (ICacheLineCount[i]),
-        .ICacheSets (ICacheSets[i]),
+        .ICacheWays (ICacheWays[i]),
         .IsoCrossing (IsoCrossing),
         .sram_cfg_t  (sram_cfg_t),
         .sram_cfgs_t (sram_cfgs_t),
diff --git a/hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral.sv b/hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral.sv
index d2e5ff319c..df5c375ab2 100644
--- a/hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral.sv
+++ b/hw/snitch_cluster/src/snitch_cluster_peripheral/snitch_cluster_peripheral.sv
@@ -8,6 +8,7 @@
 
 module snitch_cluster_peripheral
   import snitch_pkg::*;
+  import snitch_icache_pkg::*;
   import snitch_cluster_peripheral_reg_pkg::*;
 #(
   // Nr of cores in the cluster
@@ -31,13 +32,13 @@ module snitch_cluster_peripheral
   input  core_events_t [NrCores-1:0]                      core_events_i,
   input  tcdm_events_t                                    tcdm_events_i,
   input  dma_events_t [DMANumChannels-1:0]                dma_events_i,
-  input  snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events_i
+  input  icache_l0_events_t [NrCores-1:0] icache_events_i
 );
 
   // Pipeline register to ease timing.
   tcdm_events_t tcdm_events_q;
   dma_events_t [DMANumChannels-1:0] dma_events_q;
-  snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events_q;
+  icache_l0_events_t [NrCores-1:0] icache_events_q;
   `FF(tcdm_events_q, tcdm_events_i, '0)
   `FF(dma_events_q, dma_events_i, '0)
   `FF(icache_events_q, icache_events_i, '0)
diff --git a/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
new file mode 100644
index 0000000000..342eb31e05
--- /dev/null
+++ b/hw/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
@@ -0,0 +1,218 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+${disclaimer}
+
+<%def name="icache_cfg(prop)">
+  % for lw in cfg['hives']:
+    ${lw['icache'][prop]}${',' if not loop.last else ''}
+  % endfor
+</%def>
+
+<%def name="core_cfg(prop)">\
+  % for c in cfg['cores']:
+${c[prop]}${', ' if not loop.last else ''}\
+  % endfor
+</%def>\
+
+<%def name="core_cfg_flat(prop)">\
+${cfg['nr_cores']}'b\
+  % for c in cfg['cores'][::-1]:
+${int(c[prop])}\
+  % endfor
+</%def>\
+
+<%def name="core_isa(isa)">\
+${cfg['nr_cores']}'b\
+  % for c in cfg['cores'][::-1]:
+${int(getattr(c['isa_parsed'], isa))}\
+  % endfor
+</%def>\
+
+<%def name="ssr_cfg(core, ssr_fmt_str, none_str, inner_sep)">\
+% for core in cfg['cores']:
+  % for s in list(reversed(core['ssrs'] + [None]*(cfg['num_ssrs_max']-len(core['ssrs'])))):
+${("    '{" if loop.first else ' ') + \
+    (ssr_fmt_str.format(**s) if s is not None else none_str) \
+    + (inner_sep if not loop.last else '}')}\
+  % endfor
+${',' if not loop.last else ''}
+% endfor
+</%def>\
+
+`include "axi/typedef.svh"
+
+// verilog_lint: waive-start package-filename
+package ${cfg['pkg_name']};
+
+  localparam int unsigned NrCores = ${cfg['nr_cores']};
+  localparam int unsigned NrHives = ${cfg['nr_hives']};
+
+  localparam int unsigned AddrWidth = ${cfg['addr_width']};
+  localparam int unsigned NarrowDataWidth = ${cfg['data_width']};
+  localparam int unsigned WideDataWidth = ${cfg['dma_data_width']};
+
+  localparam int unsigned NarrowIdWidthIn = ${cfg['id_width_in']};
+  localparam int unsigned NrNarrowMasters = 3;
+  localparam int unsigned NarrowIdWidthOut = $clog2(NrNarrowMasters) + NarrowIdWidthIn;
+
+  localparam int unsigned NrWideMasters = 1 + ${cfg['dma_nr_channels']} + ${cfg['nr_hives']};
+  localparam int unsigned WideIdWidthIn = ${cfg['dma_id_width_in']};
+  localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn;
+
+  localparam int unsigned NarrowUserWidth = ${cfg['user_width']};
+  localparam int unsigned WideUserWidth = ${cfg['dma_user_width']};
+
+  localparam int unsigned ICacheLineWidth [NrHives] = '{${icache_cfg('cacheline')}};
+  localparam int unsigned ICacheLineCount [NrHives] = '{${icache_cfg('depth')}};
+  localparam int unsigned ICacheWays [NrHives] = '{${icache_cfg('ways')}};
+
+  localparam int unsigned Hive [NrCores] = '{${core_cfg('hive')}};
+
+  typedef struct packed {
+% for field, width in cfg['sram_cfg_fields'].items():
+    logic [${width-1}:0] ${field};
+% endfor
+  } sram_cfg_t;
+
+  typedef struct packed {
+    sram_cfg_t icache_tag;
+    sram_cfg_t icache_data;
+    sram_cfg_t tcdm;
+  } sram_cfgs_t;
+
+  typedef logic [AddrWidth-1:0]         addr_t;
+  typedef logic [NarrowDataWidth-1:0]   data_t;
+  typedef logic [NarrowDataWidth/8-1:0] strb_t;
+  typedef logic [WideDataWidth-1:0]     data_dma_t;
+  typedef logic [WideDataWidth/8-1:0]   strb_dma_t;
+  typedef logic [NarrowIdWidthIn-1:0]   narrow_in_id_t;
+  typedef logic [NarrowIdWidthOut-1:0]  narrow_out_id_t;
+  typedef logic [WideIdWidthIn-1:0]     wide_in_id_t;
+  typedef logic [WideIdWidthOut-1:0]    wide_out_id_t;
+  typedef logic [NarrowUserWidth-1:0]   user_t;
+  typedef logic [WideUserWidth-1:0]     user_dma_t;
+
+  `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_t)
+  `AXI_TYPEDEF_ALL(narrow_out, addr_t, narrow_out_id_t, data_t, strb_t, user_t)
+  `AXI_TYPEDEF_ALL(wide_in, addr_t, wide_in_id_t, data_dma_t, strb_dma_t, user_dma_t)
+  `AXI_TYPEDEF_ALL(wide_out, addr_t, wide_out_id_t, data_dma_t, strb_dma_t, user_dma_t)
+
+  function automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] get_cached_regions();
+    automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] cached_regions;
+    cached_regions = '{default: '0};
+% for i, cp in enumerate(cfg['pmas']['cached']):
+    cached_regions[${i}] = '{base: ${to_sv_hex(cp[0], cfg['addr_width'])}, mask: ${to_sv_hex(cp[1], cfg['addr_width'])}};
+% endfor
+    return cached_regions;
+  endfunction
+
+  localparam snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{
+      NrCachedRegionRules: ${len(cfg['pmas']['cached'])},
+      CachedRegion: get_cached_regions(),
+      default: 0
+  };
+
+  localparam fpnew_pkg::fpu_implementation_t FPUImplementation [${cfg['nr_cores']}] = '{
+  % for c in cfg['cores']:
+    '{
+        PipeRegs: // FMA Block
+                  '{
+                    '{  ${cfg['timing']['lat_comp_fp32']}, // FP32
+                        ${cfg['timing']['lat_comp_fp64']}, // FP64
+                        ${cfg['timing']['lat_comp_fp16']}, // FP16
+                        ${cfg['timing']['lat_comp_fp8']}, // FP8
+                        ${cfg['timing']['lat_comp_fp16_alt']}, // FP16alt
+                        ${cfg['timing']['lat_comp_fp8_alt']}  // FP8alt
+                      },
+                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
+                    '{${cfg['timing']['lat_noncomp']},
+                      ${cfg['timing']['lat_noncomp']},
+                      ${cfg['timing']['lat_noncomp']},
+                      ${cfg['timing']['lat_noncomp']},
+                      ${cfg['timing']['lat_noncomp']},
+                      ${cfg['timing']['lat_noncomp']}},   // NONCOMP
+                    '{${cfg['timing']['lat_conv']},
+                      ${cfg['timing']['lat_conv']},
+                      ${cfg['timing']['lat_conv']},
+                      ${cfg['timing']['lat_conv']},
+                      ${cfg['timing']['lat_conv']},
+                      ${cfg['timing']['lat_conv']}},   // CONV
+                    '{${cfg['timing']['lat_sdotp']},
+                      ${cfg['timing']['lat_sdotp']},
+                      ${cfg['timing']['lat_sdotp']},
+                      ${cfg['timing']['lat_sdotp']},
+                      ${cfg['timing']['lat_sdotp']},
+                      ${cfg['timing']['lat_sdotp']}}    // DOTP
+                    },
+        UnitTypes: '{'{fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED,
+                       fpnew_pkg::MERGED},  // FMA
+% if c["Xdiv_sqrt"]:
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}, // DIVSQRT
+% else:
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}, // DIVSQRT
+% endif
+                    '{fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL,
+                        fpnew_pkg::PARALLEL}, // NONCOMP
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED},   // CONV
+% if c["xfdotp"]:
+                    '{fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED,
+                        fpnew_pkg::MERGED}},  // DOTP
+% else:
+                    '{fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED,
+                        fpnew_pkg::DISABLED}}, // DOTP
+% endif
+        PipeConfig: fpnew_pkg::${cfg['timing']['fpu_pipe_config']}
+    }${',\n' if not loop.last else '\n'}\
+  % endfor
+  };
+
+  localparam snitch_ssr_pkg::ssr_cfg_t [${cfg['num_ssrs_max']}-1:0] SsrCfgs [${cfg['nr_cores']}] = '{
+${ssr_cfg(core, "'{{{indirection:d}, {isect_master:d}, {isect_master_idx:d}, {isect_slave:d}, "\
+  "{isect_slave_spill:d}, {indir_out_spill:d}, {num_loops}, {index_width}, {pointer_width}, "\
+  "{shift_width}, {rpt_width}, {index_credits}, {isect_slave_credits}, {data_credits}, "\
+  "{mux_resp_depth}}}", "/*None*/ '0", ',\n     ')}\
+  };
+
+  localparam logic [${cfg['num_ssrs_max']}-1:0][4:0] SsrRegs [${cfg['nr_cores']}] = '{
+${ssr_cfg(core, '{reg_idx}', '/*None*/ 0', ',')}\
+  };
+
+  // Forward potentially optional configuration parameters
+  localparam logic [9:0] CfgBaseHartId      =  (${to_sv_hex(cfg['cluster_base_hartid'], 10)});
+  localparam addr_t    	 CfgClusterBaseAddr = (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])});
+
+endpackage
+// verilog_lint: waive-stop package-filename
diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
index b3c35d9ace..687caba617 100644
--- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
@@ -41,182 +41,6 @@ ${',' if not loop.last else ''}
 % endfor
 </%def>\
 
-`include "axi/typedef.svh"
-
-// verilog_lint: waive-start package-filename
-package ${cfg['pkg_name']};
-
-  localparam int unsigned NrCores = ${cfg['nr_cores']};
-  localparam int unsigned NrHives = ${cfg['nr_hives']};
-
-  localparam int unsigned AddrWidth = ${cfg['addr_width']};
-  localparam int unsigned NarrowDataWidth = ${cfg['data_width']};
-  localparam int unsigned WideDataWidth = ${cfg['dma_data_width']};
-
-  localparam int unsigned NarrowIdWidthIn = ${cfg['id_width_in']};
-  localparam int unsigned NrNarrowMasters = 3;
-  localparam int unsigned NarrowIdWidthOut = $clog2(NrNarrowMasters) + NarrowIdWidthIn;
-
-  localparam int unsigned NrWideMasters = 1 + ${cfg['dma_nr_channels']} + ${cfg['nr_hives']};
-  localparam int unsigned WideIdWidthIn = ${cfg['dma_id_width_in']};
-  localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn;
-
-  localparam int unsigned NarrowUserWidth = ${cfg['user_width']};
-  localparam int unsigned WideUserWidth = ${cfg['dma_user_width']};
-
-  localparam int unsigned ICacheLineWidth [NrHives] = '{${icache_cfg('cacheline')}};
-  localparam int unsigned ICacheLineCount [NrHives] = '{${icache_cfg('depth')}};
-  localparam int unsigned ICacheSets [NrHives] = '{${icache_cfg('sets')}};
-
-  localparam int unsigned Hive [NrCores] = '{${core_cfg('hive')}};
-
-  typedef struct packed {
-% for field, width in cfg['sram_cfg_fields'].items():
-    logic [${width-1}:0] ${field};
-% endfor
-  } sram_cfg_t;
-
-  typedef struct packed {
-    sram_cfg_t icache_tag;
-    sram_cfg_t icache_data;
-    sram_cfg_t tcdm;
-  } sram_cfgs_t;
-
-  typedef logic [AddrWidth-1:0]         addr_t;
-  typedef logic [NarrowDataWidth-1:0]   data_t;
-  typedef logic [NarrowDataWidth/8-1:0] strb_t;
-  typedef logic [WideDataWidth-1:0]     data_dma_t;
-  typedef logic [WideDataWidth/8-1:0]   strb_dma_t;
-  typedef logic [NarrowIdWidthIn-1:0]   narrow_in_id_t;
-  typedef logic [NarrowIdWidthOut-1:0]  narrow_out_id_t;
-  typedef logic [WideIdWidthIn-1:0]     wide_in_id_t;
-  typedef logic [WideIdWidthOut-1:0]    wide_out_id_t;
-  typedef logic [NarrowUserWidth-1:0]   user_t;
-  typedef logic [WideUserWidth-1:0]     user_dma_t;
-
-  `AXI_TYPEDEF_ALL(narrow_in, addr_t, narrow_in_id_t, data_t, strb_t, user_t)
-  `AXI_TYPEDEF_ALL(narrow_out, addr_t, narrow_out_id_t, data_t, strb_t, user_t)
-  `AXI_TYPEDEF_ALL(wide_in, addr_t, wide_in_id_t, data_dma_t, strb_dma_t, user_dma_t)
-  `AXI_TYPEDEF_ALL(wide_out, addr_t, wide_out_id_t, data_dma_t, strb_dma_t, user_dma_t)
-
-  function automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] get_cached_regions();
-    automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] cached_regions;
-    cached_regions = '{default: '0};
-% for i, cp in enumerate(cfg['pmas']['cached']):
-    cached_regions[${i}] = '{base: ${to_sv_hex(cp[0], cfg['addr_width'])}, mask: ${to_sv_hex(cp[1], cfg['addr_width'])}};
-% endfor
-    return cached_regions;
-  endfunction
-
-  localparam snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{
-      NrCachedRegionRules: ${len(cfg['pmas']['cached'])},
-      CachedRegion: get_cached_regions(),
-      default: 0
-  };
-
-  localparam fpnew_pkg::fpu_implementation_t FPUImplementation [${cfg['nr_cores']}] = '{
-  % for c in cfg['cores']:
-    '{
-        PipeRegs: // FMA Block
-                  '{
-                    '{  ${cfg['timing']['lat_comp_fp32']}, // FP32
-                        ${cfg['timing']['lat_comp_fp64']}, // FP64
-                        ${cfg['timing']['lat_comp_fp16']}, // FP16
-                        ${cfg['timing']['lat_comp_fp8']}, // FP8
-                        ${cfg['timing']['lat_comp_fp16_alt']}, // FP16alt
-                        ${cfg['timing']['lat_comp_fp8_alt']}  // FP8alt
-                      },
-                    '{1, 1, 1, 1, 1, 1},   // DIVSQRT
-                    '{${cfg['timing']['lat_noncomp']},
-                      ${cfg['timing']['lat_noncomp']},
-                      ${cfg['timing']['lat_noncomp']},
-                      ${cfg['timing']['lat_noncomp']},
-                      ${cfg['timing']['lat_noncomp']},
-                      ${cfg['timing']['lat_noncomp']}},   // NONCOMP
-                    '{${cfg['timing']['lat_conv']},
-                      ${cfg['timing']['lat_conv']},
-                      ${cfg['timing']['lat_conv']},
-                      ${cfg['timing']['lat_conv']},
-                      ${cfg['timing']['lat_conv']},
-                      ${cfg['timing']['lat_conv']}},   // CONV
-                    '{${cfg['timing']['lat_sdotp']},
-                      ${cfg['timing']['lat_sdotp']},
-                      ${cfg['timing']['lat_sdotp']},
-                      ${cfg['timing']['lat_sdotp']},
-                      ${cfg['timing']['lat_sdotp']},
-                      ${cfg['timing']['lat_sdotp']}}    // DOTP
-                    },
-        UnitTypes: '{'{fpnew_pkg::MERGED,
-                       fpnew_pkg::MERGED,
-                       fpnew_pkg::MERGED,
-                       fpnew_pkg::MERGED,
-                       fpnew_pkg::MERGED,
-                       fpnew_pkg::MERGED},  // FMA
-% if c["Xdiv_sqrt"]:
-                    '{fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED}, // DIVSQRT
-% else:
-                    '{fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED}, // DIVSQRT
-% endif
-                    '{fpnew_pkg::PARALLEL,
-                        fpnew_pkg::PARALLEL,
-                        fpnew_pkg::PARALLEL,
-                        fpnew_pkg::PARALLEL,
-                        fpnew_pkg::PARALLEL,
-                        fpnew_pkg::PARALLEL}, // NONCOMP
-                    '{fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED},   // CONV
-% if c["xfdotp"]:
-                    '{fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED,
-                        fpnew_pkg::MERGED}},  // DOTP
-% else:
-                    '{fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED,
-                        fpnew_pkg::DISABLED}}, // DOTP
-% endif
-        PipeConfig: fpnew_pkg::${cfg['timing']['fpu_pipe_config']}
-    }${',\n' if not loop.last else '\n'}\
-  % endfor
-  };
-
-  localparam snitch_ssr_pkg::ssr_cfg_t [${cfg['num_ssrs_max']}-1:0] SsrCfgs [${cfg['nr_cores']}] = '{
-${ssr_cfg(core, "'{{{indirection:d}, {isect_master:d}, {isect_master_idx:d}, {isect_slave:d}, "\
-  "{isect_slave_spill:d}, {indir_out_spill:d}, {num_loops}, {index_width}, {pointer_width}, "\
-  "{shift_width}, {rpt_width}, {index_credits}, {isect_slave_credits}, {data_credits}, "\
-  "{mux_resp_depth}}}", "/*None*/ '0", ',\n     ')}\
-  };
-
-  localparam logic [${cfg['num_ssrs_max']}-1:0][4:0] SsrRegs [${cfg['nr_cores']}] = '{
-${ssr_cfg(core, '{reg_idx}', '/*None*/ 0', ',')}\
-  };
-
-  // Forward potentially optional configuration parameters
-  localparam logic [9:0] CfgBaseHartId      =  (${to_sv_hex(cfg['cluster_base_hartid'], 10)});
-  localparam addr_t    	 CfgClusterBaseAddr = (${to_sv_hex(cfg['cluster_base_addr'], cfg['addr_width'])});
-
-endpackage
-// verilog_lint: waive-stop package-filename
-
 module ${cfg['name']}_wrapper (
   input  logic                                   clk_i,
   input  logic                                   rst_ni,
@@ -277,7 +101,7 @@ module ${cfg['name']}_wrapper (
     .DMANumChannels (${cfg['dma_nr_channels']}),
     .ICacheLineWidth (${cfg['pkg_name']}::ICacheLineWidth),
     .ICacheLineCount (${cfg['pkg_name']}::ICacheLineCount),
-    .ICacheSets (${cfg['pkg_name']}::ICacheSets),
+    .ICacheWays (${cfg['pkg_name']}::ICacheWays),
     .VMSupport (${int(cfg['vm_support'])}),
     .RVE (${core_isa('e')}),
     .RVF (${core_isa('f')}),
diff --git a/hw/snitch_cluster/src/snitch_fp_ss.sv b/hw/snitch_cluster/src/snitch_fp_ss.sv
index 65c3e86e4a..7213682010 100644
--- a/hw/snitch_cluster/src/snitch_fp_ss.sv
+++ b/hw/snitch_cluster/src/snitch_fp_ss.sv
@@ -50,6 +50,14 @@ module snitch_fp_ss import snitch_pkg::*; #(
   output acc_resp_t        acc_resp_o,
   output logic             acc_resp_valid_o,
   input  logic             acc_resp_ready_i,
+  // IN->FP Queue output interface
+  input  logic [31:0]      fpq_pdata_i,  // Integer Data from INCC
+  input  logic             fpq_pvalid_i, // Whether queue is not empty
+  output logic             fpq_pready_o, // Whether FPSS wants to read (pop)
+  // FP->IN Queue input interface
+  output  logic [31:0]     inq_qdata_o,  // Floating Point Data from FPSS
+  output  logic            inq_qvalid_o, // Whether FPSS wants to write (push)
+  input   logic            inq_qready_i, // Whether queue is not full
   // TCDM Data Interface for regular FP load/stores.
   output dreq_t            data_req_o,
   input  drsp_t            data_rsp_i,
@@ -77,6 +85,8 @@ module snitch_fp_ss import snitch_pkg::*; #(
   // Notifies the issuing Snitch core of retired loads/stores.
   // TODO: is it good enough to assert this at issuing time instead?
   output logic             caq_pvalid_o,
+  // FP Queue CSR signal
+  input logic en_fpinq_i,
   // Core event strobes
   output core_events_t core_events_o
 );
@@ -233,7 +243,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
 
   // this handles WAW Hazards - Potentially this can be relaxed if necessary
   // at the expense of increased timing pressure
-  assign dst_ready = ~(rd_is_fp & sb_q[rd]);
+  assign dst_ready = inq_qvalid_o ? inq_qready_i : ~(rd_is_fp & sb_q[rd]);
 
   // check that either:
   // 1. The FPU and all operands are ready
@@ -249,7 +259,9 @@ module snitch_fp_ss import snitch_pkg::*; #(
                                       | (acc_req_valid_q && result_select == ResAccBus));
 
   // either the FPU or the regfile produced a result
-  assign acc_resp_valid_o = (fpu_tag_out.acc & fpu_out_valid);
+  // If queue is enabled, data goes to queue instead of AccBus
+  assign acc_resp_valid_o = ~en_fpinq_i & (fpu_tag_out.acc & fpu_out_valid);
+  assign inq_qvalid_o = en_fpinq_i & (fpu_tag_out.acc & fpu_out_valid);
   // stall FPU if we forward from reg
   assign fpu_out_ready = ((fpu_tag_out.acc & acc_resp_ready_i) | (~fpu_tag_out.acc & fpr_wready));
 
@@ -260,6 +272,7 @@ module snitch_fp_ss import snitch_pkg::*; #(
   assign acc_resp_o.id = fpu_tag_out.rd;
   // accelerator bus write-port
   assign acc_resp_o.data = fpu_result;
+  assign inq_qdata_o = fpu_result;
 
   assign rd = acc_req_q.data_op[11:7];
   assign rs1 = acc_req_q.data_op[19:15];
@@ -1764,6 +1777,15 @@ module snitch_fp_ss import snitch_pkg::*; #(
         fpu_tag_in.acc = 1'b1;
         rd_is_fp       = 1'b0;
       end
+      // Double Precision Floating-Point, MC extension
+      riscv_instr:: FLT_D_COPIFT: begin
+        fpu_op = fpnew_pkg::CMP;
+        op_select[0]   = RegA;
+        op_select[1]   = RegB;
+        // op_select[2]   = RegC;
+        src_fmt        = fpnew_pkg::FP64;
+        dst_fmt        = fpnew_pkg::FP64;
+      end
       riscv_instr::FCLASS_D: begin
         fpu_op = fpnew_pkg::CLASSIFY;
         op_select[0]   = RegA;
@@ -2234,6 +2256,16 @@ module snitch_fp_ss import snitch_pkg::*; #(
         dst_fmt      = fpnew_pkg::FP64;
         if (acc_req_q.data_op inside {riscv_instr::FCVT_D_WU}) op_mode = 1'b1; // unsigned
       end
+      // Double Precision Floating-Point
+      riscv_instr:: FCVT_D_W_COPIFT,
+      riscv_instr:: FCVT_D_WU_COPIFT: begin
+        fpu_op = fpnew_pkg:: I2F;
+        op_select[0] = RegA; // The operand comes from SSR which diverts out of FPR
+        // op_select[1] = RegB;
+        src_fmt      = fpnew_pkg::FP64;
+        dst_fmt      = fpnew_pkg::FP64;
+        if (acc_req_q.data_op inside {riscv_instr::FCVT_D_WU_COPIFT}) op_mode = 1'b1; // unsigned
+      end
       // [Alternate] Half Precision Floating-Point
       riscv_instr::FMV_H_X: begin
         fpu_op = fpnew_pkg::SGNJ;
@@ -2459,6 +2491,9 @@ module snitch_fp_ss import snitch_pkg::*; #(
     endcase
   end
 
+  logic [2:0] rs_is_fpq;
+  assign fpq_pready_o = acc_req_valid_q & (rs_is_fpq[2] | rs_is_fpq[1] | rs_is_fpq[0]) ; // If the FPU expects data and we have to read from fpq, we send ready signal to fpq
+
   for (genvar i = 0; i < 3; i++) begin: gen_operand_select
     logic is_raddr_ssr;
     always_comb begin
@@ -2466,6 +2501,9 @@ module snitch_fp_ss import snitch_pkg::*; #(
       for (int s = 0; s < NumSsrs; s++)
         is_raddr_ssr |= (SsrRegs[s] == fpr_raddr[i]);
     end
+    always_comb begin
+      rs_is_fpq[i] = op_select[i]==AccBus ? en_fpinq_i : 0; // Read from any INT RF will be from fpq if queues are enabled
+    end
     always_comb begin
       ssr_rvalid_o[i] = 1'b0;
       unique case (op_select[i])
@@ -2474,8 +2512,8 @@ module snitch_fp_ss import snitch_pkg::*; #(
           op_ready[i] = 1'b1;
         end
         AccBus: begin
-          op[i] = acc_qdata[i];
-          op_ready[i] = acc_req_valid_q;
+          op[i] = rs_is_fpq[i] ? { {(FLEN-32){fpq_pdata_i[31]}}, fpq_pdata_i[31:0] } : acc_qdata[i];
+          op_ready[i] = rs_is_fpq[i] ? fpq_pvalid_i : acc_req_valid_q;
         end
         // Scoreboard or SSR
         RegA, RegB, RegBRep, RegC, RegDest: begin
diff --git a/hw/snitch_cluster/src/snitch_hive.sv b/hw/snitch_cluster/src/snitch_hive.sv
index fbb5f6910c..4e5064c461 100644
--- a/hw/snitch_cluster/src/snitch_hive.sv
+++ b/hw/snitch_cluster/src/snitch_hive.sv
@@ -7,15 +7,15 @@
 `include "snitch_vm/typedef.svh"
 
 /// Shared subsystems for `CoreCount` cores.
-module snitch_hive #(
+module snitch_hive import snitch_icache_pkg::*; #(
   /// Number of cores which share an instruction frontend
   parameter int unsigned CoreCount          = 4,
   /// Width of a single icache line.
   parameter int unsigned ICacheLineWidth    = CoreCount > 2 ? CoreCount * 32 : 64,
   /// Number of icache lines per set.
   parameter int unsigned ICacheLineCount    = 128,
-  /// Number of icache sets.
-  parameter int unsigned ICacheSets         = 4,
+  /// Number of icache ways.
+  parameter int unsigned ICacheWays         = 4,
   parameter bit          IsoCrossing        = 1,
   /// Address width of the buses
   parameter int unsigned AddrWidth          = 0,
@@ -53,7 +53,7 @@ module snitch_hive #(
 
   input sram_cfgs_t sram_cfgs_i,
 
-  output snitch_icache_pkg::icache_events_t [CoreCount-1:0] icache_events_o
+  output icache_l0_events_t [CoreCount-1:0] icache_events_o
 );
   // Extend the ID to route back results to the appropriate core.
   localparam int unsigned IdWidth = 5;
@@ -87,7 +87,7 @@ module snitch_hive #(
     .L0_LINE_COUNT      ( 8                ),
     .LINE_WIDTH         ( ICacheLineWidth  ),
     .LINE_COUNT         ( ICacheLineCount  ),
-    .SET_COUNT          ( ICacheSets       ),
+    .WAY_COUNT          ( ICacheWays       ),
     .FETCH_AW           ( AddrWidth        ),
     .FETCH_DW           ( 32               ),
     .FILL_AW            ( AddrWidth        ),
@@ -107,7 +107,8 @@ module snitch_hive #(
     .clk_d2_i (clk_d2_i),
     .rst_ni (rst_ni),
     .enable_prefetching_i ( icache_prefetch_enable_i ),
-    .icache_events_o  ( icache_events_o),
+    .icache_l0_events_o   ( icache_events_o),
+    .icache_l1_events_o   ( ),
     .flush_valid_i    ( flush_valid    ),
     .flush_ready_o    ( flush_ready    ),
 
diff --git a/hw/snitch_cluster/src/snitch_sequencer.sv b/hw/snitch_cluster/src/snitch_sequencer.sv
index a360102906..2ecaf1891b 100644
--- a/hw/snitch_cluster/src/snitch_sequencer.sv
+++ b/hw/snitch_cluster/src/snitch_sequencer.sv
@@ -373,7 +373,7 @@ module snitch_sequencer import snitch_pkg::*; #(
       riscv_instr::FMV_W_X,
       riscv_instr::FCVT_S_W,
       riscv_instr::FCVT_S_WU,
-      riscv_instr::FCVT_D_W,
+      // riscv_instr::FCVT_D_W,
       riscv_instr::FCVT_D_WU,
       riscv_instr::FMV_H_X,
       riscv_instr::FCVT_H_W,
@@ -392,8 +392,8 @@ module snitch_sequencer import snitch_pkg::*; #(
       riscv_instr::VFCVT_B_X,
       riscv_instr::VFCVT_B_XU,
 
-      riscv_instr::IMV_X_W,
-      riscv_instr::IMV_W_X,
+      // riscv_instr::IMV_X_W,
+      // riscv_instr::IMV_W_X,
       // CSR accesses
       riscv_instr::CSRRW,
       riscv_instr::CSRRS,
diff --git a/iis-setup.sh b/iis-setup.sh
index 5738136a0a..be510b75a3 100755
--- a/iis-setup.sh
+++ b/iis-setup.sh
@@ -10,7 +10,7 @@ export CXX=g++-9.2.0
 export VCS_SEPP=vcs-2020.12
 export VERILATOR_SEPP=verilator-5.020
 export QUESTA_SEPP=questa-2022.3
-export LLVM_BINROOT=/usr/pack/riscv-1.0-kgf/pulp-llvm-0.12.0/bin
+export LLVM_BINROOT=/usr/scratch2/vulcano/colluca/tools/riscv32-snitch-llvm-almalinux8-15.0.0-snitch-0.1.0/bin
 
 # Create Python virtual environment with required packages
 /usr/local/anaconda3-2023.07/bin/python -m venv .venv
diff --git a/pyproject.toml b/pyproject.toml
index 3bba3730f5..3b289b055f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,3 +54,4 @@ dependencies = [
 "snitch.dnn" = "sw/dnn"
 "snitch.blas" = "sw/blas"
 "snitch.util" = "util"
+"snitch.target" = "target/snitch_cluster/util"
diff --git a/sw/apps/exp/exp.c b/sw/apps/exp/exp.c
new file mode 100644
index 0000000000..cd6d11241e
--- /dev/null
+++ b/sw/apps/exp/exp.c
@@ -0,0 +1,86 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Reference implementation
+float
+__expf (float x)
+{
+  uint32_t abstop;
+  uint64_t ki, t;
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t kd, xd, z, r, r2, y, s;
+
+  xd = (double_t) x;
+  abstop = top12 (x) & 0x7ff;
+  if (__glibc_unlikely (abstop >= top12 (88.0f)))
+    {
+      /* |x| >= 88 or x is nan.  */
+      if (asuint (x) == asuint (-INFINITY))
+	return 0.0f;
+      if (abstop >= top12 (INFINITY))
+	return x + x;
+      if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */
+	return __math_oflowf (0);
+      if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */
+	return __math_uflowf (0);
+#if WANT_ERRNO_UFLOW
+      if (x < -0x1.9d1d9ep6f) /* x < log(0x1p-149) ~= -103.28 */
+	return __math_may_uflowf (0);
+#endif
+    }
+
+  /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */
+  z = InvLn2N * xd;
+
+  /* Round and convert z to int, the result is in [-150*N, 128*N] and
+     ideally ties-to-even rule is used, otherwise the magnitude of r
+     can be bigger which gives larger approximation error.  */
+#if TOINT_INTRINSICS
+  kd = roundtoint (z);
+  ki = converttoint (z);
+#else
+# define SHIFT __exp2f_data.shift
+  kd = math_narrow_eval ((double) (z + SHIFT)); /* Needs to be double.  */
+  ki = asuint64 (kd);
+  kd -= SHIFT;
+#endif
+  r = z - kd;
+
+  /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+  t = T[ki % N];
+  t += ki << (52 - EXP2F_TABLE_BITS);
+  s = asdouble (t);
+  z = C[0] * r + C[1];
+  r2 = r * r;
+  y = C[2] * r + 1;
+  y = z * r2 + y;
+  y = y * s;
+  return (float) y;
+}
+
+// Simplified reference implementation
+float
+__expf (float x)
+{
+  uint64_t ki, t;
+  double_t kd, xd, z, r, r2, y, s;
+
+  xd = (double_t) x;
+  z = InvLn2N * xd;
+
+  kd = (double) (z + SHIFT);
+  ki = asuint64 (kd);
+  kd -= SHIFT;
+  r = z - kd;
+
+  t = T[ki % N];
+  t += ki << (52 - EXP2F_TABLE_BITS);
+  s = asdouble (t);
+  z = C[0] * r + C[1];
+  r2 = r * r;
+  y = C[2] * r + 1;
+  y = z * r2 + y;
+  y = y * s;
+  return (float) y;
+}
diff --git a/sw/apps/exp/main.c b/sw/apps/exp/main.c
new file mode 100644
index 0000000000..894c85386e
--- /dev/null
+++ b/sw/apps/exp/main.c
@@ -0,0 +1,48 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "math.h"
+#include "snrt.h"
+
+#include "vexpf.h"
+
+double a[LEN], b_golden[LEN], b_actual[LEN];
+
+int main() {
+    uint32_t tstart, tend;
+
+    // Initialize input array
+    if (snrt_cluster_core_idx() == 0)
+        for (int i = 0; i < LEN; i++)
+            a[i] = (float)i / LEN;
+
+    // Calculate exponential of input array using reference implementation
+    if (snrt_cluster_core_idx() == 0) {
+        for (int i = 0; i < LEN; i++) {
+            b_golden[i] = (double)expf((float)a[i]);
+        }
+    }
+
+    // Synchronize cores
+    snrt_cluster_hw_barrier();
+
+    // Calculate exponential of input array using vectorized implementation
+    vexpf_kernel(a, b_actual);
+
+    // Check if the results are correct
+    if (snrt_cluster_core_idx() == 0) {
+        uint32_t n_err = LEN;
+        for (int i = 0; i < LEN; i++) {
+            if ((float)b_golden[i] != (float)b_actual[i])
+                printf("Error: b_golden[%d] = %f, b_actual[%d] = %f\n", i,
+                    (float)b_golden[i], i, (float)b_actual[i]);
+            else
+                n_err--;
+        }
+        return n_err;
+    } else
+        return 0;
+}
diff --git a/sw/apps/exp/vexpf.h b/sw/apps/exp/vexpf.h
new file mode 100644
index 0000000000..bf5a062580
--- /dev/null
+++ b/sw/apps/exp/vexpf.h
@@ -0,0 +1,59 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#ifndef LEN
+#define LEN 12288
+#endif
+
+#ifndef BATCH_SIZE
+#define BATCH_SIZE 96
+#endif
+
+#define IMPL_NAIVE 0
+#define IMPL_BASELINE 1
+#define IMPL_OPTIMIZED 2
+
+#ifndef IMPL
+#define IMPL IMPL_OPTIMIZED
+#endif
+
+#if IMPL == IMPL_NAIVE
+#define FUNC_PTR vexpf_naive
+#elif IMPL == IMPL_BASELINE
+#define FUNC_PTR vexpf_baseline
+#elif IMPL == IMPL_OPTIMIZED
+#define FUNC_PTR vexpf_optimized
+#endif
+
+#define ALLOCATE_BUFFER(type, size) \
+    (type *)snrt_l1_alloc_cluster_local(size * sizeof(type), sizeof(type))
+
+__thread uint64_t T[64] = {
+    0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+    0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+    0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+    0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+    0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+    0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+    0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+    0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+};
+
+__thread const uint32_t EXP2F_TABLE_BITS = 5;
+__thread const double N = 1 << EXP2F_TABLE_BITS;
+__thread const double InvLn2N = 0x1.71547652b82fep+0 * N;
+__thread const double SHIFT = 0x1.8p+52;
+__thread const double C[4] = {0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, 1.0};
+
+#include "vexpf_naive.h"
+#include "vexpf_baseline.h"
+#include "vexpf_optimized.h"
+
+static inline void vexpf_kernel(double *a, double *b) {
+    snrt_mcycle();
+    FUNC_PTR(a, b);
+    snrt_mcycle();
+}
\ No newline at end of file
diff --git a/sw/apps/exp/vexpf_baseline.h b/sw/apps/exp/vexpf_baseline.h
new file mode 100644
index 0000000000..4e9ceb8952
--- /dev/null
+++ b/sw/apps/exp/vexpf_baseline.h
@@ -0,0 +1,203 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#define N_BUFFERS 2
+
+static inline void vexpf_baseline(double *a, double *b) {
+
+    int n_batches = LEN / BATCH_SIZE;
+    int n_iterations = n_batches + 2;
+
+    double *a_buffers[N_BUFFERS];
+    double *b_buffers[N_BUFFERS];
+
+    a_buffers[0] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    a_buffers[1] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    b_buffers[0] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    b_buffers[1] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+
+    unsigned int dma_a_idx = 0;
+    unsigned int dma_b_idx = 0;
+    unsigned int comp_idx = 0;
+    double *dma_a_ptr;
+    double *dma_b_ptr;
+    double *comp_a_ptr;
+    double *comp_b_ptr;
+
+    uint64_t ki[4], t[4];
+
+    // Iterate over batches
+    for (int iteration = 0; iteration < n_iterations; iteration++) {
+        snrt_mcycle();
+
+        // DMA cores
+        if (snrt_is_dm_core()) {
+
+            // DMA in phase
+            if (iteration < n_iterations - 2) {
+
+                // Index buffers
+                dma_a_ptr = a_buffers[dma_a_idx];
+
+                // DMA transfer
+                snrt_dma_load_1d_tile(
+                    dma_a_ptr,
+                    a,
+                    iteration,
+                    BATCH_SIZE,
+                    sizeof(double)
+                );
+
+                // Increment buffer index for next iteration
+                dma_a_idx += 1;
+                dma_a_idx %= N_BUFFERS;
+            }
+
+            // DMA out phase
+            if (iteration > 1) {
+
+                // Index buffers
+                dma_b_ptr = b_buffers[dma_b_idx];
+
+                // DMA transfer
+                snrt_dma_store_1d_tile(
+                    b,
+                    dma_b_ptr,
+                    iteration - 2,
+                    BATCH_SIZE,
+                    sizeof(double)
+                );
+
+                // Increment buffer index for next iteration
+                dma_b_idx += 1;
+                dma_b_idx %= N_BUFFERS;
+            }
+            snrt_dma_wait_all();
+        }
+
+        if (snrt_cluster_core_idx() == 0) {
+
+            // Compute phase
+            if (iteration > 0 && iteration < n_iterations - 1) {
+
+                // Index buffers
+                comp_a_ptr = a_buffers[comp_idx];
+                comp_b_ptr = b_buffers[comp_idx];
+
+                // Loop over samples (unrolled by 4)
+                for (int i = 0; i < BATCH_SIZE; i += 4) {
+                    asm volatile(
+                        "fmul.d  fa3, %[InvLn2N], %[input0] \n" // z = InvLn2N * xd
+                        "fmul.d  ft3, %[InvLn2N], %[input1] \n" // z = InvLn2N * xd
+                        "fmul.d  ft4, %[InvLn2N], %[input2] \n" // z = InvLn2N * xd
+                        "fmul.d  ft5, %[InvLn2N], %[input3] \n" // z = InvLn2N * xd
+                        "fadd.d  fa1, fa3, %[SHIFT]         \n" // kd = (double) (z + SHIFT)
+                        "fadd.d  fa5, ft3, %[SHIFT]         \n" // kd = (double) (z + SHIFT)
+                        "fadd.d  fa6, ft4, %[SHIFT]         \n" // kd = (double) (z + SHIFT)
+                        "fadd.d  fa7, ft5, %[SHIFT]         \n" // kd = (double) (z + SHIFT)
+                        "fsd     fa1, 0(%[ki])              \n" // ki = asuint64 (kd)
+                        "fsd     fa5, 8(%[ki])              \n" // ki = asuint64 (kd)
+                        "fsd     fa6, 16(%[ki])             \n" // ki = asuint64 (kd)
+                        "fsd     fa7, 24(%[ki])             \n" // ki = asuint64 (kd)
+                        "lw      a0, 0(%[ki])               \n" // ki = asuint64 (kd)
+                        "lw      a3, 8(%[ki])               \n" // ki = asuint64 (kd)
+                        "lw      a4, 16(%[ki])              \n" // ki = asuint64 (kd)
+                        "lw      a5, 24(%[ki])              \n" // ki = asuint64 (kd)
+                        "andi    a1, a0, 0x1f               \n" // ki % N
+                        "andi    a6, a3, 0x1f               \n" // ki % N
+                        "andi    a7, a4, 0x1f               \n" // ki % N
+                        "andi    t0, a5, 0x1f               \n" // ki % N
+                        "slli    a1, a1, 0x3                \n" // T[ki % N]
+                        "slli    a6, a6, 0x3                \n" // T[ki % N]
+                        "slli    a7, a7, 0x3                \n" // T[ki % N]
+                        "slli    t0, t0, 0x3                \n" // T[ki % N]
+                        "add     a1, %[T], a1               \n" // T[ki % N]
+                        "add     a6, %[T], a6               \n" // T[ki % N]
+                        "add     a7, %[T], a7               \n" // T[ki % N]
+                        "add     t0, %[T], t0               \n" // T[ki % N]
+                        "lw      a2, 0(a1)                  \n" // t = T[ki % N]
+                        "lw      t1, 0(a6)                  \n" // t = T[ki % N]
+                        "lw      t2, 0(a7)                  \n" // t = T[ki % N]
+                        "lw      t3, 0(t0)                  \n" // t = T[ki % N]
+                        "lw      a1, 4(a1)                  \n" // t = T[ki % N]
+                        "lw      a6, 4(a6)                  \n" // t = T[ki % N]
+                        "lw      a7, 4(a7)                  \n" // t = T[ki % N]
+                        "lw      t0, 4(t0)                  \n" // t = T[ki % N]
+                        "slli    a0, a0, 0xf                \n" // ki << (52 - EXP2F_TABLE_BITS)
+                        "slli    a3, a3, 0xf                \n" // ki << (52 - EXP2F_TABLE_BITS)
+                        "slli    a4, a4, 0xf                \n" // ki << (52 - EXP2F_TABLE_BITS)
+                        "slli    a5, a5, 0xf                \n" // ki << (52 - EXP2F_TABLE_BITS)
+                        "sw      a2, 0(%[t])                \n" // store lower 32b of t (unaffected)
+                        "sw      t1, 8(%[t])                \n" // store lower 32b of t (unaffected)
+                        "sw      t2, 16(%[t])               \n" // store lower 32b of t (unaffected)
+                        "sw      t3, 24(%[t])               \n" // store lower 32b of t (unaffected)
+                        "add     a0, a0, a1                 \n" // t += ki << (52 - EXP2F_TABLE_BITS)
+                        "add     a3, a3, a6                 \n" // t += ki << (52 - EXP2F_TABLE_BITS)
+                        "add     a4, a4, a7                 \n" // t += ki << (52 - EXP2F_TABLE_BITS)
+                        "add     a5, a5, t0                 \n" // t += ki << (52 - EXP2F_TABLE_BITS)
+                        "sw      a0, 4(%[t])                \n" // store upper 32b of t
+                        "sw      a3, 12(%[t])               \n" // store upper 32b of t
+                        "sw      a4, 20(%[t])               \n" // store upper 32b of t
+                        "sw      a5, 28(%[t])               \n" // store upper 32b of t
+                        "fsub.d  fa2, fa1, %[SHIFT]         \n" // kd -= SHIFT
+                        "fsub.d  ft6, fa5, %[SHIFT]         \n" // kd -= SHIFT
+                        "fsub.d  ft7, fa6, %[SHIFT]         \n" // kd -= SHIFT
+                        "fsub.d  ft8, fa7, %[SHIFT]         \n" // kd -= SHIFT
+                        "fsub.d  fa3, fa3, fa2              \n" // r = z - kd
+                        "fsub.d  ft3, ft3, ft6              \n" // r = z - kd
+                        "fsub.d  ft4, ft4, ft7              \n" // r = z - kd
+                        "fsub.d  ft5, ft5, ft8              \n" // r = z - kd
+                        "fmadd.d fa2, %[C0], fa3, %[C1]     \n" // z = C[0] * r + C[1]
+                        "fmadd.d ft6, %[C0], ft3, %[C1]     \n" // z = C[0] * r + C[1]
+                        "fmadd.d ft7, %[C0], ft4, %[C1]     \n" // z = C[0] * r + C[1]
+                        "fmadd.d ft8, %[C0], ft5, %[C1]     \n" // z = C[0] * r + C[1]
+                        "fld     fa0, 0(%[t])               \n" // s = asdouble (t)
+                        "fld     ft9, 8(%[t])               \n" // s = asdouble (t)
+                        "fld     ft10, 16(%[t])             \n" // s = asdouble (t)
+                        "fld     ft11, 24(%[t])             \n" // s = asdouble (t)
+                        "fmadd.d fa4, %[C2], fa3, %[C3]     \n" // y = C[2] * r + C[3]
+                        "fmadd.d fs0, %[C2], ft3, %[C3]     \n" // y = C[2] * r + C[3]
+                        "fmadd.d fs1, %[C2], ft4, %[C3]     \n" // y = C[2] * r + C[3]
+                        "fmadd.d fs2, %[C2], ft5, %[C3]     \n" // y = C[2] * r + C[3]
+                        "fmul.d  fa1, fa3, fa3              \n" // r2 = r * r
+                        "fmul.d  fa5, ft3, ft3              \n" // r2 = r * r
+                        "fmul.d  fa6, ft4, ft4              \n" // r2 = r * r
+                        "fmul.d  fa7, ft5, ft5              \n" // r2 = r * r
+                        "fmadd.d fa4, fa2, fa1, fa4         \n" // w = z * r2 + y
+                        "fmadd.d fs0, ft6, fa5, fs0         \n" // w = z * r2 + y
+                        "fmadd.d fs1, ft7, fa6, fs1         \n" // w = z * r2 + y
+                        "fmadd.d fs2, ft8, fa7, fs2         \n" // w = z * r2 + y
+                        "fmul.d  %[output0], fa4, fa0       \n" // y = w * s
+                        "fmul.d  %[output1], fs0, ft9       \n" // y = w * s
+                        "fmul.d  %[output2], fs1, ft10      \n" // y = w * s
+                        "fmul.d  %[output3], fs2, ft11      \n" // y = w * s
+                        : [output0] "=f" (comp_b_ptr[i+0]), [output1] "=f" (comp_b_ptr[i+1]),
+                        [output2] "=f" (comp_b_ptr[i+2]), [output3] "=f" (comp_b_ptr[i+3])
+                        : [input0] "f" (comp_a_ptr[i+0]), [input1] "f" (comp_a_ptr[i+1]),
+                        [input2] "f" (comp_a_ptr[i+2]), [input3] "f" (comp_a_ptr[i+3]),
+                        [InvLn2N] "f" (InvLn2N), [SHIFT] "f" (SHIFT),
+                        [C0] "f" (C[0]), [C1] "f" (C[1]), [C2] "f" (C[2]), [C3] "f" (C[3]),
+                        [ki] "r" (ki), [t] "r" (t), [T] "r" (T)
+                        : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0",
+                        "t1", "t2", "t3",
+                        "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft3",
+                        "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10", "ft11", "fs0",
+                        "fs1", "fs2"
+                    );
+                }
+
+                // Increment buffer indices for next iteration
+                comp_idx += 1;
+                comp_idx %= N_BUFFERS;
+
+                snrt_fpu_fence();
+            }
+        }
+
+        // Synchronize cores    
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/apps/exp/vexpf_naive.h b/sw/apps/exp/vexpf_naive.h
new file mode 100644
index 0000000000..d3eda85864
--- /dev/null
+++ b/sw/apps/exp/vexpf_naive.h
@@ -0,0 +1,45 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+static inline void vexpf_naive(double *a, double *b) {
+
+    uint64_t ki, t;
+
+    if (snrt_cluster_core_idx() == 0) {
+
+        // Loop over samples
+        for (int i = 0; i < LEN; i++) {
+            asm volatile(
+                "fmul.d  fa3, %[InvLn2N], %[input] \n" // z = InvLn2N * xd
+                "fadd.d  fa1, fa3, %[SHIFT]        \n" // kd = (double) (z + SHIFT)
+                "fsd     fa1, 0(%[ki])             \n" // ki = asuint64 (kd)
+                "lw      a0, 0(%[ki])              \n" // ki = asuint64 (kd)
+                "andi    a1, a0, 0x1f              \n" // ki % N
+                "slli    a1, a1, 0x3               \n" // T[ki % N]
+                "add     a1, %[T], a1              \n" // T[ki % N]
+                "lw      a2, 0(a1)                 \n" // t = T[ki % N]
+                "lw      a1, 4(a1)                 \n" // t = T[ki % N]
+                "slli    a0, a0, 0xf               \n" // ki << (52 - EXP2F_TABLE_BITS)
+                "sw      a2, 0(%[t])               \n" // store lower 32b of t (unaffected)
+                "add     a0, a0, a1                \n" // t += ki << (52 - EXP2F_TABLE_BITS)
+                "sw      a0, 4(%[t])               \n" // store upper 32b of t
+                "fsub.d  fa2, fa1, %[SHIFT]        \n" // kd -= SHIFT
+                "fsub.d  fa3, fa3, fa2             \n" // r = z - kd
+                "fmadd.d fa2, %[C0], fa3, %[C1]    \n" // z = C[0] * r + C[1]
+                "fld     fa0, 0(%[t])              \n" // s = asdouble (t)
+                "fmadd.d fa4, %[C2], fa3, %[C3]    \n" // y = C[2] * r + C[3]
+                "fmul.d  fa1, fa3, fa3             \n" // r2 = r * r
+                "fmadd.d fa4, fa2, fa1, fa4        \n" // y = z * r2 + y
+                "fmul.d  %[output], fa4, fa0       \n" // y = y * s
+                : [output] "=f" (b[i])
+                : [input] "f" (a[i]), [InvLn2N] "f" (InvLn2N), [SHIFT] "f" (SHIFT),
+                [C0] "f" (C[0]), [C1] "f" (C[1]), [C2] "f" (C[2]), [C3] "f" (C[3]),
+                [ki] "r" (&ki), [t] "r" (&t), [T] "r" (T)
+                : "memory", "a0", "a1", "a2", "fa0", "fa1", "fa2", "fa3", "fa4"
+            );
+        }
+    }
+}
diff --git a/sw/apps/exp/vexpf_optimized.h b/sw/apps/exp/vexpf_optimized.h
new file mode 100644
index 0000000000..390208ba5b
--- /dev/null
+++ b/sw/apps/exp/vexpf_optimized.h
@@ -0,0 +1,292 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#define N_T_BUFFERS 2
+#define N_W_BUFFERS 3
+
+#include "vexpf_optimized_asm.h"
+
+static inline void vexpf_optimized(double *a, double *b) {
+
+    // Derived parameters
+    unsigned int n_batches = LEN / BATCH_SIZE;
+    unsigned int n_iterations = n_batches + 2 + 2;
+
+    // Allocate buffers (ORDER IS IMPORTANT!)
+    uint64_t *ki_buffers[N_W_BUFFERS];
+    double   *kd_buffers[N_W_BUFFERS];
+    double   *w_buffers[N_W_BUFFERS];
+    double   *b_buffers[N_W_BUFFERS];
+    double   *a_buffers[N_T_BUFFERS];
+    uint64_t *t_buffers[N_T_BUFFERS];
+    ki_buffers[0] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    ki_buffers[1] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    ki_buffers[2] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    kd_buffers[0] = (double*)ki_buffers[0];
+    kd_buffers[1] = (double*)ki_buffers[1];
+    kd_buffers[2] = (double*)ki_buffers[2];
+    w_buffers[0] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    w_buffers[1] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    w_buffers[2] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    b_buffers[1] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    b_buffers[2] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    b_buffers[0] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    a_buffers[0] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    a_buffers[1] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    t_buffers[0] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    t_buffers[1] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+
+    // Define buffer pointers for every phase (fp0, int and fp1)
+    unsigned int dma_a_idx = 0;
+    unsigned int dma_b_idx = 0;
+    unsigned int fp0_a_idx = 0;
+    unsigned int fp0_w_idx = 0;
+    unsigned int int_ki_idx = 0;
+    unsigned int int_t_idx = 0;
+    unsigned int fp1_w_idx = 0;
+    unsigned int fp1_t_idx = 0;
+    double   *dma_a_ptr;
+    double   *dma_b_ptr;
+    double   *fp0_a_ptr;
+    double   *fp0_kd_ptr;
+    double   *fp0_w_ptr;
+    uint64_t *int_ki_ptr;
+    uint64_t *int_t_ptr;
+    uint64_t *fp1_t_ptr;
+    double   *fp1_w_ptr;
+    double   *fp1_b_ptr;
+
+    // Exponential function constants
+    uint32_t EXP2F_TABLE_BITS = 5;
+    double N = 1 << EXP2F_TABLE_BITS;
+    double InvLn2N = 0x1.71547652b82fep+0 * N;
+    double SHIFT = 0x1.8p+52;
+    double C[4] = {0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, 1.0};
+
+    snrt_cluster_hw_barrier();
+
+    // Iterate over batches
+    for (int iteration = 0; iteration < n_iterations; iteration++) {
+        snrt_mcycle();
+
+        // DMA cores
+        if (snrt_is_dm_core()) {
+
+            // DMA in phase
+            if (iteration < n_iterations - 4) {
+
+                // Index buffers
+                dma_a_ptr = a_buffers[dma_a_idx];
+
+                // DMA transfer
+                snrt_dma_load_1d_tile(
+                    dma_a_ptr,
+                    a,
+                    iteration,
+                    BATCH_SIZE,
+                    sizeof(double)
+                );
+
+                // Increment buffer index for next iteration
+                dma_a_idx += 1;
+                dma_a_idx %= N_T_BUFFERS;
+            }
+
+            // DMA out phase
+            if (iteration > 3) {
+
+                // Index buffers
+                dma_b_ptr = b_buffers[dma_b_idx];
+
+                // DMA transfer
+                snrt_dma_store_1d_tile(
+                    b,
+                    dma_b_ptr,
+                    iteration - 4,
+                    BATCH_SIZE,
+                    sizeof(double)
+                );
+
+                // Increment buffer index for next iteration
+                dma_b_idx += 1;
+                dma_b_idx %= N_W_BUFFERS;
+            }
+
+            snrt_dma_wait_all();
+        }
+
+        // Compute cores
+        if (snrt_cluster_core_idx() == 0) {
+
+            // FP0 phase
+            if (iteration > 0 && iteration < 3 && iteration < n_iterations - 3) {
+
+                // Index buffers
+                fp0_a_ptr = a_buffers[fp0_a_idx];
+                fp0_kd_ptr = kd_buffers[fp0_w_idx];
+                fp0_w_ptr = w_buffers[fp0_w_idx];
+
+                // Configure SSRs
+                snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, BATCH_SIZE, sizeof(double));
+                snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, fp0_a_ptr);
+                snrt_ssr_write(SNRT_SSR_DM1, SNRT_SSR_1D, fp0_kd_ptr);
+                snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, fp0_w_ptr);
+                snrt_ssr_enable();
+
+                // FP0 computation
+                int unroll_factor = 4;
+                asm volatile(
+                    "frep.o %[n_frep], 36, 0, 0 \n"
+                    FP0_ASM_BODY
+                    :
+                    : [n_frep] "r" (BATCH_SIZE / unroll_factor - 1),
+                      [InvLn2N] "f" (InvLn2N), [SHIFT] "f" (SHIFT),
+                      [C0] "f" (C[0]), [C1] "f" (C[1]),
+                      [C2] "f" (C[2]), [C3] "f" (C[3])
+                    : "memory", "ft0", "ft1", "ft2", "fa3", "ft3", "ft4", "ft5",
+                      "fa1", "fa2", "fa3", "fa4", "fa5",
+                      "fa6", "fa7", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8",
+                      "fs0", "fs1", "fs2", "ft0", "ft1", "ft2"
+                );
+
+                // Increment buffer index for next iteration
+                fp0_w_idx += 1;
+                fp0_a_idx += 1;
+                fp0_w_idx %= N_W_BUFFERS;
+                fp0_a_idx %= N_T_BUFFERS;
+            }
+
+            // Both FP0 and FP1 phases
+            if (iteration > 2 && iteration < n_iterations - 3) {
+
+                // Index buffers
+                fp0_a_ptr = a_buffers[fp0_a_idx];
+                fp0_kd_ptr = kd_buffers[fp0_w_idx];
+                fp1_w_ptr = w_buffers[fp1_w_idx];
+
+                // Configure SSRs
+                int unroll_factor = 4;
+                if (iteration == 3) {
+                    snrt_ssr_loop_3d(
+                        SNRT_SSR_DM0,
+                        unroll_factor,
+                        2,
+                        BATCH_SIZE / unroll_factor,
+                        sizeof(double),
+                        N_T_BUFFERS * BATCH_SIZE * sizeof(double),
+                        sizeof(double) * unroll_factor
+                    );
+                    snrt_ssr_loop_1d(SNRT_SSR_DM1, BATCH_SIZE, sizeof(double));
+                    snrt_ssr_loop_3d(
+                        SNRT_SSR_DM2,
+                        unroll_factor,
+                        3,
+                        BATCH_SIZE / unroll_factor,
+                        sizeof(double),
+                        N_W_BUFFERS * BATCH_SIZE * sizeof(double),
+                        sizeof(double) * unroll_factor
+                    );
+                }
+                snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, fp0_a_ptr);
+                snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, fp1_w_ptr);
+                snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_3D, fp0_kd_ptr);
+                snrt_ssr_enable();
+
+                // FP0 and FP1 computation
+                asm volatile(
+                    "frep.o %[n_frep], 40, 0, 0 \n"
+                    FP0_FP1_ASM_BODY
+                    :
+                    : [n_frep] "r" (BATCH_SIZE / unroll_factor - 1),
+                      [InvLn2N] "f" (InvLn2N), [SHIFT] "f" (SHIFT),
+                      [C0] "f" (C[0]), [C1] "f" (C[1]),
+                      [C2] "f" (C[2]), [C3] "f" (C[3])
+                    : "memory", "ft0", "ft1", "ft2",
+                      "fa1", "fa2", "fa3", "fa4", "fa5",
+                      "fa6", "fa7", "ft3", "ft4", "ft5", "ft6", "ft7", "ft8",
+                      "fs0", "fs1", "fs2"
+                );
+
+                // Increment buffer index for next iteration
+                fp0_w_idx += 1;
+                fp0_a_idx += 1;
+                fp1_w_idx += 1;
+                fp1_t_idx += 1;
+                fp0_w_idx %= N_W_BUFFERS;
+                fp0_a_idx %= N_T_BUFFERS;
+                fp1_w_idx %= N_W_BUFFERS;
+                fp1_t_idx %= N_T_BUFFERS;
+            }
+
+            // FP1 phase
+            if (iteration > 2 && iteration >= n_iterations - 3 && iteration < n_iterations - 1) {
+
+                // Index buffers
+                fp1_w_ptr = w_buffers[fp1_w_idx];
+                fp1_t_ptr = t_buffers[fp1_t_idx];
+                fp1_b_ptr = b_buffers[fp1_w_idx];
+
+                // Configure SSRs
+                int unroll_factor = 4;
+                snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, BATCH_SIZE, sizeof(double));
+                snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, fp1_w_ptr);
+                snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, fp1_t_ptr);
+                snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, fp1_b_ptr);
+                snrt_ssr_enable();
+
+                // FP1 computation
+                asm volatile(
+                    "frep.o %[n_frep], 4, 0, 0 \n"
+                    FP1_ASM_BODY
+                    :
+                    : [n_frep] "r" (BATCH_SIZE / unroll_factor - 1)
+                    : "memory", "ft0", "ft1", "ft2"
+                );
+
+                // Increment buffer indices for next iteration
+                fp1_w_idx += 1;
+                fp1_t_idx += 1;
+                fp1_w_idx %= N_W_BUFFERS;
+                fp1_t_idx %= N_T_BUFFERS;
+            }
+
+            // INT phase
+            if (iteration > 1 && iteration < n_iterations - 2) {
+
+                // Index buffers
+                int_ki_ptr = ki_buffers[int_ki_idx];
+                int_t_ptr = t_buffers[int_t_idx];
+
+                // INT computation
+                // Avoid further unrolling by the compiler so that loop fits in L0 cache
+                int unroll_factor = 4;
+                #pragma nounroll
+                for (int i = 0; i < BATCH_SIZE; i += unroll_factor) {
+                    asm volatile(
+                        INT_ASM_BODY
+                        :
+                        : [ki] "r" (int_ki_ptr + i), [T] "r" (T), [t] "r" (int_t_ptr + i)
+                        : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
+                            "t0", "t1", "t2", "t3"
+                    );
+                }
+
+                // Increment buffer indices for next iteration
+                int_ki_idx += 1;
+                int_t_idx += 1;
+                int_ki_idx %= N_W_BUFFERS;
+                int_t_idx %= N_T_BUFFERS;
+            }
+
+            // Synchronize FP and integer threads
+            snrt_ssr_disable();
+            snrt_fpu_fence();
+        }
+
+        // Synchronize cores
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/apps/exp/vexpf_optimized_asm.h b/sw/apps/exp/vexpf_optimized_asm.h
new file mode 100644
index 0000000000..336c20db83
--- /dev/null
+++ b/sw/apps/exp/vexpf_optimized_asm.h
@@ -0,0 +1,127 @@
+#define FP0_ASM_BODY                     \
+    "fmul.d   fa3, %[InvLn2N], ft0   \n" \
+    "fmul.d   ft3, %[InvLn2N], ft0   \n" \
+    "fmul.d   ft4, %[InvLn2N], ft0   \n" \
+    "fmul.d   ft5, %[InvLn2N], ft0   \n" \
+    "fadd.d   fa1, fa3, %[SHIFT]     \n" \
+    "fadd.d   fa5, ft3, %[SHIFT]     \n" \
+    "fadd.d   fa6, ft4, %[SHIFT]     \n" \
+    "fadd.d   fa7, ft5, %[SHIFT]     \n" \
+    "fmv.d    ft1, fa1               \n" \
+    "fmv.d    ft1, fa5               \n" \
+    "fmv.d    ft1, fa6               \n" \
+    "fmv.d    ft1, fa7               \n" \
+    "fsub.d   fa2, fa1, %[SHIFT]     \n" \
+    "fsub.d   ft6, fa5, %[SHIFT]     \n" \
+    "fsub.d   ft7, fa6, %[SHIFT]     \n" \
+    "fsub.d   ft8, fa7, %[SHIFT]     \n" \
+    "fsub.d   fa3, fa3, fa2          \n" \
+    "fsub.d   ft3, ft3, ft6          \n" \
+    "fsub.d   ft4, ft4, ft7          \n" \
+    "fsub.d   ft5, ft5, ft8          \n" \
+    "fmadd.d  fa2, %[C0], fa3, %[C1] \n" \
+    "fmadd.d  ft6, %[C0], ft3, %[C1] \n" \
+    "fmadd.d  ft7, %[C0], ft4, %[C1] \n" \
+    "fmadd.d  ft8, %[C0], ft5, %[C1] \n" \
+    "fmadd.d  fa4, %[C2], fa3, %[C3] \n" \
+    "fmadd.d  fs0, %[C2], ft3, %[C3] \n" \
+    "fmadd.d  fs1, %[C2], ft4, %[C3] \n" \
+    "fmadd.d  fs2, %[C2], ft5, %[C3] \n" \
+    "fmul.d   fa1, fa3, fa3          \n" \
+    "fmul.d   fa5, ft3, ft3          \n" \
+    "fmul.d   fa6, ft4, ft4          \n" \
+    "fmul.d   fa7, ft5, ft5          \n" \
+    "fmadd.d  ft2, fa2, fa1, fa4     \n" \
+    "fmadd.d  ft2, ft6, fa5, fs0     \n" \
+    "fmadd.d  ft2, ft7, fa6, fs1     \n" \
+    "fmadd.d  ft2, ft8, fa7, fs2     \n"
+
+#define INT_ASM_BODY        \
+    "lw   a0,  0(%[ki]) \n" \
+    "lw   a3,  8(%[ki]) \n" \
+    "lw   a4, 16(%[ki]) \n" \
+    "lw   a5, 24(%[ki]) \n" \
+    "andi a1, a0, 0x1f  \n" \
+    "andi a6, a3, 0x1f  \n" \
+    "andi a7, a4, 0x1f  \n" \
+    "andi t0, a5, 0x1f  \n" \
+    "slli a1, a1, 0x3   \n" \
+    "slli a6, a6, 0x3   \n" \
+    "slli a7, a7, 0x3   \n" \
+    "slli t0, t0, 0x3   \n" \
+    "add  a1, %[T], a1  \n" \
+    "add  a6, %[T], a6  \n" \
+    "add  a7, %[T], a7  \n" \
+    "add  t0, %[T], t0  \n" \
+    "lw   a2, 0(a1)     \n" \
+    "lw   t1, 0(a6)     \n" \
+    "lw   t2, 0(a7)     \n" \
+    "lw   t3, 0(t0)     \n" \
+    "lw   a1, 4(a1)     \n" \
+    "lw   a6, 4(a6)     \n" \
+    "lw   a7, 4(a7)     \n" \
+    "lw   t0, 4(t0)     \n" \
+    "slli a0, a0, 0xf   \n" \
+    "slli a3, a3, 0xf   \n" \
+    "slli a4, a4, 0xf   \n" \
+    "slli a5, a5, 0xf   \n" \
+    "sw   a2,  0(%[t])  \n" \
+    "sw   t1,  8(%[t])  \n" \
+    "sw   t2, 16(%[t])  \n" \
+    "sw   t3, 24(%[t])  \n" \
+    "add  a0, a0, a1    \n" \
+    "add  a3, a3, a6    \n" \
+    "add  a4, a4, a7    \n" \
+    "add  a5, a5, t0    \n" \
+    "sw   a0,  4(%[t])  \n" \
+    "sw   a3, 12(%[t])  \n" \
+    "sw   a4, 20(%[t])  \n" \
+    "sw   a5, 28(%[t])  \n"
+
+#define FP1_ASM_BODY          \
+    "fmul.d ft2, ft0, ft1 \n" \
+    "fmul.d ft2, ft0, ft1 \n" \
+    "fmul.d ft2, ft0, ft1 \n" \
+    "fmul.d ft2, ft0, ft1 \n"
+
+#define FP0_FP1_ASM_BODY                 \
+    "fmul.d   fa3, %[InvLn2N], ft0   \n" \
+    "fmul.d   ft3, %[InvLn2N], ft0   \n" \
+    "fmul.d   ft4, %[InvLn2N], ft0   \n" \
+    "fmul.d   ft5, %[InvLn2N], ft0   \n" \
+    "fadd.d   fa1, fa3, %[SHIFT]     \n" \
+    "fadd.d   fa5, ft3, %[SHIFT]     \n" \
+    "fadd.d   fa6, ft4, %[SHIFT]     \n" \
+    "fadd.d   fa7, ft5, %[SHIFT]     \n" \
+    "fmv.d    ft2, fa1               \n" \
+    "fmv.d    ft2, fa5               \n" \
+    "fmv.d    ft2, fa6               \n" \
+    "fmv.d    ft2, fa7               \n" \
+    "fsub.d   fa2, fa1, %[SHIFT]     \n" \
+    "fsub.d   ft6, fa5, %[SHIFT]     \n" \
+    "fsub.d   ft7, fa6, %[SHIFT]     \n" \
+    "fsub.d   ft8, fa7, %[SHIFT]     \n" \
+    "fsub.d   fa3, fa3, fa2          \n" \
+    "fsub.d   ft3, ft3, ft6          \n" \
+    "fsub.d   ft4, ft4, ft7          \n" \
+    "fsub.d   ft5, ft5, ft8          \n" \
+    "fmadd.d  fa2, %[C0], fa3, %[C1] \n" \
+    "fmadd.d  ft6, %[C0], ft3, %[C1] \n" \
+    "fmadd.d  ft7, %[C0], ft4, %[C1] \n" \
+    "fmadd.d  ft8, %[C0], ft5, %[C1] \n" \
+    "fmadd.d  fa4, %[C2], fa3, %[C3] \n" \
+    "fmadd.d  fs0, %[C2], ft3, %[C3] \n" \
+    "fmadd.d  fs1, %[C2], ft4, %[C3] \n" \
+    "fmadd.d  fs2, %[C2], ft5, %[C3] \n" \
+    "fmul.d   fa1, fa3, fa3          \n" \
+    "fmul.d   fa5, ft3, ft3          \n" \
+    "fmul.d   fa6, ft4, ft4          \n" \
+    "fmul.d   fa7, ft5, ft5          \n" \
+    "fmadd.d  ft2, fa2, fa1, fa4     \n" \
+    "fmadd.d  ft2, ft6, fa5, fs0     \n" \
+    "fmadd.d  ft2, ft7, fa6, fs1     \n" \
+    "fmadd.d  ft2, ft8, fa7, fs2     \n" \
+    "fmul.d   ft2, ft1, ft0          \n" \
+    "fmul.d   ft2, ft1, ft0          \n" \
+    "fmul.d   ft2, ft1, ft0          \n" \
+    "fmul.d   ft2, ft1, ft0          \n"
diff --git a/sw/apps/log/main.c b/sw/apps/log/main.c
new file mode 100644
index 0000000000..a5c7836b79
--- /dev/null
+++ b/sw/apps/log/main.c
@@ -0,0 +1,49 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "math.h"
+#include "snrt.h"
+
+#include "vlogf.h"
+
+float a[LEN];
+double b_golden[LEN], b_actual[LEN];
+
+int main() {
+    uint32_t tstart, tend;
+
+    // Initialize input array
+    if (snrt_cluster_core_idx() == 0)
+        for (int i = 0; i < LEN; i++)
+            a[i] = (float)(i + 1) / LEN;
+
+    // Calculate logarithm of input array using reference implementation
+    if (snrt_cluster_core_idx() == 0) {
+        for (int i = 0; i < LEN; i++) {
+            b_golden[i] = (double)logf(a[i]);
+        }
+    }
+
+    // Synchronize cores
+    snrt_cluster_hw_barrier();
+
+    // Calculate logarithm of input array using vectorized implementation
+    vlogf_kernel(a, b_actual);
+
+    // Check if the results are correct
+    if (snrt_cluster_core_idx() == 0) {
+        uint32_t n_err = LEN;
+        for (int i = 0; i < LEN; i++) {
+            if ((float)b_golden[i] != (float)b_actual[i])
+                printf("Error: b_golden[%d] = %f, b_actual[%d] = %f\n", i,
+                    (float)b_golden[i], i, (float)b_actual[i]);
+            else
+                n_err--;
+        }
+        return n_err;
+    } else
+        return 0;
+}
diff --git a/sw/apps/log/vlogf.h b/sw/apps/log/vlogf.h
new file mode 100644
index 0000000000..2f0c6ffab3
--- /dev/null
+++ b/sw/apps/log/vlogf.h
@@ -0,0 +1,72 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#ifndef LEN
+#define LEN 2048
+#endif
+
+#ifndef BATCH_SIZE
+#define BATCH_SIZE 64
+#endif
+
+#define IMPL_NAIVE 0
+#define IMPL_BASELINE 1
+#define IMPL_OPTIMIZED 2
+#define IMPL_ISSR 3
+
+#ifndef IMPL
+#define IMPL IMPL_OPTIMIZED
+#endif
+
+#if IMPL == IMPL_NAIVE
+#define FUNC_PTR vlogf_naive
+#elif IMPL == IMPL_BASELINE
+#define FUNC_PTR vlogf_baseline
+#elif IMPL == IMPL_OPTIMIZED || IMPL == IMPL_ISSR
+#define FUNC_PTR vlogf_optimized
+#endif
+
+#define ALLOCATE_BUFFER(type, size) \
+    (type *)snrt_l1_alloc_cluster_local(size * sizeof(type), sizeof(type))
+
+#define LOGF_TABLE_BITS 4
+
+typedef struct {
+    double invc, logc;
+} log_tab_entry_t;
+
+__thread const log_tab_entry_t T[16] = {
+  { 0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2 },
+  { 0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2 },
+  { 0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2 },
+  { 0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3 },
+  { 0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3 },
+  { 0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3 },
+  { 0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4 },
+  { 0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4 },
+  { 0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5 },
+  { 0x1p+0, 0x0p+0 },
+  { 0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5 },
+  { 0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4 },
+  { 0x1.b2036576afce6p-1, 0x1.526e57720db08p-3 },
+  { 0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3 },
+  { 0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2 },
+  { 0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2 }
+};
+__thread const double A[4] = {-0x1.00ea348b88334p-2, 0x1.5575b0be00b6ap-2, -0x1.ffffef20a4123p-2, -1};
+__thread const double Ln2 = 0x1.62e42fefa39efp-1;
+__thread const uint32_t OFF = 0x3f330000;
+
+#include "vlogf_glibc.h"
+#include "vlogf_naive.h"
+#include "vlogf_baseline.h"
+#include "vlogf_optimized.h"
+
+static inline void vlogf_kernel(float *a, double *b) {
+    snrt_mcycle();
+    FUNC_PTR(a, b);
+    snrt_mcycle();
+}
diff --git a/sw/apps/log/vlogf_baseline.h b/sw/apps/log/vlogf_baseline.h
new file mode 100644
index 0000000000..730734345a
--- /dev/null
+++ b/sw/apps/log/vlogf_baseline.h
@@ -0,0 +1,201 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#define N_BUFFERS 2
+
+static inline void vlogf_baseline(float *a, double *b) {
+
+    int n_batches = LEN / BATCH_SIZE;
+    int n_iterations = n_batches + 2;
+
+    float  *a_buffers[N_BUFFERS];
+    double *b_buffers[N_BUFFERS];
+
+    a_buffers[0] = ALLOCATE_BUFFER(float, BATCH_SIZE);
+    a_buffers[1] = ALLOCATE_BUFFER(float, BATCH_SIZE);
+    b_buffers[0] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    b_buffers[1] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+
+    unsigned int dma_a_idx = 0;
+    unsigned int dma_b_idx = 0;
+    unsigned int comp_idx = 0;
+    float  *dma_a_ptr;
+    double *dma_b_ptr;
+    float  *comp_a_ptr;
+    double *comp_b_ptr;
+
+    // Iterate over batches
+    for (int iteration = 0; iteration < n_iterations; iteration++) {
+        snrt_mcycle();
+
+        // DMA cores
+        if (snrt_is_dm_core()) {
+
+            // DMA in phase
+            if (iteration < n_iterations - 2) {
+
+                // Index buffers
+                dma_a_ptr = a_buffers[dma_a_idx];
+
+                // DMA transfer
+                snrt_dma_load_1d_tile(
+                    dma_a_ptr,
+                    a,
+                    iteration,
+                    BATCH_SIZE,
+                    sizeof(float)
+                );
+
+                // Increment buffer index for next iteration
+                dma_a_idx += 1;
+                dma_a_idx %= N_BUFFERS;
+            }
+
+            // DMA out phase
+            if (iteration > 1) {
+
+                // Index buffers
+                dma_b_ptr = b_buffers[dma_b_idx];
+
+                // DMA transfer
+                snrt_dma_store_1d_tile(
+                    b,
+                    dma_b_ptr,
+                    iteration - 2,
+                    BATCH_SIZE,
+                    sizeof(double)
+                );
+
+                // Increment buffer index for next iteration
+                dma_b_idx += 1;
+                dma_b_idx %= N_BUFFERS;
+            }
+            snrt_dma_wait_all();
+        }
+
+        // Compute cores
+        if (snrt_cluster_core_idx() == 0) {
+
+            // Compute phase
+            if (iteration > 0 && iteration < n_iterations - 1) {
+
+                // Index buffers
+                comp_a_ptr = a_buffers[comp_idx];
+                comp_b_ptr = b_buffers[comp_idx];
+
+                // Loop over samples (unrolled by 4)
+                for (int i = 0; i < BATCH_SIZE; i += 4) {
+
+                    asm volatile(
+                        "lw       a0,  0(%[input])           \n" // ix = asuint (x)
+                        "lw       a4,  4(%[input])           \n" // ix = asuint (x)
+                        "lw       a5,  8(%[input])           \n" // ix = asuint (x)
+                        "lw       a6, 12(%[input])           \n" // ix = asuint (x)
+                        "sub      a1, a0, %[OFF]             \n" // tmp = ix - OFF
+                        "sub      a7, a4, %[OFF]             \n" // tmp = ix - OFF
+                        "sub      t0, a5, %[OFF]             \n" // tmp = ix - OFF
+                        "sub      t1, a6, %[OFF]             \n" // tmp = ix - OFF
+                        "srai     a2, a1, 23                 \n" // k = (int32_t) tmp >> 23
+                        "srai     t2, a7, 23                 \n" // k = (int32_t) tmp >> 23
+                        "srai     t3, t0, 23                 \n" // k = (int32_t) tmp >> 23
+                        "srai     t4, t1, 23                 \n" // k = (int32_t) tmp >> 23
+                        "lui      a3, 1046528                \n" // 0x1ff << 23
+                        "lui      t5, 1046528                \n" // 0x1ff << 23
+                        "lui      t6, 1046528                \n" // 0x1ff << 23
+                        "lui      s0, 1046528                \n" // 0x1ff << 23
+                        "and      a3, a1, a3                 \n" // tmp & 0x1ff << 23
+                        "and      t5, a7, t5                 \n" // tmp & 0x1ff << 23
+                        "and      t6, t0, t6                 \n" // tmp & 0x1ff << 23
+                        "and      s0, t1, s0                 \n" // tmp & 0x1ff << 23
+                        "sub      a3, a0, a3                 \n" // iz = ix - (tmp & 0x1ff << 23)
+                        "sub      t5, a4, t5                 \n" // iz = ix - (tmp & 0x1ff << 23)
+                        "sub      t6, a5, t6                 \n" // iz = ix - (tmp & 0x1ff << 23)
+                        "sub      s0, a6, s0                 \n" // iz = ix - (tmp & 0x1ff << 23)
+                        "srli     a1, a1, 15                 \n" // tmp >> (23 - LOGF_TABLE_BITS)
+                        "srli     a7, a7, 15                 \n" // tmp >> (23 - LOGF_TABLE_BITS)
+                        "srli     t0, t0, 15                 \n" // tmp >> (23 - LOGF_TABLE_BITS)
+                        "srli     t1, t1, 15                 \n" // tmp >> (23 - LOGF_TABLE_BITS)
+                        "andi     a1, a1, 240                \n" // i = (tmp >> (23 - LOGF_TABLE_BITS)) % N
+                        "andi     a7, a7, 240                \n" // i = (tmp >> (23 - LOGF_TABLE_BITS)) % N
+                        "andi     t0, t0, 240                \n" // i = (tmp >> (23 - LOGF_TABLE_BITS)) % N
+                        "andi     t1, t1, 240                \n" // i = (tmp >> (23 - LOGF_TABLE_BITS)) % N
+                        "add      a1, %[T], a1               \n" // T[i]
+                        "add      a7, %[T], a7               \n" // T[i]
+                        "add      t0, %[T], t0               \n" // T[i]
+                        "add      t1, %[T], t1               \n" // T[i]
+                        "fld      fa0, 0(a1)                 \n" // invc = T[i].invc
+                        "fld      fa4, 0(a7)                 \n" // invc = T[i].invc
+                        "fld      fa5, 0(t0)                 \n" // invc = T[i].invc
+                        "fld      fa6, 0(t1)                 \n" // invc = T[i].invc
+                        "fld      fa1, 8(a1)                 \n" // logc = T[i].logc
+                        "fld      fa7, 8(a7)                 \n" // logc = T[i].logc
+                        "fld      ft3, 8(t0)                 \n" // logc = T[i].logc
+                        "fld      ft4, 8(t1)                 \n" // logc = T[i].logc
+                        "fmv.w.x  fa2, a3                    \n" // asfloat (iz)
+                        "fmv.w.x  ft5, t5                    \n" // asfloat (iz)
+                        "fmv.w.x  ft6, t6                    \n" // asfloat (iz)
+                        "fmv.w.x  ft7, s0                    \n" // asfloat (iz)
+                        "fcvt.d.s fa2, fa2                   \n" // z = (double_t) asfloat (iz)
+                        "fcvt.d.s ft5, ft5                   \n" // z = (double_t) asfloat (iz)
+                        "fcvt.d.s ft6, ft6                   \n" // z = (double_t) asfloat (iz)
+                        "fcvt.d.s ft7, ft7                   \n" // z = (double_t) asfloat (iz)
+                        "fmadd.d  fa2, fa2, fa0, %[A3]       \n" // r = z * invc - 1
+                        "fmadd.d  ft5, ft5, fa4, %[A3]       \n" // r = z * invc - 1
+                        "fmadd.d  ft6, ft6, fa5, %[A3]       \n" // r = z * invc - 1
+                        "fmadd.d  ft7, ft7, fa6, %[A3]       \n" // r = z * invc - 1
+                        "fcvt.d.w fa0, a2                    \n" // (double_t) k
+                        "fcvt.d.w fa4, t2                    \n" // (double_t) k
+                        "fcvt.d.w fa5, t3                    \n" // (double_t) k
+                        "fcvt.d.w fa6, t4                    \n" // (double_t) k
+                        "fmadd.d  fa1, fa0, %[Ln2], fa1      \n" // y0 = logc + (double_t) k * Ln2
+                        "fmadd.d  fa7, fa4, %[Ln2], fa7      \n" // y0 = logc + (double_t) k * Ln2
+                        "fmadd.d  ft3, fa5, %[Ln2], ft3      \n" // y0 = logc + (double_t) k * Ln2
+                        "fmadd.d  ft4, fa6, %[Ln2], ft4      \n" // y0 = logc + (double_t) k * Ln2
+                        "fmul.d   fa0, fa2, fa2              \n" // r2 = r * r
+                        "fmul.d   fa4, ft5, ft5              \n" // r2 = r * r
+                        "fmul.d   fa5, ft6, ft6              \n" // r2 = r * r
+                        "fmul.d   fa6, ft7, ft7              \n" // r2 = r * r
+                        "fmadd.d  fa3, fa2, %[A1], %[A2]     \n" // y = A[1] * r + A[2]
+                        "fmadd.d  ft8, ft5, %[A1], %[A2]     \n" // y = A[1] * r + A[2]
+                        "fmadd.d  ft9, ft6, %[A1], %[A2]     \n" // y = A[1] * r + A[2]
+                        "fmadd.d  ft10, ft7, %[A1], %[A2]    \n" // y = A[1] * r + A[2]
+                        "fmadd.d  fa3, fa0, %[A0], fa3       \n" // y = A[0] * r2 + y
+                        "fmadd.d  ft8, fa4, %[A0], ft8       \n" // y = A[0] * r2 + y
+                        "fmadd.d  ft9, fa5, %[A0], ft9       \n" // y = A[0] * r2 + y
+                        "fmadd.d  ft10, fa6, %[A0], ft10     \n" // y = A[0] * r2 + y
+                        "fadd.d   fa1, fa1, fa2              \n" // y = y * r2 + (y0 + r)
+                        "fadd.d   fa7, fa7, ft5              \n" // y = y * r2 + (y0 + r)
+                        "fadd.d   ft3, ft3, ft6              \n" // y = y * r2 + (y0 + r)
+                        "fadd.d   ft4, ft4, ft7              \n" // y = y * r2 + (y0 + r)
+                        "fmadd.d  %[output0], fa3, fa0, fa1  \n" // y = y * r2 + (y0 + r)
+                        "fmadd.d  %[output1], ft8, fa4, fa7  \n" // y = y * r2 + (y0 + r)
+                        "fmadd.d  %[output2], ft9, fa5, ft3  \n" // y = y * r2 + (y0 + r)
+                        "fmadd.d  %[output3], ft10, fa6, ft4 \n" // y = y * r2 + (y0 + r)
+                        : [output0] "=f" (comp_b_ptr[i+0]), [output1] "=f" (comp_b_ptr[i+1]),
+                        [output2] "=f" (comp_b_ptr[i+2]), [output3] "=f" (comp_b_ptr[i+3])
+                        : [input] "r" (comp_a_ptr + i),
+                        [Ln2] "f" (Ln2), [OFF] "r" (OFF), [T] "r" (T),
+                        [A0] "f" (A[0]), [A1] "f" (A[1]), [A2] "f" (A[2]), [A3] "f" (A[3])
+                        : "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0",
+                        "t1", "t2", "t3", "t4", "t5", "t6", "s0",
+                        "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7", "ft3",
+                        "ft4", "ft5", "ft6", "ft7", "ft8", "ft9", "ft10"
+                    );
+                }
+
+                // Increment buffer indices for next iteration
+                comp_idx += 1;
+                comp_idx %= N_BUFFERS;
+
+                snrt_fpu_fence();
+            }
+
+        }
+
+        // Synchronize cores    
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/apps/log/vlogf_glibc.h b/sw/apps/log/vlogf_glibc.h
new file mode 100644
index 0000000000..d5c7822171
--- /dev/null
+++ b/sw/apps/log/vlogf_glibc.h
@@ -0,0 +1,60 @@
+static inline uint32_t
+asuint (float f)
+{
+  union
+  {
+    float f;
+    uint32_t i;
+  } u = {f};
+  return u.i;
+}
+
+static inline float
+asfloat (uint32_t i)
+{
+  union
+  {
+    uint32_t i;
+    float f;
+  } u = {i};
+  return u.f;
+}
+
+float
+glibc_logf (float x)
+{
+  /* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */
+  double_t z, r, r2, y, y0, invc, logc;
+  uint32_t ix, iz, tmp;
+  int k, i;
+
+  ix = asuint (x);
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF] and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (23 - LOGF_TABLE_BITS)) % (1 << LOGF_TABLE_BITS);
+  k = (int32_t) tmp >> 23; /* arithmetic shift */
+  iz = ix - (tmp & 0x1ff << 23);
+  invc = T[i].invc;
+  logc = T[i].logc;
+  z = (double_t) asfloat (iz);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */
+  r = z * invc - 1;
+  y0 = logc + (double_t) k * Ln2;
+
+  /* Pipelined polynomial evaluation to approximate log1p(r).  */
+  r2 = r * r;
+  y = A[1] * r + A[2];
+  y = A[0] * r2 + y;
+  y = y * r2 + (y0 + r);
+  return (float) y;
+}
+
+void vlogf_glibc(float *a, double *b) {
+    for (int i = 0; i < LEN; i++) {
+        b[i] = (double)glibc_logf(a[i]);
+    }
+}
diff --git a/sw/apps/log/vlogf_naive.h b/sw/apps/log/vlogf_naive.h
new file mode 100644
index 0000000000..5d75a43445
--- /dev/null
+++ b/sw/apps/log/vlogf_naive.h
@@ -0,0 +1,42 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+static inline void vlogf_naive(float *a, double *b) {
+
+    // Loop over sample
+    for (int i = 0; i < LEN; i++) {
+        asm volatile(
+            "fmv.x.w  a0, %[input]             \n" // ix = asuint (x)
+            "sub      a1, a0, %[OFF]           \n" // tmp = ix - OFF
+            "srai     a2, a1, 23               \n" // k = (int32_t) tmp >> 23
+            "lui      a3, 1046528              \n" // 0x1ff << 23
+            "and      a3, a1, a3               \n" // tmp & 0x1ff << 23
+            "sub      a3, a0, a3               \n" // iz = ix - (tmp & 0x1ff << 23)
+            "srli     a1, a1, 15               \n" // tmp >> (23 - LOGF_TABLE_BITS)
+            "andi     a1, a1, 240              \n" // i = (tmp >> (23 - LOGF_TABLE_BITS)) % N
+            "add      a1, %[T], a1             \n" // T[i]
+            "fld      fa0, 0(a1)               \n" // invc = T[i].invc
+            "fld      fa1, 8(a1)               \n" // logc = T[i].logc
+            "fmv.w.x  fa2, a3                  \n" // asfloat (iz)
+            "fcvt.d.s fa2, fa2                 \n" // z = (double_t) asfloat (iz)
+            "fmadd.d  fa2, fa2, fa0, %[A3]     \n" // r = z * invc - 1
+            "fcvt.d.w fa0, a2                  \n" // (double_t) k
+            "fmadd.d  fa1, fa0, %[Ln2], fa1    \n" // y0 = logc + (double_t) k * Ln2
+            "fmul.d   fa0, fa2, fa2            \n" // r2 = r * r
+            "fmadd.d  fa3, fa2, %[A1], %[A2]   \n" // y = A[1] * r + A[2]
+            "fmadd.d  fa3, fa0, %[A0], fa3     \n" // y = A[0] * r2 + y
+            "fadd.d   fa1, fa1, fa2            \n" // y = y * r2 + (y0 + r)
+            "fmadd.d  %[output], fa3, fa0, fa1 \n" // y = y * r2 + (y0 + r)
+            : [output] "=f" (b[i])
+            : [input] "f" (a[i]), [Ln2] "f" (Ln2), [OFF] "r" (OFF),
+              [A0] "f" (A[0]), [A1] "f" (A[1]), [A2] "f" (A[2]), [A3] "f" (A[3]),
+              [T] "r" (T)
+            : "memory", "a0", "a1", "a2", "a3",
+              "fa0", "fa1", "fa2", "fa3"
+        );
+    }
+    snrt_fpu_fence();
+}
diff --git a/sw/apps/log/vlogf_optimized.h b/sw/apps/log/vlogf_optimized.h
new file mode 100644
index 0000000000..92e9424da5
--- /dev/null
+++ b/sw/apps/log/vlogf_optimized.h
@@ -0,0 +1,239 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#define N_BUFFERS 2
+
+#include "vlogf_optimized_asm.h"
+
+static inline void vlogf_optimized(float *a, double *b) {
+
+    // Derived parameters
+    unsigned int n_stages = 4;  // DMA in, INT, FP, DMA out
+    unsigned int n_batches = LEN / BATCH_SIZE;
+    unsigned int n_iterations = n_stages + n_batches - 1;
+
+    // Allocate buffers (ORDER IS IMPORTANT!)
+    float    *a_buffers[N_BUFFERS];
+    double   *b_buffers[N_BUFFERS];
+    uint64_t *z_buffers[N_BUFFERS];
+    uint64_t *k_buffers[N_BUFFERS];
+    uint64_t *invc_buffers[N_BUFFERS];
+    uint64_t *logc_buffers[N_BUFFERS];
+    uint8_t  *idx_buffers[N_BUFFERS];
+    a_buffers[0] = ALLOCATE_BUFFER(float, BATCH_SIZE);
+    a_buffers[1] = ALLOCATE_BUFFER(float, BATCH_SIZE);
+    b_buffers[0] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    b_buffers[1] = ALLOCATE_BUFFER(double, BATCH_SIZE);
+    z_buffers[0] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    z_buffers[1] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    k_buffers[0] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    k_buffers[1] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+#if IMPL == IMPL_ISSR
+    idx_buffers[0] = ALLOCATE_BUFFER(uint8_t, BATCH_SIZE * 2);
+    idx_buffers[1] = ALLOCATE_BUFFER(uint8_t, BATCH_SIZE * 2);
+#else
+    invc_buffers[0] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    invc_buffers[1] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    logc_buffers[0] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+    logc_buffers[1] = ALLOCATE_BUFFER(uint64_t, BATCH_SIZE);
+#endif
+
+    // Define buffer pointers for every phase (int and fp)
+    unsigned int dma_a_idx = 0;
+    unsigned int dma_b_idx = 0;
+    unsigned int int_buff_idx = 0;
+    unsigned int fp_buff_idx = 0;
+    float    *dma_a_ptr;
+    double   *dma_b_ptr;
+    float    *int_a_ptr;
+    uint64_t *int_z_ptr;
+    uint64_t *int_k_ptr;
+    uint64_t *int_invc_ptr;
+    uint64_t *int_logc_ptr;
+    uint8_t  *int_idx_ptr;
+    uint64_t *fp_z_ptr;
+    uint64_t *fp_invc_ptr;
+    uint8_t  *fp_idx_ptr;
+    double   *fp_b_ptr;
+
+    // NaN-box values in Z buffer, since conversion from single-precision
+    // to double-precision floating-point assumes the single-precision values
+    // are NaN-boxed.
+    if (snrt_cluster_core_idx() == 0)
+        for (int i = 0; i < BATCH_SIZE; i++) {
+            z_buffers[0][i] = 0xffffffffffffffff;
+            z_buffers[1][i] = 0xffffffffffffffff;
+        }
+
+    snrt_cluster_hw_barrier();
+
+    // Iterate over batches
+    for (int iteration = 0; iteration < n_iterations; iteration++) {
+        snrt_mcycle();
+
+        // DMA cores
+        if (snrt_is_dm_core()) {
+
+            // DMA in phase
+            if (iteration < n_iterations - 3) {
+
+                // Index buffers
+                dma_a_ptr = a_buffers[dma_a_idx];
+
+                // DMA transfer
+                snrt_dma_load_1d_tile(
+                    dma_a_ptr,
+                    a,
+                    iteration,
+                    BATCH_SIZE,
+                    sizeof(float)
+                );
+
+                // Increment buffer index for next iteration
+                dma_a_idx += 1;
+                dma_a_idx %= N_BUFFERS;
+            }
+
+            // DMA out phase
+            if (iteration > 2) {
+
+                // Index buffers
+                dma_b_ptr = b_buffers[dma_b_idx];
+
+                // DMA transfer
+                snrt_dma_store_1d_tile(
+                    b,
+                    dma_b_ptr,
+                    iteration - 3,
+                    BATCH_SIZE,
+                    sizeof(double)
+                );
+
+                // Increment buffer index for next iteration
+                dma_b_idx += 1;
+                dma_b_idx %= N_BUFFERS;
+            }
+
+            snrt_dma_wait_all();
+        }
+
+        // Compute cores
+        if (snrt_cluster_core_idx() == 0) {
+
+            // FP phase
+            if (iteration > 1 && iteration < n_iterations - 1) {
+
+                // Index buffers
+                fp_z_ptr = z_buffers[fp_buff_idx];
+                fp_invc_ptr = invc_buffers[fp_buff_idx];
+                fp_idx_ptr = idx_buffers[fp_buff_idx];
+                fp_b_ptr = b_buffers[fp_buff_idx];
+
+                // Configure SSRs
+                int unroll_factor = 4;
+                if (iteration == 2) {
+                    snrt_ssr_loop_3d(
+                        SNRT_SSR_DM0,
+                        unroll_factor,
+                        2,
+                        BATCH_SIZE / unroll_factor,
+                        sizeof(uint64_t),
+                        N_BUFFERS * BATCH_SIZE * sizeof(uint64_t),
+                        sizeof(uint64_t) * unroll_factor
+                    );
+                    snrt_ssr_loop_1d(SNRT_SSR_DM2, BATCH_SIZE, sizeof(double));
+#if IMPL == IMPL_ISSR
+                }
+                // Load invc and logc using an ISSR
+                snrt_issr_read(
+                    SNRT_SSR_DM1,
+                    (void *)T,
+                    fp_idx_ptr,
+                    2 * BATCH_SIZE,
+                    SNRT_SSR_IDXSIZE_U8
+                );
+#else
+                    snrt_ssr_loop_3d(
+                        SNRT_SSR_DM1,
+                        unroll_factor,
+                        2,
+                        BATCH_SIZE / unroll_factor,
+                        sizeof(uint64_t),
+                        N_BUFFERS * BATCH_SIZE * sizeof(uint64_t),
+                        sizeof(uint64_t) * unroll_factor
+                    );
+                }
+                snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_3D, fp_invc_ptr);
+#endif
+                snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_3D, fp_z_ptr);
+                snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, fp_b_ptr);
+                snrt_ssr_enable();
+
+                // FP computation
+                asm volatile(
+                    "frep.o %[n_frep], 36, 0, 0 \n"
+                    FP_ASM_BODY
+                    :
+                    : [n_frep] "r" (BATCH_SIZE / unroll_factor - 1),
+                      [A0] "f" (A[0]), [A1] "f" (A[1]),
+                      [A2] "f" (A[2]), [A3] "f" (A[3]), [Ln2] "f" (Ln2)
+                    : "ft0", "ft1", "ft2", "fa0", "fa1", "fa2", "fa3", "fa4",
+                      "fa5", "fa6", "fa7", "ft3", "ft4", "ft5", "ft6", "ft7",
+                      "ft8", "ft9", "ft10", "memory"
+                );
+
+                // Increment buffer indices for next iteration
+                fp_buff_idx += 1;
+                fp_buff_idx %= N_BUFFERS;
+            }
+
+            // INT phase
+            if (iteration > 0 && iteration < n_iterations - 2) {
+
+                // Index buffers
+                int_a_ptr = a_buffers[int_buff_idx];
+                int_z_ptr = z_buffers[int_buff_idx];
+                int_invc_ptr = invc_buffers[int_buff_idx];
+                int_k_ptr = k_buffers[int_buff_idx];
+                int_logc_ptr = logc_buffers[int_buff_idx];
+                int_idx_ptr = idx_buffers[int_buff_idx];
+
+                // INT computation
+                // Avoid further unrolling by the compiler so that loop fits in L0 cache
+                int unroll_factor = 4;
+                #pragma nounroll
+                for (int i = 0; i < BATCH_SIZE; i += unroll_factor) {
+                    asm volatile(
+                        INT_ASM_BODY
+                        :
+                        : [a] "r" (int_a_ptr + i), [OFF] "r" (OFF),
+                          [T] "r" (T), [z] "r" (int_z_ptr + i),
+                          [k] "r" (int_k_ptr + i),
+#if IMPL == IMPL_ISSR
+                          [idx] "r" (int_idx_ptr + 2 * i)
+#else
+                          [invc] "r" (int_invc_ptr + i),
+                          [logc] "r" (int_logc_ptr + i)
+#endif
+                        : "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0",
+                          "t1", "t2", "t3", "t4", "t5", "t6", "s0", "memory"
+                    );
+                }
+
+                // Increment buffer indices for next iteration
+                int_buff_idx += 1;
+                int_buff_idx %= N_BUFFERS;
+            }
+
+            // Synchronize FP and integer threads
+            snrt_ssr_disable();
+            snrt_fpu_fence();
+        }
+
+        // Synchronize cores
+        snrt_cluster_hw_barrier();
+    }
+}
diff --git a/sw/apps/log/vlogf_optimized_asm.h b/sw/apps/log/vlogf_optimized_asm.h
new file mode 100644
index 0000000000..9255539c42
--- /dev/null
+++ b/sw/apps/log/vlogf_optimized_asm.h
@@ -0,0 +1,171 @@
+#define FP_ASM_BODY                          \
+    "fcvt.d.s     fa2, ft0                   \n" \
+    "fcvt.d.s     ft5, ft0                   \n" \
+    "fcvt.d.s     ft6, ft0                   \n" \
+    "fcvt.d.s     ft7, ft0                   \n" \
+    "fmadd.d      fa2, fa2, ft1, %[A3]       \n" \
+    "fmadd.d      ft5, ft5, ft1, %[A3]       \n" \
+    "fmadd.d      ft6, ft6, ft1, %[A3]       \n" \
+    "fmadd.d      ft7, ft7, ft1, %[A3]       \n" \
+    "fcvt.d.w.ssr fa0, ft0                   \n" \
+    "fcvt.d.w.ssr fa4, ft0                   \n" \
+    "fcvt.d.w.ssr fa5, ft0                   \n" \
+    "fcvt.d.w.ssr fa6, ft0                   \n" \
+    "fmadd.d      fa1, fa0, %[Ln2], ft1      \n" \
+    "fmadd.d      fa7, fa4, %[Ln2], ft1      \n" \
+    "fmadd.d      ft3, fa5, %[Ln2], ft1      \n" \
+    "fmadd.d      ft4, fa6, %[Ln2], ft1      \n" \
+    "fmul.d       fa0, fa2, fa2              \n" \
+    "fmul.d       fa4, ft5, ft5              \n" \
+    "fmul.d       fa5, ft6, ft6              \n" \
+    "fmul.d       fa6, ft7, ft7              \n" \
+    "fmadd.d      fa3, fa2, %[A1], %[A2]     \n" \
+    "fmadd.d      ft8, ft5, %[A1], %[A2]     \n" \
+    "fmadd.d      ft9, ft6, %[A1], %[A2]     \n" \
+    "fmadd.d      ft10, ft7, %[A1], %[A2]    \n" \
+    "fmadd.d      fa3, fa0, %[A0], fa3       \n" \
+    "fmadd.d      ft8, fa4, %[A0], ft8       \n" \
+    "fmadd.d      ft9, fa5, %[A0], ft9       \n" \
+    "fmadd.d      ft10, fa6, %[A0], ft10     \n" \
+    "fadd.d       fa1, fa1, fa2              \n" \
+    "fadd.d       fa7, fa7, ft5              \n" \
+    "fadd.d       ft3, ft3, ft6              \n" \
+    "fadd.d       ft4, ft4, ft7              \n" \
+    "fmadd.d      ft2, fa3, fa0, fa1         \n" \
+    "fmadd.d      ft2, ft8, fa4, fa7         \n" \
+    "fmadd.d      ft2, ft9, fa5, ft3         \n" \
+    "fmadd.d      ft2, ft10, fa6, ft4        \n"  
+
+#if IMPL == IMPL_ISSR
+#define INT_ASM_BODY          \
+    "lw   a0,  0(%[a])    \n" \
+    "lw   a4,  4(%[a])    \n" \
+    "lw   a5,  8(%[a])    \n" \
+    "lw   a6, 12(%[a])    \n" \
+    "sub  a1, a0, %[OFF]  \n" \
+    "sub  a7, a4, %[OFF]  \n" \
+    "sub  t0, a5, %[OFF]  \n" \
+    "sub  t1, a6, %[OFF]  \n" \
+    "srai a2, a1, 23      \n" \
+    "srai t2, a7, 23      \n" \
+    "srai t3, t0, 23      \n" \
+    "srai t4, t1, 23      \n" \
+    "sw   a2,  0(%[k])    \n" \
+    "sw   t2,  8(%[k])    \n" \
+    "sw   t3, 16(%[k])    \n" \
+    "sw   t4, 24(%[k])    \n" \
+    "lui  a3, 1046528     \n" \
+    "lui  t5, 1046528     \n" \
+    "lui  t6, 1046528     \n" \
+    "lui  s0, 1046528     \n" \
+    "and  a3, a1, a3      \n" \
+    "and  t5, a7, t5      \n" \
+    "and  t6, t0, t6      \n" \
+    "and  s0, t1, s0      \n" \
+    "sub  a3, a0, a3      \n" \
+    "sub  t5, a4, t5      \n" \
+    "sub  t6, a5, t6      \n" \
+    "sub  s0, a6, s0      \n" \
+    "sw   a3,  0(%[z])    \n" \
+    "sw   t5,  8(%[z])    \n" \
+    "sw   t6, 16(%[z])    \n" \
+    "sw   s0, 24(%[z])    \n" \
+    "srli a1, a1, 18      \n" \
+    "srli a7, a7, 18      \n" \
+    "srli t0, t0, 18      \n" \
+    "srli t1, t1, 18      \n" \
+    "andi a1, a1, 30      \n" \
+    "andi a7, a7, 30      \n" \
+    "andi t0, t0, 30      \n" \
+    "andi t1, t1, 30      \n" \
+    "sb   a1, 0(%[idx])   \n" \
+    "sb   a7, 1(%[idx])   \n" \
+    "sb   t0, 2(%[idx])   \n" \
+    "sb   t1, 3(%[idx])   \n" \
+    "addi a1, a1, 1       \n" \
+    "addi a7, a7, 1       \n" \
+    "addi t0, t0, 1       \n" \
+    "addi t1, t1, 1       \n" \
+    "sb   a1, 4(%[idx])   \n" \
+    "sb   a7, 5(%[idx])   \n" \
+    "sb   t0, 6(%[idx])   \n" \
+    "sb   t1, 7(%[idx])   \n"
+#else
+#define INT_ASM_BODY          \
+    "lw   a0,  0(%[a])    \n" \
+    "lw   a4,  4(%[a])    \n" \
+    "lw   a5,  8(%[a])    \n" \
+    "lw   a6, 12(%[a])    \n" \
+    "sub  a1, a0, %[OFF]  \n" \
+    "sub  a7, a4, %[OFF]  \n" \
+    "sub  t0, a5, %[OFF]  \n" \
+    "sub  t1, a6, %[OFF]  \n" \
+    "srai a2, a1, 23      \n" \
+    "srai t2, a7, 23      \n" \
+    "srai t3, t0, 23      \n" \
+    "srai t4, t1, 23      \n" \
+    "sw   a2,  0(%[k])    \n" \
+    "sw   t2,  8(%[k])    \n" \
+    "sw   t3, 16(%[k])    \n" \
+    "sw   t4, 24(%[k])    \n" \
+    "lui  a3, 1046528     \n" \
+    "lui  t5, 1046528     \n" \
+    "lui  t6, 1046528     \n" \
+    "lui  s0, 1046528     \n" \
+    "and  a3, a1, a3      \n" \
+    "and  t5, a7, t5      \n" \
+    "and  t6, t0, t6      \n" \
+    "and  s0, t1, s0      \n" \
+    "sub  a3, a0, a3      \n" \
+    "sub  t5, a4, t5      \n" \
+    "sub  t6, a5, t6      \n" \
+    "sub  s0, a6, s0      \n" \
+    "sw   a3,  0(%[z])    \n" \
+    "sw   t5,  8(%[z])    \n" \
+    "sw   t6, 16(%[z])    \n" \
+    "sw   s0, 24(%[z])    \n" \
+    "srli a1, a1, 15      \n" \
+    "srli a7, a7, 15      \n" \
+    "srli t0, t0, 15      \n" \
+    "srli t1, t1, 15      \n" \
+    "andi a1, a1, 240     \n" \
+    "andi a7, a7, 240     \n" \
+    "andi t0, t0, 240     \n" \
+    "andi t1, t1, 240     \n" \
+    "add  a1, %[T], a1    \n" \
+    "add  a7, %[T], a7    \n" \
+    "add  t0, %[T], t0    \n" \
+    "add  t1, %[T], t1    \n" \
+    "lw   a0,   0(a1)     \n" \
+    "lw   a2,   4(a1)     \n" \
+    "lw   a3,   8(a1)     \n" \
+    "lw   a1,  12(a1)     \n" \
+    "lw   a4,   0(a7)     \n" \
+    "lw   a5,   4(a7)     \n" \
+    "lw   a6,   8(a7)     \n" \
+    "lw   a7,  12(a7)     \n" \
+    "lw   t2,   0(t0)     \n" \
+    "lw   t3,   4(t0)     \n" \
+    "lw   t4,   8(t0)     \n" \
+    "lw   t0,  12(t0)     \n" \
+    "lw   t5,   0(t1)     \n" \
+    "lw   t6,   4(t1)     \n" \
+    "lw   s0,   8(t1)     \n" \
+    "lw   t1,  12(t1)     \n" \
+    "sw   a0,  0(%[invc]) \n" \
+    "sw   a2,  4(%[invc]) \n" \
+    "sw   a3,  0(%[logc]) \n" \
+    "sw   a1,  4(%[logc]) \n" \
+    "sw   a4,  8(%[invc]) \n" \
+    "sw   a5, 12(%[invc]) \n" \
+    "sw   a6,  8(%[logc]) \n" \
+    "sw   a7, 12(%[logc]) \n" \
+    "sw   t2, 16(%[invc]) \n" \
+    "sw   t3, 20(%[invc]) \n" \
+    "sw   t4, 16(%[logc]) \n" \
+    "sw   t0, 20(%[logc]) \n" \
+    "sw   t5, 24(%[invc]) \n" \
+    "sw   t6, 28(%[invc]) \n" \
+    "sw   s0, 24(%[logc]) \n" \
+    "sw   t1, 28(%[logc]) \n"
+#endif
\ No newline at end of file
diff --git a/sw/apps/montecarlo/pi_estimation/main.c b/sw/apps/montecarlo/pi_estimation/main.c
index f5a279beca..8ff8b4b561 100644
--- a/sw/apps/montecarlo/pi_estimation/main.c
+++ b/sw/apps/montecarlo/pi_estimation/main.c
@@ -5,36 +5,524 @@
 // Fabio Cappellini <fcappellini@student.ethz.ch>
 // Hakim Filali <hfilali@student.ee.ethz.ch>
 // Luca Colagrande <colluca@iis.ee.ethz.ch>
+// Lannan Jiang <jiangl@student.ethz.ch>
 
-#include "lcg.h"
 #include "math.h"
 #include "pi_estimation.h"
+#include "prng.h"
 #include "snrt.h"
 
+#ifndef N_SAMPLES
 #define N_SAMPLES 1024
+#endif
 
-__thread uint32_t seed0, seed1, Ap, Cp;
+#ifndef FUNC_PTR
+#define FUNC_PTR calculate_psum_optimized
+#endif
 
-double pi_estimate;
+#ifndef N_CORES
+#define N_CORES snrt_cluster_compute_core_num()
+#endif
 
-inline void mc_init() {
-    // Double the sequences as each core produces two random numbers
-    unsigned int num_sequences = 2 * snrt_cluster_compute_core_num();
-    init_2d_lcg_params(num_sequences, 0, LCG_A, LCG_C, &seed0, &seed1, &Ap,
-                       &Cp);
+#ifndef BATCH_SIZE
+#define BATCH_SIZE 64
+#endif
+
+#define APPLICATION_PI 0
+#define APPLICATION_POLY 1
+
+#ifndef APPLICATION
+#define APPLICATION APPLICATION_PI
+#endif
+
+#define PRNG_LCG 0
+#define PRNG_XOSHIRO128P 1
+
+#ifndef PRNG
+#define PRNG PRNG_LCG
+#endif
+
+#if PRNG == PRNG_LCG
+#define PRNG_T lcg_t
+#define PRNG_INIT_N lcg_init_n_default
+#define PRNG_NEXT lcg_next
+#elif PRNG == PRNG_XOSHIRO128P
+#define PRNG_T xoshiro128p_t
+#define PRNG_INIT_N xoshiro128p_init_n
+#define PRNG_NEXT xoshiro128p_next
+#endif
+
+__thread double one = 1.0;
+__thread double two = 2.0;
+__thread double three = 3.0;
+
+static inline uint32_t calculate_psum_naive(PRNG_T *prng,
+                                            unsigned int n_samples) {
+    // Only compute cores follow
+    if (snrt_cluster_core_idx() >= N_CORES) return 0;
+
+    uint32_t int_x, int_y;
+    double x, y;
+    unsigned int hit = 0;
+    unsigned int result = 0;
+
+    snrt_mcycle();
+    for (unsigned int i = 0; i < n_samples; i++) {
+        int_x = PRNG_NEXT(prng);
+        int_y = PRNG_NEXT(prng);
+        x = rand_int_to_unit_double(int_x);
+        y = rand_int_to_unit_double(int_y);
+
+#if APPLICATION == APPLICATION_PI
+        hit = (x * x + y * y) < one;
+#elif APPLICATION == APPLICATION_POLY
+        hit = (y * 3) < (x * x * x + x * x - x + 2);
+#endif
+
+        if (hit) result++;
+    }
+    snrt_mcycle();
+
+    return result;
+}
+
+static inline uint32_t calculate_psum_baseline(PRNG_T *prngs,
+                                               unsigned int n_samples) {
+    if (snrt_cluster_core_idx() < N_CORES) {
+        int result = 0;
+        int n_iter = n_samples / 4;
+
+#if PRNG == PRNG_LCG
+        // LCG state
+        uint32_t lcg_state_x0 = prngs[0].state;
+        uint32_t lcg_state_x1 = prngs[1].state;
+        uint32_t lcg_state_x2 = prngs[2].state;
+        uint32_t lcg_state_x3 = prngs[3].state;
+        uint32_t lcg_state_y0 = prngs[4].state;
+        uint32_t lcg_state_y1 = prngs[5].state;
+        uint32_t lcg_state_y2 = prngs[6].state;
+        uint32_t lcg_state_y3 = prngs[7].state;
+        uint32_t lcg_Ap = prngs->A;
+        uint32_t lcg_Cp = prngs->C;
+#elif PRNG == PRNG_XOSHIRO128P
+        // xoshiro128p state
+        uint32_t xoshiro128p_state_0 = prngs->s[0];
+        uint32_t xoshiro128p_state_1 = prngs->s[1];
+        uint32_t xoshiro128p_state_2 = prngs->s[2];
+        uint32_t xoshiro128p_state_3 = prngs->s[3];
+        uint32_t xoshiro128p_tmp;
+#endif
+
+        asm volatile(
+            "1:"
+            "csrr t4, mcycle \n"
+            // Generate next 4 pseudo-random integer (X,Y) pairs
+            // and convert to doubles
+#if PRNG == PRNG_LCG
+            EVAL_LCG_UNROLL4
+            FCVT_UNROLL_8(%[int_x0], %[int_y0], %[int_x1], %[int_y1],
+                          %[int_x2], %[int_y2], %[int_x3], %[int_y3],
+                          ft0, fa0, ft1, fa1, ft2, fa2, ft3, fa3)
+#elif PRNG == PRNG_XOSHIRO128P
+            EVAL_XOSHIRO128P_UNROLL4
+            FCVT_UNROLL_8(t0, t1, t2, t3, a0, a1, a2, a3,
+                          ft0, ft1, ft2, ft3, fa0, fa1, fa2, fa3)
+#endif
+
+            // Normalize PRNs to [0, 1] range
+            "fmul.d ft0, ft0, %[div] \n"
+            "fmul.d ft1, ft1, %[div] \n"
+            "fmul.d ft2, ft2, %[div] \n"
+            "fmul.d ft3, ft3, %[div] \n"
+            "fmul.d fa0, fa0, %[div] \n"
+            "fmul.d fa1, fa1, %[div] \n"
+            "fmul.d fa2, fa2, %[div] \n"
+            "fmul.d fa3, fa3, %[div] \n"
+
+#if APPLICATION == APPLICATION_PI
+            // x^2 + y^2
+            EVAL_X2_PLUS_Y2_UNROLL4(ft0, ft1, ft2, ft3, fa0, fa1, fa2, fa3,
+                                    ft0, ft1, ft2, ft3)
+            // (x^2 + y^2) < 1
+            FLT_UNROLL_4(ft0, ft1, ft2, ft3, %[one], %[one], %[one], %[one],
+                         a0, a1, a2, a3)
+#elif APPLICATION == APPLICATION_POLY
+            // y * 3
+            // x^3 + x^2 - x + 2
+            EVAL_POLY_UNROLL4(ft0, ft1, ft2, ft3, fa0, fa1, fa2, fa3, ft4, ft5,
+                              ft6, ft7, ft0, ft1, ft2, ft3)
+            // y * 3 < x^3 + x^2 - x + 2
+            FLT_UNROLL_4(fa0, fa1, fa2, fa3, ft0, ft1, ft2, ft3, a0, a1, a2,
+                         a3)
+#endif
+
+            // Count points in circle
+            "add %[result], %[result], a0 \n"
+            "add %[result], %[result], a1 \n"
+            "add %[result], %[result], a2 \n"
+            "add %[result], %[result], a3 \n"
+
+            // Loop over batches
+            "addi %[n_iter], %[n_iter], -1 \n"
+            "bnez %[n_iter], 1b            \n"
+
+            : [ result ] "+r"(result), [ n_iter ] "+r"(n_iter)
+#if PRNG == PRNG_LCG
+              , ASM_LCG_OUTPUTS
+#elif PRNG == PRNG_XOSHIRO128P
+              , ASM_XOSHIRO128P_OUTPUTS
+#endif
+            : [ div ] "f"(max_uint_plus_1_inverse)
+#if PRNG == PRNG_LCG
+              , ASM_LCG_INPUTS
+#elif PRNG == PRNG_XOSHIRO128P
+              , ASM_XOSHIRO128P_INPUTS
+#endif
+#if APPLICATION == APPLICATION_PI
+              , ASM_PI_CONSTANTS(one)
+#elif APPLICATION == APPLICATION_POLY
+              , ASM_POLY_CONSTANTS(two, three)
+#endif
+            : "ft0", "ft1", "ft2", "ft3",
+              "fa0", "fa1", "fa2", "fa3",
+              "t0", "t1", "t2", "t3",
+              "a0", "a1", "a2", "a3",
+              "memory"
+#if APPLICATION == APPLICATION_POLY
+              , ASM_POLY_CLOBBERS
+#endif
+        );
+        snrt_fpu_fence();
+
+        return result;
+    }
+
+    return 0;
+}
+
+static inline uint32_t calculate_psum_optimized(PRNG_T *prngs,
+                                                unsigned int n_samples) {
+    // Derived parameters
+    uint32_t n_batches = n_samples / BATCH_SIZE;
+    uint32_t n_iterations = n_batches + 2;
+
+    // Allocate memory on TCDM
+    unsigned int result = 0;
+    double *rng_x_all[2];
+    double *rng_y_all[2];
+    double *rng_z_all[2];
+
+    // Allocate (double) buffers for communication between integer and FP
+    // threads in memory
+    rng_x_all[0] = (double *)snrt_l1_alloc_cluster_local(
+        BATCH_SIZE * N_CORES * sizeof(double), sizeof(double));
+    rng_y_all[0] = (double *)snrt_l1_alloc_cluster_local(
+        BATCH_SIZE * N_CORES * sizeof(double), sizeof(double));
+    rng_z_all[0] = (double *)snrt_l1_alloc_cluster_local(
+        BATCH_SIZE * N_CORES * sizeof(double), sizeof(double));
+    rng_x_all[1] = (double *)snrt_l1_alloc_cluster_local(
+        BATCH_SIZE * N_CORES * sizeof(double), sizeof(double));
+    rng_y_all[1] = (double *)snrt_l1_alloc_cluster_local(
+        BATCH_SIZE * N_CORES * sizeof(double), sizeof(double));
+    rng_z_all[1] = (double *)snrt_l1_alloc_cluster_local(
+        BATCH_SIZE * N_CORES * sizeof(double), sizeof(double));
+
+    // Point each core to its own section of the buffers
+    if (snrt_is_compute_core()) {
+        unsigned int offset = snrt_cluster_core_idx() * BATCH_SIZE;
+        rng_x_all[0] += offset;
+        rng_y_all[0] += offset;
+        rng_z_all[0] += offset;
+        rng_x_all[1] += offset;
+        rng_y_all[1] += offset;
+        rng_z_all[1] += offset;
+    }
+
+    // Clear buffers. This is necessary since the PRNG generates 32-bit
+    // integers, but the fcvt.d.wu.ssr instruction expects 64-bit integers.
+    // Zero-padding is sufficient to encode a 64-bit unsigned integer from
+    // a 32-bit integer. Thus, if the buffers are zero-initialized, storing the
+    // 32-bit integers with a 64-bit stride is sufficient.
+    if (snrt_is_dm_core()) {
+        snrt_dma_memset(rng_x_all[0], 0, sizeof(double) * BATCH_SIZE * N_CORES);
+        snrt_dma_memset(rng_y_all[0], 0, sizeof(double) * BATCH_SIZE * N_CORES);
+        snrt_dma_memset(rng_z_all[0], 0, sizeof(double) * BATCH_SIZE * N_CORES);
+        snrt_dma_memset(rng_x_all[1], 0, sizeof(double) * BATCH_SIZE * N_CORES);
+        snrt_dma_memset(rng_y_all[1], 0, sizeof(double) * BATCH_SIZE * N_CORES);
+        snrt_dma_memset(rng_z_all[1], 0, sizeof(double) * BATCH_SIZE * N_CORES);
+    }
+    snrt_cluster_hw_barrier();
+
+    // Pointers to current set of buffers
+    unsigned int fp_xyz_idx = 0;
+    unsigned int int_xy_idx = 0;
+    unsigned int int_z_idx = 0;
+    double *fp_x_ptr = rng_x_all[fp_xyz_idx];
+    double *fp_y_ptr = rng_y_all[fp_xyz_idx];
+    double *fp_z_ptr = rng_z_all[fp_xyz_idx];
+    double *int_x_ptr = rng_x_all[int_xy_idx];
+    double *int_y_ptr = rng_y_all[int_xy_idx];
+    double *int_z_ptr = rng_z_all[int_z_idx];
+
+    // Accumulators for partial sums
+    int temp0 = 0;
+    int temp1 = 0;
+    int temp2 = 0;
+    int temp3 = 0;
+    int temp4 = 0;
+    int temp5 = 0;
+    int temp6 = 0;
+    int temp7 = 0;
+
+#if PRNG == PRNG_LCG
+    // LCG state
+    uint32_t lcg_state_x0 = prngs[0].state;
+    uint32_t lcg_state_x1 = prngs[1].state;
+    uint32_t lcg_state_x2 = prngs[2].state;
+    uint32_t lcg_state_x3 = prngs[3].state;
+    uint32_t lcg_state_y0 = prngs[4].state;
+    uint32_t lcg_state_y1 = prngs[5].state;
+    uint32_t lcg_state_y2 = prngs[6].state;
+    uint32_t lcg_state_y3 = prngs[7].state;
+    uint32_t lcg_Ap = prngs->A;
+    uint32_t lcg_Cp = prngs->C;
+#elif PRNG == PRNG_XOSHIRO128P
+    // xoshiro128p state
+    uint32_t xoshiro128p_state_0 = prngs->s[0];
+    uint32_t xoshiro128p_state_1 = prngs->s[1];
+    uint32_t xoshiro128p_state_2 = prngs->s[2];
+    uint32_t xoshiro128p_state_3 = prngs->s[3];
+    uint32_t xoshiro128p_tmp;
+#endif
+
+    if (snrt_cluster_core_idx() < N_CORES) {
+        // Set up SSRs
+        snrt_ssr_loop_1d(SNRT_SSR_DM_ALL, BATCH_SIZE, sizeof(double));
+
+        // Batch iterations
+        for (int iteration = 0; iteration < n_iterations; iteration++) {
+            snrt_mcycle();
+
+            // Floating-point thread works on all but first and last iterations
+            if (iteration > 0 && iteration < n_iterations - 1) {
+                // Switch buffers for floating-point thread
+                fp_xyz_idx ^= 1;
+                fp_x_ptr = rng_x_all[fp_xyz_idx];
+                fp_y_ptr = rng_y_all[fp_xyz_idx];
+                fp_z_ptr = rng_z_all[fp_xyz_idx];
+
+                // Point SSRs to current buffers
+                snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, fp_x_ptr);
+                snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_1D, fp_y_ptr);
+                snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, fp_z_ptr);
+
+                // Fix register used by 1.0 constant
+                register double reg_one asm("ft8") = one;
+                register double reg_two asm("ft9") = two;
+                register double reg_three asm("ft10") = three;
+
+                // Enable SSRs
+                snrt_ssr_enable();
+
+                // Floating-point thread
+                asm volatile(
+                    // Unrolled by 4
+#if APPLICATION == APPLICATION_PI
+                    "frep.o %[n_frep], 28, 0, 0 \n"
+#elif APPLICATION == APPLICATION_POLY
+                    "frep.o %[n_frep], 40, 0, 0 \n"
+#endif
+
+                    // Convert integer PRNs to doubles
+                    "fcvt.d.wu.ssr fa0, ft0 \n"
+                    "fcvt.d.wu.ssr fa1, ft1 \n"
+                    "fcvt.d.wu.ssr fa2, ft0 \n"
+                    "fcvt.d.wu.ssr fa3, ft1 \n"
+                    "fcvt.d.wu.ssr fa4, ft0 \n"
+                    "fcvt.d.wu.ssr fa5, ft1 \n"
+                    "fcvt.d.wu.ssr fa6, ft0 \n"
+                    "fcvt.d.wu.ssr fa7, ft1 \n"
+
+                    // Normalize PRNs to [0, 1] range
+                    "fmul.d fa0, fa0, %[div] \n"
+                    "fmul.d fa2, fa2, %[div] \n"
+                    "fmul.d fa4, fa4, %[div] \n"
+                    "fmul.d fa6, fa6, %[div] \n"
+                    "fmul.d fa1, fa1, %[div] \n"
+                    "fmul.d fa3, fa3, %[div] \n"
+                    "fmul.d fa5, fa5, %[div] \n"
+                    "fmul.d fa7, fa7, %[div] \n"
+
+#if APPLICATION == APPLICATION_PI
+                    // x^2 + y^2
+                    EVAL_X2_PLUS_Y2_UNROLL4(fa0, fa2, fa4, fa6, fa1, fa3, fa5,
+                                            fa7, fa1, fa3, fa5, fa7)
+                    // (x^2 + y^2) < 1
+                    FLT_SSR_UNROLL_4(fa1, fa3, fa5, fa7, ft8, ft8, ft8, ft8,
+                                     ft2, ft2, ft2, ft2)
+#elif APPLICATION == APPLICATION_POLY
+                    // y * 3
+                    // x^3 + x^2 - x + 2
+                    EVAL_POLY_UNROLL4(fa0, fa2, fa4, fa6, fa1, fa3, fa5, fa7,
+                                      ft3, ft4, ft5, ft6, fa0, fa2, fa4, fa6)
+                    // y * 3 < x^3 + x^2 - x + 2
+                    FLT_SSR_UNROLL_4(fa1, fa3, fa5, fa7, fa0, fa2, fa4, fa6,
+                                     ft2, ft2, ft2, ft2)
+#endif
+                    :
+                    : [ n_frep ] "r"(BATCH_SIZE / 4 - 1),
+                      [ div ] "f"(max_uint_plus_1_inverse)
+#if APPLICATION == APPLICATION_PI
+                      , ASM_PI_CONSTANTS(reg_one)
+#elif APPLICATION == APPLICATION_POLY
+                      , ASM_POLY_CONSTANTS(reg_two, reg_three)
+#endif
+                    : "ft0", "ft1", "ft2", "ft3", "ft4", "ft5", "ft6", "ft8",
+                      "fa0", "fa1", "fa2", "fa3", "fa4", "fa5", "fa6", "fa7",
+                      "memory");
+            }
+
+            // Integer thread produces PRNs in all but last two iterations
+            if (iteration < n_iterations - 2) {
+                // Switch X, Y buffers for integer generation thread
+                int_xy_idx ^= 1;
+                int_x_ptr = rng_x_all[int_xy_idx];
+                int_y_ptr = rng_y_all[int_xy_idx];
+
+                // Unrolled by 4
+                uint32_t n_iter = BATCH_SIZE / 4;
+                asm volatile(
+                    "1:"
+
+                    // Compute and store 4 integer PRN (X, Y) pairs for
+                    // iteration i+2, zero-padded to 64-bit unsigned
+                    // integers
+#if PRNG == PRNG_LCG
+                    EVAL_LCG_UNROLL4
+                    SW_UNROLL_8(%[int_x0], %[int_y0], %[int_x1], %[int_y1],
+                                %[int_x2], %[int_y2], %[int_x3], %[int_y3],
+                                0, 0, 8, 8, 16, 16, 24, 24,
+                                %[rng_x], %[rng_y], %[rng_x], %[rng_y],
+                                %[rng_x], %[rng_y], %[rng_x], %[rng_y])
+#elif PRNG == PRNG_XOSHIRO128P
+                    EVAL_XOSHIRO128P_UNROLL4
+                    SW_UNROLL_8(t0, a0, t1, a1, t2, a2, t3, a3,
+                                0, 0, 8, 8, 16, 16, 24, 24,
+                                %[rng_x], %[rng_y], %[rng_x], %[rng_y],
+                                %[rng_x], %[rng_y], %[rng_x], %[rng_y])
+#endif
+
+                    // Loop over batches
+                    "addi %[n_iter], %[n_iter], -1 \n"
+                    "addi %[rng_x], %[rng_x], 32   \n"
+                    "addi %[rng_y], %[rng_y], 32   \n"
+                    "bnez %[n_iter], 1b            \n"
+                    : [ n_iter ] "+r"(n_iter)
+#if PRNG == PRNG_LCG
+                      , ASM_LCG_OUTPUTS
+#elif PRNG == PRNG_XOSHIRO128P
+                      , ASM_XOSHIRO128P_OUTPUTS
+#endif
+                    : [ rng_x ] "r"(int_x_ptr),
+                      [ rng_y ] "r"(int_y_ptr)
+#if PRNG == PRNG_LCG
+                      , ASM_LCG_INPUTS
+#elif PRNG == PRNG_XOSHIRO128P
+                      , ASM_XOSHIRO128P_INPUTS
+#endif
+                    : "t0", "a0", "t1", "a1", "t2", "a2", "t3", "a3",
+                      "memory");
+            }
+
+            // Integer thread accumulates the comparison results in all
+            // iterations after the second
+            if (iteration > 1) {
+                // Switch Z buffers for integer accumulation thread
+                int_z_idx ^= 1;
+                int_z_ptr = rng_z_all[int_z_idx];
+
+                // Unrolled by 8
+                for (int j = 0; j < BATCH_SIZE; j += 8) {
+                    asm volatile(
+                        "lw  a0,  0(%[rng_z])       \n"
+                        "lw  a1,  8(%[rng_z])       \n"
+                        "lw  a2, 16(%[rng_z])       \n"
+                        "lw  a3, 24(%[rng_z])       \n"
+                        "add %[temp0], %[temp0], a0 \n"
+                        "lw  a4, 32(%[rng_z])       \n"
+                        "add %[temp1], %[temp1], a1 \n"
+                        "lw  a5, 40(%[rng_z])       \n"
+                        "add %[temp2], %[temp2], a2 \n"
+                        "lw  a6, 48(%[rng_z])       \n"
+                        "add %[temp3], %[temp3], a3 \n"
+                        "lw  a7, 56(%[rng_z])       \n"
+                        "add %[temp4], %[temp4], a4 \n"
+                        "add %[temp5], %[temp5], a5 \n"
+                        "add %[temp6], %[temp6], a6 \n"
+                        "add %[temp7], %[temp7], a7 \n"
+                        : [ temp0 ] "+r"(temp0), [ temp1 ] "+r"(temp1),
+                          [ temp2 ] "+r"(temp2), [ temp3 ] "+r"(temp3),
+                          [ temp4 ] "+r"(temp4), [ temp5 ] "+r"(temp5),
+                          [ temp6 ] "+r"(temp6), [ temp7 ] "+r"(temp7)
+                        : [ rng_z ] "r"(&int_z_ptr[j])
+                        : "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
+                          "memory");
+                }
+            }
+
+            // Synchronize FP and integer threads and disable SSRs
+            snrt_fpu_fence();
+            snrt_ssr_disable();
+        }
+
+        // Reduce partial sums
+        result += temp0;
+        result += temp1;
+        result += temp2;
+        result += temp3;
+        result += temp4;
+        result += temp5;
+        result += temp6;
+        result += temp7;
+        return result;
+    }
+
+    return 0;
 }
 
 int main() {
-    // Initialize the PRNGs for parallel Monte Carlo
-    if (snrt_is_compute_core()) mc_init();
+    uint32_t n_seq_per_core, n_seq;
+
+    // Define number of independent subsequences required for parallelization
+    // or simply to have independent states, allowing to avoid RAW stalls
+    n_seq_per_core = 1;
+    if (PRNG == PRNG_LCG) {
+        if (FUNC_PTR != calculate_psum_naive) n_seq_per_core = 4 * 2;
+    }
+    // Every core gets independent subsequences which to calculate in parallel
+    n_seq = n_seq_per_core * N_CORES;
+
+    // Initialize the PRNGs for parallel Monte Carlo.
+    PRNG_T *prngs = (PRNG_T *)snrt_l1_alloc_cluster_local(sizeof(PRNG_T) * n_seq,
+                                                          sizeof(PRNG_T));
+    if (snrt_cluster_core_idx() == 0) {
+        PRNG_INIT_N(42, n_seq, prngs);
+    }
+    snrt_cluster_hw_barrier();
 
     // Store partial sum array at first free address in TCDM
-    uint32_t* reduction_array = (uint32_t*)snrt_l1_next();
+    uint32_t *reduction_array = (uint32_t *)snrt_l1_alloc_cluster_local(
+        sizeof(uint32_t) * N_CORES, sizeof(uint32_t));
 
     // Calculate partial sums
-    uint32_t n_samples_per_core = N_SAMPLES / snrt_cluster_compute_core_num();
-    reduction_array[snrt_cluster_core_idx()] =
-        calculate_partial_sum(seed0, seed1, Ap, Cp, n_samples_per_core);
+    if (snrt_is_compute_core()) snrt_mcycle();
+    uint32_t n_samples_per_core = N_SAMPLES / N_CORES;
+    int result = FUNC_PTR(prngs + snrt_cluster_core_idx() * n_seq_per_core,
+                          n_samples_per_core);
+    if (snrt_is_compute_core()) {
+        reduction_array[snrt_cluster_core_idx()] = result;
+        snrt_mcycle();
+    }
 
     // Synchronize cores
     snrt_cluster_hw_barrier();
@@ -42,17 +530,25 @@ int main() {
     // First core in cluster performs the final calculation
     if (snrt_cluster_core_idx() == 0) {
         // Reduce partial sums
-        uint32_t sum = 0;
-        for (int i = 0; i < snrt_cluster_compute_core_num(); i++) {
-            sum += reduction_array[i];
+        uint32_t hit = 0;
+        for (int i = 0; i < N_CORES; i++) {
+            hit += reduction_array[i];
         }
 
-        // Estimate pi
-        pi_estimate = estimate_pi(sum, N_SAMPLES);
+        // Estimate final result and calculate error
+        double actual_result;
+        double golden_result;
+#if APPLICATION == APPLICATION_PI
+        actual_result = (double)(4 * hit) / (double)N_SAMPLES;
+        golden_result = M_PI;
+#elif APPLICATION == APPLICATION_POLY
+        actual_result = 3 * (double)hit / (double)N_SAMPLES;
+        golden_result = 2.083333333;
+#endif
 
         // Check result
-        double err = fabs(pi_estimate - M_PI);
-        if (err > 0.05) return 1;
+        double err = fabs(actual_result - golden_result);
+        if (err > 0.1) return 1;
     }
 
     return 0;
diff --git a/sw/apps/montecarlo/pi_estimation/pi_estimation.h b/sw/apps/montecarlo/pi_estimation/pi_estimation.h
index 99da3b5328..f10ef8f4db 100644
--- a/sw/apps/montecarlo/pi_estimation/pi_estimation.h
+++ b/sw/apps/montecarlo/pi_estimation/pi_estimation.h
@@ -1,34 +1,230 @@
-// Copyright 2023 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-//
-// Fabio Cappellini <fcappellini@student.ethz.ch>
-// Hakim Filali <hfilali@student.ee.ethz.ch>
-// Luca Colagrande <colluca@iis.ee.ethz.ch>
-
-__thread double one = 1.0;
-
-inline uint32_t calculate_partial_sum(uint32_t seed0, uint32_t seed1,
-                                      uint32_t Ap, uint32_t Cp,
-                                      unsigned int n_samples) {
-    uint32_t x1 = seed0;
-    uint32_t x2 = seed1;
-    double u1, u2;
-    unsigned int result = 0;
-
-    for (unsigned int i = 0; i < n_samples; i++) {
-        u1 = normalize(x1);
-        u2 = normalize(x2);
-        x1 = lcg(Ap, Cp, x1);
-        x2 = lcg(Ap, Cp, x2);
-
-        if ((u1 * u1 + u2 * u2) < one) {
-            result++;
-        }
-    }
-    return result;
-}
-
-inline double estimate_pi(uint32_t sum, uint32_t n_samples) {
-    return (double)(4 * sum) / (double)n_samples;
-}
+#define ASM_LCG_OUTPUTS                                           \
+    [ int_x0 ] "+r"(lcg_state_x0), [ int_y0 ] "+r"(lcg_state_y0), \
+    [ int_x1 ] "+r"(lcg_state_x1), [ int_y1 ] "+r"(lcg_state_y1), \
+    [ int_x2 ] "+r"(lcg_state_x2), [ int_y2 ] "+r"(lcg_state_y2), \
+    [ int_x3 ] "+r"(lcg_state_x3), [ int_y3 ] "+r"(lcg_state_y3)
+
+#define ASM_LCG_INPUTS [ Ap ] "r"(lcg_Ap), [ Cp ] "r"(lcg_Cp)
+
+#define ASM_XOSHIRO128P_OUTPUTS        \
+    [ s_0 ] "+r"(xoshiro128p_state_0), \
+    [ s_1 ] "+r"(xoshiro128p_state_1), \
+    [ s_2 ] "+r"(xoshiro128p_state_2), \
+    [ s_3 ] "+r"(xoshiro128p_state_3)
+
+#define ASM_XOSHIRO128P_INPUTS [ t ] "r"(xoshiro128p_tmp)
+
+#define EVAL_LCG_UNROLL4                 \
+    "mul %[int_x0], %[int_x0], %[Ap] \n" \
+    "mul %[int_x1], %[int_x1], %[Ap] \n" \
+    "mul %[int_x2], %[int_x2], %[Ap] \n" \
+    "mul %[int_x3], %[int_x3], %[Ap] \n" \
+    "add %[int_x0], %[int_x0], %[Cp] \n" \
+    "mul %[int_y0], %[int_y0], %[Ap] \n" \
+    "add %[int_x1], %[int_x1], %[Cp] \n" \
+    "mul %[int_y1], %[int_y1], %[Ap] \n" \
+    "add %[int_x2], %[int_x2], %[Cp] \n" \
+    "mul %[int_y2], %[int_y2], %[Ap] \n" \
+    "add %[int_x3], %[int_x3], %[Cp] \n" \
+    "mul %[int_y3], %[int_y3], %[Ap] \n" \
+    "add %[int_y0], %[int_y0], %[Cp] \n" \
+    "add %[int_y1], %[int_y1], %[Cp] \n" \
+    "add %[int_y2], %[int_y2], %[Cp] \n" \
+    "add %[int_y3], %[int_y3], %[Cp] \n"
+
+#define EVAL_XOSHIRO128P_UNROLL4       \
+    "add t0, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n" \
+    "add t1, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n" \
+    "add t2, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n" \
+    "add t3, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n" \
+    "add a0, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n" \
+    "add a1, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n" \
+    "add a2, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n" \
+    "add a3, %[s_0], %[s_3] \n"        \
+    "sll %[t], %[s_1], 9           \n" \
+    "xor %[s_2], %[s_2], %[s_0]    \n" \
+    "xor %[s_3], %[s_3], %[s_1]    \n" \
+    "xor %[s_1],%[s_1], %[s_2]     \n" \
+    "xor %[s_0],%[s_0], %[s_3]     \n" \
+    "xor %[s_2], %[s_2], %[t]      \n" \
+    "sll %[t], %[s_3], 11          \n" \
+    "srl %[s_3], %[s_3], 21        \n" \
+    "or %[s_3], %[s_3], %[t]       \n"
+
+#define FCVT_UNROLL_8(in0, in1, in2, in3, in4, in5, in6, in7,         \
+                      out0, out1, out2, out3, out4, out5, out6, out7) \
+    "fcvt.d.wu " #out0 ", " #in0 " \n"                                \
+    "fcvt.d.wu " #out1 ", " #in1 " \n"                                \
+    "fcvt.d.wu " #out2 ", " #in2 " \n"                                \
+    "fcvt.d.wu " #out3 ", " #in3 " \n"                                \
+    "fcvt.d.wu " #out4 ", " #in4 " \n"                                \
+    "fcvt.d.wu " #out5 ", " #in5 " \n"                                \
+    "fcvt.d.wu " #out6 ", " #in6 " \n"                                \
+    "fcvt.d.wu " #out7 ", " #in7 " \n"
+
+#define SW_UNROLL_8(word0, word1, word2, word3, word4, word5, word6, word7, \
+                    offs0, offs1, offs2, offs3, offs4, offs5, offs6, offs7, \
+                    addr0, addr1, addr2, addr3, addr4, addr5, addr6, addr7) \
+    "sw " #word0 ", " #offs0 "(" #addr0 ") \n"                              \
+    "sw " #word1 ", " #offs1 "(" #addr1 ") \n"                              \
+    "sw " #word2 ", " #offs2 "(" #addr2 ") \n"                              \
+    "sw " #word3 ", " #offs3 "(" #addr3 ") \n"                              \
+    "sw " #word4 ", " #offs4 "(" #addr4 ") \n"                              \
+    "sw " #word5 ", " #offs5 "(" #addr5 ") \n"                              \
+    "sw " #word6 ", " #offs6 "(" #addr6 ") \n"                              \
+    "sw " #word7 ", " #offs7 "(" #addr7 ") \n"
+
+
+#define ASM_PI_CONSTANTS(k_one) \
+    [ one ] "f"(k_one)
+#define ASM_POLY_CONSTANTS(k_two, k_three) \
+    [ two ] "f"(k_two), [ three ] "f"(k_three)
+
+#define ASM_POLY_CLOBBERS "ft4", "ft5", "ft6", "ft7"
+
+#define EVAL_X2_PLUS_Y2_UNROLL2(x0, x1, y0, y1, out0, out1) \
+    /* x^2 */                                               \
+    "fmul.d " #x0 ", " #x0 ", " #x0 " \n"                   \
+    "fmul.d " #x1 ", " #x1 ", " #x1 " \n"                   \
+    /* x^2 + y^2 */                                         \
+    "fmadd.d " #out0 ", " #y0 ", " #y0 ", " #x0 " \n"       \
+    "fmadd.d " #out1 ", " #y1 ", " #y1 ", " #x1 " \n"
+
+#define EVAL_X2_PLUS_Y2_UNROLL4(x0, x1, x2, x3, y0, y1, y2, y3, out0, out1, \
+                                out2, out3)                                 \
+    /* x^2 */                                                               \
+    "fmul.d " #x0 ", " #x0 ", " #x0 " \n"                                   \
+    "fmul.d " #x1 ", " #x1 ", " #x1 " \n"                                   \
+    "fmul.d " #x2 ", " #x2 ", " #x2 " \n"                                   \
+    "fmul.d " #x3 ", " #x3 ", " #x3 " \n"                                   \
+    /* x^2 + y^2 */                                                         \
+    "fmadd.d " #out0 ", " #y0 ", " #y0 ", " #x0 " \n"                       \
+    "fmadd.d " #out1 ", " #y1 ", " #y1 ", " #x1 " \n"                       \
+    "fmadd.d " #out2 ", " #y2 ", " #y2 ", " #x2 " \n"                       \
+    "fmadd.d " #out3 ", " #y3 ", " #y3 ", " #x3 " \n"
+
+#define EVAL_POLY_UNROLL2(x0, x1, y0, y1, tmp0, tmp1, out0, out1) \
+    /* scale y to [0,3] */                                        \
+    "fmul.d " #y0 ", " #y0 ", %[three] \n"                        \
+    "fmul.d " #y1 ", " #y1 ", %[three] \n"                        \
+    /* x^2 */                                                     \
+    "fmul.d " #tmp0 ", " #x0 ", " #x0 " \n"                       \
+    "fmul.d " #tmp1 ", " #x1 ", " #x1 " \n"                       \
+    /* x^3 + x^2 */                                               \
+    "fmadd.d " #tmp0 ", " #tmp0 ", " #x0 ", " #tmp0 " \n"         \
+    "fmadd.d " #tmp1 ", " #tmp1 ", " #x1 ", " #tmp1 " \n"         \
+    /* x^3 + x^2 - x */                                           \
+    "fsub.d " #out0 ", " #tmp0 ", " #x0 " \n"                     \
+    "fsub.d " #out1 ", " #tmp1 ", " #x1 " \n"                     \
+    /* x^3 + x^2 - x + 2 */                                       \
+    "fadd.d " #out0 ", " #out0 ", %[two] \n"                      \
+    "fadd.d " #out1 ", " #out1 ", %[two] \n"                      \
+
+#define EVAL_POLY_UNROLL4(x0, x1, x2, x3, y0, y1, y2, y3, tmp0, tmp1, tmp2, \
+                          tmp3, out0, out1, out2, out3)                     \
+    /* scale y to [0,3] */                                                  \
+    "fmul.d " #y0 ", " #y0 ", %[three] \n"                                  \
+    "fmul.d " #y1 ", " #y1 ", %[three] \n"                                  \
+    "fmul.d " #y2 ", " #y2 ", %[three] \n"                                  \
+    "fmul.d " #y3 ", " #y3 ", %[three] \n"                                  \
+    /* x^2 */                                                               \
+    "fmul.d " #tmp0 ", " #x0 ", " #x0 " \n"                                 \
+    "fmul.d " #tmp1 ", " #x1 ", " #x1 " \n"                                 \
+    "fmul.d " #tmp2 ", " #x2 ", " #x2 " \n"                                 \
+    "fmul.d " #tmp3 ", " #x3 ", " #x3 " \n"                                 \
+    /* x^3 + x^2 */                                                         \
+    "fmadd.d " #tmp0 ", " #tmp0 ", " #x0 ", " #tmp0 " \n"                   \
+    "fmadd.d " #tmp1 ", " #tmp1 ", " #x1 ", " #tmp1 " \n"                   \
+    "fmadd.d " #tmp2 ", " #tmp2 ", " #x2 ", " #tmp2 " \n"                   \
+    "fmadd.d " #tmp3 ", " #tmp3 ", " #x3 ", " #tmp3 " \n"                   \
+    /* x^3 + x^2 - x */                                                     \
+    "fsub.d " #out0 ", " #tmp0 ", " #x0 " \n"                               \
+    "fsub.d " #out1 ", " #tmp1 ", " #x1 " \n"                               \
+    "fsub.d " #out2 ", " #tmp2 ", " #x2 " \n"                               \
+    "fsub.d " #out3 ", " #tmp3 ", " #x3 " \n"                               \
+    /* x^3 + x^2 - x + 2 */                                                 \
+    "fadd.d " #out0 ", " #out0 ", %[two] \n"                                \
+    "fadd.d " #out1 ", " #out1 ", %[two] \n"                                \
+    "fadd.d " #out2 ", " #out2 ", %[two] \n"                                \
+    "fadd.d " #out3 ", " #out3 ", %[two] \n"
+
+#define FLT_UNROLL_4(lhs0, lhs1, lhs2, lhs3, rhs0, rhs1, rhs2, rhs3, out0, \
+                     out1, out2, out3)                                     \
+    "flt.d " #out0 ", " #lhs0 ", " #rhs0 " \n"                             \
+    "flt.d " #out1 ", " #lhs1 ", " #rhs1 " \n"                             \
+    "flt.d " #out2 ", " #lhs2 ", " #rhs2 " \n"                             \
+    "flt.d " #out3 ", " #lhs3 ", " #rhs3 " \n"
+
+#define FLT_SSR_UNROLL_2(lhs0, lhs1, rhs0, rhs1, out0, out1) \
+    "flt.d.copift " #out0 ", " #lhs0 ", " #rhs0 " \n"           \
+    "flt.d.copift " #out1 ", " #lhs1 ", " #rhs1 " \n"
+
+#define FLT_SSR_UNROLL_4(lhs0, lhs1, lhs2, lhs3, rhs0, rhs1, rhs2, rhs3, \
+                         out0, out1, out2, out3)                         \
+    "flt.d.copift " #out0 ", " #lhs0 ", " #rhs0 " \n"                       \
+    "flt.d.copift " #out1 ", " #lhs1 ", " #rhs1 " \n"                       \
+    "flt.d.copift " #out2 ", " #lhs2 ", " #rhs2 " \n"                       \
+    "flt.d.copift " #out3 ", " #lhs3 ", " #rhs3 " \n"
diff --git a/sw/apps/prng/lcg.h b/sw/apps/prng/lcg.h
index 19ffb2cbef..7ca08133df 100644
--- a/sw/apps/prng/lcg.h
+++ b/sw/apps/prng/lcg.h
@@ -6,58 +6,51 @@
 // Hakim Filali <hfilali@student.ee.ethz.ch>
 // Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-#include "snrt.h"
-
 // Numerical Recipes from the "quick and dirty generators" list, Chapter 7.1,
 // Eq. 7.1.6 parameters from Knuth and H. W. Lewis
-#define MAX_UINT_PLUS1 4294967296.0
 #define LCG_A 1664525
 #define LCG_C 1013904223
 
-__thread double max_uint_plus_1_inverse = (double)1.0 / (double)MAX_UINT_PLUS1;
-
-// Calculate A' and C' constants
-inline void leapfrog_constants(unsigned int num_sequences, uint32_t a,
-                               uint32_t c, uint32_t* Ap, uint32_t* Cp) {
-    // Ap = a^k
-    // Cp = (a^(k-1) + a^(k-2) + ... + a^1 + a^0)*c
-    uint32_t Ap_tmp = 1;
-    uint32_t Cp_tmp = 0;
-    for (unsigned int p = 0; p < num_sequences; p++) {
-        Cp_tmp += Ap_tmp;
-        Ap_tmp *= a;
-    }
-    Cp_tmp *= c;
+typedef struct {
+    uint32_t state;
+    uint32_t A;
+    uint32_t C;
+} lcg_t;
 
-    // Store temporary variables to outputs
-    *Ap = Ap_tmp;
-    *Cp = Cp_tmp;
+lcg_t lcg_init_default(uint32_t seed) {
+    lcg_t lcg = {.state = seed, .A = LCG_A, .C = LCG_C};
+    return lcg;
 }
 
-// Calculate seed for leapfrog method's right sequence of index `sequence_idx`
-inline uint32_t right_seed(uint32_t left_seed, uint32_t a, uint32_t c,
-                           unsigned int sequence_idx) {
-    uint32_t seed = left_seed;
-    for (unsigned int p = 1; p <= sequence_idx; p++) {
-        seed = seed * a + c;
-    }
-    return seed;
+lcg_t lcg_init(uint32_t seed, uint32_t A, uint32_t C) {
+    lcg_t lcg = {.state = seed, .A = A, .C = C};
+    return lcg;
 }
 
-// Generate next PRN from LCG recurrence equation
-inline uint32_t lcg(uint32_t a, uint32_t c, uint32_t previous) {
-    return previous * a + c;
+uint32_t lcg_next(lcg_t* lcg) {
+    lcg->state = lcg->state * lcg->A + lcg->C;
+    return lcg->state;
 }
 
-// Normalize LCG PRN to [0, 1) range
-inline double normalize(uint32_t x) {
-    return (double)x * max_uint_plus_1_inverse;
+void lcg_init_n(uint32_t seed, uint32_t A, uint32_t C, uint32_t n, lcg_t* lcg) {
+    // Calculate Ap and Cp of leapfrog sequences
+    // Ap = a^k
+    // Cp = (a^(k-1) + a^(k-2) + ... + a^1 + a^0)*c
+    uint32_t Ap = 1;
+    uint32_t Cp = 0;
+    for (unsigned int p = 0; p < n; p++) {
+        Cp += Ap;
+        Ap *= A;
+    }
+    Cp *= C;
+
+    // Calculate seeds of leapfrog sequences
+    lcg_t lcg0 = lcg_init(seed, A, C);
+    for (unsigned int i = 0; i < n; i++) {
+        lcg[i] = lcg_init(lcg_next(&lcg0), Ap, Cp);
+    }
 }
 
-inline void init_2d_lcg_params(unsigned int num_sequences, uint32_t seed,
-                               uint32_t a, uint32_t c, uint32_t* seed0,
-                               uint32_t* seed1, uint32_t* Ap, uint32_t* Cp) {
-    *seed0 = right_seed(seed, a, c, 2 * snrt_global_compute_core_idx());
-    *seed1 = right_seed(seed, a, c, 2 * snrt_global_compute_core_idx() + 1);
-    leapfrog_constants(num_sequences, a, c, Ap, Cp);
+void lcg_init_n_default(uint32_t seed, uint32_t n, lcg_t* lcg) {
+    lcg_init_n(seed, LCG_A, LCG_C, n, lcg);
 }
diff --git a/sw/apps/prng/prng.h b/sw/apps/prng/prng.h
new file mode 100644
index 0000000000..45f8c07b1c
--- /dev/null
+++ b/sw/apps/prng/prng.h
@@ -0,0 +1,20 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include <stdint.h>
+
+#define MAX_UINT_PLUS1 4294967296.0
+
+__thread double max_uint_plus_1_inverse = (double)1.0 / (double)MAX_UINT_PLUS1;
+
+// Normalize integer PRN to [0, 1) range
+double rand_int_to_unit_double(uint32_t x) {
+    return (double)x * max_uint_plus_1_inverse;
+}
+
+#include "lcg.h"
+#include "splitmix64.h"
+#include "xoshiro128p.h"
diff --git a/sw/apps/prng/splitmix64.h b/sw/apps/prng/splitmix64.h
new file mode 100644
index 0000000000..9eabae03fd
--- /dev/null
+++ b/sw/apps/prng/splitmix64.h
@@ -0,0 +1,22 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Lannan Jiang <jiangl@student.ethz.ch>
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+typedef struct {
+    uint64_t state;
+} splitmix64_t;
+
+splitmix64_t splitmix64_init(uint64_t seed) {
+    splitmix64_t sp64 = {.state = seed};
+    return sp64;
+}
+
+uint64_t splitmix64_next(splitmix64_t* sp64) {
+	uint64_t z = (sp64->state += 0x9e3779b97f4a7c15);
+	z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+	z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+	return z ^ (z >> 31);
+}
diff --git a/sw/apps/prng/xoshiro128p.h b/sw/apps/prng/xoshiro128p.h
new file mode 100644
index 0000000000..c01c08a999
--- /dev/null
+++ b/sw/apps/prng/xoshiro128p.h
@@ -0,0 +1,86 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Lannan Jiang <jiangl@student.ethz.ch>
+// Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+typedef struct {
+    uint32_t s[4];
+} xoshiro128p_t;
+
+xoshiro128p_t xoshiro128p_init(uint32_t seed) {
+    // State initialization uses SplitMix64 generator as suggested by authors
+    // https://prng.di.unimi.it/
+    splitmix64_t sp64 = splitmix64_init(seed);
+
+    // How SplitMix64 is used to initialize Xoshiro follows the Numba
+    // implementation https://github.com/numba/numba/blob/
+    // 5e626ba7d808dde8a9317ced8359c11f41b2cb6a/numba/cuda/random.py#L63-L69
+    xoshiro128p_t xoshiro128p;
+    for (int i = 0; i < 4; i++)
+        xoshiro128p.s[i] = splitmix64_next(&sp64);
+    return xoshiro128p;
+}
+
+void xoshiro128p_copy(xoshiro128p_t* target, xoshiro128p_t* source) {
+    for (int i = 0; i < 4; i++)
+        target->s[i] = source->s[i];
+}
+
+uint32_t xoshiro128p_next(xoshiro128p_t* xoshiro128p) {
+    uint32_t *s = (uint32_t *)&(xoshiro128p->s);
+    const uint32_t result = s[0] + s[3];
+
+	const uint32_t t = s[1] << 9;
+
+	s[2] ^= s[0];
+	s[3] ^= s[1];
+	s[1] ^= s[2];
+	s[0] ^= s[3];
+
+	s[2] ^= t;
+
+	s[3] = (s[3] << 11) | (s[3] >> (32 - 11));
+
+	return result;
+}
+
+/* This is the jump function for the generator. It is equivalent
+   to 2^64 calls to next(); it can be used to generate 2^64
+   non-overlapping subsequences for parallel computations. */
+void xoshiro128p_jump(xoshiro128p_t* xoshiro128p) {
+	static const uint32_t JUMP[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b };
+    uint32_t *s = (uint32_t *)&(xoshiro128p->s);
+
+	uint32_t s0 = 0;
+	uint32_t s1 = 0;
+	uint32_t s2 = 0;
+	uint32_t s3 = 0;
+	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
+		for(int b = 0; b < 32; b++) {
+			if (JUMP[i] & UINT32_C(1) << b) {
+				s0 ^= s[0];
+				s1 ^= s[1];
+				s2 ^= s[2];
+				s3 ^= s[3];
+			}
+			xoshiro128p_next(xoshiro128p);	
+		}
+		
+	s[0] = s0;
+	s[1] = s1;
+	s[2] = s2;
+	s[3] = s3;
+}
+
+void xoshiro128p_init_n(uint32_t seed, uint32_t n, xoshiro128p_t* xoshiro128p) {
+    // Every sequence jumps 2^64 steps from the previous one.
+    // See https://doi.org/10.1145/3460772 for details on subsequence overlap
+    // probability.
+    xoshiro128p[0] = xoshiro128p_init(seed);
+    for (int i = 1; i < n; i++) {
+        xoshiro128p_copy(xoshiro128p + i, xoshiro128p + i - 1);
+        xoshiro128p_jump(xoshiro128p + i);
+    }
+}
diff --git a/sw/deps/riscv-opcodes b/sw/deps/riscv-opcodes
index 94caf0e0fe..da14f06eaa 160000
--- a/sw/deps/riscv-opcodes
+++ b/sw/deps/riscv-opcodes
@@ -1 +1 @@
-Subproject commit 94caf0e0fefff1009ba144bccb6d8f7d425ea2f5
+Subproject commit da14f06eaa3760c3d4dc404b6d436b71bbb1be45
diff --git a/sw/snRuntime/api/riscv_decls.h b/sw/snRuntime/api/riscv_decls.h
index f0aae17793..06585cb6dd 100644
--- a/sw/snRuntime/api/riscv_decls.h
+++ b/sw/snRuntime/api/riscv_decls.h
@@ -6,6 +6,12 @@
 
 #include <stdint.h>
 
+#define R_TYPE_ENCODE(funct7, rs2, rs1, funct3, rd, opcode)                    \
+    ((funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) | \
+     (opcode))
+
+#define OP_CUSTOM1 0b0101011
+
 static inline void snrt_wfi();
 
 static inline uint32_t snrt_mcycle();
diff --git a/sw/snRuntime/src/copift.h b/sw/snRuntime/src/copift.h
new file mode 100644
index 0000000000..177fd9b3fd
--- /dev/null
+++ b/sw/snRuntime/src/copift.h
@@ -0,0 +1,52 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#define FCMP_FUNCT5 0b10100
+#define FCVT_D_INT_FUNCT5 0b11010
+
+#define OP_FP_FMT_D 0b01
+
+#define FLT_FUNCT3 0b001
+
+#define FCVT_D_WU_RS2 0b00001
+
+#define FLT_D_SSR(rd, rs1, rs2)                                               \
+    R_TYPE_ENCODE((FCMP_FUNCT5 << 2 | OP_FP_FMT_D), rs2, rs1, FLT_FUNCT3, rd, \
+                  OP_CUSTOM1)
+
+#define FCVT_D_WU_SSR(rd, rs1)                                                \
+    R_TYPE_ENCODE((FCVT_D_INT_FUNCT5 << 2 | OP_FP_FMT_D), FCVT_D_WU_RS2, rs1, \
+                  0b000, rd, OP_CUSTOM1)
+
+/**
+ * @brief FLT comparison with writeback to SSR 2.
+ * @param lval The left-hand-side value in the comparison.
+ * @param rval The right-hand-side value in the comparison.
+ */
+inline void snrt_ssr_flt(double lval, double rval) {
+    register double reg_lval asm("fa0") = lval;  // 10
+    register double reg_rval asm("fa1") = rval;  // 11
+
+    // flt.d.ssr ft2, lval, rval
+    asm volatile(".word %[insn]\n"
+                 :
+                 : [ insn ] "i"(FLT_D_SSR(2, 10, 11)), "f"(lval), "f"(rval));
+}
+
+/**
+ * @brief Convert an unsigned integer from SSR 0 to a double.
+ * @return The value converted to double.
+ */
+inline double snrt_ssr_fcvt() {
+    register double reg_result asm("fa0");  // 10
+
+    // fcvt.d.wu fa0, ft0
+    asm volatile(".word %[insn]\n"
+                 : "=f"(reg_result)
+                 : [ insn ] "i"(FCVT_D_WU_SSR(10, 0)));
+
+    return reg_result;
+}
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index 3638053465..17bbbd6fa2 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -9,7 +9,6 @@
 
 #pragma once
 
-#define OP_CUSTOM1 0b0101011
 #define XDMA_FUNCT3 0b000
 #define DMSRC_FUNCT7 0b0000000
 #define DMDST_FUNCT7 0b0000001
@@ -20,10 +19,6 @@
 #define DMSTR_FUNCT7 0b0000110
 #define DMREP_FUNCT7 0b0000111
 
-#define R_TYPE_ENCODE(funct7, rs2, rs1, funct3, rd, opcode)                    \
-    ((funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) | \
-     (opcode))
-
 /// A DMA transfer identifier.
 typedef uint32_t snrt_dma_txid_t;
 
diff --git a/sw/snRuntime/src/ssr.h b/sw/snRuntime/src/ssr.h
index d8858baa4a..4f7fd4331b 100644
--- a/sw/snRuntime/src/ssr.h
+++ b/sw/snRuntime/src/ssr.h
@@ -11,11 +11,19 @@
  * @code{.c}
  * for (int i = 0; i < b1; i++)
  *     for (int j = 0; j < b0; j++)
- *         array[i * s1 + j * s0] = 0;
+ *         ptr[i * s1 + j * s0] = 0;
  * @endcode
  *
- * The configuration functions provided in this file reflect the parameters
- * one would define to set up such a loop nest.
+ * An ISSR (Indirect SSR) stream can be configured to replace a store (or load)
+ * sequence as could be generated by:
+ * @code{.c}
+ * for (int i = 0; i < bound; i++)
+ *     base[idcs[i]] = 0;
+ * @endcode
+ *
+ * The convenience functions provided in this file can be used to set up such
+ * access patterns. The function argument names reflect the variable names
+ * presented in these sample code snippets.
  */
 
 #pragma once
@@ -54,13 +62,26 @@ enum snrt_ssr_dim {
 /**
  * @brief The SSR configuration registers.
  */
-enum {
-    REG_STATUS = 0,  /**< SSR status register */
-    REG_REPEAT = 1,  /**< SSR repeat register */
-    REG_BOUNDS = 2,  /**< SSR bounds register */
-    REG_STRIDES = 6, /**< SSR strides register */
-    REG_RPTR = 24,   /**< SSR read pointer register */
-    REG_WPTR = 28    /**< SSR write pointer register */
+enum snrt_ssr_reg {
+    SNRT_SSR_REG_STATUS = 0,      /**< SSR status register */
+    SNRT_SSR_REG_REPEAT = 1,      /**< SSR repeat register */
+    SNRT_SSR_REG_BOUNDS = 2,      /**< SSR bounds register */
+    SNRT_SSR_REG_STRIDES = 6,     /**< SSR strides register */
+    SNRT_SSR_REG_IDX_CFG = 10,    /**< SSSR index configuration register */
+    SNRT_SSR_REG_IDX_BASE = 11,   /**< SSSR base address register */
+    SNRT_SSR_REG_RPTR_INDIR = 16, /**< SSSR indirection indices read pointer register */
+    SNRT_SSR_REG_RPTR = 24,       /**< SSR read pointer register */
+    SNRT_SSR_REG_WPTR = 28        /**< SSR write pointer register */
+};
+
+/**
+ * @brief The size of the SSSR indirection indices.
+ */
+enum snrt_ssr_idxsize {
+    SNRT_SSR_IDXSIZE_U8 = 0,  /**< Unsigned 8-bit integer */
+    SNRT_SSR_IDXSIZE_U16 = 1, /**< Unsigned 16-bit integer */
+    SNRT_SSR_IDXSIZE_U32 = 2, /**< Unsigned 32-bit integer */
+    SNRT_SSR_IDXSIZE_U64 = 3, /**< Unsigned 64-bit integer */
 };
 
 /**
@@ -91,7 +112,7 @@ inline void snrt_ssr_disable() {
  * @param dm The SSR index.
  * @return The value of the register.
  */
-inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) {
+inline uint32_t read_ssr_cfg(enum snrt_ssr_reg reg, uint32_t dm) {
     uint32_t value;
     asm volatile("scfgri %[value], %[dm] | %[reg]<<5\n"
                  : [ value ] "=r"(value)
@@ -105,7 +126,7 @@ inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) {
  * @param dm The SSR index.
  * @param value The value to write.
  */
-inline void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) {
+inline void write_ssr_cfg(enum snrt_ssr_reg reg, uint32_t dm, uint32_t value) {
     asm volatile("scfgwi %[value], %[dm] | %[reg]<<5\n" ::[value] "r"(value),
                  [ dm ] "i"(dm), [ reg ] "i"(reg));
 }
@@ -118,9 +139,9 @@ inline void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) {
  */
 inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) {
     --b0;
-    write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 0, dm, b0);
     size_t a = 0;
-    write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 0, dm, s0 - a);
     a += s0 * b0;
 }
 
@@ -136,12 +157,12 @@ inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
                              size_t s0, size_t s1) {
     --b0;
     --b1;
-    write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
-    write_ssr_cfg(REG_BOUNDS + 1, dm, b1);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 0, dm, b0);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 1, dm, b1);
     size_t a = 0;
-    write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 0, dm, s0 - a);
     a += s0 * b0;
-    write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 1, dm, s1 - a);
     a += s1 * b1;
 }
 
@@ -160,15 +181,15 @@ inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     --b0;
     --b1;
     --b2;
-    write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
-    write_ssr_cfg(REG_BOUNDS + 1, dm, b1);
-    write_ssr_cfg(REG_BOUNDS + 2, dm, b2);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 0, dm, b0);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 1, dm, b1);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 2, dm, b2);
     size_t a = 0;
-    write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 0, dm, s0 - a);
     a += s0 * b0;
-    write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 1, dm, s1 - a);
     a += s1 * b1;
-    write_ssr_cfg(REG_STRIDES + 2, dm, s2 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 2, dm, s2 - a);
     a += s2 * b2;
 }
 
@@ -191,18 +212,18 @@ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     --b1;
     --b2;
     --b3;
-    write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
-    write_ssr_cfg(REG_BOUNDS + 1, dm, b1);
-    write_ssr_cfg(REG_BOUNDS + 2, dm, b2);
-    write_ssr_cfg(REG_BOUNDS + 3, dm, b3);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 0, dm, b0);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 1, dm, b1);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 2, dm, b2);
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS + 3, dm, b3);
     size_t a = 0;
-    write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 0, dm, s0 - a);
     a += s0 * b0;
-    write_ssr_cfg(REG_STRIDES + 1, dm, s1 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 1, dm, s1 - a);
     a += s1 * b1;
-    write_ssr_cfg(REG_STRIDES + 2, dm, s2 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 2, dm, s2 - a);
     a += s2 * b2;
-    write_ssr_cfg(REG_STRIDES + 3, dm, s3 - a);
+    write_ssr_cfg(SNRT_SSR_REG_STRIDES + 3, dm, s3 - a);
     a += s3 * b3;
 }
 
@@ -212,7 +233,7 @@ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
  * @param count The repetition count.
  */
 inline void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) {
-    write_ssr_cfg(REG_REPEAT, dm, count - 1);
+    write_ssr_cfg(SNRT_SSR_REG_REPEAT, dm, count - 1);
 }
 
 /**
@@ -223,7 +244,7 @@ inline void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) {
  */
 inline void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
                           volatile void *ptr) {
-    write_ssr_cfg(REG_RPTR + dim, dm, (uintptr_t)ptr);
+    write_ssr_cfg(SNRT_SSR_REG_RPTR + dim, dm, (uintptr_t)ptr);
 }
 
 /**
@@ -234,5 +255,23 @@ inline void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
  */
 inline void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
                            volatile void *ptr) {
-    write_ssr_cfg(REG_WPTR + dim, dm, (uintptr_t)ptr);
+    write_ssr_cfg(SNRT_SSR_REG_WPTR + dim, dm, (uintptr_t)ptr);
 }
+
+/**
+ * @brief Start a streaming indirect read.
+ * @param dm The SSSR index.
+ * @param base The base pointer to the data.
+ * @param ptr The pointer to the indirection indices.
+ * @param bound The bound of the first (and only) loop.
+ * @param idxsize The size of the indices.
+ */
+inline void snrt_issr_read(enum snrt_ssr_dm dm, volatile void *base,
+                           volatile void *idcs, size_t bound, enum
+                           snrt_ssr_idxsize idxsize) {
+    write_ssr_cfg(SNRT_SSR_REG_IDX_CFG, dm, idxsize & 0xFF);
+    --bound;
+    write_ssr_cfg(SNRT_SSR_REG_BOUNDS, dm, bound);
+    write_ssr_cfg(SNRT_SSR_REG_IDX_BASE, dm, (uintptr_t)base);
+    write_ssr_cfg(SNRT_SSR_REG_RPTR_INDIR, dm, (uintptr_t)idcs);
+}
\ No newline at end of file
diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c
index 824c8231f4..f169ec6ffd 100644
--- a/sw/snRuntime/src/start.c
+++ b/sw/snRuntime/src/start.c
@@ -23,6 +23,7 @@ static inline void snrt_init_tls() {
         // First initialize the DM core's .tdata section from main memory
         asm volatile("mv %0, tp" : "=r"(tls_ptr) : :);
         snrt_dma_start_1d((void*)tls_ptr, (void*)(&__tdata_start), size);
+        snrt_dma_wait_all();
 
         // Then initialize all other cores' .tdata sections from the DM
         // core's. The offset between the TLS section of successive cores
diff --git a/sw/tests/fcvt_d_w_ssr.c b/sw/tests/fcvt_d_w_ssr.c
new file mode 100644
index 0000000000..caaa767e82
--- /dev/null
+++ b/sw/tests/fcvt_d_w_ssr.c
@@ -0,0 +1,66 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Authors: Lannan Jiang <jiangl@student.ethz.ch>
+//          Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+#define LENGTH 64
+
+int main() {
+    // Only compute cores proceed
+    if (snrt_is_dm_core()) return 0;
+
+    // Allocate input and output arrays
+    uint64_t *input = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(uint32_t), sizeof(uint32_t));
+    double *golden_output = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(double), sizeof(double));
+    double *actual_output = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(double), sizeof(double));
+
+    // Initialize first half of input array with positive numbers
+    for (int i = 0; i < LENGTH / 2; i++) {
+        input[i] = snrt_cluster_core_idx() * (LENGTH / 2) + i;
+    }
+
+    // Initialize second half of input array with negative numbers
+    for (int i = LENGTH / 2; i < LENGTH; i++) {
+        input[i] = - (snrt_cluster_core_idx() * (LENGTH / 2) + i);
+    }
+
+    // Calculate golden outputs using reference flt.d instruction
+    for (int i = 0; i < LENGTH; i++) {
+        asm volatile("fcvt.d.w %[out], %[in] \n"
+                     : [ out ] "=f"(golden_output[i])
+                     : [ in ] "r"((uint32_t)input[i])
+                     :);
+    }
+    snrt_fpu_fence();
+
+    // Configure SSRs
+    snrt_ssr_loop_1d(SNRT_SSR_DM0, LENGTH, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM2, LENGTH, sizeof(double));
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input);
+    snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, actual_output);
+
+    // Calculate outputs using flt.d.ssr instruction
+    snrt_ssr_enable();
+    asm volatile(
+        "frep.o  %[n_frep], 1, 0, 0 \n"
+        "fcvt.d.w.copift ft2, ft0 \n"
+        :
+        : [ n_frep ] "r"(LENGTH - 1)
+        : "ft0", "ft1", "ft2", "ft3", "memory");
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+
+    // Compare results
+    uint32_t n_errors = LENGTH;
+    for (int i = 0; i < LENGTH; i++) {
+        if (golden_output[i] == actual_output[i]) n_errors--;
+    }
+    return n_errors;
+}
diff --git a/sw/tests/fcvt_d_wu_ssr.c b/sw/tests/fcvt_d_wu_ssr.c
new file mode 100644
index 0000000000..401fb98b32
--- /dev/null
+++ b/sw/tests/fcvt_d_wu_ssr.c
@@ -0,0 +1,61 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Authors: Lannan Jiang <jiangl@student.ethz.ch>
+//          Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+#define LENGTH 64
+
+int main() {
+    // Only compute cores proceed
+    if (snrt_is_dm_core()) return 0;
+
+    // Allocate input and output arrays
+    uint64_t *input = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(uint32_t), sizeof(uint32_t));
+    double *golden_output = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(double), sizeof(double));
+    double *actual_output = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(double), sizeof(double));
+
+    // Initialize input array
+    for (int i = 0; i < LENGTH; i++) {
+        input[i] = snrt_cluster_core_idx() * LENGTH + i;
+    }
+
+    // Calculate golden outputs using reference flt.d instruction
+    for (int i = 0; i < LENGTH; i++) {
+        asm volatile("fcvt.d.wu %[out], %[in] \n"
+                     : [ out ] "=f"(golden_output[i])
+                     : [ in ] "r"((uint32_t)input[i])
+                     :);
+    }
+    snrt_fpu_fence();
+
+    // Configure SSRs
+    snrt_ssr_loop_1d(SNRT_SSR_DM0, LENGTH, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM2, LENGTH, sizeof(double));
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input);
+    snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, actual_output);
+
+    // Calculate outputs using flt.d.ssr instruction
+    snrt_ssr_enable();
+    asm volatile(
+        "frep.o  %[n_frep], 1, 0, 0 \n"
+        "fcvt.d.wu.ssr ft2, ft0 \n"
+        :
+        : [ n_frep ] "r"(LENGTH - 1)
+        : "ft0", "ft1", "ft2", "ft3", "memory");
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+
+    // Compare results
+    uint32_t n_errors = LENGTH;
+    for (int i = 0; i < LENGTH; i++) {
+        if (golden_output[i] == actual_output[i]) n_errors--;
+    }
+    return n_errors;
+}
diff --git a/sw/tests/flt_d_ssr.c b/sw/tests/flt_d_ssr.c
new file mode 100644
index 0000000000..2a143f3ff6
--- /dev/null
+++ b/sw/tests/flt_d_ssr.c
@@ -0,0 +1,67 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Authors: Lannan Jiang <jiangl@student.ethz.ch>
+//          Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+#define LENGTH 8
+
+int main() {
+    // Only compute cores proceed
+    if (snrt_is_dm_core()) return 0;
+
+    // Allocate input and output arrays
+    double *input = snrt_l1_alloc_compute_core_local(LENGTH * sizeof(double),
+                                                     sizeof(double));
+    uint32_t *golden_output = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(uint32_t), sizeof(uint32_t));
+    uint64_t *actual_output = snrt_l1_alloc_compute_core_local(
+        LENGTH * sizeof(uint64_t), sizeof(uint64_t));
+
+    // Initialize input array
+    input[0] = 0.1;
+    input[1] = 0.2;
+    input[2] = 1.3;
+    input[3] = 1.4;
+    input[4] = 0.8;
+    input[5] = 0.99;
+    input[6] = 1.8;
+    input[7] = 1.1;
+
+    // Calculate golden outputs using reference flt.d instruction
+    for (int i = 0; i < LENGTH; i++) {
+        asm volatile("flt.d %[out], %[in], %[one] \n"
+                     : [ out ] "+r"(golden_output[i])
+                     : [ one ] "f"(1.0), [ in ] "f"(input[i])
+                     :);
+    }
+    snrt_fpu_fence();
+
+    // Configure SSRs
+    snrt_ssr_loop_1d(SNRT_SSR_DM0, LENGTH, sizeof(double));
+    snrt_ssr_loop_1d(SNRT_SSR_DM2, LENGTH, sizeof(double));
+    snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_1D, input);
+    snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, actual_output);
+
+    // Calculate outputs using flt.d.ssr instruction
+    register double reg_one asm("ft3") = 1.0;  // 3
+    snrt_ssr_enable();
+    asm volatile(
+        "frep.o  %[n_frep], 1, 0, 0 \n"
+        "flt.d.ssr ft2, ft0, ft3 \n"
+        :
+        : [ n_frep ] "r"(LENGTH - 1), "f"(reg_one)
+        : "ft0", "ft1", "ft2", "ft3", "memory");
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+
+    // Compare results
+    uint32_t n_errors = LENGTH;
+    for (int i = 0; i < LENGTH; i++) {
+        if (golden_output[i] == (uint32_t)actual_output[i]) n_errors--;
+    }
+    return n_errors;
+}
diff --git a/sw/tests/issr.c b/sw/tests/issr.c
new file mode 100644
index 0000000000..f6b1c3e2d2
--- /dev/null
+++ b/sw/tests/issr.c
@@ -0,0 +1,51 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+#include "snrt.h"
+
+#define LEN 2048
+
+int main() {
+
+    // Only core 0 performs the test
+    if (snrt_cluster_core_idx() > 0) return 0;
+
+    // Allocate data, index and result arrays
+    double *data = snrt_l1_alloc_cluster_local(LEN * sizeof(double), sizeof(double));
+    double *result = snrt_l1_alloc_cluster_local(LEN * sizeof(double), sizeof(double));
+    uint16_t *idcs = snrt_l1_alloc_cluster_local(LEN * sizeof(uint16_t), sizeof(uint16_t));
+
+    // Initialize data and indirection indices arrays
+    for (int i = 0; i < LEN; i++) {
+        data[i] = (double)i;
+        idcs[i] = (i * 7) % LEN;
+    }
+
+    // Configure ISSR
+    snrt_issr_read(SNRT_SSR_DM0, data, idcs, LEN, SNRT_SSR_IDXSIZE_U16);
+
+    // Configure SSR for writeback
+    snrt_ssr_loop_1d(SNRT_SSR_DM2, LEN, sizeof(double));
+    snrt_ssr_write(SNRT_SSR_DM2, SNRT_SSR_1D, result);
+
+    // Read values using ISSR
+    snrt_ssr_enable();
+    asm volatile(
+        "frep.o %[n_frep], 1, 0, 0 \n"
+        "fmv.d   ft2, ft0 \n"
+        : : [n_frep] "r"(LEN - 1) : "memory", "ft0", "ft1", "ft2"
+    );
+    snrt_ssr_disable();
+    snrt_fpu_fence();
+
+    // Check results
+    int n_err = LEN;
+    for (int i = 0; i < LEN; i++)
+        if (result[i] == data[idcs[i]])
+            n_err--;
+
+    return n_err;
+}
diff --git a/target/common/common.mk b/target/common/common.mk
index 77d0e79d3e..4f04e1666a 100644
--- a/target/common/common.mk
+++ b/target/common/common.mk
@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Makefile invocation
-DEBUG ?= OFF  # ON to turn on wave logging
+DEBUG    ?= OFF  # ON to turn on wave logging
+PL_SIM   ?= 0    # 1 for post-layout simulation
+VCD_DUMP ?= 0    # 1 to dump VCD traces
 
 # Directories
 SIM_DIR  ?= $(shell pwd)
@@ -12,7 +14,8 @@ UTIL_DIR ?= $(SNITCH_ROOT)/util
 LOGS_DIR  = $(SIM_DIR)/logs
 
 # Files
-BENDER_LOCK ?= $(ROOT)/Bender.lock
+BENDER_LOCK = $(ROOT)/Bender.lock
+BENDER_YML  = $(ROOT)/Bender.yml
 
 # SEPP packages
 QUESTA_SEPP    ?=
@@ -47,28 +50,56 @@ VLT_ROOT        ?= ${VERILATOR_ROOT}
 VLT_JOBS        ?= $(shell nproc)
 VLT_NUM_THREADS ?= 1
 
-MATCH_END := '/+incdir+/ s/$$/\/*\/*/'
-MATCH_BGN := 's/+incdir+//g'
-MATCH_DEF := '/+define+/d'
-SED_SRCS  := sed -e ${MATCH_END} -e ${MATCH_BGN} -e ${MATCH_DEF}
-
-COMMON_BENDER_FLAGS += -t rtl
+COMMON_BENDER_FLAGS += -t rtl -t snitch_cluster
+ifeq ($(PL_SIM), 1)
+COMMON_BENDER_FLAGS += -t postlayout
+endif
 
 VSIM_BENDER   += $(COMMON_BENDER_FLAGS) -t test -t simulation -t vsim
-VSIM_SOURCES   = $(shell ${BENDER} script flist-plus ${VSIM_BENDER} | ${SED_SRCS})
 VSIM_BUILDDIR ?= work-vsim
 VSIM_FLAGS    += -t 1ps
+VOPT_FLAGS     = 
 ifeq ($(DEBUG), ON)
-VSIM_FLAGS    += -do "log -r /*; run -a"
-VOPT_FLAGS     = +acc
+VSIM_FLAGS    += -do "log -r /*"
+VOPT_FLAGS    += +acc
+endif
+
+ifeq ($(PL_SIM), 1)
+GATE_LIBS = -L sc7p5mcpp84_12lpplus_base_rvt_c14      \
+			-L sc7p5mcpp84_12lpplus_base_lvt_c14      \
+			-L sc7p5mcpp84_12lpplus_base_slvt_c14     \
+			-L sc7p5mcpp84_12lpplus_base_hvt_c16      \
+			-L sc7p5mcpp84_12lpplus_base_rvt_c16      \
+			-L sc7p5mcpp84_12lpplus_base_lvt_c16      \
+			-L sc7p5mcpp84_12lpplus_base_slvt_c16     \
+			-L sc7p5mcpp84_12lpplus_base_hvt_c18      \
+			-L sc7p5mcpp84_12lpplus_base_rvt_c18      \
+			-L sc7p5mcpp84_12lpplus_hpk_rvt_c14       \
+			-L sc7p5mcpp84_12lpplus_hpk_lvt_c14       \
+			-L sc7p5mcpp84_12lpplus_hpk_slvt_c14      \
+			-L sc7p5mcpp84_12lpplus_hpk_rvt_c16       \
+			-L sc7p5mcpp84_12lpplus_hpk_lvt_c16       \
+			-L sc7p5mcpp84_12lpplus_hpk_slvt_c16      \
+			-L sc7p5mcpp84_12lpplus_hpk_rvt_c18       \
+			-L gf12mem_rf_sp_hse_rvt_mvt              \
+			-L gf12mem_rf_sp_uhse_rvt_mvt             \
+			-L gf12mem_sram_sp_hse_rvt_mvt
+VOPT_FLAGS 	+= -modelsimini $(ROOT)/nonfree/gf12/modelsim/modelsim.ini
+VOPT_FLAGS  += +nospecify
+VOPT_FLAGS  += $(GATE_LIBS)
+VSIM_FLAGS 	+= -modelsimini $(ROOT)/nonfree/gf12/modelsim/modelsim.ini
+VSIM_FLAGS  += +nospecify
+endif
+
+ifeq ($(VCD_DUMP), 1)
+VSIM_FLAGS += -do "source $(ROOT)/nonfree/gf12/modelsim/vcd.tcl"
 else
-VSIM_FLAGS    += -do "run -a"
+VSIM_FLAGS += -do "run -a"
 endif
 
 # VCS_BUILDDIR should to be the same as the `DEFAULT : ./work-vcs`
 # in target/snitch_cluster/synopsys_sim.setup
 VCS_BENDER   += $(COMMON_BENDER_FLAGS) -t test -t simulation -t vcs
-VCS_SOURCES   = $(shell ${BENDER} script flist-plus ${VCS_BENDER} | ${SED_SRCS})
 VCS_BUILDDIR := work-vcs
 
 # fesvr is being installed here
@@ -76,7 +107,6 @@ FESVR         ?= ${MKFILE_DIR}work
 FESVR_VERSION ?= 35d50bc40e59ea1d5566fbd3d9226023821b1bb6
 
 VLT_BENDER   += $(COMMON_BENDER_FLAGS) -DCOMMON_CELLS_ASSERTS_OFF
-VLT_SOURCES   = $(shell ${BENDER} script flist-plus ${VLT_BENDER} | ${SED_SRCS})
 VLT_BUILDDIR := $(abspath work-vlt)
 VLT_FESVR     = $(VLT_BUILDDIR)/riscv-isa-sim
 VLT_FLAGS    += --timing
@@ -91,7 +121,7 @@ VLT_FLAGS    += -Wno-UNSIGNED
 VLT_FLAGS    += -Wno-UNOPTFLAT
 VLT_FLAGS    += -Wno-fatal
 VLT_FLAGS    += --unroll-count 1024
-VLT_FLAGS	   += --threads $(VLT_NUM_THREADS)
+VLT_FLAGS    += --threads $(VLT_NUM_THREADS)
 VLT_CFLAGS   += -std=c++20 -pthread
 VLT_CFLAGS   += -I $(VLT_FESVR)/include -I $(TB_DIR) -I ${MKFILE_DIR}test
 
@@ -170,15 +200,6 @@ $(VLT_BUILDDIR)/lib/libfesvr.a: $(VLT_FESVR)/${FESVR_VERSION}_unzip
 	mkdir -p $(dir $@)
 	cp $(dir $<)libfesvr.a $@
 
-#######
-# VCS #
-#######
-$(VCS_BUILDDIR)/compile.sh:
-	mkdir -p $(VCS_BUILDDIR)
-	${BENDER} script vcs ${VCS_BENDER} --vlog-arg="${VLOGAN_FLAGS}" --vcom-arg="${VHDLAN_FLAGS}" > $@
-	chmod +x $@
-	$(VCS_SEPP) $@ > $(VCS_BUILDDIR)/compile.log
-
 ########
 # Util #
 ########
diff --git a/target/common/vcs.mk b/target/common/vcs.mk
new file mode 100644
index 0000000000..0c477c09f8
--- /dev/null
+++ b/target/common/vcs.mk
@@ -0,0 +1,37 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+VCS_TOP_MODULE = tb_bin
+
+$(VCS_BUILDDIR):
+	mkdir -p $@
+
+$(VCS_BUILDDIR)/compile.sh: $(BENDER_YML) $(BENDER_LOCK) | $(VCS_BUILDDIR)
+	$(BENDER) script vcs $(VCS_BENDER) --vlog-arg="$(VLOGAN_FLAGS)" --vcom-arg="$(VHDLAN_FLAGS)" > $@
+	chmod +x $@
+
+# Generate dependency file with RTL sources and headers using Verilator
+$(VCS_BUILDDIR)/$(VCS_TOP_MODULE).d: $(BENDER_YML) $(BENDER_LOCK) $(GENERATED_RTL_SOURCES) | $(VCS_BUILDDIR)
+	$(VLT) $(shell $(BENDER) script verilator $(VCS_BENDER)) \
+		--Mdir $(VCS_BUILDDIR) --MMD -E --top-module $(VCS_TOP_MODULE) > /dev/null
+	mv $(VCS_BUILDDIR)/V$(VCS_TOP_MODULE)__ver.d $@
+	sed -i 's|^[^:]*:|$(BIN_DIR)/$(TARGET).vcs:|' $@
+
+# Run compilation script and create VCS simulation binary
+$(BIN_DIR)/$(TARGET).vcs: $(VCS_BUILDDIR)/compile.sh $(TB_CC_SOURCES) $(RTL_CC_SOURCES) work/lib/libfesvr.a | $(BIN_DIR)
+	$(VCS_SEPP) $< > $(VCS_BUILDDIR)/compile.log
+	$(VCS) -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o $@ -cc $(CC) -cpp $(CXX) \
+		-assert disable_cover -override_timescale=1ns/1ps -full64 $(VCS_TOP_MODULE) $(TB_CC_SOURCES) $(RTL_CC_SOURCES) \
+		-CFLAGS "$(TB_CC_FLAGS)" -LDFLAGS "-L$(FESVR)/lib" -lfesvr
+
+# Clean all build directories and temporary files for VCS simulation
+.PHONY: clean-vcs
+clean-vcs: clean-work
+	rm -rf $(BIN_DIR)/$(TARGET).vcs $(VCS_BUILDDIR) vc_hdrs.h
+
+clean: clean-vcs
+
+ifneq ($(filter-out clean%,$(MAKECMDGOALS)),)
+-include $(VCS_BUILDDIR)/$(VCS_TOP_MODULE).d
+endif
diff --git a/target/common/verilator.mk b/target/common/verilator.mk
index eb02ee346f..518a71de85 100644
--- a/target/common/verilator.mk
+++ b/target/common/verilator.mk
@@ -2,16 +2,29 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
-$(BIN_DIR)/$(TARGET).vlt: $(VLT_SOURCES) $(TB_CC_SOURCES) $(VLT_CC_SOURCES) $(VLT_BUILDDIR)/lib/libfesvr.a | $(BIN_DIR)
+VLT_TOP_MODULE = testharness
+
+# Generate dependency file with RTL sources and headers using Verilator
+$(VLT_BUILDDIR)/$(VLT_TOP_MODULE).d: $(BENDER_YML) $(BENDER_LOCK) $(GENERATED_RTL_SOURCES) | $(VLT_BUILDDIR)
+	$(VLT) $(shell $(BENDER) script verilator $(VLT_BENDER)) \
+		--Mdir $(VLT_BUILDDIR) --MMD -E --top-module $(VLT_TOP_MODULE) > /dev/null
+	mv $(VLT_BUILDDIR)/V$(VLT_TOP_MODULE)__ver.d $@
+	sed -i 's|^[^:]*:|$(BIN_DIR)/$(TARGET).vlt:|' $@
+
+$(BIN_DIR)/$(TARGET).vlt: $(TB_CC_SOURCES) $(VLT_CC_SOURCES) $(VLT_BUILDDIR)/lib/libfesvr.a | $(BIN_DIR)
 	$(VLT) $(shell $(BENDER) script verilator $(VLT_BENDER)) \
 		$(VLT_FLAGS) --Mdir $(VLT_BUILDDIR) \
 		-CFLAGS "$(VLT_CFLAGS)" \
 		-LDFLAGS "$(VLT_LDFLAGS)" \
 		-j $(VLT_JOBS) \
-		-o ../$@ --cc --exe --build --top-module testharness $(TB_CC_SOURCES) $(VLT_CC_SOURCES)
+		-o ../$@ --cc --exe --build --top-module $(VLT_TOP_MODULE) $(TB_CC_SOURCES) $(VLT_CC_SOURCES)
 
 .PHONY: clean-vlt
 clean-vlt: clean-work
 	rm -rf $(BIN_DIR)/$(TARGET).vlt $(VLT_BUILDDIR)
 
 clean: clean-vlt
+
+ifneq ($(filter-out clean%,$(MAKECMDGOALS)),)
+-include $(VLT_BUILDDIR)/$(VLT_TOP_MODULE).d
+endif
diff --git a/target/common/vsim.mk b/target/common/vsim.mk
index 38019240b4..d5d05495d1 100644
--- a/target/common/vsim.mk
+++ b/target/common/vsim.mk
@@ -2,34 +2,47 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
+VSIM_TOP_MODULE = tb_bin
+
 $(VSIM_BUILDDIR):
 	mkdir -p $@
 
-$(VSIM_BUILDDIR)/compile.vsim.tcl: $(BENDER_LOCK) | $(VSIM_BUILDDIR)
+$(VSIM_BUILDDIR)/compile.vsim.tcl: $(BENDER_YML) $(BENDER_LOCK) | $(VSIM_BUILDDIR)
 	$(VLIB) $(dir $@)
 	$(BENDER) script vsim $(VSIM_BENDER) --vlog-arg="$(VLOG_FLAGS) -work $(dir $@) " > $@
 	echo '$(VLOG) -work $(dir $@) $(TB_CC_SOURCES) $(RTL_CC_SOURCES) -vv -ccflags "$(TB_CC_FLAGS)"' >> $@
 	echo 'return 0' >> $@
 
-# Build compilation script and compile all sources for Questasim simulation
-$(BIN_DIR)/$(TARGET).vsim: $(VSIM_BUILDDIR)/compile.vsim.tcl $(VSIM_SOURCES) $(TB_SRCS) $(TB_CC_SOURCES) $(RTL_CC_SOURCES) work/lib/libfesvr.a | $(BIN_DIR)
+# Intermediate file required to avoid "Argument list too long" errors in Occamy
+# when invoking Verilator
+$(VSIM_BUILDDIR)/$(TARGET).f: $(BENDER_YML) $(BENDER_LOCK) | $(VSIM_BUILDDIR)
+	$(BENDER) script verilator $(VSIM_BENDER) > $@
+
+# Generate dependency file with RTL sources and headers using Verilator
+$(VSIM_BUILDDIR)/$(VSIM_TOP_MODULE).d: $(VSIM_BUILDDIR)/$(TARGET).f $(GENERATED_RTL_SOURCES) | $(VSIM_BUILDDIR)
+	$(VLT) -f $< --Mdir $(VSIM_BUILDDIR) --MMD -E --top-module $(VSIM_TOP_MODULE) > /dev/null
+	mv $(VSIM_BUILDDIR)/V$(VSIM_TOP_MODULE)__ver.d $@
+	sed -i 's|^[^:]*:|$(BIN_DIR)/$(TARGET).vsim:|' $@
+
+# Run compilation script and create Questasim simulation binary
+$(BIN_DIR)/$(TARGET).vsim: $(VSIM_BUILDDIR)/compile.vsim.tcl $(TB_CC_SOURCES) $(RTL_CC_SOURCES) work/lib/libfesvr.a | $(BIN_DIR)
 	$(VSIM) -c -do "source $<; quit" | tee $(dir $<)vlog.log
 	@! grep -P "Errors: [1-9]*," $(dir $<)vlog.log
-	$(VOPT) $(VOPT_FLAGS) -work $(VSIM_BUILDDIR) tb_bin -o tb_bin_opt | tee $(dir $<)vopt.log
+	$(VOPT) $(VOPT_FLAGS) -work $(VSIM_BUILDDIR) $(VSIM_TOP_MODULE) -o $(VSIM_TOP_MODULE)_opt | tee $(dir $<)vopt.log
 	@! grep -P "Errors: [1-9]*," $(dir $<)vopt.log
 	@echo "#!/bin/bash" > $@
 	@echo 'binary=$$(realpath $$1)' >> $@
 	@echo 'echo $$binary > .rtlbinary' >> $@
 	@echo '$(VSIM) +permissive $(VSIM_FLAGS) $$3 -work $(MKFILE_DIR)/$(VSIM_BUILDDIR) -c \
 				-ldflags "-Wl,-rpath,$(FESVR)/lib -L$(FESVR)/lib -lfesvr -lutil" \
-				tb_bin_opt +permissive-off ++$$binary ++$$2' >> $@
+				$(VSIM_TOP_MODULE)_opt +permissive-off ++$$binary ++$$2' >> $@
 	@chmod +x $@
 	@echo "#!/bin/bash" > $@.gui
 	@echo 'binary=$$(realpath $$1)' >> $@.gui
 	@echo 'echo $$binary > .rtlbinary' >> $@.gui
 	@echo '$(VSIM) +permissive $(VSIM_FLAGS) -work $(MKFILE_DIR)/$(VSIM_BUILDDIR) \
 				-ldflags "-Wl,-rpath,$(FESVR)/lib -L$(FESVR)/lib -lfesvr -lutil" \
-				tb_bin_opt +permissive-off ++$$binary ++$$2' >> $@.gui
+				$(VSIM_TOP_MODULE)_opt +permissive-off ++$$binary ++$$2' >> $@.gui
 	@chmod +x $@.gui
 
 # Clean all build directories and temporary files for Questasim simulation
@@ -38,3 +51,7 @@ clean-vsim: clean-work
 	rm -rf $(BIN_DIR)/$(TARGET).vsim $(BIN_DIR)/$(TARGET).vsim.gui $(VSIM_BUILDDIR) vsim.wlf
 
 clean: clean-vsim
+
+ifneq ($(filter-out clean%,$(MAKECMDGOALS)),)
+-include $(VSIM_BUILDDIR)/$(VSIM_TOP_MODULE).d
+endif
diff --git a/target/snitch_cluster/Makefile b/target/snitch_cluster/Makefile
index 1a5ab23880..aeb5ccfc6a 100644
--- a/target/snitch_cluster/Makefile
+++ b/target/snitch_cluster/Makefile
@@ -48,7 +48,8 @@ CLUSTER_GEN_SRC ?= $(wildcard $(ROOT)/util/clustergen/*.py)
 
 BIN_DIR           ?= bin
 GENERATED_DIR     ?= $(MKFILE_DIR)generated
-PERIPH_DIR        ?= $(ROOT)/hw/snitch_cluster/src/snitch_cluster_peripheral
+HW_DIR             = $(ROOT)/hw
+PERIPH_DIR         = $(HW_DIR)/snitch_cluster/src/snitch_cluster_peripheral
 
 # If the configuration file is overriden on the command-line (through
 # CFG_OVERRIDE) and this file differs from the least recently used
@@ -151,9 +152,13 @@ include $(ROOT)/target/snitch_cluster/sw.mk
 # RTL #
 #######
 
+SNITCH_CLUSTER_WRAPPER_TPL = $(HW_DIR)/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl
+SNITCH_CLUSTER_PKG_TPL     = $(HW_DIR)/snitch_cluster/src/snitch_cluster_pkg.sv.tpl
+
 GENERATED_RTL_SOURCES  = $(PERIPH_DIR)/snitch_cluster_peripheral_reg_top.sv
 GENERATED_RTL_SOURCES += $(PERIPH_DIR)/snitch_cluster_peripheral_reg_pkg.sv
 GENERATED_RTL_SOURCES += $(GENERATED_DIR)/snitch_cluster_wrapper.sv
+GENERATED_RTL_SOURCES += $(GENERATED_DIR)/snitch_cluster_pkg.sv
 
 .PHONY: rtl clean-rtl
 
@@ -165,16 +170,19 @@ clean-rtl:
 $(GENERATED_DIR):
 	mkdir -p $@
 
-$(GENERATED_DIR)/snitch_cluster_wrapper.sv: ${CFG} ${CLUSTER_GEN_PREREQ} | $(GENERATED_DIR)
+$(GENERATED_DIR)/snitch_cluster_wrapper.sv: $(CFG) $(CLUSTER_GEN_PREREQ) $(SNITCH_CLUSTER_WRAPPER_TPL) | $(GENERATED_DIR)
 	$(CLUSTER_GEN) -c $< -o $(GENERATED_DIR) --wrapper
 
-$(GENERATED_DIR)/link.ld: ${CFG} ${CLUSTER_GEN_PREREQ} | $(GENERATED_DIR)
+$(GENERATED_DIR)/snitch_cluster_pkg.sv: $(CFG) $(CLUSTER_GEN_PREREQ) $(SNITCH_CLUSTER_PKG_TPL) | $(GENERATED_DIR)
+	$(CLUSTER_GEN) -c $< -o $(GENERATED_DIR) --package
+
+$(GENERATED_DIR)/link.ld: $(CFG) $(CLUSTER_GEN_PREREQ) | $(GENERATED_DIR)
 	$(CLUSTER_GEN) -c $< -o $(GENERATED_DIR) --linker
 
-$(GENERATED_DIR)/memories.json: ${CFG} ${CLUSTER_GEN_PREREQ} | $(GENERATED_DIR)
+$(GENERATED_DIR)/memories.json: $(CFG) $(CLUSTER_GEN_PREREQ) | $(GENERATED_DIR)
 	$(CLUSTER_GEN) -c $< -o $(GENERATED_DIR) --memories
 
-$(GENERATED_DIR)/bootdata.cc: ${CFG} ${CLUSTER_GEN_PREREQ} | $(GENERATED_DIR)
+$(GENERATED_DIR)/bootdata.cc: $(CFG) $(CLUSTER_GEN_PREREQ) | $(GENERATED_DIR)
 	$(CLUSTER_GEN) -c $< -o $(GENERATED_DIR) --bootdata
 
 # REGGEN regfile
@@ -202,18 +210,7 @@ include $(ROOT)/target/common/vsim.mk
 # VCS #
 #######
 
-.PHONY: clean-vcs
-
-# Clean all build directories and temporary files for VCS simulation
-clean-vcs: clean-work
-	rm -rf $(BIN_DIR)/$(TARGET).vcs $(VCS_BUILDDIR) vc_hdrs.h
-
-# Build compilation script and compile all sources for VCS simulation
-$(BIN_DIR)/$(TARGET).vcs: ${VCS_SOURCES} ${TB_SRCS} $(TB_CC_SOURCES) $(RTL_CC_SOURCES) $(VCS_BUILDDIR)/compile.sh work/lib/libfesvr.a
-	mkdir -p $(BIN_DIR)
-	$(VCS) -Mlib=$(VCS_BUILDDIR) -Mdir=$(VCS_BUILDDIR) -o $(BIN_DIR)/$(TARGET).vcs -cc $(CC) -cpp $(CXX) \
-		-assert disable_cover -override_timescale=1ns/1ps -full64 tb_bin $(TB_CC_SOURCES) $(RTL_CC_SOURCES) \
-		-CFLAGS "$(TB_CC_FLAGS)" -LDFLAGS "-L${FESVR}/lib" -lfesvr
+include $(ROOT)/target/common/vcs.mk
 
 ########
 # Util #
diff --git a/target/snitch_cluster/cfg/copift.hjson b/target/snitch_cluster/cfg/copift.hjson
new file mode 100644
index 0000000000..1e7337f0ea
--- /dev/null
+++ b/target/snitch_cluster/cfg/copift.hjson
@@ -0,0 +1,139 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Cluster configuration for a simple testbench system.
+{
+    cluster: {
+        boot_addr: 4096, // 0x1000
+        cluster_base_addr: 268435456, // 0x1000_0000
+        cluster_base_offset: 0, // 0x0
+        cluster_base_hartid: 0,
+        addr_width: 48,
+        data_width: 64,
+        user_width: 5, // clog2(total number of clusters)
+        tcdm: {
+            size: 128,
+            banks: 32,
+        },
+        cluster_periph_size: 64, // kB
+        zero_mem_size: 64, // kB
+        alias_region_enable: true,
+        dma_data_width: 512,
+        dma_axi_req_fifo_depth: 24,
+        dma_req_fifo_depth: 8,
+        narrow_trans: 4,
+        wide_trans: 32,
+        dma_user_width: 1,
+        // We don't need Snitch debugging in Occamy
+        enable_debug: false,
+        // We don't need Snitch (core-internal) virtual memory support
+        vm_support: false,
+        // Memory configuration inputs
+        sram_cfg_expose: true,
+        sram_cfg_fields: {
+            ema: 3,
+            emaw: 2,
+            emas: 1
+        },
+        // Timing parameters
+        timing: {
+            lat_comp_fp32: 2,
+            lat_comp_fp64: 3,
+            lat_comp_fp16: 1,
+            lat_comp_fp16_alt: 1,
+            lat_comp_fp8: 1,
+            lat_comp_fp8_alt: 1,
+            lat_noncomp: 1,
+            lat_conv: 2,
+            lat_sdotp: 3,
+            fpu_pipe_config: "BEFORE",
+            narrow_xbar_latency: "CUT_ALL_PORTS",
+            wide_xbar_latency: "CUT_ALL_PORTS",
+            // Isolate the core.
+            register_core_req: true,
+            register_core_rsp: true,
+            register_offload_req: true,
+            register_offload_rsp: true,
+            register_fpu_req: true,
+            register_ext_narrow: false,
+            register_ext_wide: false
+        },
+        hives: [
+            // Hive 0
+            {
+                icache: {
+                    size: 8, // total instruction cache size in kByte
+                    ways: 2, // number of ways
+                    cacheline: 256 // word size in bits
+                },
+                cores: [
+                    { $ref: "#/compute_core_template" },
+                    { $ref: "#/dma_core_template" },
+                ]
+            }
+        ]
+    },
+    dram: {
+        // 0x8000_0000
+        address: 2147483648,
+        // 0x8000_0000
+        length: 2147483648
+    },
+    peripherals: {
+        clint: {
+            // 0xffff_0000
+            address: 4294901760,
+            // 0x0000_1000
+            length: 4096
+        },
+    },
+    // Templates.
+    compute_core_template: {
+        isa: "rv32imafd",
+        xssr: true,
+        xfrep: true,
+        xdma: false,
+        xf16: true,
+        xf16alt: true,
+        xf8: true,
+        xf8alt: true,
+        xfdotp: true,
+        xfvec: true,
+        ssr_nr_credits: 4,
+        num_int_outstanding_loads: 4,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 64,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+        // SSSR configuration below
+        ssr_intersection: true,
+        ssr_intersection_triple: [0, 1, 2],
+        ssrs: [
+            {indirection: true},    // Master 0
+            {indirection: true},    // Master 1
+            {},                     // Slave
+        ],
+    },
+    dma_core_template: {
+        isa: "rv32imafd",
+        xdma: true,
+        xssr: false,
+        xfrep: false,
+        xf16: false,
+        xf16alt: false,
+        xf8: false,
+        xf8alt: false,
+        xfdotp: false,
+        xfvec: false,
+        num_int_outstanding_loads: 4,
+        num_int_outstanding_mem: 4,
+        num_fp_outstanding_loads: 4,
+        num_fp_outstanding_mem: 4,
+        num_sequencer_instructions: 64,
+        num_dtlb_entries: 1,
+        num_itlb_entries: 1,
+    }
+}
diff --git a/target/snitch_cluster/cfg/default.hjson b/target/snitch_cluster/cfg/default.hjson
index 93cb2dbec3..67a96ea433 100644
--- a/target/snitch_cluster/cfg/default.hjson
+++ b/target/snitch_cluster/cfg/default.hjson
@@ -64,7 +64,7 @@
             {
                 icache: {
                     size: 8, // total instruction cache size in kByte
-                    sets: 2, // number of ways
+                    ways: 2, // number of ways
                     cacheline: 256 // word size in bits
                 },
                 cores: [
@@ -108,11 +108,11 @@
         xfdotp: true,
         xfvec: true,
         ssr_nr_credits: 4,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
+        num_sequencer_instructions: 64,
         num_dtlb_entries: 1,
         num_itlb_entries: 1,
         // SSSR configuration below
@@ -135,11 +135,11 @@
         xf8alt: false,
         xfdotp: false,
         xfvec: false,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
-        num_sequencer_instructions: 16,
+        num_sequencer_instructions: 64,
         num_dtlb_entries: 1,
         num_itlb_entries: 1,
     }
diff --git a/target/snitch_cluster/cfg/dma_mchan.hjson b/target/snitch_cluster/cfg/dma_mchan.hjson
index ef551a2d21..479c1a2267 100644
--- a/target/snitch_cluster/cfg/dma_mchan.hjson
+++ b/target/snitch_cluster/cfg/dma_mchan.hjson
@@ -70,7 +70,7 @@
             {
                 icache: {
                     size: 8, // total instruction cache size in kByte
-                    sets: 2, // number of ways
+                    ways: 2, // number of ways
                     cacheline: 256 // word size in bits
                 },
                 cores: [
@@ -114,7 +114,7 @@
         xfdotp: true,
         xfvec: true,
         ssr_nr_credits: 4,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
@@ -141,7 +141,7 @@
         xf8alt: false,
         xfdotp: false,
         xfvec: false,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
diff --git a/target/snitch_cluster/cfg/fdiv.hjson b/target/snitch_cluster/cfg/fdiv.hjson
index c58409ef7f..84abc4efef 100644
--- a/target/snitch_cluster/cfg/fdiv.hjson
+++ b/target/snitch_cluster/cfg/fdiv.hjson
@@ -64,7 +64,7 @@
             {
                 icache: {
                     size: 8, // total instruction cache size in kByte
-                    sets: 2, // number of ways
+                    ways: 2, // number of ways
                     cacheline: 256 // word size in bits
                 },
                 cores: [
@@ -108,7 +108,7 @@
         xfdotp: true,
         xfvec: true,
         ssr_nr_credits: 4,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
@@ -138,7 +138,7 @@
         xf8alt: false,
         xfdotp: false,
         xfvec: false,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
diff --git a/target/snitch_cluster/cfg/github-ci.hjson b/target/snitch_cluster/cfg/github-ci.hjson
index 3b788754c3..f45276ef5d 100644
--- a/target/snitch_cluster/cfg/github-ci.hjson
+++ b/target/snitch_cluster/cfg/github-ci.hjson
@@ -69,7 +69,7 @@
             {
                 icache: {
                     size: 8, // total instruction cache size in kByte
-                    sets: 2, // number of ways
+                    ways: 2, // number of ways
                     cacheline: 256 // word size in bits
                 },
                 cores: [
@@ -109,7 +109,7 @@
         xfdotp: true,
         xfvec: true,
         ssr_nr_credits: 4,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
@@ -139,7 +139,7 @@
         xf8alt: false,
         xfdotp: false,
         xfvec: false,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
diff --git a/target/snitch_cluster/cfg/omega.hjson b/target/snitch_cluster/cfg/omega.hjson
index 65655936a5..bba1689c9a 100644
--- a/target/snitch_cluster/cfg/omega.hjson
+++ b/target/snitch_cluster/cfg/omega.hjson
@@ -65,7 +65,7 @@
             {
                 icache: {
                     size: 8, // total instruction cache size in kByte
-                    sets: 2, // number of ways
+                    ways: 2, // number of ways
                     cacheline: 256 // word size in bits
                 },
                 cores: [
@@ -109,7 +109,7 @@
         xfdotp: true,
         xfvec: true,
         ssr_nr_credits: 4,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
@@ -136,7 +136,7 @@
         xf8alt: false,
         xfdotp: false,
         xfvec: false,
-        num_int_outstanding_loads: 1,
+        num_int_outstanding_loads: 4,
         num_int_outstanding_mem: 4,
         num_fp_outstanding_loads: 4,
         num_fp_outstanding_mem: 4,
diff --git a/target/snitch_cluster/experiments/copift/.gitignore b/target/snitch_cluster/experiments/copift/.gitignore
new file mode 100644
index 0000000000..ab9bf2c826
--- /dev/null
+++ b/target/snitch_cluster/experiments/copift/.gitignore
@@ -0,0 +1,6 @@
+/build
+/power
+/results
+/runs
+/pls.yaml
+/pls
\ No newline at end of file
diff --git a/target/snitch_cluster/experiments/copift/README.md b/target/snitch_cluster/experiments/copift/README.md
new file mode 100644
index 0000000000..312ce1e387
--- /dev/null
+++ b/target/snitch_cluster/experiments/copift/README.md
@@ -0,0 +1,16 @@
+Run RTL experiments:
+```
+./experiments.py experiments.yaml sw run perf -j
+```
+
+Run PLS experiments:
+```
+make clean-vsim
+make PL_SIM=1 DEBUG=ON VCD_DUMP=1 bin/snitch_cluster.vsim
+./experiments.py pls.yaml run power -j --run-dir pls
+```
+
+Run MATRIX experiments:
+```
+./matrix.py none.yaml sw run -j
+```
diff --git a/target/snitch_cluster/experiments/copift/experiments.py b/target/snitch_cluster/experiments/copift/experiments.py
new file mode 100755
index 0000000000..62ff6a9f8e
--- /dev/null
+++ b/target/snitch_cluster/experiments/copift/experiments.py
@@ -0,0 +1,619 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import re
+from snitch.target.experiment_utils import ExperimentManager
+from snitch.target.common import extend_environment
+from snitch.target.SimResults import SimRegion
+from statistics import geometric_mean
+import yaml
+
+# Plot parameters
+A4_HEIGHT = 11.7
+IEEE_TEXT_WIDTH = 7.244
+IEEE_TWO_COLUMN_SEP = 0.157
+IEEE_COL_WIDTH = (IEEE_TEXT_WIDTH - IEEE_TWO_COLUMN_SEP) / 2
+RESULT_DIR = Path('results')
+
+# Files
+PLS_TESTLIST = Path('pls.yaml')
+
+
+class CopiftExperimentManager(ExperimentManager):
+
+    def derive_axes(self, experiment):
+        # Monte Carlo kernels
+        if experiment['app'] == 'pi_estimation':
+            return {
+                'app': experiment['integrand'] + '_' + experiment['prng'],
+                'impl': experiment['impl'],
+                'length': experiment['n_samples'],
+                'batch_size': experiment['batch_size']
+            }
+        # LIBC kernels
+        else:
+            return {
+                'app': experiment['app'],
+                'impl': experiment['impl'],
+                'length': experiment['length'],
+                'batch_size': experiment['batch_size']
+            }
+
+    def derive_cdefines(self, experiment):
+        # Monte Carlo kernels
+        if experiment['app'] == 'pi_estimation':
+            return {
+                'APPLICATION': 'APPLICATION_' + experiment['integrand'].upper(),
+                'PRNG': 'PRNG_' + experiment['prng'].upper(),
+                'N_SAMPLES': experiment['n_samples'],
+                'N_CORES': experiment['n_cores'],
+                'FUNC_PTR': 'calculate_psum_' + experiment['impl'],
+                'BATCH_SIZE': experiment['batch_size'],
+            }
+        # LIBC kernels
+        else:
+            return {
+                'IMPL': 'IMPL_' + experiment['impl'].upper(),
+                'LEN': experiment['length'],
+                'BATCH_SIZE': experiment['batch_size']
+            }
+
+    def derive_env(self, experiment):
+        if 'vcd_start' in experiment and 'vcd_end' in experiment:
+            return extend_environment({
+                'VCD_START': str(experiment['vcd_start']),
+                'VCD_END': str(experiment['vcd_end'])
+            })
+        else:
+            return None
+
+
+def get_num_stages(row):
+    if row['app'] == 'log':
+        if row['impl'] != 'Baseline':
+            return 4
+        else:
+            return 3
+    elif row['app'] == 'exp':
+        if row['impl'] != 'Baseline':
+            return 5
+        else:
+            return 3
+    else:
+        if row['impl'] != 'Baseline':
+            return 3
+        else:
+            return 1
+
+
+def get_num_iterations(row):
+    n_batches = row['length'] // row['batch_size']
+    n_stages = get_num_stages(row)
+    return n_stages + n_batches - 1
+
+
+def get_rois(row, start_iter=-1, end_iter=-1):
+    def iter_to_region_idx(iter):
+        if row['app'] not in ['exp', 'log'] and row['impl'] == 'Baseline':
+            # Baseline Monte Carlo iterations are smaller than batch size
+            # (and process exactly only 4 samples) so we adjust it here
+            return 2 + iter * (row['batch_size'] // 4)
+        else:
+            return 2 + iter
+
+    start = iter_to_region_idx(start_iter)
+
+    if end_iter == -1:
+        end = iter_to_region_idx(get_num_iterations(row) - 1)
+    else:
+        end = iter_to_region_idx(end_iter)
+
+    return [SimRegion('hart_0', i) for i in range(start, end)]
+
+
+def get_interval(row, start_iter=-1, end_iter=-1):
+    rois = get_rois(row, start_iter, end_iter)
+    tstart = row['results'].get_metric(rois[0], 'tstart')
+    tend = row['results'].get_metric(rois[-1], 'tend')
+    return tstart, tend
+
+
+def get_runtime(row, start_iter=-1, end_iter=-1):
+    interval = get_interval(row, start_iter, end_iter)
+    return interval[1] - interval[0]
+
+
+def get_ipc(row, start_iter=-1, end_iter=-1):
+    rois = get_rois(row, start_iter, end_iter)
+    interval = get_interval(row, start_iter, end_iter)
+    cycles = interval[1] - interval[0]
+    instructions = 0
+    for roi in rois:
+        instructions += row['results'].get_metric(roi, 'snitch_issues')
+        instructions += row['results'].get_metric(roi, 'fpss_issues')
+    return instructions / cycles
+
+
+def fig1(df):
+    # Calculate IPC for every run
+    df['ipc'] = df.apply(lambda row: get_ipc(row, 4, 8), axis=1)
+
+    # Pivot the DataFrame to reshape it for plotting
+    df = df.pivot(index='app', columns='impl', values='ipc').reset_index()
+
+    # Prepare data for plotting
+    apps = df['app']
+    x = np.arange(len(apps))
+    width = 0.35
+
+    # Create the plot
+    _, ax = plt.subplots()
+    ax.axhline(1, color='black', linewidth=0.5, zorder=1)
+    cmap = mpl.colormaps['plasma']
+    base_bars = ax.bar(x - width/2, df['Baseline'], width, label='Base', color=cmap(0.48))
+    copift_bars = ax.bar(x + width/2, df['COPIFT'], width, label='COPIFT', color=cmap(0.82))
+    ax.bar_label(base_bars, label_type='center', fmt='{:.2f}', rotation=90, color='white')
+    ax.bar_label(copift_bars, label_type='center', fmt='{:.2f}', rotation=90, color='white')
+
+    # Draw speedup lines
+    ipc_increases = df['COPIFT'] / df['Baseline']
+    for i in range(len(apps)):
+        # Get the coordinates for the line
+        base_x = x[i] - width/2
+        base_y = df['Baseline'][i]
+        copift_y = df['COPIFT'][i]
+        label_y = (base_y + copift_y) / 2
+
+        # Draw vertical line
+        bottom_margin, top_margin = 18, 24
+        label_y_pt = ax.transData.transform((0, label_y))[1]
+        label_y_plus_margin, label_y_minus_margin = [point[1] for point in ax.transData.inverted().transform(
+            [(0, label_y_pt + top_margin), (0, label_y_pt - bottom_margin)])]
+        ax.plot([base_x, base_x], [base_y, label_y_minus_margin], color='black', linewidth=0.5)
+        ax.plot([base_x, base_x], [label_y_plus_margin, copift_y], color='black', linewidth=0.5)
+
+        # Draw horizontal line
+        ax.plot([base_x - width/2, base_x + width/2], [copift_y, copift_y], color='black', linewidth=0.5)
+
+        # Draw IPC improvement label in middle of vertical line
+        ipc_increase = ipc_increases[i]
+        ax.text(base_x, label_y, f"{ipc_increase:.1f}x", ha='center', va='center')
+
+    # Draw expected IPC values
+    exp_ipc_increases = [1.28, 1.4, 1.78, 1.9, 1.63, 1.84]
+    exp_copift_ipc = df['Baseline'] * exp_ipc_increases
+    for i in range(len(apps)):
+        # Get the coordinates for the line
+        base_x = x[i] + width/2
+        copift_y = df['COPIFT'][i]
+        exp_copift_y = exp_copift_ipc[i]
+
+        # Draw vertical line
+        ax.plot([base_x, base_x], [copift_y, exp_copift_y], color='black', linewidth=0.5, linestyle='--')
+
+        # Draw horizontal line
+        ax.plot([base_x - width/2, base_x + width/2], [exp_copift_y, exp_copift_y], color='black', linewidth=0.5, linestyle='--')
+
+    # Customize the plot
+    ax.set_axisbelow(True)
+    ax.grid(color='gainsboro', which='both', axis='y', linewidth=0.5)
+    ax.set_xlabel('')
+    ax.set_ylabel('(a) IPC')
+    ax.set_xticks([])
+    ax.set_xticklabels([])
+    ax.legend(title='', ncols=2)
+    plt.tight_layout()
+
+    # Display the plot
+    file = RESULT_DIR / 'plot1.pdf'
+    file.parent.mkdir(parents=True, exist_ok=True)
+    plt.gcf().set_size_inches(IEEE_COL_WIDTH, 0.09 * A4_HEIGHT)
+    plt.gcf().subplots_adjust(
+        left=0.1,
+        bottom=0.04,
+        right=1,
+        top=1
+    )
+    plt.savefig(file)
+
+    # Return metrics
+    df_poly_lcg = df[df['app'] == 'poly_lcg']
+    poly_lcg_increase = (df_poly_lcg['COPIFT'] / df_poly_lcg['Baseline']).item()
+    return {
+        'GeomeanIPCIncrease': '{:.2f}'.format(geometric_mean(ipc_increases)),
+        'PeakIPC': '{:.2f}'.format(df['COPIFT'].max()),
+        'GeomeanIPC': '{:.2f}'.format(geometric_mean(df['COPIFT'])),
+        'PolyLCGIPCIncrease': '{:.2f}'.format(poly_lcg_increase),
+    }
+
+
+def fig2(df):
+
+    # Prepare data for plotting
+    apps = df['app'].unique()
+    x = np.arange(len(apps))
+    width = 0.35
+
+    # Create the plot
+    base_powers = df[df['impl'] == 'Baseline']['total_power'].values
+    copift_powers = df[df['impl'] == 'COPIFT']['total_power'].values
+    _, ax = plt.subplots()
+    cmap = mpl.colormaps['plasma']
+    base_bars = ax.bar(x - width/2, base_powers, width, label='Base', color=cmap(0.48))
+    copift_bars = ax.bar(x + width/2, copift_powers, width, label='COPIFT', color=cmap(0.82))
+    ax.bar_label(base_bars, label_type='center', fmt='{:.2f}', rotation=90, color='white')
+    ax.bar_label(copift_bars, label_type='center', fmt='{:.2f}', rotation=90, color='white')
+
+    # Draw speedup lines
+    power_increases = copift_powers / base_powers
+    for i in range(len(apps)):
+        # Get the coordinates for the line
+        base_x = x[i] - width/2
+        base_y = base_powers[i]
+        copift_y = copift_powers[i]
+
+        # Draw vertical line
+        ax.plot([base_x, base_x], [base_y, copift_y], color='black', linewidth=0.5)
+
+        # Draw horizontal line
+        ax.plot([base_x - width/2, base_x + width/2], [copift_y, copift_y], color='black', linewidth=0.5)
+
+        # Draw label on top of horizontal line
+        ax.text(base_x, copift_y + 2, f"{power_increases[i]:.2f}x", ha='center', va='center')
+
+    # Customize the plot
+    ax.set_axisbelow(True)
+    ax.grid(color='gainsboro', which='both', axis='y', linewidth=0.5)
+    ax.set_xlabel('')
+    ax.set_ylabel('(b) Power [mW]')
+    ax.set_ylim(0, copift_powers.max() * 1.2)
+    ax.set_xticks([])
+    ax.set_xticklabels([])
+    ax.legend(title='', ncols=2)
+    plt.tight_layout()
+
+    # Display the plot
+    file = RESULT_DIR / 'plot2.pdf'
+    file.parent.mkdir(parents=True, exist_ok=True)
+    plt.gcf().set_size_inches(IEEE_COL_WIDTH, 0.09 * A4_HEIGHT)
+    plt.gcf().subplots_adjust(
+        left=0.1,
+        bottom=0.04,
+        right=1,
+        top=1
+    )
+    plt.savefig(file)
+
+    # Return metrics
+    return {
+        'MaxPowerIncrease': '{:.2f}'.format(power_increases.max()),
+        'GeomeanPowerIncrease': '{:.2f}'.format(geometric_mean(power_increases)),
+    }
+
+
+def fig3(df):
+    df['runtime'] = df.apply(lambda row: get_runtime(row, 4, 8), axis=1)
+
+    # Prepare data for plotting
+    apps = df['app'].unique()
+    x = np.arange(len(apps))
+    width = 0.35
+    base_runtimes = df[df['impl'] == 'Baseline']['runtime'].values
+    copift_runtimes = df[df['impl'] == 'COPIFT']['runtime'].values
+    base_powers = df[df['impl'] == 'Baseline']['total_power'].values
+    copift_powers = df[df['impl'] == 'COPIFT']['total_power'].values
+    speedup = base_runtimes / copift_runtimes
+    energy_saving = (base_powers * base_runtimes) / (copift_powers * copift_runtimes)
+
+    # Create the plot
+    _, ax = plt.subplots()
+    ax.axhline(1, color='black', linewidth=0.5, zorder=1)
+    cmap = mpl.colormaps['plasma']
+    speedup_bars = ax.bar(x - width/2, speedup, width, label='Speedup', color=cmap(0.48))
+    energy_saving_bars = ax.bar(x + width/2, energy_saving, width, label='Energy improvement', color=cmap(0.82))
+
+    # Add labels to the bars
+    ax.bar_label(speedup_bars, label_type='center', fmt='{:.2f}', rotation=90, color='white')
+    ax.bar_label(energy_saving_bars, label_type='center', fmt='{:.2f}', rotation=90, color='white')
+
+    # Draw expected speedup values
+    exp_speedups = [1.14, 1.26, 1.39, 1.55, 1.6, 2.21]
+    for i in range(len(apps)):
+        # Get the coordinates for the line
+        base_x = x[i] - width/2
+        speedup_y = speedup[i]
+        exp_speedup_y = exp_speedups[i]
+
+        # Draw vertical line
+        ax.plot([base_x, base_x], [speedup_y, exp_speedup_y], color='black', linewidth=0.5, linestyle='--')
+
+        # Draw horizontal line
+        ax.plot([base_x - width/2, base_x + width/2], [exp_speedup_y, exp_speedup_y], color='black', linewidth=0.5, linestyle='--')
+
+    # Customize the plot
+    ax.set_axisbelow(True)
+    ax.grid(color='gainsboro', which='both', axis='y', linewidth=0.5)
+    ax.set_xlabel('')
+    ax.set_ylabel('(c) Speedup/Energy impr.')
+    ax.set_xticks(x)
+    ax.set_xticklabels(apps, rotation=30)
+    ax.legend(title='', ncols=2)
+    plt.tight_layout()
+
+    # Display the plot
+    file = RESULT_DIR / 'plot3.pdf'
+    file.parent.mkdir(parents=True, exist_ok=True)
+    plt.gcf().set_size_inches(IEEE_COL_WIDTH, 0.131 * A4_HEIGHT)
+    plt.gcf().subplots_adjust(
+        left=0.1,
+        bottom=0.34,
+        right=1,
+        top=0.98
+    )
+    plt.savefig(file)
+
+    # Return metrics
+    return {
+        'PeakSpeedup': '{:.2f}'.format(speedup.max()),
+        'GeomeanSpeedup': '{:.2f}'.format(geometric_mean(speedup)),
+        'PeakEnergySaving': '{:.2f}'.format(energy_saving.max()),
+        'GeomeanEnergySaving': '{:.2f}'.format(geometric_mean(energy_saving)),
+    }
+
+
+def group_power_breakdown(df):
+    import fnmatch
+    patterns = [
+        '*i_snitch_shared_muldiv*',
+        '*gen_core_1*i_snitch_fp_ss*',
+        '*gen_core_1*i_idma_inst64*',
+        '*i_snitch_icache*',
+        '*i_data_mem*',
+        '*i_axi_dma_xbar*',
+        '*gen_core_0*i_snitch_fp_ss*'
+    ]
+    power_summary = []
+    for pattern in patterns:
+        matching_entries = df[df['name'].apply(lambda x: fnmatch.fnmatch(x, pattern))]
+        int_power_sum = matching_entries['int_power'].sum()
+        switch_power_sum = matching_entries['switch_power'].sum()
+        leak_power_sum = matching_entries['leak_power'].sum()
+        total_power_sum = matching_entries['total_power'].sum()
+        power_summary.append({
+            'name': pattern,
+            'int_power': int_power_sum,
+            'switch_power': switch_power_sum,
+            'leak_power': leak_power_sum,
+            'total_power': total_power_sum
+        })
+    return pd.DataFrame(power_summary)
+
+
+def fig4(df, app):
+    print(df)
+
+    num_vars = 8
+    legend_labels = ['Baseline', 'COPIFT']
+    labels = ['Muldiv', 'FPSS 1', 'DMA 1', 'I$', 'TCDM', 'DMA XBAR', 'FPSS 0', 'Clock'][1:num_vars]
+
+    # Get power breakdown for both implementations
+    base_df = df[(df['app'] == app) & (df['impl'] == 'Baseline')]
+    copift_df = df[(df['app'] == app) & (df['impl'] == 'COPIFT')]
+    base_breakdown_df = base_df.iloc[0]['power_breakdown']
+    copift_breakdown_df = copift_df.iloc[0]['power_breakdown']
+    base_breakdown_df = group_power_breakdown(base_breakdown_df)
+    copift_breakdown_df = group_power_breakdown(copift_breakdown_df)
+    base_breakdown_df = base_breakdown_df._append(
+        {'name': 'clock_network', 'total_power': base_df.iloc[0]['clock_power']},
+        ignore_index=True)
+    copift_breakdown_df = copift_breakdown_df._append(
+        {'name': 'clock_network', 'total_power': copift_df.iloc[0]['clock_power']},
+        ignore_index=True)
+    base_breakdown_df = base_breakdown_df.iloc[1:num_vars]
+    copift_breakdown_df = copift_breakdown_df.iloc[1:num_vars]
+    print(base_breakdown_df)
+    print(copift_breakdown_df)
+
+    # Compute angle for each axis in the plot (in radians)
+    angles = np.linspace(0, 2 * np.pi, num_vars - 1, endpoint=False).tolist()
+    angles += angles[:1]
+
+    # Values for each simulation
+    values1 = base_breakdown_df['total_power'].tolist()
+    values1 += values1[:1]
+    values2 = copift_breakdown_df['total_power'].tolist()
+    values2 += values2[:1]
+
+    # Initialize the radar plot
+    _, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
+
+    # Plot data for the first simulation
+    ax.plot(angles, values1, color='blue', linewidth=2, label=legend_labels[0])
+    ax.fill(angles, values1, color='skyblue', alpha=0.25)
+
+    # Plot data for the second simulation
+    ax.plot(angles, values2, color='red', linewidth=2, label=legend_labels[1])
+    ax.fill(angles, values2, color='salmon', alpha=0.25)
+
+    # Fix the axis to go in the correct direction and start at 12 o'clock
+    ax.set_theta_offset(np.pi / 2)
+    ax.set_theta_direction(-1)
+
+    # Draw axis per variable and add labels
+    ax.set_thetagrids(np.degrees(angles[:-1]), labels)
+
+    # Set the range for the radial axis
+    # max_value = max(max(values1), max(values2))
+    # ax.set_ylim(0, max_value * 1.1)
+    ax.set_yscale('log')
+
+    # Add legend and title
+    ax.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
+
+    plt.tight_layout()
+    plt.show()
+
+
+def dump_pls_testlist(testlist, df):
+    vcd_interval = df.apply(lambda row: get_interval(row, 4, 8), axis=1)
+
+    for i, experiment in enumerate(testlist['experiments']):
+        vcd_start, vcd_end = vcd_interval.iloc[i]
+        experiment['vcd_start'] = vcd_start
+        experiment['vcd_end'] = vcd_end
+
+    with open(PLS_TESTLIST, 'w') as f:
+        yaml.dump(testlist, f, sort_keys=False)
+
+
+def latex_metrics(metrics):
+    # Auxiliary function to format a metric as a LaTeX command
+    def latex_metric(name, value):
+        return f"\\newcommand{{\\Result{name}}}{{{value}}}\n"
+
+    # Create file
+    with open(RESULT_DIR / 'metrics.tex', 'w') as f:
+        [f.write(latex_metric(name, value)) for name, value in metrics.items()]
+
+
+def global_plot_settings():
+    # Change global plot settings for export
+    # plt.rcParams['font.family'] = 'Latin Modern Roman'
+    plt.rcParams['font.size'] = 6
+    plt.rcParams['xtick.major.size'] = 3
+    plt.rcParams['xtick.major.pad'] = 2
+    plt.rcParams['axes.labelpad'] = 2
+    plt.rcParams['axes.linewidth'] = 0.5
+    plt.rcParams['xtick.major.width'] = 0.5
+    plt.rcParams['xtick.minor.width'] = 0.5
+    plt.rcParams['ytick.major.width'] = 0.5
+    plt.rcParams['ytick.minor.width'] = 0.5
+    plt.rcParams['patch.linewidth'] = 0.5
+    plt.rcParams['lines.linewidth'] = 1
+    plt.rcParams['legend.handletextpad'] = 0.5
+    plt.rcParams['legend.columnspacing'] = 1
+    # Use Latex backend for rendering
+    # plt.rcParams['text.usetex'] = True
+    # plt.rcParams['text.latex.preamble'] = r'\usepackage[T1]{fontenc}\usepackage{lmodern}'
+
+
+def parse_summary_power_report(report_dir):
+    total_power_regex = r'Total Power\s*=\s*([0-9.]+)\s*'
+    clock_power_regex = r'clock_network\s+'
+    clock_power_regex += r'[0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+\s+'  # Internal power
+    clock_power_regex += r'[0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+\s+'  # Switching power
+    clock_power_regex += r'[0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+\s+'  # Leakage power
+    clock_power_regex += r'([0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+)\s+'  # Total power
+    with open(report_dir / 'power.rpt', 'r') as file:
+        for line in file:
+            match = re.search(total_power_regex, line)
+            if match:
+                total_power = float(match.group(1)) * 1000
+            match = re.search(clock_power_regex, line)
+            if match:
+                clock_power = float(match.group(1)) * 1000
+    return total_power, clock_power
+
+
+def parse_hierarchical_power_report(report_dir):
+    regex = r'^(\s*)'  # Indentation
+    regex += r'(\S+)\s+'  # Instance name
+    regex += r'(\([^)]+\))?\s+'  # Module name
+    regex += r'([0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+)\s+'  # Internal power
+    regex += r'([0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+)\s+'  # Switching power
+    regex += r'([0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+)\s+'  # Leakage power
+    regex += r'([0-9]*\.?[0-9]+[eE]?[+-]?[0-9]+)\s+'  # Total power
+    regex += r'([0-9]*\.?[0-9]+)'
+    data = []
+    stack = []
+    level_indent = 2
+    previous_indent = 0
+    previous_name = ''
+    with open(report_dir / 'power_hier.rpt', 'r') as file:
+        for line in file:
+            line = line.rstrip()
+            match = re.match(regex, line)
+            if match:
+                indent = len(match.group(1))
+                name = match.group(2)
+                int_power = float(match.group(4)) * 1000
+                switch_power = float(match.group(5)) * 1000
+                leak_power = float(match.group(6)) * 1000
+                total_power = float(match.group(7)) * 1000
+                percentage = float(match.group(8))
+
+                # Flatten hierarchy
+                if indent > previous_indent:
+                    stack.append(previous_name)
+                elif indent < previous_indent:
+                    levels_up = (previous_indent - indent) // level_indent
+                    for _ in range(levels_up):
+                        if stack:
+                            stack.pop()
+                full_name = '_'.join(stack + [name])
+                previous_indent = indent
+                previous_name = name
+
+                data.append({
+                    'name': full_name,
+                    'int_power': int_power,
+                    'switch_power': switch_power,
+                    'leak_power': leak_power,
+                    'total_power': total_power,
+                    'percentage': percentage
+                })
+    return pd.DataFrame(data)
+
+
+def main():
+    manager = CopiftExperimentManager()
+    manager.run()
+    df = manager.get_results()
+
+    data = []
+    for experiment in manager.experiments:
+        report_dir = manager.derive_dir(Path('power'), experiment)
+        total_power, clock_power = parse_summary_power_report(report_dir)
+        data.append({
+            'total_power': total_power,
+            'clock_power': clock_power,
+            'power_breakdown': parse_hierarchical_power_report(report_dir)
+        })
+    power_df = pd.DataFrame(data)
+    df = pd.concat([df, power_df], axis=1)
+
+    # Rename implementations for consistency
+    df['impl'] = df['impl'].replace({
+        'issr': 'COPIFT',
+        'optimized': 'COPIFT',
+        'baseline': 'Baseline'
+    })
+
+    global_plot_settings()
+
+    dump_pls_testlist(manager.yaml, df)
+
+    # Reorder columns according to Base IR
+    desired_order = ['pi_xoshiro128p', 'poly_xoshiro128p', 'pi_lcg', 'poly_lcg', 'log', 'exp']
+    df['app'] = pd.Categorical(df['app'], categories=desired_order, ordered=True)
+    df = df.sort_values('app').reset_index(drop=True)
+
+    metrics = {}
+    metrics.update(fig1(df))
+    metrics.update(fig2(df))
+    metrics.update(fig3(df))
+    # fig4(df, 'exp')
+    latex_metrics(metrics)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/experiments/copift/experiments.yaml b/target/snitch_cluster/experiments/copift/experiments.yaml
new file mode 100644
index 0000000000..ecc5c21e23
--- /dev/null
+++ b/target/snitch_cluster/experiments/copift/experiments.yaml
@@ -0,0 +1,79 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+experiments:
+  - app: pi_estimation
+    integrand: pi
+    prng: lcg
+    impl: baseline
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
+  - app: pi_estimation
+    integrand: pi
+    prng: lcg
+    impl: optimized
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
+  - app: exp
+    impl: baseline
+    length: 4096
+    batch_size: 128
+  - app: exp
+    impl: optimized
+    length: 4096
+    batch_size: 128
+  - app: log
+    impl: baseline
+    length: 4096
+    batch_size: 256
+  - app: log
+    impl: issr
+    length: 4096
+    batch_size: 256
+  - app: pi_estimation
+    integrand: poly
+    prng: lcg
+    impl: baseline
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
+  - app: pi_estimation
+    integrand: poly
+    prng: lcg
+    impl: optimized
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
+  - app: pi_estimation
+    integrand: poly
+    prng: xoshiro128p
+    impl: baseline
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
+  - app: pi_estimation
+    integrand: poly
+    prng: xoshiro128p
+    impl: optimized
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
+  - app: pi_estimation
+    integrand: pi
+    prng: xoshiro128p
+    impl: baseline
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
+  - app: pi_estimation
+    integrand: pi
+    prng: xoshiro128p
+    impl: optimized
+    n_cores: 1
+    n_samples: 8192
+    batch_size: 256
diff --git a/target/snitch_cluster/experiments/copift/matrix.py b/target/snitch_cluster/experiments/copift/matrix.py
new file mode 100755
index 0000000000..5f39f8aaa0
--- /dev/null
+++ b/target/snitch_cluster/experiments/copift/matrix.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from experiments import *
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+
+
+def fig1(df):
+
+    # Filter data
+    copift_df = df[df['impl'] != 'Baseline']
+    copift_df['ipc'] = copift_df.apply(lambda row: get_ipc(row, 0, -1), axis=1)
+    copift_df = copift_df.pivot(index='length', columns='batch_size', values='ipc')
+
+    # Get convergence points
+    lengths = copift_df.index
+    batch_sizes = copift_df.columns
+    convergence_points = []
+    for x, batch_size in enumerate(batch_sizes):
+        for y, length in enumerate(lengths):
+            ipc = copift_df.loc[length, batch_size]
+            asymptote = copift_df.loc[lengths[-1], batch_size]
+            if ipc > 0.995 * asymptote:
+                convergence_points.append((x, y))
+                break
+
+    # Get optimum points
+    lengths = copift_df.index
+    batch_sizes = copift_df.columns
+    optimum_points = []
+    for y, length in enumerate(lengths):
+        x_opt = 0
+        x_max_val = 0
+        for x, batch_size in enumerate(batch_sizes):
+            x_val = copift_df.loc[length, batch_size]
+            if x_val > x_max_val:
+                x_opt = x
+                x_max_val = x_val
+        optimum_points.append((x_opt, y))
+
+    print(optimum_points)
+
+    # Create the plot
+    fig, ax = plt.subplots()
+    cmap = mpl.colormaps['plasma']
+    cmap = mpl.colors.ListedColormap(cmap(np.linspace(0.15, 0.85, 128)))
+    im = ax.pcolor(copift_df, cmap=cmap)
+    ax.set_yticks(np.arange(0.5, len(lengths), 1), lengths)
+    ax.set_xticks(np.arange(0.5, len(batch_sizes), 1), batch_sizes)
+    fig.colorbar(im)
+    plt.tight_layout()
+
+    # Annotate convergence points
+    for (x, y) in convergence_points:
+        if (x, y) not in optimum_points:
+            ax.text(x + 0.5, y + 0.5, '>99.5%', ha="center", va="center", color="w")
+        else:
+            ax.text(x + 0.5, y + 0.5, 'both', ha="center", va="center", color="w")
+
+    # Annotate optimum points
+    for (x, y) in optimum_points:
+        if (x, y) not in convergence_points:
+            ax.text(x + 0.5, y + 0.5, 'peak', ha="center", va="center", color="w")
+
+    # Display the plot
+    file = RESULT_DIR / 'plot4.pdf'
+    file.parent.mkdir(parents=True, exist_ok=True)
+    plt.gcf().set_size_inches(IEEE_COL_WIDTH, 0.099 * A4_HEIGHT)
+    plt.gcf().subplots_adjust(
+        left=0.12,
+        bottom=0.13,
+        right=1.05,
+        top=1
+    )
+    plt.savefig(file)
+
+
+def main():
+
+    experiments = []
+    for batch_size in [32, 48, 64, 96, 128, 192, 256]:
+        for n_samples in [768, 1536, 3072, 6144, 12288, 24576, 49152, 98304]:
+            experiments.append({
+                'app': 'pi_estimation',
+                'integrand': 'poly',
+                'prng': 'lcg',
+                'impl': 'optimized',
+                'n_cores': 1,
+                'n_samples': n_samples,
+                'batch_size': batch_size,
+            })
+            experiments.append({
+                'app': 'pi_estimation',
+                'integrand': 'poly',
+                'prng': 'lcg',
+                'impl': 'baseline',
+                'n_cores': 1,
+                'n_samples': n_samples,
+                'batch_size': batch_size,
+            })
+
+    manager = CopiftExperimentManager(experiments=experiments)
+    manager.run()
+    df = manager.get_results()
+
+    # Rename implementations for consistency
+    df['impl'] = df['impl'].replace({
+        'issr': 'COPIFT',
+        'optimized': 'COPIFT',
+        'baseline': 'Baseline'
+    })
+
+    global_plot_settings()
+
+    fig1(df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk
index 6083a8ee1c..b6ec83a241 100644
--- a/target/snitch_cluster/sw.mk
+++ b/target/snitch_cluster/sw.mk
@@ -70,6 +70,12 @@ APPS += sw/apps/correlation
 APPS += sw/apps/covariance
 APPS += sw/apps/doitgen
 APPS += sw/apps/kmeans
+APPS += sw/apps/exp
+APPS += sw/apps/log
+APPS += sw/apps/tutorial
+APPS += sw/apps/copift_queue/fpqueue
+APPS += sw/apps/copift_queue/inqueue
+APPS += sw/apps/copift_queue/fpq_frep
 
 # Include Makefile from each app subdirectory
 $(foreach app,$(APPS), \
diff --git a/target/snitch_cluster/sw/apps/copift_queue/fpq_frep/app.mk b/target/snitch_cluster/sw/apps/copift_queue/fpq_frep/app.mk
new file mode 100644
index 0000000000..cd570084ea
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/copift_queue/fpq_frep/app.mk
@@ -0,0 +1,6 @@
+APP              := fpq_frep
+$(APP)_BUILD_DIR := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/build
+SRCS             := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/src/$(APP).c
+$(APP)_INCDIRS   := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/data
+
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/copift_queue/fpq_frep/src/fpq_frep.c b/target/snitch_cluster/sw/apps/copift_queue/fpq_frep/src/fpq_frep.c
new file mode 100644
index 0000000000..1a9c28630b
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/copift_queue/fpq_frep/src/fpq_frep.c
@@ -0,0 +1,74 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+// #include <bits/types.h>
+
+#define LENGTH 64
+
+int main() {
+    if (snrt_cluster_core_idx() > 0) return 0;
+
+    __uint32_t input = 21, t6_out;
+    double     output;
+
+    printf("Starting Test\n");
+    printf("Input: %d\n", input);
+
+    asm volatile(
+        ".data \n"
+        "zero_val: .double 0.0 \n"
+        
+        ".text \n"
+        // Initialize registers to 0
+        "mv t6, x0 \n"
+        "fcvt.d.w ft3, t6 \n"
+        "fcvt.d.w ft2, t6 \n"
+        "fcvt.d.w ft1, t6 \n"
+
+        // Create FP->IN dependency
+        "fmv.x.w t2, ft1\n"
+        "mv      t0, t2\n"
+
+        // Input
+        "mv t1, %[in] \n"
+        "add t2, t1, %[frep_len] \n"
+        
+        // Enable Queues
+        "csrrsi x0, 0x7C4, 0x1\n"
+        
+        // Initial data in queue
+        "mv t6, t1 \n"
+
+        // outer loop, repeat frep_len+1 times, next 2 instrs in loopbody, don't stagger
+        "frep.o %[frep_len], 2, 0, 0 \n"
+        "fcvt.d.w ft1, t6 \n"
+        "fadd.d ft2, ft2, ft1 \n"
+        
+        // Fill queue
+        "1: \n"
+        "addi t1, t1, 1 \n"
+        "mv t6, t1 \n"
+        "bne t1, t2, 1b \n"
+
+        // Create FP->IN dependency
+        "fmv.x.w t6, ft2\n"
+        "mv      t0, t6\n"
+
+        // Disable Queues
+        "csrrci x0, 0x7C4, 0x1\n"
+
+        "mv %[t6_out], t6 \n"
+        "fadd.d %[fp_out], ft2, ft3 \n"
+        : [ fp_out ] "=f"(output), [ t6_out ] "=r"(t6_out)
+        : [ in ] "r"(input), [frep_len] "r"(LENGTH-1)
+        : "t2");
+    
+    printf("Finished Test\n");
+    int golden_sum = input*LENGTH+(LENGTH-1)*LENGTH/2;
+    printf("Sum Obtained: %d vs Golden Sum: %d\n", (int)output, golden_sum);
+    printf("Value in t6: %d\n", t6_out);
+    if((int)output == golden_sum && t6_out == 0) return 0;
+    else return 1;
+}
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/copift_queue/fpqueue/app.mk b/target/snitch_cluster/sw/apps/copift_queue/fpqueue/app.mk
new file mode 100644
index 0000000000..3273c34c4a
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/copift_queue/fpqueue/app.mk
@@ -0,0 +1,6 @@
+APP              := fpqueue
+$(APP)_BUILD_DIR := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/build
+SRCS             := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/src/$(APP).c
+$(APP)_INCDIRS   := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/data
+
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/copift_queue/fpqueue/src/fpqueue.c b/target/snitch_cluster/sw/apps/copift_queue/fpqueue/src/fpqueue.c
new file mode 100644
index 0000000000..6281dfa692
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/copift_queue/fpqueue/src/fpqueue.c
@@ -0,0 +1,43 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+
+int main() {
+    if (snrt_cluster_core_idx() > 0) return 0;
+
+    __uint32_t input, t6_rf;
+    double     fpq_read;
+
+    input = 6;
+    printf("\n----------Starting SW Test----------\n");
+    printf("Input: %d\n", input);
+
+    asm volatile(
+        "mv t6, x0 \n"              // INCC: Write RF t6
+        
+        "csrrsi x0, 0x7C4, 0x1\n"   // Enable queue
+        
+        "mv t6, %[input] \n"        // INCC: Write into fpq
+        "fcvt.d.wu %[fpq_read], t6 \n" // FPSS: Read from fpq
+
+        // Create FP->IN dependency
+        "fadd.d ft2, ft2, %[fpq_read] \n"
+        "fmv.x.w t6, ft2\n"
+        "mv      t0, t6\n"
+
+        "csrrci x0, 0x7C4, 0x1\n"   // Disable queue
+
+        "add %[t6_rf], t6, x0 \n"   // INCC: Read RF t6
+        : [ fpq_read ] "=f"(fpq_read), [ t6_rf ] "=r"(t6_rf)
+        : [ input ] "r"(input)
+        :);
+    printf("-----Finished asm-----\n");
+    printf("fcvt\'ed value from fpq: %f\n", fpq_read);
+    printf("value in t6 RF: %d\n", t6_rf);
+    printf("----------Finished SW Test----------\n\n");
+    
+    if((int)fpq_read == input && t6_rf == 0) return 0;
+    else return 1;
+}
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/copift_queue/inqueue/app.mk b/target/snitch_cluster/sw/apps/copift_queue/inqueue/app.mk
new file mode 100644
index 0000000000..a26d1b7163
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/copift_queue/inqueue/app.mk
@@ -0,0 +1,6 @@
+APP              := inqueue
+$(APP)_BUILD_DIR := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/build
+SRCS             := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/src/$(APP).c
+$(APP)_INCDIRS   := $(ROOT)/target/snitch_cluster/sw/apps/copift_queue/$(APP)/data
+
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/copift_queue/inqueue/src/inqueue.c b/target/snitch_cluster/sw/apps/copift_queue/inqueue/src/inqueue.c
new file mode 100644
index 0000000000..8e1fd27044
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/copift_queue/inqueue/src/inqueue.c
@@ -0,0 +1,38 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "snrt.h"
+
+int main() {
+    if (snrt_cluster_core_idx() > 0) return 0;
+
+    double     input;
+    __uint32_t inq_read, t6_rf;
+
+    input = 6.0;
+    printf("\n----------Starting SW Test----------\n");
+    printf("input: %f\n", input);
+
+    asm volatile(
+        "mv t6, x0 \n"              // INCC: Write RF t6
+        
+        "csrrsi x0, 0x7C4, 0x1\n"   // Enable queue
+        
+        "fcvt.w.d t2, %[input] \n"  // FPSS: Write into inq
+        "mv %[inq_read], t6 \n"     // INCC: Read from inq
+
+        "csrrci x0, 0x7C4, 0x1\n"   // Disable queue
+
+        "add %[t6_rf], t6, x0 \n"   // INCC: Read RF t6
+        : [ inq_read ] "=r"(inq_read), [ t6_rf ] "=r"(t6_rf)
+        : [ input ] "f"(input)
+        :);
+    printf("-----Finished asm-----\n");
+    printf("fcvt\'ed value from inq: %d\n", inq_read);
+    printf("value in t6 RF: %d\n", t6_rf);
+    printf("----------Finished SW Test----------\n\n");
+    
+    if((int)inq_read == input && t6_rf == 0) return 0;
+    else return 1;
+}
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/apps/exp/app.mk b/target/snitch_cluster/sw/apps/exp/app.mk
new file mode 100644
index 0000000000..b4d08306ad
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/exp/app.mk
@@ -0,0 +1,11 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := exp
+SRCS             := $(ROOT)/sw/apps/$(APP)/main.c
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
+
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/log/app.mk b/target/snitch_cluster/sw/apps/log/app.mk
new file mode 100644
index 0000000000..6b0d0be1ad
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/log/app.mk
@@ -0,0 +1,11 @@
+# Copyright 2023 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+APP              := log
+SRCS             := $(ROOT)/sw/apps/$(APP)/main.c
+$(APP)_BUILD_DIR ?= $(ROOT)/target/snitch_cluster/sw/apps/$(APP)/build
+
+include $(ROOT)/target/snitch_cluster/sw/apps/common.mk
diff --git a/target/snitch_cluster/sw/apps/montecarlo/experiments/experiments.py b/target/snitch_cluster/sw/apps/montecarlo/experiments/experiments.py
new file mode 100755
index 0000000000..0d91cebe88
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/montecarlo/experiments/experiments.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import json5
+from mako.template import Template
+from pathlib import Path
+import plot
+import yaml
+
+from snitch.util.sim import sim_utils
+from snitch.target.build import build, annotate_traces, build_visual_trace
+from snitch.target.run import get_parser, run_simulations, SIMULATORS
+from snitch.target.SimResults import SimResults
+
+ACTIONS = ['build', 'run', 'traces', 'visual-trace', 'plot', 'all']
+BUILD_DIR = Path('build').absolute()
+RUN_DIR = Path('runs').absolute()
+
+
+def funcptr_axis(funcptr):
+    if funcptr == 'calculate_psum_naive':
+        return 'naive'
+    elif funcptr == 'calculate_psum_baseline':
+        return 'baseline'
+    elif funcptr == 'calculate_psum_optimized':
+        return 'optimized'
+    else:
+        raise ValueError(f'Unknown function pointer: {funcptr}')
+
+
+def get_axis(experiments, axis):
+    vals = [experiment['axes'][axis] for experiment in experiments]
+    return list(dict.fromkeys(vals))
+
+
+def get_experiment_name(axes):
+    return '/'.join([str(val) for val in axes.values()])
+
+
+def get_experiment_run_dir(axes):
+    return RUN_DIR / get_experiment_name(axes)
+
+
+def get_simulation_results(axes):
+    return SimResults(get_experiment_run_dir(axes))
+
+
+def main():
+    # Parse args
+    parser = get_parser()
+    parser.add_argument('actions', nargs='*', choices=ACTIONS, help='List of actions')
+    args = parser.parse_args()
+
+    # Get experiments from experiments file
+    experiments_path = Path(args.testlist).absolute()
+    with open(experiments_path, 'r') as f:
+        experiments = yaml.safe_load(f)['experiments']
+
+    # Fill experiment information
+    for experiment in experiments:
+        experiment['axes'] = {
+            'app': experiment['app'],
+            'prng': experiment['prng'],
+            'impl': funcptr_axis(experiment['funcptr']),
+            'n_cores': experiment['n_cores'],
+            'n_samples': experiment['n_samples'],
+            'batch_size': experiment['batch_size'],
+        }
+        experiment['name'] = get_experiment_name(experiment['axes'])
+        experiment['elf'] = BUILD_DIR / experiment['name'] / 'pi_estimation.elf'
+        experiment['run_dir'] = get_experiment_run_dir(experiment['axes'])
+
+    # Build all experiments
+    if 'build' in args.actions or 'all' in args.actions:
+        for experiment in experiments:
+            defines = {
+                'APPLICATION': ('application_' + experiment['app']).upper(),
+                'PRNG': ('prng_' + experiment['prng']).upper(),
+                'N_SAMPLES': experiment['n_samples'],
+                'N_CORES': experiment['n_cores'],
+                'FUNC_PTR': experiment['funcptr'],
+                'BATCH_SIZE': experiment['batch_size'],
+            }
+            experiment['build_dir'] = experiment['elf'].parent
+            build('pi_estimation', experiment['build_dir'], defines=defines)
+
+    # Run all experiments
+    if 'run' in args.actions or 'all' in args.actions:
+        simulations = sim_utils.get_simulations(experiments, SIMULATORS[args.simulator], RUN_DIR)
+        run_simulations(simulations, args)
+
+    # Annotate traces
+    if 'traces' in args.actions or 'all' in args.actions:
+        for experiment in experiments:
+            annotate_traces(experiment['run_dir'])
+
+    # Build visual traces
+    if 'visual-trace' in args.actions or 'all' in args.actions:
+        for experiment in experiments:
+
+            # Render ROI specification template
+            with open('roi.json', 'r') as f:
+                spec = f.read()
+            spec_template = Template(spec)
+            spec_data = spec_template.render(**experiment)
+            spec_data = json5.loads(spec_data)
+            rendered_spec = experiment['run_dir'] / 'roi_spec.json'
+            with open(rendered_spec, 'w') as f:
+                json5.dump(spec_data, f, indent=4)
+
+            # Build visual trace
+            build_visual_trace(experiment['run_dir'], rendered_spec)
+
+    # Plot
+    if 'plot' in args.actions or 'all' in args.actions:
+        plot.main(experiments)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/snitch_cluster/sw/apps/montecarlo/experiments/experiments.yaml b/target/snitch_cluster/sw/apps/montecarlo/experiments/experiments.yaml
new file mode 100644
index 0000000000..f9b2632c51
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/montecarlo/experiments/experiments.yaml
@@ -0,0 +1,151 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+experiments:
+    - app: pi
+      prng: lcg
+      funcptr: calculate_psum_naive
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: lcg
+      funcptr: calculate_psum_naive
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: lcg
+      funcptr: calculate_psum_baseline
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: lcg
+      funcptr: calculate_psum_baseline
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: lcg
+      funcptr: calculate_psum_optimized
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: lcg
+      funcptr: calculate_psum_optimized
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: lcg
+      funcptr: calculate_psum_naive
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: lcg
+      funcptr: calculate_psum_naive
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: lcg
+      funcptr: calculate_psum_baseline
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: lcg
+      funcptr: calculate_psum_baseline
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: lcg
+      funcptr: calculate_psum_optimized
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: lcg
+      funcptr: calculate_psum_optimized
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: xoshiro128p
+      funcptr: calculate_psum_naive
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: xoshiro128p
+      funcptr: calculate_psum_naive
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: xoshiro128p
+      funcptr: calculate_psum_baseline
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: xoshiro128p
+      funcptr: calculate_psum_baseline
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: xoshiro128p
+      funcptr: calculate_psum_optimized
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: pi
+      prng: xoshiro128p
+      funcptr: calculate_psum_optimized
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: xoshiro128p
+      funcptr: calculate_psum_naive
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: xoshiro128p
+      funcptr: calculate_psum_naive
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: xoshiro128p
+      funcptr: calculate_psum_baseline
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: xoshiro128p
+      funcptr: calculate_psum_baseline
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: xoshiro128p
+      funcptr: calculate_psum_optimized
+      n_cores: 8
+      n_samples: 8192
+      batch_size: 256
+    - app: poly
+      prng: xoshiro128p
+      funcptr: calculate_psum_optimized
+      n_cores: 1
+      n_samples: 8192
+      batch_size: 256
diff --git a/target/snitch_cluster/sw/apps/montecarlo/experiments/plot.py b/target/snitch_cluster/sw/apps/montecarlo/experiments/plot.py
new file mode 100644
index 0000000000..9f0235afec
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/montecarlo/experiments/plot.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import pandas as pd
+
+from experiments import get_simulation_results, get_axis
+from snitch.target.SimResults import SimRegion
+
+
+def calculate_ipc_throughput(results, impl, n_cores, n_samples, batch_size):
+    # In baseline implementation, the IPC is that of the single PSUM region.
+    # We need to be careful and calculate it correctly across all cores.
+    if impl == 'baseline':
+        print(f'\nbaseline {n_cores}')
+        cycles = 0
+        issues = 0
+        n_samples_per_core = n_samples // n_cores
+        for hartid in range(n_cores):
+            hartid = 'hart_' + str(hartid)
+            cycles += results.get_metric(SimRegion(hartid, 'psum'), 'cycles')
+            snitch_issues = results.get_metric(SimRegion(hartid, 'psum'), 'snitch_issues')
+            fpss_issues = results.get_metric(SimRegion(hartid, 'psum'), 'fpss_issues')
+            issues += snitch_issues + fpss_issues
+            print(f'{hartid}: {snitch_issues / n_samples_per_core} snitch issues, '
+                  f'{fpss_issues / n_samples_per_core} fpss issues, '
+                  f'{cycles / n_samples_per_core} cycles')
+        ipc = issues / cycles
+        throughput = 1000 * n_samples / cycles
+    # In optimized implementation, the IPC needs to be calculated from a steady-state batch.
+    # We need to be careful and calculate it correctly across all cores.
+    elif impl == 'optimized':
+        print(f'\noptimized {n_cores}')
+        n_samples_per_batch = batch_size * n_cores
+        n_batches = n_samples // n_samples_per_batch
+        assert n_batches >= 4, 'Not enough batches to reach steady state'
+        cycles = 0
+        issues = 0
+        for hartid in range(n_cores):
+            hartid = 'hart_' + str(hartid)
+            # Measure steady-state iteration (i.e. iteration 3)
+            cycles += results.get_metric(SimRegion(hartid, 'iteration', 3), 'cycles')
+            snitch_issues = results.get_metric(SimRegion(hartid, 'iteration', 3), 'snitch_issues')
+            fpss_issues = results.get_metric(SimRegion(hartid, 'iteration', 3), 'fpss_issues')
+            print(f'{hartid}: {snitch_issues / batch_size} snitch issues, '
+                  f'{fpss_issues / batch_size} fpss issues, {cycles / batch_size} cycles')
+            issues += snitch_issues + fpss_issues
+        ipc = issues / cycles
+        throughput = 1000 * n_samples_per_batch / cycles
+    else:
+        raise ValueError(f'Unsupported implementation: {impl}')
+    return ipc, throughput
+
+
+def plot1(experiments):
+    # Extract data
+    data = []
+    apps = get_axis(experiments, 'app')
+    prngs = get_axis(experiments, 'prng')
+    n_samples = get_axis(experiments, 'n_samples')[0]
+    batch_size = get_axis(experiments, 'batch_size')[0]
+    all_implementations = get_axis(experiments, 'impl')
+    all_n_cores = get_axis(experiments, 'n_cores')
+    for app in apps:
+        for prng in prngs:
+            for impl in all_implementations:
+                if impl != 'naive':
+                    for n_cores in all_n_cores:
+                        results = get_simulation_results({
+                            'app': app,
+                            'prng': prng,
+                            'impl': impl,
+                            'n_cores': n_cores,
+                            'n_samples': n_samples,
+                            'batch_size': batch_size,
+                        })
+                        ipc, throughput = calculate_ipc_throughput(results, impl, n_cores, n_samples,
+                                                                   batch_size)
+                        data.append({'app': app, 'prng': prng, 'impl': impl + str(n_cores), 'metric': 'ipc', 'value': ipc})
+                        data.append({'app': app, 'prng': prng, 'impl': impl + str(n_cores), 'metric': 'throughput', 'value': throughput})
+    df = pd.DataFrame(data)
+
+    for prng in prngs:
+        for app in apps:
+            print(f'\n{app} {prng}')
+            df_app = df[df['app'] == app]
+            df_prng = df_app[df_app['prng'] == prng]
+            pivot_df = df_prng.pivot(index='metric', columns='impl', values='value')
+            pivot_df.index.name = None
+            pivot_df.columns.name = None
+            print(pivot_df)
+
+
+def main(experiments):
+    plot1(experiments)
diff --git a/target/snitch_cluster/sw/apps/montecarlo/experiments/roi.json b/target/snitch_cluster/sw/apps/montecarlo/experiments/roi.json
new file mode 100644
index 0000000000..1f8df71781
--- /dev/null
+++ b/target/snitch_cluster/sw/apps/montecarlo/experiments/roi.json
@@ -0,0 +1,27 @@
+[
+    <% n_batches = (n_samples // n_cores) // batch_size %>
+    <% n_iterations = n_batches + 2 %>
+
+    // Compute cores
+% for j in range(n_cores):
+    {
+        "thread": "${f'hart_{j}'}",
+
+    // ROIs in optimized implementation (shows individual batches)
+    % if funcptr == 'calculate_psum_optimized':
+        "roi": [
+            {"idx": 1, "label": "init"},
+        % for i in range(n_iterations):
+            {"idx": ${2 + i}, "label": "iteration"},
+        % endfor
+        ]
+    // ROIs in other implementations (show computation as a whole)
+    % else:
+        "roi": [
+            {"idx": 1, "label": "init"},
+            {"idx": 2, "label": "psum"},
+        ]
+    % endif
+    },
+% endfor
+]
\ No newline at end of file
diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml
index 65a2f80674..6f41fd02ec 100644
--- a/target/snitch_cluster/sw/run.yaml
+++ b/target/snitch_cluster/sw/run.yaml
@@ -3,103 +3,110 @@
 # SPDX-License-Identifier: Apache-2.0
 
 runs:
-  - elf: tests/build/alias.elf
+  - elf: ./tests/build/alias.elf
     simulators: [vsim, vcs, verilator] # banshee does not model alias regions
-  - elf: tests/build/atomics.elf
+  - elf: ./tests/build/atomics.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x4
-  - elf: tests/build/barrier.elf
-  - elf: tests/build/data_mover.elf
-  - elf: tests/build/dma_empty_transfer.elf
-  - elf: tests/build/dma_simple.elf
-  - elf: tests/build/event_unit.elf
-  - elf: tests/build/fence_i.elf
-  - elf: tests/build/fp8_comparison_scalar.elf
+  - elf: ./tests/build/barrier.elf
+  - elf: ./tests/build/data_mover.elf
+  - elf: ./tests/build/dma_empty_transfer.elf
+  - elf: ./tests/build/dma_simple.elf
+  - elf: ./tests/build/event_unit.elf
+  - elf: ./tests/build/fence_i.elf
+  - elf: ./tests/build/fp8_comparison_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with segfault
-  - elf: tests/build/fp8_comparison_vector.elf
+  - elf: ./tests/build/fp8_comparison_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with segfault
-  - elf: tests/build/fp8_computation_scalar.elf
+  - elf: ./tests/build/fp8_computation_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - elf: tests/build/fp8_computation_vector.elf
+  - elf: ./tests/build/fp8_computation_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
-  - elf: tests/build/fp8alt_comparison_scalar.elf
+  - elf: ./tests/build/fp8alt_comparison_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with segfault
-  - elf: tests/build/fp8alt_comparison_vector.elf
+  - elf: ./tests/build/fp8alt_comparison_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - elf: tests/build/fp8alt_computation_scalar.elf
+  - elf: ./tests/build/fp8alt_computation_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - elf: tests/build/fp8alt_computation_vector.elf
+  - elf: ./tests/build/fp8alt_computation_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x12
-  - elf: tests/build/fp16_comparison_scalar.elf
+  - elf: ./tests/build/fp16_comparison_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - elf: tests/build/fp16_comparison_vector.elf
+  - elf: ./tests/build/fp16_comparison_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - elf: tests/build/fp16_computation_scalar.elf
+  - elf: ./tests/build/fp16_computation_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - elf: tests/build/fp16_computation_vector.elf
+  - elf: ./tests/build/fp16_computation_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x6
-  - elf: tests/build/fp16alt_comparison_scalar.elf
+  - elf: ./tests/build/fp16alt_comparison_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - elf: tests/build/fp16alt_comparison_vector.elf
+  - elf: ./tests/build/fp16alt_comparison_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x10
-  - elf: tests/build/fp16alt_computation_scalar.elf
+  - elf: ./tests/build/fp16alt_computation_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with JIT issue
-  - elf: tests/build/fp16alt_computation_vector.elf
+  - elf: ./tests/build/fp16alt_computation_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x16
-  - elf: tests/build/fp32_comparison_scalar.elf
-  - elf: tests/build/fp32_comparison_vector.elf
-  - elf: tests/build/fp32_computation_scalar.elf
+  - elf: ./tests/build/fp32_comparison_scalar.elf
+  - elf: ./tests/build/fp32_comparison_vector.elf
+  - elf: ./tests/build/fp32_computation_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
-  - elf: tests/build/fp32_computation_vector.elf
+  - elf: ./tests/build/fp32_computation_vector.elf
     simulators: [vsim, vcs, verilator] # banshee fails with exit code 0x2
-  - elf: tests/build/fp32_conversions_scalar.elf
+  - elf: ./tests/build/fp32_conversions_scalar.elf
     simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction
-  # - elf: tests/build/fp64_conversions_scalar.elf
+  # - elf: ./tests/build/fp64_conversions_scalar.elf
   #   simulators: [vsim, vcs, verilator]
-  - elf: tests/build/interrupt_local.elf
-  - elf: tests/build/multi_cluster.elf
-  - elf: tests/build/openmp_parallel.elf
-  - elf: tests/build/openmp_for_static_schedule.elf
-  - elf: tests/build/openmp_double_buffering.elf
-  - elf: tests/build/perf_cnt.elf
+  - elf: ./tests/build/interrupt_local.elf
+  - elf: ./tests/build/multi_cluster.elf
+  - elf: ./tests/build/openmp_parallel.elf
+  - elf: ./tests/build/openmp_for_static_schedule.elf
+  - elf: ./tests/build/openmp_double_buffering.elf
+  - elf: ./tests/build/perf_cnt.elf
     simulators: [vsim, vcs, verilator] # banshee does not have HW performance counters
-  - elf: tests/build/printf_simple.elf
-  - elf: tests/build/printf_fmtint.elf
-  - elf: tests/build/simple.elf
-  - elf: tests/build/tls.elf
-  - elf: tests/build/varargs_1.elf
-  - elf: tests/build/varargs_2.elf
-  - elf: tests/build/zero_mem.elf
-  - elf: tests/build/non_null_exitcode.elf
+  - elf: ./tests/build/printf_simple.elf
+  - elf: ./tests/build/printf_fmtint.elf
+  - elf: ./tests/build/simple.elf
+  - elf: ./tests/build/tls.elf
+  - elf: ./tests/build/varargs_1.elf
+  - elf: ./tests/build/varargs_2.elf
+  - elf: ./tests/build/zero_mem.elf
+  - elf: ./tests/build/non_null_exitcode.elf
     retcode: 56
-  - elf: tests/build/caq.elf
-  - elf: tests/build/caq_frep.elf
+  - elf: ./tests/build/issr.elf
+  - elf: ./tests/build/caq.elf
+  - elf: ./tests/build/caq_frep.elf
     simulators: [vsim, vcs, verilator] # banshee does not model FREP timing
-  - elf: apps/blas/axpy/build/axpy.elf
+  - elf: ./tests/build/flt_d_ssr.elf
+    simulators: [vsim, vcs, verilator] # banshee does not model COPIFT extensions
+  - elf: ./tests/build/fcvt_d_wu_ssr.elf
+    simulators: [vsim, vcs, verilator] # banshee does not model COPIFT extensions
+  - elf: ./tests/build/fcvt_d_w_ssr.elf
+    simulators: [vsim, vcs, verilator] # banshee does not model COPIFT extensions
+  - elf: ./apps/blas/axpy/build/axpy.elf
     cmd: [../../../sw/blas/axpy/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/blas/gemm/build/gemm.elf
+  - elf: ./apps/blas/gemm/build/gemm.elf
     cmd: [../../../sw/blas/gemm/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/blas/dot/build/dot.elf
+  - elf: ./apps/blas/dot/build/dot.elf
     cmd: [../../../sw/blas/dot/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/blas/syrk/build/syrk.elf
+  - elf: ./apps/blas/syrk/build/syrk.elf
     cmd: [../../../sw/blas/syrk/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/dnn/batchnorm/build/batchnorm.elf
-  - elf: apps/dnn/maxpool/build/maxpool.elf
-  # - elf: apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results
+  - elf: ./apps/dnn/batchnorm/build/batchnorm.elf
+  - elf: ./apps/dnn/maxpool/build/maxpool.elf
+  # - elf: ./apps/dnn/conv2d/build/conv2d.elf # Fails with wrong results
   #   cmd: [../../../sw/dnn/conv2d/scripts/verify.py, "${sim_bin}", "${elf}"]
-  # - elf: apps/dnn/fusedconv/build/fusedconv.elf # Fails with wrong results
+  # - elf: ./apps/dnn/fusedconv/build/fusedconv.elf # Fails with wrong results
   #   cmd: [../../../sw/dnn/fusedconv/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/dnn/concat/build/concat.elf
+  - elf: ./apps/dnn/concat/build/concat.elf
     cmd: [../../../sw/dnn/concat/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/dnn/fused_concat_linear/build/fused_concat_linear.elf
+  - elf: ./apps/dnn/fused_concat_linear/build/fused_concat_linear.elf
     cmd: [../../../sw/dnn/fused_concat_linear/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/dnn/transpose/build/transpose.elf
+  - elf: ./apps/dnn/transpose/build/transpose.elf
     cmd: [../../../sw/dnn/transpose/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/montecarlo/pi_estimation/build/pi_estimation.elf
-  - elf: apps/atax/build/atax.elf
+  - elf: ./apps/montecarlo/pi_estimation/build/pi_estimation.elf
+  - elf: ./apps/atax/build/atax.elf
     cmd: [../../../sw/apps/atax/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/covariance/build/covariance.elf
+  - elf: ./apps/covariance/build/covariance.elf
     cmd: [../../../sw/apps/covariance/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/doitgen/build/doitgen.elf
+  - elf: ./apps/doitgen/build/doitgen.elf
     cmd: [../../../sw/apps/doitgen/scripts/verify.py, "${sim_bin}", "${elf}"]
-  - elf: apps/blas/gemv/build/gemv.elf
+  - elf: ./apps/blas/gemv/build/gemv.elf
     cmd: [../../../sw/blas/gemv/scripts/verify.py, "${sim_bin}", "${elf}"]
diff --git a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
index 426b623ed4..4cd1d37412 100644
--- a/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
+++ b/target/snitch_cluster/sw/runtime/rtl/src/snrt.h
@@ -24,6 +24,7 @@
 #include "alloc_v2.h"
 #include "cls.h"
 #include "cluster_interrupts.h"
+#include "copift.h"
 #include "dm.h"
 #include "dma.h"
 #include "dump.h"
diff --git a/target/snitch_cluster/sw/toolchain.mk b/target/snitch_cluster/sw/toolchain.mk
index 9cadb8c447..62829d477e 100644
--- a/target/snitch_cluster/sw/toolchain.mk
+++ b/target/snitch_cluster/sw/toolchain.mk
@@ -30,7 +30,7 @@ RISCV_CFLAGS := -mcpu=snitch
 RISCV_CFLAGS += -menable-experimental-extensions
 RISCV_CFLAGS += -mabi=ilp32d
 RISCV_CFLAGS += -mcmodel=medany
-# RISCV_CFLAGS += -mno-fdiv # Not supported by Clang
+RISCV_CFLAGS += -mno-fdiv
 # RISCV_CFLAGS += -ffast-math
 RISCV_CFLAGS += -fno-builtin-printf
 RISCV_CFLAGS += -fno-builtin-sqrtf
@@ -38,6 +38,7 @@ RISCV_CFLAGS += -fno-common
 RISCV_CFLAGS += -fopenmp
 RISCV_CFLAGS += -ftls-model=local-exec
 RISCV_CFLAGS += -O3
+RISCV_CFLAGS += -Wno-error=int-conversion
 ifeq ($(DEBUG), ON)
 RISCV_CFLAGS += -g
 endif
diff --git a/target/snitch_cluster/test/testharness.sv b/target/snitch_cluster/test/testharness.sv
index dbde824efc..c6ae6eb714 100644
--- a/target/snitch_cluster/test/testharness.sv
+++ b/target/snitch_cluster/test/testharness.sv
@@ -32,7 +32,11 @@ module testharness import snitch_cluster_pkg::*; (
     .hart_base_id_i (CfgBaseHartId),
     .cluster_base_addr_i (CfgClusterBaseAddr),
     .clk_d2_bypass_i (1'b0),
+`ifdef TARGET_POSTLAYOUT
+    .sram_cfgs_i (snitch_cluster_pkg::sram_cfgs_t'('1)),
+`else
     .sram_cfgs_i (snitch_cluster_pkg::sram_cfgs_t'('0)),
+`endif
     .narrow_in_req_i (narrow_in_req),
     .narrow_in_resp_o (narrow_in_resp),
     .narrow_out_req_o (narrow_out_req),
diff --git a/target/snitch_cluster/util/SimResults.py b/target/snitch_cluster/util/SimResults.py
new file mode 100644
index 0000000000..8fa6349cc4
--- /dev/null
+++ b/target/snitch_cluster/util/SimResults.py
@@ -0,0 +1,141 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+import functools
+import json
+from pathlib import Path
+import re
+
+
+class SimRegion():
+    """A region in the simulation results.
+
+    A region is identified by a thread and a region ID. The ID is the
+    index of the region in the thread's trace. Alternatively, if a
+    label was associated to the region using a region-of-interest (ROI)
+    specification, this can also be used as an identifier. In case of
+    multiple regions with the same label (as e.g. in a loop) you need to
+    provide an additional index, to select the specific occurrence of the
+    label.
+    """
+    def __init__(self, thread, id, occurrence=0):
+        # Validate thread identifier
+        pattern = r'^(hart|dma)_[0-9]+$'
+        assert re.match(pattern, str(thread)), \
+            f"Thread '{thread}' does not match pattern '(hart|dma)_#'"
+
+        # Save arguments
+        self.thread = thread
+        self.id = id
+        self.occurrence = occurrence
+
+
+class MissingRegionError(Exception):
+
+    def __init__(self, region):
+        self.region = region
+
+    def __str__(self):
+        if isinstance(self.region.id, int):
+            return f"Region {self.region.id} not found in thread {self.region.thread}."
+        else:
+            return f"Region {self.region.id} (occurrence {self.region.occurrence}) not found " \
+                   f"in thread {self.region.thread}."
+
+
+class SimResults():
+
+    def __init__(self, sim_dir, source=None):
+        """Initializes a simulation results object from the run directory.
+        """
+        self.sim_dir = Path(sim_dir)
+        self.roi_json = Path(self.sim_dir) / 'logs' / 'roi.json'
+        self.perf_json = Path(self.sim_dir) / 'logs' / 'perf.json'
+
+        # Get data from ROI file, if available, and perf file otherwise
+        if source is not None:
+            assert source in ['perf', 'roi'], f'Invalid source {source}.'
+            self.source = source
+        else:
+            self.source = 'perf'
+            if self.roi_json.exists():
+                self.source = 'roi'
+
+    @functools.cached_property
+    def performance_data(self):
+        """Returns all performance data logged during simulation."""
+        source = self.perf_json if self.source == 'perf' else self.roi_json
+        with open(source, 'r') as f:
+            return json.load(f)
+
+    def get_metric(self, region, metric):
+        """Get a performance metric from a simulation region.
+
+        Args:
+            region: The region to extract the metric from. Should be an
+                instance of the `SimRegion` class.
+            metric: The name of the metric to extract.
+        """
+        # Get region index: trivial if SimRegion is already defined by its
+        # index, otherwise search for the region with the given label
+        # and occurrence.
+        id = None
+        if isinstance(region.id, int):
+            id = region.id
+        else:
+            if self.source == 'perf':
+                raise ValueError('Regions can only be identified by string labels if a ROI file'
+                                 ' is available.')
+            cnt = 0
+            for i, reg in enumerate(self.performance_data[region.thread]):
+                if reg['label'] == region.id:
+                    if cnt == region.occurrence:
+                        id = i
+                        break
+                    else:
+                        cnt += 1
+        if id is None:
+            raise MissingRegionError(region)
+
+        # Get metric
+        thread_data = self.performance_data[region.thread]
+        if id < len(thread_data):
+            if metric in ['tstart', 'tend'] or self.source == 'perf':
+                return thread_data[id][metric]
+            else:
+                return thread_data[id]['attrs'][metric]
+        else:
+            raise MissingRegionError(region)
+
+    def get_metrics(self, regions, metric):
+        """Get a performance metric from multiple simulation regions.
+
+        Args:
+            regions: A list of regions to extract the metric from. Should
+                be a list of `SimRegion` instances.
+            metric: The name of the metric to extract.
+
+        Returns:
+            A list of metrics corresponding to each region in the input list.
+        """
+        metrics = []
+        for region in regions:
+            metrics.append(self.get_metric(region, metric))
+        return metrics
+
+    def get_timespan(self, start_region, end_region=None):
+        """Get the timespan between two regions.
+
+        Args:
+            start_region: The region to start from.
+            end_region: The region to end at. If not provided, the start
+                region is used.
+        """
+        if end_region is None:
+            end_region = start_region
+        start_time = self.get_metric(start_region, 'tstart')
+        end_time = self.get_metric(end_region, 'tend')
+        return end_time - start_time
diff --git a/target/snitch_cluster/util/build.py b/target/snitch_cluster/util/build.py
index f8a084ac93..9c8a574326 100755
--- a/target/snitch_cluster/util/build.py
+++ b/target/snitch_cluster/util/build.py
@@ -31,11 +31,12 @@
 """
 
 import argparse
-import os
 from pathlib import Path
-import subprocess
+from termcolor import colored
 import yaml
 
+from snitch.target import common
+
 
 def parser():
     # Argument parsing
@@ -58,25 +59,24 @@ def parser():
     return parser
 
 
-def extend_environment(vars, env=None):
-    if env is None:
-        env = os.environ.copy()
-    env.update(vars)
-    return env
-
-
 # Build software target with a specific data configuration
-def build(target, cfg):
-    # Define configuration-specific variables for build system
-    env = extend_environment({
-        f'{target}_DATA_CFG': cfg,
-        f'{target}_BUILD_DIR': Path(f'build/{cfg.stem}').resolve()
-    })
+def build(target, build_dir, data_cfg=None, defines=None):
+    # Define variables for build system
+    vars = {
+        'DEBUG': 'ON',
+        f'{target}_BUILD_DIR': build_dir,
+    }
+    if data_cfg is not None:
+        vars[f'{target}_DATA_CFG'] = data_cfg
+    if defines:
+        cflags = ' '.join([f'-D{name}={value}' for name, value in defines.items()])
+        vars[f'{target}_RISCV_CFLAGS'] = cflags
 
-    # Build the software
-    mk_dir = Path(__file__).resolve().parent.parent
-    subprocess.run(['make', '-C', mk_dir, 'DEBUG=ON', target],
-                   check=True, env=env)
+    # Build software
+    print(colored('Build app', 'black', attrs=['bold']), colored(target, 'cyan', attrs=['bold']),
+          colored('in', 'black', attrs=['bold']), colored(build_dir, 'cyan', attrs=['bold']))
+    env = common.extend_environment(vars)
+    common.make(target, env=env)
 
 
 # Create test specification for a specific configuration
@@ -100,6 +100,25 @@ def dump_testlist(tests, outfile):
             yaml.dump({'runs': tests}, f)
 
 
+def annotate_traces(run_dir):
+    print(colored('Annotate traces', 'black', attrs=['bold']),
+          colored(run_dir, 'cyan', attrs=['bold']))
+    vars = {'SIM_DIR': run_dir}
+    flags = ['-j']
+    common.make('annotate', vars, flags=flags)
+
+
+def build_visual_trace(run_dir, roi_spec):
+    print(colored('Build visual trace', 'black', attrs=['bold']),
+          colored(run_dir / 'logs/trace.json', 'cyan', attrs=['bold']))
+    vars = {
+        'SIM_DIR': run_dir,
+        'ROI_SPEC': roi_spec
+    }
+    flags = ['-j']
+    common.make('visual-trace', vars, flags=flags)
+
+
 def main():
 
     # Parse arguments
@@ -108,7 +127,8 @@ def main():
 
     # Build software
     for cfg in cfgs:
-        build(args.target, cfg)
+        build_dir = Path(f'build/{cfg.stem}').resolve()
+        build(args.target, build_dir, data_cfg=cfg)
 
     # Build testlist
     tests = [create_test(args.target, cfg, args.testlist_cmd) for cfg in cfgs]
diff --git a/target/snitch_cluster/util/common.py b/target/snitch_cluster/util/common.py
new file mode 100644
index 0000000000..f2f6df8923
--- /dev/null
+++ b/target/snitch_cluster/util/common.py
@@ -0,0 +1,39 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+
+from pathlib import Path
+import os
+import subprocess
+
+MK_DIR = Path(__file__).resolve().parent.parent
+
+
+def extend_environment(vars, env=None):
+    if env is None:
+        env = os.environ.copy()
+    env.update(vars)
+    return env
+
+
+def run(cmd, env=None, dry_run=False, sync=True):
+    cmd = [str(arg) for arg in cmd]
+    if dry_run:
+        print(' '.join(cmd))
+        return None
+    else:
+        if sync:
+            return subprocess.run(cmd, env=env)
+        else:
+            return subprocess.Popen(cmd, env=env)
+
+
+def make(target, vars={}, flags=[], dir=MK_DIR, env=None, dry_run=False, sync=True):
+    var_assignments = [f'{key}={value}' for key, value in vars.items()]
+    cmd = ['make', *var_assignments, target]
+    if dir is not None:
+        cmd.extend(['-C', dir])
+    cmd.extend(flags)
+    return run(cmd, env=env, dry_run=dry_run, sync=sync)
diff --git a/target/snitch_cluster/util/experiment_utils.py b/target/snitch_cluster/util/experiment_utils.py
new file mode 100644
index 0000000000..4bf38dd57a
--- /dev/null
+++ b/target/snitch_cluster/util/experiment_utils.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Luca Colagrande <colluca@iis.ee.ethz.ch>
+"""Convenience functions to run software experiments in RTL simulation.
+"""
+
+from copy import deepcopy
+import json5
+from mako.template import Template
+import pandas as pd
+from pathlib import Path
+from snitch.target.SimResults import SimResults
+from snitch.target import run, build, common
+from snitch.util.sim import sim_utils
+from termcolor import colored
+import yaml
+
+ACTIONS = ['sw', 'run', 'traces', 'annotate', 'perf', 'visual-trace', 'power', 'all', 'none']
+SNITCH_ROOT = Path(__file__).parent.parent.parent.parent
+
+
+class ExperimentManager:
+
+    def __init__(self, experiments=None, actions=None):
+        """Initializes the class from the command-line arguments."""
+        self.args = self.parser().parse_args()
+        if actions is not None:
+            self.actions = actions
+        else:
+            self.actions = self.args.actions
+
+        # Save directory
+        self.dir = Path.cwd()
+        self.run_dir = self.dir / self.args.run_dir
+
+        # Get experiments
+        if experiments is not None:
+            self.experiments = experiments
+        else:
+            experiments_path = Path(self.args.testlist).absolute()
+            with open(experiments_path, 'r') as f:
+                self.yaml = yaml.safe_load(f)
+
+            self.experiments = deepcopy(self.yaml['experiments'])
+
+        # Derive experiment information
+        for experiment in self.experiments:
+            self.derive_experiment_info(experiment)
+
+    def parser(self):
+        parser = run.get_parser()
+        parser.add_argument('actions', nargs='*', default='none', choices=ACTIONS, help='List of actions')
+        return parser
+
+    def derive_axes(self, experiment):
+        return experiment.copy()
+
+    def derive_name(self, experiment):
+        return '/'.join([str(val) for val in experiment['axes'].values()])
+
+    def derive_elf(self, experiment):
+        return self.dir / 'build' / experiment['name'] / (experiment['app'] + '.elf')
+
+    def derive_dir(self, base, experiment):
+        return base / experiment['name']
+
+    def derive_env(self, experiment):
+        return None
+
+    def derive_experiment_info(self, experiment):
+        experiment['axes'] = self.derive_axes(experiment)
+        experiment['name'] = self.derive_name(experiment)
+        experiment['elf'] = self.derive_elf(experiment)
+        experiment['run_dir'] = self.derive_dir(self.run_dir, experiment)
+
+    def derive_cdefines(self, experiment):
+        return {}
+
+    def run(self):
+        # Build software
+        if 'sw' in self.actions or 'all' in self.actions:
+            for experiment in self.experiments:
+                defines = self.derive_cdefines(experiment)
+                build.build(experiment['app'], experiment['elf'].parent, defines=defines)
+
+        # Run experiments
+        if 'run' in self.actions or 'all' in self.actions:
+            simulations = sim_utils.get_simulations(
+                self.experiments,
+                run.SIMULATORS[self.args.simulator],
+                self.run_dir
+            )
+            for i, experiment in enumerate(self.experiments):
+                simulations[i].env = self.derive_env(experiment)
+            run.run_simulations(simulations, self.args)
+
+        # Generate traces
+        if 'traces' in self.actions or 'all' in self.actions:
+            for experiment in self.experiments:
+                print(colored('Generate traces', 'black', attrs=['bold']),
+                    colored(experiment['run_dir'], 'cyan', attrs=['bold']))
+                vars = {'SIM_DIR': experiment['run_dir']}
+                flags = ['-j']
+                common.make('traces', vars, flags=flags)
+
+        # Annotate traces
+        if 'annotate' in self.actions or 'all' in self.actions:
+            for experiment in self.experiments:
+                build.annotate_traces(experiment['run_dir'])
+
+        # Generate joint performance dump
+        if 'perf' in self.actions or 'all' in self.actions:
+            processes = []
+            for experiment in self.experiments:
+                print(
+                    colored('Generate performance dump', 'black', attrs=['bold']),
+                    colored(experiment['run_dir'], 'cyan', attrs=['bold'])
+                )
+                vars = {'SIM_DIR': experiment['run_dir']}
+                flags = ['-j']
+                process = common.make('perf', vars, flags=flags, sync=False)
+                processes.append(process)
+
+            # Wait for all processes to complete
+            for i, process in enumerate(processes):
+                return_code = process.wait()
+                if return_code != 0:
+                    raise Exception(f'Failed to generate performance dump for experiment {i}')
+
+        # Build visual traces
+        if 'visual-trace' in self.actions or 'all' in self.actions:
+
+            # Check for existence of a ROI specification
+            roi = self.dir / 'roi.json'
+            if roi.exists():
+                for experiment in self.experiments:
+
+                    # Render ROI specification template
+                    with open(spec, 'r') as f:
+                        spec = f.read()
+                    spec_template = Template(spec)
+                    spec_data = spec_template.render(**experiment['axes'])
+                    spec_data = json5.loads(spec_data)
+                    rendered_spec = self.run_dir / 'roi_spec.json'
+                    with open(rendered_spec, 'w') as f:
+                        json5.dump(spec_data, f, indent=4)
+
+                    # Build visual trace
+                    build.build_visual_trace(self.run_dir, rendered_spec)
+
+        # Generate joint performance dump
+        if 'power' in self.actions or 'all' in self.actions:
+            processes = []
+            for experiment in self.experiments:
+                power_dir = self.derive_dir(self.dir / 'power', experiment)
+                print(
+                    colored('Estimate power', 'black', attrs=['bold']),
+                    colored(power_dir, 'cyan', attrs=['bold'])
+                )
+                vars = {
+                    'SIM_DIR': experiment['run_dir'],
+                    'POWER_REPDIR': power_dir
+                }
+                dir = SNITCH_ROOT / 'nonfree'
+                process = common.make('power', vars, dir=dir, sync=False)
+                processes.append(process)
+
+            # Wait for all processes to complete
+            for i, process in enumerate(processes):
+                return_code = process.wait()
+                if return_code != 0:
+                    raise Exception(f'Failed to estimate power for experiment {i}')
+
+    def get_results(self, source=None):
+        """Returns a DataFrame of SimResults objects."""
+
+        # Create the DataFrame
+        df = pd.DataFrame(self.experiments)
+
+        # Create SimResults objects from 'run_dir' column
+        results = df['run_dir'].apply(lambda run_dir: SimResults(run_dir, source=source))
+        results.rename('results', inplace=True)
+
+        # Expand the 'axes' column into separate columns
+        axes = df['axes'].apply(pd.Series)
+
+        # Combine 'axes' and 'results' into a new DataFrame
+        df = pd.concat([axes, results], axis=1)
+
+        # If desired, reset the index
+        df = df.reset_index(drop=True)
+
+        return df
diff --git a/target/snitch_cluster/util/run.py b/target/snitch_cluster/util/run.py
index 12cb671883..7218ac281d 100755
--- a/target/snitch_cluster/util/run.py
+++ b/target/snitch_cluster/util/run.py
@@ -13,28 +13,21 @@
 from pathlib import Path
 import sys
 
-sys.path.append(str(Path(__file__).parent / '../../../util/sim'))
-import sim_utils  # noqa: E402
-from Simulator import QuestaSimulator, VCSSimulator, VerilatorSimulator, \
-                      BansheeSimulator  # noqa: E402
-
+from snitch.util.sim import sim_utils, Simulator
 
+TARGET_DIR = Path(__file__).parent.resolve() / '../'
 SIMULATORS = {
-    'vsim': QuestaSimulator(Path(__file__).parent.resolve() / '../bin/snitch_cluster.vsim'),
-    'vcs': VCSSimulator(Path(__file__).parent.resolve() / '../bin/snitch_cluster.vcs'),
-    'verilator': VerilatorSimulator(Path(__file__).parent.resolve() / '../bin/snitch_cluster.vlt'),
-    'banshee': BansheeSimulator(Path(__file__).parent.resolve() / '../src/banshee.yaml')
+    'vsim': Simulator.QuestaSimulator(TARGET_DIR / 'bin/snitch_cluster.vsim'),
+    'vcs': Simulator.VCSSimulator(TARGET_DIR / 'bin/snitch_cluster.vcs'),
+    'verilator': Simulator.VerilatorSimulator(TARGET_DIR / 'bin/snitch_cluster.vlt'),
+    'banshee': Simulator.BansheeSimulator(TARGET_DIR / 'src/banshee.yaml')
 }
 
 
-def parser():
+def get_parser():
     return sim_utils.parser('vsim', SIMULATORS.keys())
 
 
-def get_simulations(args):
-    return sim_utils.get_simulations(args.testlist, SIMULATORS[args.simulator], args.run_dir)
-
-
 def run_simulations(simulations, args):
     return sim_utils.run_simulations(simulations,
                                      n_procs=args.n_procs,
@@ -45,8 +38,16 @@ def run_simulations(simulations, args):
 
 
 def main():
-    args = parser().parse_args()
-    simulations = get_simulations(args)
+    # Parse args
+    args = get_parser().parse_args()
+    testlist = args.testlist
+    simulator = SIMULATORS[args.simulator]
+    run_dir = args.run_dir
+
+    # Get simulations
+    simulations = sim_utils.get_simulations_from_file(testlist, simulator, run_dir)
+
+    # Run simulations
     return run_simulations(simulations, args)
 
 
diff --git a/util/clustergen.py b/util/clustergen.py
index b349747ee8..c700ebbad2 100755
--- a/util/clustergen.py
+++ b/util/clustergen.py
@@ -49,6 +49,9 @@ def main():
     parser.add_argument("--wrapper",
                         action="store_true",
                         help="Generate Snitch cluster wrapper")
+    parser.add_argument("--package",
+                        action="store_true",
+                        help="Generate Snitch cluster package")
     parser.add_argument("--linker",
                         action="store_true",
                         help="Generate linker script")
@@ -89,6 +92,10 @@ def main():
         with open(outdir / "snitch_cluster_wrapper.sv", "w") as f:
             f.write(cluster_tb.render_wrapper())
 
+    if args.package:
+        with open(outdir / "snitch_cluster_pkg.sv", "w") as f:
+            f.write(cluster_tb.render_package())
+
     if args.linker:
         with open(outdir / "link.ld", "w") as f:
             f.write(cluster_tb.render_linker_script())
diff --git a/util/clustergen/cluster.py b/util/clustergen/cluster.py
index 7c48ee92b3..e0e7c877e9 100644
--- a/util/clustergen/cluster.py
+++ b/util/clustergen/cluster.py
@@ -144,8 +144,8 @@ class SnitchCluster(Generator):
     Instance of a Snitch cluster.
     """
     files = {
-        'cfg': "src/snitch_cfg.sv.tpl",
-        'wrapper': "src/snitch_cluster_wrapper.sv.tpl"
+        'wrapper': "src/snitch_cluster_wrapper.sv.tpl",
+        'package': "src/snitch_cluster_pkg.sv.tpl"
     }
 
     def __init__(self, cfg, pma_cfg):
@@ -179,6 +179,13 @@ def render_wrapper(self):
                                            to_sv_hex=to_sv_hex,
                                            disclaimer=self.DISCLAIMER)
 
+    def render_package(self):
+        """Render the cluster package"""
+        cfg_template = self.templates.get_template(self.files['package'])
+        return cfg_template.render_unicode(cfg=self.cfg,
+                                           to_sv_hex=to_sv_hex,
+                                           disclaimer=self.DISCLAIMER)
+
     def add_mem(self,
                 words,
                 width,
@@ -247,7 +254,7 @@ def calc_cache_sizes(self):
             cl_bytes = self.cfg['hives'][i]['icache']['cacheline'] // 8
             self.cfg['hives'][i]['icache']['depth'] = self.cfg['hives'][i][
                 'icache']['size'] * 1024 // self.cfg['hives'][i]['icache'][
-                    'sets'] // cl_bytes
+                    'ways'] // cl_bytes
             # tag width
             self.tag_width = self.cfg['addr_width'] - clog2(
                     hive['icache']['cacheline'] // 8) - clog2(hive['icache']['depth']) + 3
@@ -381,6 +388,9 @@ def __init__(self, cfg):
     def render_wrapper(self):
         return self.cluster.render_wrapper()
 
+    def render_package(self):
+        return self.cluster.render_package()
+
     def render_linker_script(self):
         """Generate a linker script for the cluster testbench"""
         cfg_template = self.templates.get_template("test/link.ld.tpl")
diff --git a/util/clustergen/schema/snitch_cluster.schema.json b/util/clustergen/schema/snitch_cluster.schema.json
index f9d5831ca3..1473b14800 100644
--- a/util/clustergen/schema/snitch_cluster.schema.json
+++ b/util/clustergen/schema/snitch_cluster.schema.json
@@ -373,7 +373,7 @@
                         "description": "Detailed configuration of the current Hive's instruction cache.",
                         "default": {
                             "size": 8,
-                            "sets": 2,
+                            "ways": 2,
                             "cacheline": 128
                         },
                         "properties": {
@@ -381,7 +381,7 @@
                                 "type": "number",
                                 "description": "Total instruction cache size in KiByte."
                             },
-                            "sets": {
+                            "ways": {
                                 "type": "number",
                                 "description": "Number of ways."
                             },
diff --git a/util/container/Dockerfile b/util/container/Dockerfile
index 4cd5c8ea50..f3df751768 100644
--- a/util/container/Dockerfile
+++ b/util/container/Dockerfile
@@ -95,9 +95,11 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/doc/*
 
 # Copy all needed files to install the package
+# (.dockerignore is used to filter only the necessary files)
 COPY pyproject.toml .
 COPY sw ./sw
 COPY util ./util
+COPY target/snitch_cluster/util ./target/snitch_cluster/util
 
 # Create and activate virtual environment
 ARG VIRTUAL_ENV="/root/.venvs/snitch_cluster"
diff --git a/util/container/README.md b/util/container/README.md
index 05a1369262..83ddaeb64e 100644
--- a/util/container/README.md
+++ b/util/container/README.md
@@ -35,7 +35,7 @@ sudo docker buildx build -t ghcr.io/pulp-platform/snitch_cluster:main -f util/co
 To run the container in interactive mode:
 
 ```shell
-docker run -it -v <path_to_repository_root>:/repo -w /repo ghcr.io/pulp-platform/snitch_cluster:main
+docker run -it --entrypoint /bin/bash -v <path_to_repository_root>:/repo -w /repo ghcr.io/pulp-platform/snitch_cluster:main
 ```
 
 ## Limitations
diff --git a/util/generate-opcodes.sh b/util/generate-opcodes.sh
index 82f35f391d..f2337e6032 100755
--- a/util/generate-opcodes.sh
+++ b/util/generate-opcodes.sh
@@ -8,12 +8,12 @@ set -e
 ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
 
 RISCV_OPCODES=$ROOT/sw/deps/riscv-opcodes
-OPCODES=(opcodes-pseudo opcodes-rv32i opcodes-rv64i opcodes-rv32m opcodes-rv64m opcodes-rv32a opcodes-rv64a opcodes-rv32h opcodes-rv64h opcodes-rv32f opcodes-rv64f opcodes-rv32d opcodes-rv64d opcodes-rv32q opcodes-rv64q opcodes-system opcodes-custom opcodes-rv32b_CUSTOM opcodes-dma_CUSTOM opcodes-frep_CUSTOM opcodes-ssr_CUSTOM opcodes-flt-occamy_CUSTOM opcodes-rvv-pseudo)
+OPCODES=(opcodes-pseudo opcodes-rv32i opcodes-rv64i opcodes-rv32m opcodes-rv64m opcodes-rv32a opcodes-rv64a opcodes-rv32h opcodes-rv64h opcodes-rv32f opcodes-rv64f opcodes-rv32d opcodes-rv64d opcodes-rv32q opcodes-rv64q opcodes-system opcodes-custom opcodes-rv32b_CUSTOM opcodes-dma_CUSTOM opcodes-frep_CUSTOM opcodes-ssr_CUSTOM opcodes-copift_CUSTOM opcodes-flt-occamy_CUSTOM opcodes-rvv-pseudo)
 
 #######
 # RTL #
 #######
-OPCODES+=(opcodes-ipu_CUSTOM)
+# OPCODES+=(opcodes-ipu_CUSTOM)
 INSTR_SV=$ROOT/hw/snitch/src/riscv_instr.sv
 
 cat > $INSTR_SV <<- EOM
diff --git a/util/lint/.yamllint.yml b/util/lint/.yamllint.yml
index 7cf910fc39..a5e6e098f1 100644
--- a/util/lint/.yamllint.yml
+++ b/util/lint/.yamllint.yml
@@ -9,7 +9,7 @@ rules:
   comments:
     min-spaces-from-content: 1
   line-length:
-    max: 90
+    max: 100
     allow-non-breakable-words: true
     allow-non-breakable-inline-mappings: true
     ignore: |
diff --git a/util/sim/Simulation.py b/util/sim/Simulation.py
index 5dd98ee35a..119439e054 100644
--- a/util/sim/Simulation.py
+++ b/util/sim/Simulation.py
@@ -42,7 +42,9 @@ def __init__(self, elf=None, dry_run=False, retcode=0, run_dir=None, name=None):
         self.cmd = []
         self.log = None
         self.process = None
+        self.interrupted = False
         self.expected_retcode = int(retcode)
+        self.env = None
 
     def launch(self, dry_run=None):
         """Launch the simulation.
@@ -72,7 +74,8 @@ def launch(self, dry_run=None):
             # Launch simulation subprocess
             with open(self.log, 'w') as f:
                 self.process = subprocess.Popen(self.cmd, stdout=f, stderr=subprocess.STDOUT,
-                                                cwd=self.run_dir, universal_newlines=True)
+                                                cwd=self.run_dir, universal_newlines=True,
+                                                env=self.env)
 
     def launched(self):
         """Return whether the simulation was launched."""
@@ -86,7 +89,7 @@ def completed(self):
         if self.dry_run:
             return True
         elif self.process:
-            return self.process.poll() is not None
+            return self.process.poll() is not None and not self.interrupted
         else:
             return False
 
diff --git a/util/sim/Simulator.py b/util/sim/Simulator.py
index a5d7e19093..4d5cd28839 100644
--- a/util/sim/Simulator.py
+++ b/util/sim/Simulator.py
@@ -4,7 +4,7 @@
 #
 # Luca Colagrande <colluca@iis.ee.ethz.ch>
 
-from Simulation import QuestaSimulation, VCSSimulation, VerilatorSimulation, BansheeSimulation
+from snitch.util.sim import Simulation
 
 
 class Simulator(object):
@@ -111,7 +111,7 @@ def __init__(self, binary):
         Arguments:
             binary: The VCS simulation binary.
         """
-        super().__init__(binary, name='vcs', simulation_cls=VCSSimulation)
+        super().__init__(binary, name='vcs', simulation_cls=Simulation.VCSSimulation)
 
 
 class QuestaSimulator(RTLSimulator):
@@ -128,7 +128,7 @@ def __init__(self, binary):
         Arguments:
             binary: The QuestaSim simulation binary.
         """
-        super().__init__(binary, name='vsim', simulation_cls=QuestaSimulation)
+        super().__init__(binary, name='vsim', simulation_cls=Simulation.QuestaSimulation)
 
 
 class VerilatorSimulator(RTLSimulator):
@@ -145,7 +145,7 @@ def __init__(self, binary):
         Arguments:
             binary: The Verilator simulation binary.
         """
-        super().__init__(binary, name='verilator', simulation_cls=VerilatorSimulation)
+        super().__init__(binary, name='verilator', simulation_cls=Simulation.VerilatorSimulation)
 
 
 class BansheeSimulator(Simulator):
@@ -161,7 +161,7 @@ def __init__(self, cfg):
         Arguments:
             cfg: A Banshee config file.
         """
-        super().__init__(name='banshee', simulation_cls=BansheeSimulation)
+        super().__init__(name='banshee', simulation_cls=Simulation.BansheeSimulation)
         self.cfg = cfg
 
     def supports(self, test):
diff --git a/util/sim/sim_utils.py b/util/sim/sim_utils.py
index 9844fa3633..46f9c0eb8b 100755
--- a/util/sim/sim_utils.py
+++ b/util/sim/sim_utils.py
@@ -128,9 +128,11 @@ def _resolve_relative_path(base_path, s):
 
     Args:
         s: The input string
-        base_path: The base path
+        base_path: The base path. If None, then just return s.
     """
     try:
+        if base_path is None:
+            return s
         base_path = Path(base_path).resolve()  # Get the absolute path of the base directory
         input_path = Path(s)
         if input_path.is_absolute() or not s.startswith(("./", "../")):
@@ -148,12 +150,11 @@ def _resolve_relative_path(base_path, s):
         return s
 
 
-def get_simulations(testlist, simulator, run_dir=None):
-    """Create simulation objects from a test list file.
+def get_simulations(tests, simulator, run_dir=None, base_path=None):
+    """Create simulation objects from a list of tests.
 
     Args:
-        testlist: Path to a test list file. A test list file is a YAML
-            file describing a set of tests.
+        tests: A list of tests.
         simulator: The simulator to use to run the tests. A test run on
             a specific simulator defines a simulation.
         run_dir: A directory under which all tests should be run. If
@@ -166,24 +167,50 @@ def get_simulations(testlist, simulator, run_dir=None):
             `simulator`. This object defines a simulation of the test on
             that particular `simulator`.
     """
-    # Get tests from test list file
-    testlist_path = Path(testlist).absolute()
-    with open(testlist_path, 'r') as f:
-        tests = yaml.safe_load(f)['runs']
-    # Convert relative paths in testlist file to absolute paths
+    # Convert relative paths in testlist to absolute paths
     for test in tests:
-        test['elf'] = testlist_path.parent / test['elf']
+        test['elf'] = _resolve_relative_path(base_path, test['elf'])
         if 'cmd' in test:
-            test['cmd'] = [_resolve_relative_path(testlist_path.parent, arg) for arg in test['cmd']]
+            test['cmd'] = [_resolve_relative_path(base_path, arg) for arg in test['cmd']]
+
     # Create simulation object for every test which supports the specified simulator
     simulations = [simulator.get_simulation(test) for test in tests if simulator.supports(test)]
+
     # Set simulation run directory
     if run_dir is not None:
         for sim in simulations:
             sim.run_dir = Path(run_dir) / sim.testname
+
     return simulations
 
 
+def get_simulations_from_file(file, simulator, run_dir=None):
+    """Create simulation objects from a test list file.
+
+    Args:
+        file: Path to a test list file. A test list file is a YAML
+            file describing a set of tests.
+        simulator: The simulator to use to run the tests. A test run on
+            a specific simulator defines a simulation.
+        run_dir: A directory under which all tests should be run. If
+            provided, a unique subdirectory for each test will be
+            created under this directory, based on the test name.
+
+    Returns:
+        A list of `Simulation` objects. The list contains a
+            `Simulation` object for every test which supports the given
+            `simulator`. This object defines a simulation of the test on
+            that particular `simulator`.
+    """
+    # Get tests from test list file
+    testlist_path = Path(file).absolute()
+    with open(testlist_path, 'r') as f:
+        tests = yaml.safe_load(f)['runs']
+
+    # Get simulations from test list
+    return get_simulations(tests, simulator, run_dir, base_path=testlist_path.parent)
+
+
 def print_summary(sims, early_exit=False, dry_run=False):
     """Print a summary of the simulation suite's exit status.
 
@@ -255,7 +282,7 @@ def get_living_subprocesses():
     iterations = 0
     while get_living_subprocesses() and iterations < 10:
         living_subprocs = get_living_subprocesses()
-        print(f'{len(living_subprocs)} living subprocesses of {ppid}\n{living_subprocs}')
+        print(f'{len(living_subprocs)} living subprocesses of {ppid}')
         for proc in living_subprocs:
             try:
                 os.kill(proc.info['pid'], signal.SIGKILL)
@@ -268,6 +295,8 @@ def get_living_subprocesses():
     if get_living_subprocesses():
         print('THERE ARE STILL LIVING SUBPROCESSES')
         print(get_living_subprocesses())
+    else:
+        print('All subprocesses successfully killed')
 
 
 def get_unique_run_dir(sim, prefix=None):
@@ -336,6 +365,8 @@ def run_simulations(simulations, n_procs=1, dry_run=None, early_exit=False,
                         break
             time.sleep(POLL_PERIOD)
     except KeyboardInterrupt:
+        for sim in running_sims:
+            sim.interrupted = True
         early_exit_requested = True
 
     # Clean up after early exit
diff --git a/util/trace/gen_trace.py b/util/trace/gen_trace.py
index 9583a17585..54d7bf4e6a 100755
--- a/util/trace/gen_trace.py
+++ b/util/trace/gen_trace.py
@@ -43,6 +43,7 @@
 from collections import deque, defaultdict
 from pathlib import Path
 import traceback
+from typing import Optional
 from itertools import tee, islice, chain
 from functools import lru_cache
 
@@ -67,6 +68,7 @@
                        'snitch_load_latency', 'snitch_fseq_offloads',
                        'fseq_issues', 'fpss_issues', 'fpss_fpu_issues',
                        'fpss_load_latency', 'fpss_fpu_latency')
+PERF_EVAL_KEYS_DECIMAL = ('tstart', 'tend', 'cycles')
 
 # -------------------- Architectural constants and enums  --------------------
 
@@ -507,12 +509,12 @@ def flt_fmt(flt: float, width: int = 6) -> str:
 # -------------------- Literal formatting  --------------------
 
 
-def int_lit(num: int, size: int = 2, force_hex: bool = False) -> str:
+def int_lit(num: int, size: int = 2, as_hex: Optional[bool] = None) -> str:
     width = (8 * int(2**size))
     size_mask = (0x1 << width) - 1
     num = num & size_mask  # num is unsigned
     num_signed = c_int32(c_uint32(num).value).value
-    if force_hex or abs(num_signed) > MAX_SIGNED_INT_LIT:
+    if as_hex is True or abs(num_signed) > MAX_SIGNED_INT_LIT and as_hex is not False:
         return '0x{0:0{1}x}'.format(num, width // 4)
     else:
         return str(num_signed)
@@ -742,7 +744,7 @@ def annotate_snitch(extras: dict,
                     gpr_wb_info: dict,
                     perf_metrics: list,
                     annot_fseq_offl: bool = False,
-                    force_hex_addr: bool = True,
+                    int_as_hex: Optional[bool] = None,
                     permissive: bool = False) -> str:
     # Compound annotations in datapath order
     ret = []
@@ -770,10 +772,10 @@ def annotate_snitch(extras: dict,
                     csr_addr)
             cycles_past = extras['opb']
             if csr_name == 'mcycle':
-                perf_metrics[-1]['tend'] = sim_time / 1000
+                perf_metrics[-1]['tend'] = sim_time // 1000
                 perf_metrics[-1]['end'] = cycles_past
                 perf_metrics.append(defaultdict(int))
-                perf_metrics[-1]['tstart'] = sim_time / 1000
+                perf_metrics[-1]['tstart'] = sim_time // 1000
                 perf_metrics[-1]['start'] = cycles_past + 2
             ret.append('{} = {}'.format(csr_name, int_lit(cycles_past)))
         # Load / Store
@@ -782,12 +784,12 @@ def annotate_snitch(extras: dict,
             gpr_wb_info[extras['rd']].appendleft(cycle)
             ret.append('{:<3} <~~ {}[{}]'.format(
                 REG_ABI_NAMES_I[extras['rd']], LS_SIZES[extras['ls_size']],
-                int_lit(extras['alu_result'], force_hex=force_hex_addr)))
+                int_lit(extras['alu_result'], as_hex=int_as_hex)))
         elif extras['is_store']:
             perf_metrics[-1]['snitch_stores'] += 1
             ret.append('{} ~~> {}[{}]'.format(
                 int_lit(extras['gpr_rdata_1']), LS_SIZES[extras['ls_size']],
-                int_lit(extras['alu_result'], force_hex=force_hex_addr)))
+                int_lit(extras['alu_result'], as_hex=int_as_hex)))
         # Branches: all reg-reg ops
         elif extras['is_branch']:
             ret.append(
@@ -830,7 +832,7 @@ def annotate_fpu(
         perf_metrics: list,
         # Everything FPU does may have been issued in a previous section
         curr_sec: int = -1,
-        force_hex_addr: bool = True,
+        int_as_hex: Optional[bool] = None,
         permissive: bool = False) -> str:
     ret = []
     # On issuing of instruction
@@ -857,13 +859,13 @@ def annotate_fpu(
                 fpr_wb_info[extras['rd']].appendleft((LS_TO_FLOAT[s], vlen, cycle))
                 ret.append('{:<4} <~~ {}[{}]'.format(
                     REG_ABI_NAMES_F[extras['rd']], LS_SIZES[s],
-                    int_lit(extras['lsu_qaddr'], force_hex=force_hex_addr)))
+                    int_lit(extras['lsu_qaddr'], as_hex=int_as_hex)))
             if extras['is_store']:
                 perf_metrics[curr_sec]['fpss_stores'] += 1
                 _, val = flt_oper(insn, extras, 1)
                 ret.append('{} ~~> {}[{}]'.format(
                     val, LS_SIZES[s],
-                    int_lit(extras['lsu_qaddr'], force_hex=force_hex_addr)))
+                    int_lit(extras['lsu_qaddr'], as_hex=int_as_hex)))
     # On FLOP completion
     if extras['fpu_out_hs']:
         perf_metrics[-1]['fpss_fpu_issues'] += 1
@@ -914,7 +916,7 @@ def annotate_insn(
     tuple = None,  # Previous timestamp (keeps this method stateless)
     annot_fseq_offl:
     bool = False,  # Annotate whenever core offloads to CPU on own line
-    force_hex_addr: bool = True,
+    int_as_hex: Optional[bool] = None,
     permissive: bool = True,
     dma_trans: list = []
 ) -> (str, tuple, bool
@@ -943,7 +945,7 @@ def annotate_insn(
         if extras['source'] == TRACE_SRCES['snitch']:
             annot = annotate_snitch(extras, time_info[0], time_info[1],
                                     int(pc_str, 16), gpr_wb_info, perf_metrics,
-                                    annot_fseq_offl, force_hex_addr, permissive)
+                                    annot_fseq_offl, int_as_hex, permissive)
             if extras['fpu_offload']:
                 perf_metrics[-1]['snitch_fseq_offloads'] += 1
                 fseq_info['fpss_pcs'].appendleft(
@@ -986,7 +988,7 @@ def annotate_insn(
                         fseq_pc_str[-4:], *fseq_annot))
             annot_list.append(
                 annotate_fpu(extras, insn, time_info[1], fpr_wb_info, perf_metrics,
-                             fseq_info['curr_sec'], force_hex_addr,
+                             fseq_info['curr_sec'], int_as_hex,
                              permissive))
             annot = ', '.join(annot_list)
         else:
@@ -1069,7 +1071,8 @@ def fmt_perf_metrics(perf_metrics: list, idx: int, omit_keys: bool = True):
         elif isinstance(val, float):
             val_str = flt_fmt(val, 4)
         else:
-            val_str = int_lit(val)
+            as_hex = False if key in PERF_EVAL_KEYS_DECIMAL else None
+            val_str = int_lit(val, as_hex=as_hex)
         ret.append('{:<40}{:>10}'.format(key, val_str))
     return '\n'.join(ret)
 
@@ -1184,12 +1187,12 @@ def main():
                         False,
                         time_info,
                         args.offl,
-                        not args.saddr,
+                        None if args.saddr else True,
                         args.permissive,
                         dma_trans,
                     )
                     if perf_metrics[0]['start'] is None:
-                        perf_metrics[0]['tstart'] = time_info[0] / 1000
+                        perf_metrics[0]['tstart'] = time_info[0] // 1000
                         perf_metrics[0]['start'] = time_info[1]
                     if not empty:
                         print(ann_insn, file=file)
@@ -1203,7 +1206,7 @@ def main():
                     print(message, file=sys.stderr)
             else:
                 break  # Nothing more in pipe, EOF
-        perf_metrics[-1]['tend'] = time_info[0] / 1000
+        perf_metrics[-1]['tend'] = time_info[0] // 1000
         perf_metrics[-1]['end'] = time_info[1]
         # Compute metrics
         eval_perf_metrics(perf_metrics)