diff --git a/.github/workflows/benchmark_hpu_hlapi.yml b/.github/workflows/benchmark_hpu_hlapi.yml
deleted file mode 100644
index 9f8a5584d2..0000000000
--- a/.github/workflows/benchmark_hpu_hlapi.yml
+++ /dev/null
@@ -1,98 +0,0 @@
-# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
-name: Hpu Hlapi Benchmarks
-
-on:
-  workflow_dispatch:
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-
-permissions: {}
-
-jobs:
-  hlapi-benchmarks-hpu:
-    name: Execute HLAPI benchmarks for HPU backend
-    runs-on: v80-desktop
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    timeout-minutes: 1440  # 24 hours
-    steps:
-      # Needed as long as hw_regmap repository is private
-      - name: Configure SSH
-        uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
-        with:
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          lfs: true
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Get benchmark details
-        run: |
-          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=${COMMIT_DATE}";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-        env:
-          SHA: ${{ github.sha }}
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          repository: zama-ai/slab
-          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-      - name: Run benchmarks
-        run: |
-          make pull_hpu_files
-          export V80_SERIAL_NUMBER=XFL12E4XJXWK
-          source /opt/xilinx/Vivado/2024.2/settings64.sh
-          make bench_hlapi_erc20_hpu
-          make bench_hlapi_hpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
-          --database tfhe_rs \
-          --hardware "hpu_x1" \
-          --backend hpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch "${REF_NAME}" \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${BENCH_DATE}" \
-          --walk-subdirs
-        env:
-          REF_NAME: ${{ github.ref_name }}
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
-        with:
-          name: ${{ github.sha }}_hlapi_benchmarks
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
-          --slab-url "${SLAB_URL}"
-        env:
-          JOB_SECRET: ${{ secrets.JOB_SECRET }}
-          SLAB_URL: ${{ secrets.SLAB_URL }}
diff --git a/.github/workflows/benchmark_hpu_integer.yml b/.github/workflows/benchmark_hpu_integer.yml
index 612b3c7f5c..e09bbb51ee 100644
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -3,15 +3,6 @@ name: Hpu Integer Benchmarks
 
 on:
   workflow_dispatch:
-    inputs:
-      bench_type:
-        description: "Benchmarks type"
-        type: choice
-        default: both
-        options:
-          - latency
-          - throughput
-          - both
 
 env:
   CARGO_TERM_COLOR: always
@@ -23,46 +14,13 @@ env:
 permissions: {}
 
 jobs:
-  prepare-matrix:
-    name: Prepare operations matrix
-    runs-on: v80-desktop
-    outputs:
-      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
-    steps:
-      - name: Set benchmark types
-        if: github.event_name == 'workflow_dispatch'
-        run: |
-          if [[ -z $INPUTS_BENCH_TYPE || "${INPUTS_BENCH_TYPE}" == "both" ]]; then
-            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
-          else
-            echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
-          fi
-        env:
-          INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}
-
-      - name: Default benchmark type
-        if: github.event_name != 'workflow_dispatch'
-        run: |
-          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
-
-
-      - name: Set benchmark types output
-        id: set_bench_type
-        run: | # zizmor: ignore[template-injection] this env variable is safe
-          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
-
   integer-benchmarks-hpu:
     name: Execute integer & erc20 benchmarks for HPU backend
-    needs: prepare-matrix
     runs-on: v80-desktop
     concurrency:
       group: ${{ github.workflow }}_${{ github.ref }}
       cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
     timeout-minutes: 1440  # 24 hours
-    strategy:
-      max-parallel: 1
-      matrix:
-        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
     steps:
       # Needed as long as hw_regmap repository is private
       - name: Configure SSH
@@ -105,11 +63,8 @@ jobs:
       - name: Run benchmarks
         run: |
           make pull_hpu_files
-          export V80_SERIAL_NUMBER=XFL12E4XJXWK
-          source /opt/xilinx/Vivado/2024.2/settings64.sh
-          make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
-        env:
-          BENCH_TYPE: ${{ matrix.bench_type }}
+          make bench_integer_hpu
+          make bench_hlapi_erc20_hpu
 
       - name: Parse results
         run: |
@@ -121,16 +76,14 @@ jobs:
           --branch "${REF_NAME}" \
           --commit-date "${COMMIT_DATE}" \
           --bench-date "${BENCH_DATE}" \
-          --walk-subdirs \
-          --bench-type "${BENCH_TYPE}"
+          --walk-subdirs
         env:
           REF_NAME: ${{ github.ref_name }}
-          BENCH_TYPE: ${{ matrix.bench_type }}
 
       - name: Upload parsed results artifact
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
         with:
-          name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
+          name: ${{ github.sha }}_integer_benchmarks
           path: ${{ env.RESULTS_FILENAME }}
 
       - name: Send data to Slab
diff --git a/.github/workflows/cargo_test_ntt.yml b/.github/workflows/cargo_test_ntt.yml
index da945bd612..dc01d7723b 100644
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -11,7 +11,6 @@ env:
   CARGO_TERM_COLOR: always
   IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
   CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -32,7 +31,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
         with:
           fetch-depth: 0
-          persist-credentials: "false"
+          persist-credentials: 'false'
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Check for file changes
@@ -46,46 +45,18 @@ jobs:
               - tfhe-ntt/**
               - '.github/workflows/cargo_test_ntt.yml'
 
-  setup-instance:
-    needs: should-run
-    if: needs.should-run.outputs.ntt_test == 'true'
-    runs-on: ubuntu-latest
-    outputs:
-      matrix_os: ${{ steps.set-os-matrix.outputs.matrix_os }}
-      runner-name: ${{ steps.start-remote-instance.outputs.label }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-      - name: Set os matrix
-        id: set-os-matrix
-        env:
-          SLAB_INSTANCE: ${{ steps.start-remote-instance.outputs.label }}
-        run: |
-          INSTANCE_TO_USE="${SLAB_INSTANCE:-ubuntu-latest}"
-          echo "matrix_os=[\"${INSTANCE_TO_USE}\", \"macos-latest\", \"windows-latest\"]" >> "$GITHUB_OUTPUT"
-
   cargo-tests-ntt:
-    needs: [should-run, setup-instance]
+    needs: should-run
     if: needs.should-run.outputs.ntt_test == 'true'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
       fail-fast: false
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
         with:
-          persist-credentials: "false"
+          persist-credentials: 'false'
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Install Rust
@@ -101,16 +72,16 @@ jobs:
         run: make test_ntt_no_std
 
   cargo-tests-ntt-nightly:
-    needs: [should-run, setup-instance]
+    needs: should-run
     if: needs.should-run.outputs.ntt_test == 'true'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
         with:
-          persist-credentials: "false"
+          persist-credentials: 'false'
           token: ${{ env.CHECKOUT_TOKEN }}
 
       - name: Install Rust
@@ -126,7 +97,7 @@ jobs:
         run: make test_ntt_no_std_nightly
 
   cargo-tests-ntt-successful:
-    needs: [should-run, cargo-tests-ntt, cargo-tests-ntt-nightly]
+    needs: [ should-run, cargo-tests-ntt, cargo-tests-ntt-nightly ]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     steps:
@@ -149,28 +120,3 @@ jobs:
         run: |
           echo "Some tfhe-ntt tests failed"
           exit 1
-
-  teardown-instance:
-    name: Teardown instance (cargo-tests-ntt-successful)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [setup-instance, cargo-tests-ntt-successful]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cargo-tests-ntt) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
diff --git a/.github/workflows/coprocessor-benchmark-gpu.yml b/.github/workflows/coprocessor-benchmark-gpu.yml
index 6d4482c74e..eebae996e8 100644
--- a/.github/workflows/coprocessor-benchmark-gpu.yml
+++ b/.github/workflows/coprocessor-benchmark-gpu.yml
@@ -100,7 +100,7 @@ jobs:
           git lfs install
 
       - name: Checkout tfhe-rs
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           path: tfhe-rs
           persist-credentials: false
@@ -111,7 +111,7 @@ jobs:
           ls
 
       - name: Checkout fhevm
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
         with:
           repository: zama-ai/fhevm
           persist-credentials: 'false'
@@ -162,10 +162,10 @@ jobs:
           cargo install sqlx-cli
 
       - name: Install foundry
-        uses: foundry-rs/foundry-toolchain@82dee4ba654bd2146511f85f0d013af94670c4de
+        uses: foundry-rs/foundry-toolchain@de808b1eea699e761c404bda44ba8f21aba30b2c
 
       - name: Cache cargo
-        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
+        uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
         with:
           path: |
             ~/.cargo/registry
@@ -175,7 +175,7 @@ jobs:
           restore-keys: ${{ runner.os }}-cargo-
 
       - name: Login to GitHub Container Registry
-        uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
+        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
@@ -186,7 +186,7 @@ jobs:
         working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
 
       - name: Use Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.2
         with:
           node-version: 20.x
 
@@ -257,7 +257,7 @@ jobs:
           path: fhevm/$${{ env.RESULTS_FILENAME }}
 
       - name: Checkout Slab repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
         with:
           repository: zama-ai/slab
           path: slab
diff --git a/Cargo.toml b/Cargo.toml
index 8a8630d70f..17e9009327 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,7 +33,6 @@ rand = "0.8"
 rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = "0.2.100"
-getrandom = "0.2.8"
 
 [profile.bench]
 lto = "fat"
diff --git a/Makefile b/Makefile
index b2144c8aad..a9e9683e5e 100644
--- a/Makefile
+++ b/Makefile
@@ -1312,11 +1312,11 @@ bench_signed_integer_gpu: install_rs_check_toolchain
 
 .PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
 bench_integer_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
+	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
 
 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
@@ -1497,13 +1497,11 @@ bench_hlapi_gpu: install_rs_check_toolchain
 	--bench hlapi \
 	--features=integer,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark --
 
-.PHONY: bench_hlapi_hpu # Run benchmarks for HLAPI operations on HPU
+.PHONY: bench_hlapi_hpu # Run benchmarks for integer operations on HPU
 bench_hlapi_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
-	RUSTFLAGS="$(RUSTFLAGS)" \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi \
-	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
+	--features=integer,hpu,hpu-v80,internal-keycache,nightly-avx512 -p tfhe-benchmark --
 
 .PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
 bench_hlapi_erc20: install_rs_check_toolchain
@@ -1531,11 +1529,11 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
 
 .PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
 bench_hlapi_erc20_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
+	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
 
 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
diff --git a/README.md b/README.md
index c927f0618f..cc03588dad 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ production-ready library for all the advanced features of TFHE.
 - **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
 - **Size-efficient public key encryption**
 - **Ciphertext and server key compression** for efficient data transfer
-- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
+- **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.
 
 *Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
 <br></br>
@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
 ```
 
 > [!Note]
-> Note: You need to use Rust version >= 1.84 to compile TFHE-rs.
+> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
 
 > [!Note]
 > Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
@@ -147,7 +147,7 @@ To run this code, use the following command:
 
 > [!Note]
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
-to run in release mode with cargo's `--release` flag to have the best performances possible.
+to run in release mode with cargo's `--release` flag to have the best performance possible.
 
 *Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
 
@@ -201,9 +201,11 @@ When a new update is published in the Lattice Estimator, we update parameters ac
 
 ### Security model
 
-By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
+By default, the parameter sets used in the High-Level API with the x86 CPU backend have a failure probability $\le 2^{128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
 If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).
 
+The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-128}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].
+
 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf
 
 [2] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf
@@ -242,7 +244,7 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
 #### FAQ
 **Is Zama’s technology free to use?**
 >Zama’s libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zama’s commercial patent license.
->
+->
 >Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).
 
 **What do I need to do if I want to use Zama’s technology for commercial purposes?**
diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index c833104f12..1cd6c313b9 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -119,8 +119,6 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
 
 int cuda_get_number_of_gpus();
 
-int cuda_get_number_of_sms();
-
 void cuda_synchronize_device(uint32_t gpu_index);
 
 void cuda_drop(void *ptr, uint32_t gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
index 464a20a0d4..973193c4f8 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -115,10 +115,8 @@ template <typename Torus> struct int_decompression {
           effective_compression_carry_modulus,
           encryption_params.message_modulus, encryption_params.carry_modulus,
           decompression_rescale_f, gpu_memory_allocated);
-      auto active_gpu_count =
-          get_active_gpu_count(num_blocks_to_decompress, gpu_count);
-      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
-                                               active_gpu_count);
+
+      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
     }
   }
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index ed9831d331..bb4766c592 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -320,15 +320,10 @@ template <typename Torus> struct int_radix_lut {
   std::vector<Torus *> lwe_after_ks_vec;
   std::vector<Torus *> lwe_after_pbs_vec;
   std::vector<Torus *> lwe_trivial_indexes_vec;
-  std::vector<Torus *> lwe_aligned_vec;
 
   uint32_t *gpu_indexes;
   bool gpu_memory_allocated;
 
-  cudaEvent_t event_scatter_in;
-  cudaEvent_t *event_scatter_out;
-  cudaEvent_t event_broadcast;
-
   int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                 uint32_t num_radix_blocks, bool allocate_gpu_memory,
@@ -347,6 +342,7 @@ template <typename Torus> struct int_radix_lut {
 
     ///////////////
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint i = 0; i < active_gpu_count; i++) {
       cuda_set_device(gpu_indexes[i]);
       int8_t *gpu_pbs_buffer;
@@ -363,21 +359,10 @@ template <typename Torus> struct int_radix_lut {
       if (i == 0) {
         size_tracker += size;
       }
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
       buffer.push_back(gpu_pbs_buffer);
     }
 
-    // We create the events only if we have multiple GPUs
-    if (active_gpu_count > 1) {
-      event_scatter_in = cuda_create_event(gpu_indexes[0]);
-      event_broadcast = cuda_create_event(gpu_indexes[0]);
-
-      event_scatter_out =
-          (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
-      for (int i = 0; i < active_gpu_count; i++) {
-        event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
-      }
-    }
-
     // Allocate LUT
     // LUT is used as a trivial encryption and must be initialized outside
     // this constructor
@@ -396,6 +381,8 @@ template <typename Torus> struct int_radix_lut {
 
       lut_vec.push_back(lut);
       lut_indexes_vec.push_back(lut_indexes);
+
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
 
     // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -511,8 +498,11 @@ template <typename Torus> struct int_radix_lut {
       cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
                                            streams[i], gpu_indexes[i],
                                            allocate_gpu_memory);
+
       lut_vec.push_back(lut);
       lut_indexes_vec.push_back(lut_indexes);
+
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
 
     // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -569,6 +559,7 @@ template <typename Torus> struct int_radix_lut {
 
     ///////////////
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint i = 0; i < active_gpu_count; i++) {
       cuda_set_device(gpu_indexes[i]);
       int8_t *gpu_pbs_buffer;
@@ -585,19 +576,10 @@ template <typename Torus> struct int_radix_lut {
       if (i == 0) {
         size_tracker += size;
       }
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
       buffer.push_back(gpu_pbs_buffer);
     }
-    // We create the events only if we have multiple GPUs
-    if (active_gpu_count > 1) {
-      event_scatter_in = cuda_create_event(gpu_indexes[0]);
-      event_broadcast = cuda_create_event(gpu_indexes[0]);
-
-      event_scatter_out =
-          (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
-      for (int i = 0; i < active_gpu_count; i++) {
-        event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
-      }
-    }
+
     // Allocate LUT
     // LUT is used as a trivial encryption and must be initialized outside
     // this constructor
@@ -613,8 +595,11 @@ template <typename Torus> struct int_radix_lut {
       cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
                                            streams[i], gpu_indexes[i],
                                            allocate_gpu_memory);
+
       lut_vec.push_back(lut);
       lut_indexes_vec.push_back(lut_indexes);
+
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
 
     // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -665,9 +650,11 @@ template <typename Torus> struct int_radix_lut {
     multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
                                 lwe_trivial_indexes_vec, num_radix_blocks,
                                 size_tracker, allocate_gpu_memory);
-    multi_gpu_copy_array_from_cpu_async(
-        streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
-        h_lwe_indexes_in, num_radix_blocks, allocate_gpu_memory);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
+                               lwe_trivial_indexes_vec, lwe_trivial_indexes,
+                               num_radix_blocks, allocate_gpu_memory);
+
     // Keyswitch
     tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
@@ -724,87 +711,29 @@ template <typename Torus> struct int_radix_lut {
 
   // Broadcast luts from device gpu_indexes[0] to all active gpus
   void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes) {
-    // We only do broadcast if there are more than 1 active GPU
-    if (active_gpu_count > 1) {
-      int active_device = cuda_get_device();
+    int active_device = cuda_get_device();
 
-      uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
+    uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
 
-      auto src_lut = lut_vec[0];
-      auto src_lut_indexes = lut_indexes_vec[0];
-
-      cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
-      for (uint i = 0; i < active_gpu_count; i++) {
-        if (gpu_indexes[i] != gpu_indexes[0]) {
-          cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
-          auto dst_lut = lut_vec[i];
-          auto dst_lut_indexes = lut_indexes_vec[i];
-          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-              dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
-              gpu_indexes[i], gpu_memory_allocated);
-          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-              dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
-              streams[i], gpu_indexes[i], gpu_memory_allocated);
-        }
-      }
-      // Ensure the device set at the end of this method is the same as it was
-      // set at the beginning
-      cuda_set_device(active_device);
-    }
-  }
-  // Broadcast luts from device gpu_indexes[0] to all active gpus
-  void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                     uint32_t new_active_gpu_count,
-                     bool broadcast_lut_values = true) {
-    // We only do broadcast if there are more than 1 active GPU
-    if (new_active_gpu_count > 1) {
-      int active_device = cuda_get_device();
-
-      uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
-
-      auto src_lut = lut_vec[0];
-      auto src_lut_indexes = lut_indexes_vec[0];
-      if (active_gpu_count > 1)
-        cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
-      for (uint i = 0; i < new_active_gpu_count; i++) {
-        if (gpu_indexes[i] != gpu_indexes[0]) {
-          cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
-          if (broadcast_lut_values) {
-            auto dst_lut = lut_vec[i];
-            cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-                dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
-                streams[i], gpu_indexes[i], gpu_memory_allocated);
-          }
-          auto dst_lut_indexes = lut_indexes_vec[i];
-          cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-              dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
-              streams[i], gpu_indexes[i], gpu_memory_allocated);
-        }
-      }
-      // Ensure the device set at the end of this method is the same as it was
-      // set at the beginning
-      cuda_set_device(active_device);
-    }
-  }
+    auto src_lut = lut_vec[0];
+    auto src_lut_indexes = lut_indexes_vec[0];
 
-  void allocate_lwe_vector_for_non_trivial_indexes(
-      cudaStream_t const *streams, uint32_t const *gpu_indexes,
-      uint32_t active_gpu_count, uint64_t max_num_radix_blocks,
-      uint64_t &size_tracker, bool allocate_gpu_memory) {
-    // We need to create the auxiliary array only in GPU 0
-    lwe_aligned_vec.resize(active_gpu_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint i = 0; i < active_gpu_count; i++) {
-      uint64_t size_tracker_on_array_i = 0;
-      auto inputs_on_gpu = std::max(
-          THRESHOLD_MULTI_GPU,
-          get_num_inputs_on_gpu(max_num_radix_blocks, i, active_gpu_count));
-      Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
-          inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
-          streams[0], gpu_indexes[0], size_tracker_on_array_i,
-          allocate_gpu_memory);
-      lwe_aligned_vec[i] = d_array;
-      size_tracker += size_tracker_on_array_i;
+      if (gpu_indexes[i] != gpu_indexes[0]) {
+        auto dst_lut = lut_vec[i];
+        auto dst_lut_indexes = lut_indexes_vec[i];
+        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+            dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
+            gpu_indexes[i], gpu_memory_allocated);
+        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+            dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+            streams[i], gpu_indexes[i], gpu_memory_allocated);
+      }
     }
+    // Ensure the device set at the end of this method is the same as it was set
+    // at the beginning
+    cuda_set_device(active_device);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -816,6 +745,7 @@ template <typename Torus> struct int_radix_lut {
       cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
                                          gpu_indexes[i], gpu_memory_allocated);
     }
+
     cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
                                        gpu_indexes[0], gpu_memory_allocated);
     cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
@@ -862,23 +792,6 @@ template <typename Torus> struct int_radix_lut {
       lwe_after_ks_vec.clear();
       lwe_after_pbs_vec.clear();
       lwe_trivial_indexes_vec.clear();
-      if (active_gpu_count > 1) {
-        for (uint i = 0; i < active_gpu_count; i++) {
-          cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-          cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
-        }
-        cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
-        cuda_event_destroy(event_broadcast, gpu_indexes[0]);
-        free(event_scatter_out);
-      }
-      if (lwe_aligned_vec.size() > 0) {
-        for (uint i = 0; i < active_gpu_count; i++) {
-          cuda_drop_with_size_tracking_async(lwe_aligned_vec[i], streams[0],
-                                             gpu_indexes[0],
-                                             gpu_memory_allocated);
-        }
-        lwe_aligned_vec.clear();
-      }
     }
     free(h_lut_indexes);
     free(degrees);
@@ -927,8 +840,6 @@ template <typename InputTorus> struct int_noise_squashing_lut {
 
   bool using_trivial_lwe_indexes = true;
   bool gpu_memory_allocated;
-  std::vector<InputTorus *> lwe_aligned_scatter_vec;
-  std::vector<__uint128_t *> lwe_aligned_gather_vec;
   // noise squashing constructor
   int_noise_squashing_lut(cudaStream_t const *streams,
                           uint32_t const *input_gpu_indexes, uint32_t gpu_count,
@@ -1070,10 +981,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {
                                               &pbs_buffer[i]);
       cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
-    if (lwe_aligned_gather_vec.size() > 0) {
-      multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_gather_vec);
-      multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_scatter_vec);
-    }
+
     multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
     multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
     multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
@@ -1136,10 +1044,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
         lut->get_lut_indexes(0, 0), h_lut_indexes,
         num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
         gpu_indexes[0], allocate_gpu_memory);
-
-    auto active_gpu_count =
-        get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     /**
      * the input indexes should take the first bits_per_block PBS to target
@@ -1165,9 +1070,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
 
     lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
                          h_lwe_indexes_out);
-    lut->allocate_lwe_vector_for_non_trivial_indexes(
-        streams, gpu_indexes, active_gpu_count,
-        num_radix_blocks * bits_per_block, size_tracker, allocate_gpu_memory);
 
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     free(h_lwe_indexes_in);
@@ -1309,9 +1211,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
         mux_lut->get_degree(0), mux_lut->get_max_degree(0),
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, mux_lut_f, gpu_memory_allocated);
-    auto active_gpu_count_mux =
-        get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
-    mux_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_mux);
+    mux_lut->broadcast_lut(streams, gpu_indexes);
 
     auto cleaning_lut_f = [params](Torus x) -> Torus {
       return x % params.message_modulus;
@@ -1321,10 +1221,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
         cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
-    auto active_gpu_count_cleaning =
-        get_active_gpu_count(num_radix_blocks, gpu_count);
-    cleaning_lut->broadcast_lut(streams, gpu_indexes,
-                                active_gpu_count_cleaning);
+    cleaning_lut->broadcast_lut(streams, gpu_indexes);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1413,8 +1310,8 @@ template <typename Torus> struct int_fullprop_buffer {
     cuda_memcpy_with_size_tracking_async_to_gpu(
         lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams[0],
         gpu_indexes[0], allocate_gpu_memory);
-    auto active_gpu_count = get_active_gpu_count(2, gpu_count);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+    lut->broadcast_lut(streams, gpu_indexes);
 
     tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
@@ -1549,11 +1446,9 @@ template <typename Torus> struct int_overflowing_sub_memory {
         glwe_dimension, polynomial_size, message_modulus, carry_modulus,
         f_message_acc, gpu_memory_allocated);
 
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
-    luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes,
-                                               active_gpu_count);
-    message_acc->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    luts_array->broadcast_lut(streams, gpu_indexes);
+    luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes);
+    message_acc->broadcast_lut(streams, gpu_indexes);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1662,8 +1557,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     uint32_t total_messages = 0;
     current_columns.next_accumulation(total_ciphertexts, total_messages,
                                       _needs_processing);
-    uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
+
     if (!mem_reuse) {
+      uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
       if (total_ciphertexts > 0 ||
           reduce_degrees_for_single_carry_propagation) {
         uint64_t size_tracker = 0;
@@ -1671,11 +1567,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
             new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
                                      pbs_count, true, size_tracker);
         allocated_luts_message_carry = true;
-        auto active_gpu_count =
-            get_active_gpu_count(this->max_total_blocks_in_vec, gpu_count);
-        luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
-            streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
-            size_tracker, true);
       }
     }
     if (allocated_luts_message_carry) {
@@ -1703,9 +1594,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
           luts_message_carry->get_max_degree(1), params.glwe_dimension,
           params.polynomial_size, message_modulus, params.carry_modulus,
           lut_f_carry, gpu_memory_allocated);
-      auto active_gpu_count_mc = get_active_gpu_count(pbs_count, gpu_count);
-      luts_message_carry->broadcast_lut(streams, gpu_indexes,
-                                        active_gpu_count_mc);
+      luts_message_carry->broadcast_lut(streams, gpu_indexes);
     }
   }
   int_sum_ciphertexts_vec_memory(
@@ -1725,7 +1614,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     this->allocated_luts_message_carry = false;
     this->reduce_degrees_for_single_carry_propagation =
         reduce_degrees_for_single_carry_propagation;
-
     setup_index_buffers(streams, gpu_indexes, size_tracker);
     // because we setup_lut in host function for sum_ciphertexts to save memory
     // the size_tracker is topped up here to have a max bound on the used memory
@@ -1773,9 +1661,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     this->current_blocks = current_blocks;
     this->small_lwe_vector = small_lwe_vector;
     this->luts_message_carry = reused_lut;
-    this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
-        streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
-        size_tracker, allocate_gpu_memory);
     setup_index_buffers(streams, gpu_indexes, size_tracker);
   }
 
@@ -1859,9 +1744,8 @@ template <typename Torus> struct int_seq_group_prop_memory {
     cuda_memcpy_with_size_tracking_async_to_gpu(
         seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
         streams[0], gpu_indexes[0], allocate_gpu_memory);
-    auto active_gpu_count = get_active_gpu_count(num_seq_luts, gpu_count);
-    lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes,
-                                            active_gpu_count);
+
+    lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes);
     free(h_seq_lut_indexes);
   };
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1916,8 +1800,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
         lut_hillis_steele->get_degree(0), lut_hillis_steele->get_max_degree(0),
         glwe_dimension, polynomial_size, message_modulus, carry_modulus,
         f_lut_hillis_steele, gpu_memory_allocated);
-    auto active_gpu_count = get_active_gpu_count(num_groups, gpu_count);
-    lut_hillis_steele->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+    lut_hillis_steele->broadcast_lut(streams, gpu_indexes);
   };
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
@@ -2093,9 +1977,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
         lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
         gpu_indexes[0], allocate_gpu_memory);
     // Do I need to do something else for the multi-gpu?
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    luts_array_first_step->broadcast_lut(streams, gpu_indexes,
-                                         active_gpu_count);
+
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes);
   };
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
@@ -2356,9 +2239,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
         scalar_array_cum_sum, h_scalar_array_cum_sum,
         num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
         allocate_gpu_memory);
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    luts_array_second_step->broadcast_lut(streams, gpu_indexes,
-                                          active_gpu_count);
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes);
 
     if (use_sequential_algorithm_to_resolve_group_carries) {
 
@@ -2377,17 +2258,14 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
 
   // needed for the division to update the lut indexes
   void update_lut_indexes(cudaStream_t const *streams,
-                          uint32_t const *gpu_indexes, uint32_t gpu_count,
-                          Torus *new_lut_indexes, Torus *new_scalars,
-                          uint32_t new_num_blocks) {
+                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+                          Torus *new_scalars, uint32_t new_num_blocks) {
     Torus *lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
     cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
         lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
         streams[0], gpu_indexes[0], gpu_memory_allocated);
-    auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
-    // We just need to update the lut indexes so we use false here
-    luts_array_second_step->broadcast_lut(streams, gpu_indexes,
-                                          new_active_gpu_count, false);
+
+    luts_array_second_step->broadcast_lut(streams, gpu_indexes);
 
     cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
         scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
@@ -2552,9 +2430,7 @@ template <typename Torus> struct int_sc_prop_memory {
           polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
           gpu_memory_allocated);
 
-      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-      lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes,
-                                            active_gpu_count);
+      lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
     }
 
     // For the final cleanup in case of overflow or carry (it seems that I can)
@@ -2623,9 +2499,7 @@ template <typename Torus> struct int_sc_prop_memory {
           (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
           allocate_gpu_memory);
     }
-    auto active_gpu_count =
-        get_active_gpu_count(num_radix_blocks + 1, gpu_count);
-    lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut_message_extract->broadcast_lut(streams, gpu_indexes);
   };
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -2820,23 +2694,19 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
         lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
         gpu_indexes[0], allocate_gpu_memory);
     // Do I need to do something else for the multi-gpu?
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    luts_array_first_step->broadcast_lut(streams, gpu_indexes,
-                                         active_gpu_count);
+
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes);
   };
 
   // needed for the division to update the lut indexes
   void update_lut_indexes(cudaStream_t const *streams,
-                          uint32_t const *gpu_indexes, uint32_t gpu_count,
-                          Torus *new_lut_indexes, uint32_t new_num_blocks) {
+                          uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+                          uint32_t new_num_blocks) {
     Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
     cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
         lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
         streams[0], gpu_indexes[0], gpu_memory_allocated);
-    auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
-    // We just need to update the lut indexes so we use false here
-    luts_array_first_step->broadcast_lut(streams, gpu_indexes,
-                                         new_active_gpu_count, false);
+    luts_array_first_step->broadcast_lut(streams, gpu_indexes);
   }
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
@@ -2935,9 +2805,8 @@ template <typename Torus> struct int_borrow_prop_memory {
         lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
         message_modulus, carry_modulus, f_message_extract,
         gpu_memory_allocated);
-    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
 
-    lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut_message_extract->broadcast_lut(streams, gpu_indexes);
 
     if (compute_overflow) {
       lut_borrow_flag = new int_radix_lut<Torus>(
@@ -2953,7 +2822,8 @@ template <typename Torus> struct int_borrow_prop_memory {
           lut_borrow_flag->get_degree(0), lut_borrow_flag->get_max_degree(0),
           glwe_dimension, polynomial_size, message_modulus, carry_modulus,
           f_borrow_flag, gpu_memory_allocated);
-      lut_borrow_flag->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+      lut_borrow_flag->broadcast_lut(streams, gpu_indexes);
     }
 
     active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
@@ -2981,15 +2851,15 @@ template <typename Torus> struct int_borrow_prop_memory {
 
   // needed for the division to update the lut indexes
   void update_lut_indexes(cudaStream_t const *streams,
-                          uint32_t const *gpu_indexes, uint32_t gpu_count,
+                          uint32_t const *gpu_indexes,
                           Torus *first_indexes_for_div,
                           Torus *second_indexes_for_div, Torus *scalars_for_div,
                           uint32_t new_num_blocks) {
     shifted_blocks_borrow_state_mem->update_lut_indexes(
-        streams, gpu_indexes, gpu_count, first_indexes_for_div, new_num_blocks);
+        streams, gpu_indexes, first_indexes_for_div, new_num_blocks);
     prop_simu_group_carries_mem->update_lut_indexes(
-        streams, gpu_indexes, gpu_count, second_indexes_for_div,
-        scalars_for_div, new_num_blocks);
+        streams, gpu_indexes, second_indexes_for_div, scalars_for_div,
+        new_num_blocks);
   }
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
@@ -3120,10 +2990,7 @@ template <typename Torus> struct int_mul_memory {
           zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
           params.polynomial_size, params.message_modulus, params.carry_modulus,
           zero_out_predicate_lut_f, gpu_memory_allocated);
-
-      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-      zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes,
-                                            active_gpu_count);
+      zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes);
 
       zero_out_mem = new int_zero_out_if_buffer<Torus>(
           streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3196,8 +3063,8 @@ template <typename Torus> struct int_mul_memory {
           streams[0], gpu_indexes[0],
           luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
           msb_vector_block_count);
-    auto active_gpu_count = get_active_gpu_count(total_block_count, gpu_count);
-    luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+    luts_array->broadcast_lut(streams, gpu_indexes);
     // create memory object for sum ciphertexts
     sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
         streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3329,8 +3196,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
           cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
           params.polynomial_size, params.message_modulus, params.carry_modulus,
           shift_lut_f, gpu_memory_allocated);
-      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
 
       lut_buffers_bivariate.push_back(cur_lut_bivariate);
     }
@@ -3414,15 +3280,13 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
           cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
           params.polynomial_size, params.message_modulus, params.carry_modulus,
           shift_lut_f, gpu_memory_allocated);
-      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+      cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
 
       lut_buffers_bivariate.push_back(cur_lut_bivariate);
     }
   }
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (auto &buffer : lut_buffers_bivariate) {
       buffer->release(streams, gpu_indexes, gpu_count);
       delete buffer;
@@ -3520,9 +3384,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
           shift_last_block_lut_univariate->get_max_degree(0),
           params.glwe_dimension, params.polynomial_size, params.message_modulus,
           params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
-      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-      shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
-                                                     active_gpu_count);
+      shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
 
       lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
     }
@@ -3547,9 +3409,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
         padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
         params.polynomial_size, params.message_modulus, params.carry_modulus,
         padding_block_lut_f, gpu_memory_allocated);
-    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-    padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
-                                                active_gpu_count);
+    padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
 
     lut_buffers_univariate.push_back(padding_block_lut_univariate);
 
@@ -3588,9 +3448,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
           shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
           params.polynomial_size, params.message_modulus, params.carry_modulus,
           blocks_lut_f, gpu_memory_allocated);
-      auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-      shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes,
-                                                active_gpu_count);
+      shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes);
 
       lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
     }
@@ -3598,7 +3456,6 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
       cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
@@ -3705,13 +3562,9 @@ template <typename Torus> struct int_cmux_buffer {
         predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
         2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
         allocate_gpu_memory);
-    auto active_gpu_count_pred =
-        get_active_gpu_count(2 * num_radix_blocks, gpu_count);
-    predicate_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_pred);
-    auto active_gpu_count_msg =
-        get_active_gpu_count(num_radix_blocks, gpu_count);
-    message_extract_lut->broadcast_lut(streams, gpu_indexes,
-                                       active_gpu_count_msg);
+
+    predicate_lut->broadcast_lut(streams, gpu_indexes);
+    message_extract_lut->broadcast_lut(streams, gpu_indexes);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3783,8 +3636,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, is_max_value_f, gpu_memory_allocated);
 
-    auto active_gpu_count = get_active_gpu_count(max_chunks, gpu_count);
-    is_max_value->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    is_max_value->broadcast_lut(streams, gpu_indexes);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3844,8 +3696,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, operator_f, gpu_memory_allocated);
 
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    operator_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    operator_lut->broadcast_lut(streams, gpu_indexes);
 
     // f(x) -> x == 0
     Torus total_modulus = params.message_modulus * params.carry_modulus;
@@ -3863,7 +3714,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
 
-    is_non_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    is_non_zero_lut->broadcast_lut(streams, gpu_indexes);
 
     // Scalar may have up to num_radix_blocks blocks
     scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -3882,8 +3733,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
           params.polynomial_size, params.message_modulus, params.carry_modulus,
           lut_f, gpu_memory_allocated);
     }
-    scalar_comparison_luts->broadcast_lut(streams, gpu_indexes,
-                                          active_gpu_count);
+
+    scalar_comparison_luts->broadcast_lut(streams, gpu_indexes);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3954,8 +3805,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
         tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
         params.polynomial_size, params.message_modulus, params.carry_modulus,
         block_selector_f, gpu_memory_allocated);
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+    tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -4142,7 +3993,8 @@ template <typename Torus> struct int_comparison_buffer {
         identity_lut->get_degree(0), identity_lut->get_max_degree(0),
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, identity_lut_f, gpu_memory_allocated);
-    identity_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+    identity_lut->broadcast_lut(streams, gpu_indexes);
 
     uint32_t total_modulus = params.message_modulus * params.carry_modulus;
     auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -4159,7 +4011,7 @@ template <typename Torus> struct int_comparison_buffer {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, is_zero_f, gpu_memory_allocated);
 
-    is_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    is_zero_lut->broadcast_lut(streams, gpu_indexes);
 
     switch (op) {
     case COMPARISON_TYPE::MAX:
@@ -4241,8 +4093,8 @@ template <typename Torus> struct int_comparison_buffer {
           signed_lut->get_degree(0), signed_lut->get_max_degree(0),
           params.glwe_dimension, params.polynomial_size, params.message_modulus,
           params.carry_modulus, signed_lut_f, gpu_memory_allocated);
-      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-      signed_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+      signed_lut->broadcast_lut(streams, gpu_indexes);
     }
   }
 
@@ -4291,7 +4143,6 @@ template <typename Torus> struct int_comparison_buffer {
       delete signed_msb_lut;
       delete tmp_trivial_sign_block;
     }
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
@@ -4458,23 +4309,17 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
           streams, gpu_indexes, gpu_count, params, 1, num_blocks,
           allocate_gpu_memory, size_tracker);
 
-      generate_device_accumulator<Torus>(
-          streams[0], gpu_indexes[0], masking_luts_1[i]->get_lut(0, 0),
-          masking_luts_1[i]->get_degree(0),
-          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
-      auto active_gpu_count1 = get_active_gpu_count(1, gpu_count);
-      masking_luts_1[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count1);
+      int_radix_lut<Torus> *luts[2] = {masking_luts_1[i], masking_luts_2[i]};
 
-      generate_device_accumulator<Torus>(
-          streams[0], gpu_indexes[0], masking_luts_2[i]->get_lut(0, 0),
-          masking_luts_2[i]->get_degree(0),
-          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
-      auto active_gpu_count2 = get_active_gpu_count(num_blocks, gpu_count);
-      masking_luts_2[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count2);
+      for (int j = 0; j < 2; j++) {
+        generate_device_accumulator<Torus>(
+            streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
+            luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+            params.glwe_dimension, params.polynomial_size,
+            params.message_modulus, params.carry_modulus, lut_f_masking,
+            gpu_memory_allocated);
+        luts[j]->broadcast_lut(streams, gpu_indexes);
+      }
     }
 
     // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -4494,14 +4339,13 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
 
     int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                      message_extract_lut_2};
-    auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
     for (int j = 0; j < 2; j++) {
       generate_device_accumulator<Torus>(
           streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
           luts[j]->get_degree(0), luts[j]->get_max_degree(0),
           params.glwe_dimension, params.polynomial_size, params.message_modulus,
           params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      luts[j]->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+      luts[j]->broadcast_lut(streams, gpu_indexes);
     }
 
     // Give name to closures to improve readability
@@ -4537,8 +4381,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, cur_lut_f, params.message_modulus - 2,
         gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
-                                                          active_gpu_count);
+    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes);
     generate_device_accumulator_bivariate_with_factor<Torus>(
         streams[0], gpu_indexes[0],
         zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
@@ -4547,8 +4390,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, cur_lut_f, params.message_modulus - 1,
         gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
-                                                          active_gpu_count);
+    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes);
 
     // create and generate zero_out_if_overflow_happened
     zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -4575,8 +4417,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
         gpu_memory_allocated);
-    zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes,
-                                                    active_gpu_count);
+    zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes);
     generate_device_accumulator_bivariate_with_factor<Torus>(
         streams[0], gpu_indexes[0],
         zero_out_if_overflow_happened[1]->get_lut(0, 0),
@@ -4585,12 +4426,10 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
         gpu_memory_allocated);
-    zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes,
-                                                    active_gpu_count);
+    zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes);
 
     // merge_overflow_flags_luts
     merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
-    auto active_gpu_count_for_bits = get_active_gpu_count(1, gpu_count);
     for (int i = 0; i < num_bits_in_message; i++) {
       auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
         return (x == 0 && y == 0) << i;
@@ -4607,8 +4446,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
           merge_overflow_flags_luts[i]->get_max_degree(0),
           params.glwe_dimension, params.polynomial_size, params.message_modulus,
           params.carry_modulus, lut_f_bit, gpu_memory_allocated);
-      merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes,
-                                                  active_gpu_count_for_bits);
+      merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes);
     }
   }
 
@@ -4864,7 +4702,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
     delete[] merge_overflow_flags_luts;
 
     // release sub streams
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     for (uint i = 0; i < active_gpu_count; i++) {
       cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
       cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -4926,7 +4763,7 @@ template <typename Torus> struct int_bitop_buffer {
     gpu_memory_allocated = allocate_gpu_memory;
     this->op = op;
     this->params = params;
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+
     switch (op) {
     case BITAND:
     case BITOR:
@@ -4953,7 +4790,7 @@ template <typename Torus> struct int_bitop_buffer {
             lut->get_max_degree(0), params.glwe_dimension,
             params.polynomial_size, params.message_modulus,
             params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
-        lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+        lut->broadcast_lut(streams, gpu_indexes);
       }
       break;
     default:
@@ -4983,7 +4820,7 @@ template <typename Torus> struct int_bitop_buffer {
             params.polynomial_size, params.message_modulus,
             params.carry_modulus, lut_univariate_scalar_f,
             gpu_memory_allocated);
-        lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+        lut->broadcast_lut(streams, gpu_indexes);
       }
     }
   }
@@ -5268,10 +5105,7 @@ template <typename Torus> struct int_div_rem_memory {
           compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
           params.polynomial_size, params.message_modulus, params.carry_modulus,
           f_compare_extracted_signed_bits, gpu_memory_allocated);
-      auto active_gpu_count_cmp =
-          get_active_gpu_count(1, gpu_count); // only 1 block needed
-      compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes,
-                                             active_gpu_count_cmp);
+      compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes);
     }
   }
 
@@ -5313,7 +5147,6 @@ template <typename Torus> struct int_div_rem_memory {
       delete compare_signed_bits_lut;
 
       // release sub streams
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
       for (uint i = 0; i < gpu_count; i++) {
         cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
         cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -5942,7 +5775,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
     this->allocate_gpu_memory = allocate_gpu_memory;
     this->direction = direction;
     this->bit_value = bit_value;
-    auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+
     this->univ_lut_mem = new int_radix_lut<Torus>(
         streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks,
         allocate_gpu_memory, size_tracker);
@@ -5981,7 +5814,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
         params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
 
     if (allocate_gpu_memory) {
-      univ_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+      univ_lut_mem->broadcast_lut(streams, gpu_indexes);
     }
 
     auto generate_bi_lut_lambda =
@@ -6000,7 +5833,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
         params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
 
     if (allocate_gpu_memory) {
-      biv_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+      biv_lut_mem->broadcast_lut(streams, gpu_indexes);
     }
 
     this->tmp_ct = new CudaRadixCiphertextFFI;
@@ -6218,8 +6051,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
     cuda_memcpy_async_to_gpu(luts->get_lut_indexes(0, 0), this->h_lut_indexes,
                              num_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
-    luts->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    luts->broadcast_lut(streams, gpu_indexes);
 
     free(h_corrections);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
index ca543e6a20..bcbe409101 100644
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -232,13 +232,8 @@ template <typename Torus> struct zk_expand_mem {
         num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
         allocate_gpu_memory);
 
-    auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
-    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
-                                                  active_gpu_count);
+    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes);
 
-    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
-        streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
-        allocate_gpu_memory);
     // The expanded LWEs will always be on the casting key format
     tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
         num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index c6f0997d86..c5464743e9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,88 +1,15 @@
 #include "device.h"
 #include <cstdint>
 #include <cuda_runtime.h>
-#include <mutex>
 
 uint32_t cuda_get_device() {
   int device;
   check_cuda_error(cudaGetDevice(&device));
   return static_cast<uint32_t>(device);
 }
-std::mutex pool_mutex;
-bool mem_pools_enabled = false;
-
-// We use memory pools to reduce some overhead of memory allocations due
-// to our scratch/release pattern. This function is the simplest way of using
-// mempools, it modifies the default memory pool to use a threshold of 5% of the
-// free memory:
-//  - Enabled opportunistic reuse to maximize reuse in malloc/free patterns
-//  - Prevent memory from being released back to the OS too soon if is within
-//  our threshold
-//  - Warm up the pool by allocating and freeing a large block of memory
-// This function is called only once, the first time a GPU is set, and it
-// configures all the GPUs available.
-// We have measured an improvement of around 10% in our integer operations,
-// especially the ones involving many allocations.
-// We tested more complex configurations of mempools, but they did not yield
-// better results.
-void cuda_setup_mempool(uint32_t caller_gpu_index) {
-  if (!mem_pools_enabled) {
-    pool_mutex.lock();
-    if (mem_pools_enabled)
-      return; // If mem pools are already enabled, we don't need to do anything
-
-    // We do it only once for all GPUs
-    mem_pools_enabled = true;
-    uint32_t num_gpus = cuda_get_number_of_gpus();
-    for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
-      cuda_set_device(gpu_index);
-
-      size_t total_mem, free_mem;
-      check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
-
-      // If we have more than 5% of free memory, we can set up the mempool
-      uint64_t mem_pool_threshold = total_mem / 20; // 5% of total memory
-      mem_pool_threshold =
-          mem_pool_threshold - (mem_pool_threshold % 1024); // Align to 1KB
-      if (mem_pool_threshold < free_mem) {
-        // Get default memory pool
-        cudaMemPool_t default_pool;
-        check_cuda_error(cudaDeviceGetDefaultMemPool(&default_pool, gpu_index));
-
-        // Enable opportunistic reuse
-        int reuse = 1;
-        check_cuda_error(cudaMemPoolSetAttribute(
-            default_pool, cudaMemPoolReuseAllowOpportunistic, &reuse));
-
-        // Prevent memory from being released back to the OS too soon
-        check_cuda_error(cudaMemPoolSetAttribute(
-            default_pool, cudaMemPoolAttrReleaseThreshold,
-            &mem_pool_threshold));
-
-        // Warm up the pool by allocating and freeing a large block
-        cudaStream_t stream;
-        stream = cuda_create_stream(gpu_index);
-        void *warmup_ptr = nullptr;
-        warmup_ptr = cuda_malloc_async(mem_pool_threshold, stream, gpu_index);
-        cuda_drop_async(warmup_ptr, stream, gpu_index);
-
-        // Sync to ensure pool is grown
-        cuda_synchronize_stream(stream, gpu_index);
-
-        // Clean up
-        cuda_destroy_stream(stream, gpu_index);
-      }
-    }
-    // We return to the original gpu_index
-    cuda_set_device(caller_gpu_index);
-    pool_mutex.unlock();
-  }
-}
 
 void cuda_set_device(uint32_t gpu_index) {
   check_cuda_error(cudaSetDevice(gpu_index));
-  // Mempools are initialized only once in all the GPUS available
-  cuda_setup_mempool(gpu_index);
 }
 
 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
@@ -402,13 +329,6 @@ int cuda_get_number_of_gpus() {
   return num_gpus;
 }
 
-int cuda_get_number_of_sms() {
-  int num_sms = 0;
-  check_cuda_error(
-      cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
-  return num_sms;
-}
-
 /// Drop a cuda array
 void cuda_drop(void *ptr, uint32_t gpu_index) {
   cuda_set_device(gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
index ac0b56a798..d3dba45bc7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -148,8 +148,7 @@ __host__ void are_all_comparisons_block_true(
         cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
                                  h_lut_indexes, num_chunks * sizeof(Torus),
                                  streams[0], gpu_indexes[0]);
-        auto active_gpu_count = get_active_gpu_count(num_chunks, gpu_count);
-        is_max_value_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+        is_max_value_lut->broadcast_lut(streams, gpu_indexes);
       }
       lut = is_max_value_lut;
     }
@@ -168,10 +167,7 @@ __host__ void are_all_comparisons_block_true(
                                is_max_value_lut->h_lut_indexes,
                                is_max_value_lut->num_blocks * sizeof(Torus),
                                streams[0], gpu_indexes[0]);
-      auto active_gpu_count_is_max =
-          get_active_gpu_count(is_max_value_lut->num_blocks, gpu_count);
-      is_max_value_lut->broadcast_lut(streams, gpu_indexes,
-                                      active_gpu_count_is_max, false);
+      is_max_value_lut->broadcast_lut(streams, gpu_indexes);
       reset_radix_ciphertext_blocks(lwe_array_out, 1);
       return;
     } else {
@@ -503,9 +499,7 @@ __host__ void tree_sign_reduction(
       streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
       last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
       polynomial_size, message_modulus, carry_modulus, f, true);
-
-  auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-  last_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+  last_lut->broadcast_lut(streams, gpu_indexes);
 
   // Last leaf
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
index 01e04888ad..917337b69b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -363,17 +363,14 @@ host_integer_decompress(cudaStream_t const *streams,
           lut->lwe_trivial_indexes_vec;
 
       /// Make sure all data that should be on GPU 0 is indeed there
-      cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
-      for (int j = 1; j < active_gpu_count; j++) {
-        cuda_stream_wait_event(streams[j], lut->event_scatter_in,
-                               gpu_indexes[j]);
-      }
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
       /// With multiple GPUs we push to the vectors on each GPU then when we
       /// gather data to GPU 0 we can copy back to the original indexing
       multi_gpu_scatter_lwe_async<Torus>(
           streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-          extracted_lwe, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-          lut->lwe_aligned_vec, lut->active_gpu_count, num_blocks_to_decompress,
+          extracted_lwe, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+          lut->active_gpu_count, num_blocks_to_decompress,
           compression_params.small_lwe_dimension + 1);
 
       /// Apply PBS
@@ -391,20 +388,13 @@ host_integer_decompress(cudaStream_t const *streams,
       /// Copy data back to GPU 0 and release vecs
       multi_gpu_gather_lwe_async<Torus>(
           streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
-          lwe_after_pbs_vec, lut->lwe_indexes_out,
-          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
-          num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
+          lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+          lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
+          encryption_params.big_lwe_dimension + 1);
 
       /// Synchronize all GPUs
-      // other gpus record their events
-      for (int j = 1; j < active_gpu_count; j++) {
-        cuda_event_record(lut->event_scatter_out[j], streams[j],
-                          gpu_indexes[j]);
-      }
-      // GPU 0 waits for all
-      for (int j = 1; j < active_gpu_count; j++) {
-        cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
-                               gpu_indexes[0]);
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
       }
     }
   } else {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index fb7cac01a7..6d75d7f2a8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -311,8 +311,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
           mem_ptr->scalars_for_overflow_sub
               [merged_interesting_remainder->num_radix_blocks - 1];
       mem_ptr->overflow_sub_mem->update_lut_indexes(
-          streams, gpu_indexes, gpu_count, first_indexes, second_indexes,
-          scalar_indexes, merged_interesting_remainder->num_radix_blocks);
+          streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
+          merged_interesting_remainder->num_radix_blocks);
       host_integer_overflowing_sub<uint64_t>(
           streams, gpu_indexes, gpu_count, new_remainder,
           merged_interesting_remainder, interesting_divisor,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 5abf0e7005..d298fd9937 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -567,20 +567,16 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
-    }
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
-    PUSH_RANGE("scatter")
     multi_gpu_scatter_lwe_async<Torus>(
         streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
-        lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
-    POP_RANGE()
+        (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
+        big_lwe_dimension + 1);
+
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                    lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -599,20 +595,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         num_many_lut, lut_stride);
 
     /// Copy data back to GPU 0 and release vecs
-    PUSH_RANGE("gather")
-    multi_gpu_gather_lwe_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
-        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
-    POP_RANGE()
-    // other gpus record their events
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
-                             gpu_indexes[0]);
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      (Torus *)lwe_array_out->ptr,
+                                      lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
   }
   for (uint i = 0; i < num_radix_blocks; i++) {
@@ -683,19 +674,16 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
         grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
-    }
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
     /// With multiple GPUs we push to the vectors on each GPU then when we
     /// gather data to GPU 0 we can copy back to the original indexing
-    PUSH_RANGE("scatter")
     multi_gpu_scatter_lwe_async<Torus>(
         streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
-        lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
-    POP_RANGE()
+        (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
+        big_lwe_dimension + 1);
+
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                    lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -714,22 +702,15 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
         num_many_lut, lut_stride);
 
     /// Copy data back to GPU 0 and release vecs
-    PUSH_RANGE("gather")
     multi_gpu_gather_many_lut_lwe_async<Torus>(
         streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
         lwe_after_pbs_vec, lut->h_lwe_indexes_out,
         lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
         num_many_lut);
-    POP_RANGE()
 
-    // other gpus record their events
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
-                             gpu_indexes[0]);
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
   }
   for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -814,17 +795,13 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
         grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
   } else {
-    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
-    }
-    PUSH_RANGE("scatter")
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     multi_gpu_scatter_lwe_async<Torus>(
         streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
-        lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
-    POP_RANGE()
+        (Torus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
+        big_lwe_dimension + 1);
+
     /// Apply KS to go from a big LWE dimension to a small LWE dimension
     execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                    lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -843,20 +820,15 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         num_many_lut, lut_stride);
 
     /// Copy data back to GPU 0 and release vecs
-    PUSH_RANGE("gather")
-    multi_gpu_gather_lwe_async<Torus>(
-        streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
-        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
-    POP_RANGE()
-    // other gpus record their events
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
-                             gpu_indexes[0]);
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      (Torus *)(lwe_array_out->ptr),
+                                      lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
   }
   for (uint i = 0; i < num_radix_blocks; i++) {
@@ -1028,6 +1000,7 @@ void generate_device_accumulator_no_encoding(
   cuda_memcpy_with_size_tracking_async_to_gpu(
       acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
       stream, gpu_index, gpu_memory_allocated);
+
   cuda_synchronize_stream(stream, gpu_index);
   free(h_lut);
 }
@@ -1131,7 +1104,8 @@ void generate_device_accumulator_bivariate_with_factor(
       h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
       factor);
 
-  //  copy host lut and lut_indexes_vec to device
+  cuda_synchronize_stream(stream, gpu_index);
+  // copy host lut and lut_indexes_vec to device
   cuda_memcpy_with_size_tracking_async_to_gpu(
       acc_bivariate, h_lut,
       (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
@@ -1163,6 +1137,7 @@ void generate_device_accumulator_with_encoding(
   cuda_memcpy_with_size_tracking_async_to_gpu(
       acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
       stream, gpu_index, gpu_memory_allocated);
+
   cuda_synchronize_stream(stream, gpu_index);
   free(h_lut);
 }
@@ -1693,7 +1668,6 @@ __host__ void reduce_signs(
           "than the number of blocks to operate on")
 
   auto diff_buffer = mem_ptr->diff_buffer;
-  auto active_gpu_count = mem_ptr->active_gpu_count;
 
   auto params = mem_ptr->params;
   auto glwe_dimension = params.glwe_dimension;
@@ -1723,7 +1697,7 @@ __host__ void reduce_signs(
         streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
         lut->get_max_degree(0), glwe_dimension, polynomial_size,
         message_modulus, carry_modulus, reduce_two_orderings_function, true);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     while (num_sign_blocks > 2) {
       pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
@@ -1754,7 +1728,7 @@ __host__ void reduce_signs(
         streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
         lut->get_max_degree(0), glwe_dimension, polynomial_size,
         message_modulus, carry_modulus, final_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
                        num_sign_blocks, message_modulus);
@@ -1774,7 +1748,7 @@ __host__ void reduce_signs(
         streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
         lut->get_max_degree(0), glwe_dimension, polynomial_size,
         message_modulus, carry_modulus, final_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
@@ -1800,8 +1774,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
       (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
       streams[0], gpu_indexes[0], allocate_gpu_memory);
   *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
   POP_RANGE()
   return size_tracker;
 }
@@ -1838,8 +1811,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
       (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
       streams[0], gpu_indexes[0], allocate_gpu_memory);
   *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
   POP_RANGE()
   return size_tracker;
 }
@@ -1876,8 +1848,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
       (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
       streams[0], gpu_indexes[0], allocate_gpu_memory);
   *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
   POP_RANGE()
   return size_tracker;
 }
@@ -2392,10 +2363,9 @@ __host__ void integer_radix_apply_noise_squashing_kb(
     /// gather data to GPU 0 we can copy back to the original indexing
     multi_gpu_scatter_lwe_async<InputTorus>(
         streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_scatter_vec,
-        lut->active_gpu_count, lwe_array_out->num_radix_blocks,
-        lut->input_big_lwe_dimension + 1);
+        (InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lut->active_gpu_count,
+        lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
 
     execute_keyswitch_async<InputTorus>(
         streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
@@ -2418,8 +2388,8 @@ __host__ void integer_radix_apply_noise_squashing_kb(
     multi_gpu_gather_lwe_async<__uint128_t>(
         streams, gpu_indexes, active_gpu_count,
         (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec, nullptr,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
-        lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
+        lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
+        big_lwe_dimension + 1);
 
     /// Synchronize all GPUs
     for (uint i = 0; i < active_gpu_count; i++) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 7ce5e3847c..ad50af6e90 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -415,10 +415,31 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
           total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
           lut_stride);
     } else {
+      Torus *h_lwe_indexes_in_pinned;
+      Torus *h_lwe_indexes_out_pinned;
+      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
+                     total_ciphertexts * sizeof(Torus));
+      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
+                     total_ciphertexts * sizeof(Torus));
+      for (uint32_t i = 0; i < total_ciphertexts; i++) {
+        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
+        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
+      }
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
+          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
+          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      for (uint32_t i = 0; i < total_ciphertexts; i++) {
+        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
+        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
+      }
+      cudaFreeHost(h_lwe_indexes_in_pinned);
+      cudaFreeHost(h_lwe_indexes_out_pinned);
 
-      // we just need to broadcast the indexes
-      luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
-                                        false);
+      luts_message_carry->broadcast_lut(streams, gpu_indexes);
       luts_message_carry->using_trivial_lwe_indexes = false;
 
       integer_radix_apply_univariate_lookup_table_kb<Torus>(
@@ -470,9 +491,31 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
           lut_stride);
     } else {
       uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
-      // we just need to broadcast the indexes
-      luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
-                                        false);
+      Torus *h_lwe_indexes_in_pinned;
+      Torus *h_lwe_indexes_out_pinned;
+      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
+                     num_blocks_in_apply_lut * sizeof(Torus));
+      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
+                     num_blocks_in_apply_lut * sizeof(Torus));
+      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
+        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
+        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
+      }
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
+          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
+          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
+        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
+        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
+      }
+      cudaFreeHost(h_lwe_indexes_in_pinned);
+      cudaFreeHost(h_lwe_indexes_out_pinned);
+
+      luts_message_carry->broadcast_lut(streams, gpu_indexes);
       luts_message_carry->using_trivial_lwe_indexes = false;
 
       integer_radix_apply_univariate_lookup_table_kb<Torus>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
index bd666fdb8a..eb79720172 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -48,10 +48,7 @@ void host_integer_grouped_oprf(
     std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
     std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
 
-    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
-    }
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
 
     if (!lut->using_trivial_lwe_indexes) {
       PANIC("lut->using_trivial_lwe_indexes should be true");
@@ -59,8 +56,8 @@ void host_integer_grouped_oprf(
 
     multi_gpu_scatter_lwe_async<Torus>(
         streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, active_gpu_count, num_blocks_to_process,
+        seeded_lwe_input, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        active_gpu_count, num_blocks_to_process,
         mem_ptr->params.small_lwe_dimension + 1);
 
     execute_pbs_async<Torus, Torus>(
@@ -75,18 +72,12 @@ void host_integer_grouped_oprf(
 
     multi_gpu_gather_lwe_async<Torus>(
         streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
-        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_blocks_to_process,
+        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+        lut->using_trivial_lwe_indexes, num_blocks_to_process,
         mem_ptr->params.big_lwe_dimension + 1);
 
-    // other gpus record their events
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
-    }
-    // GPU 0 waits for all
-    for (int j = 1; j < active_gpu_count; j++) {
-      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
-                             gpu_indexes[0]);
+    for (uint32_t i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
     }
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
index 87bd09a528..b9c0373a32 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -47,8 +47,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
     cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                  num_clear_blocks * sizeof(Torus), streams[0],
                                  gpu_indexes[0]);
-    auto active_gpu_count = get_active_gpu_count(num_clear_blocks, gpu_count);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count, false);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index 3fa42f4c5d..4bc90783a1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -154,8 +154,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
         streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
         lut->get_max_degree(0), glwe_dimension, polynomial_size,
         message_modulus, carry_modulus, scalar_last_leaf_lut_f, true);
-    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     integer_radix_apply_univariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -254,8 +253,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
         streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
         lut->get_max_degree(0), glwe_dimension, polynomial_size,
         message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
-    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
@@ -288,8 +286,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
           one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
           params.glwe_dimension, params.polynomial_size, params.message_modulus,
           params.carry_modulus, one_block_lut_f, true);
-      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-      one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+      one_block_lut->broadcast_lut(streams, gpu_indexes);
 
       integer_radix_apply_univariate_lookup_table_kb<Torus>(
           streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -436,8 +434,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
         streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
         lut->get_max_degree(0), glwe_dimension, polynomial_size,
         message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
-    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-    lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    lut->broadcast_lut(streams, gpu_indexes);
 
     integer_radix_apply_bivariate_lookup_table_kb<Torus>(
         streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
@@ -543,8 +540,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
         signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
         params.glwe_dimension, params.polynomial_size, params.message_modulus,
         params.carry_modulus, lut_f, true);
-    auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-    signed_msb_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+    signed_msb_lut->broadcast_lut(streams, gpu_indexes);
 
     CudaRadixCiphertextFFI sign_block;
     as_radix_ciphertext_slice<Torus>(
@@ -592,8 +588,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
           one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
           params.glwe_dimension, params.polynomial_size, params.message_modulus,
           params.carry_modulus, one_block_lut_f, true);
-      auto active_gpu_count = get_active_gpu_count(1, gpu_count);
-      one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+      one_block_lut->broadcast_lut(streams, gpu_indexes);
 
       integer_radix_apply_univariate_lookup_table_kb<Torus>(
           streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -823,11 +819,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
           num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
           gpu_indexes[0]);
     }
-    auto active_gpu_count =
-        get_active_gpu_count(num_halved_scalar_blocks, gpu_count);
-    // We use false cause we only will broadcast the indexes
-    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes,
-                                          active_gpu_count, false);
+    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes);
 
     integer_radix_apply_univariate_lookup_table_kb<Torus>(
         lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
index be64884fe2..ea4564049d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -38,19 +38,6 @@ void multi_gpu_copy_array_async(cudaStream_t const *streams,
         gpu_indexes[i], gpu_memory_allocated);
   }
 }
-/// Copy an array residing on one CPU to all active gpus
-template <typename Torus>
-void multi_gpu_copy_array_from_cpu_async(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, std::vector<Torus *> &dest, Torus const *h_src,
-    uint32_t elements_per_gpu, bool gpu_memory_allocated) {
-  dest.resize(gpu_count);
-  for (uint i = 0; i < gpu_count; i++) {
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        dest[i], h_src, elements_per_gpu * sizeof(Torus), streams[i],
-        gpu_indexes[i], gpu_memory_allocated);
-  }
-}
 /// Allocates the input/output vector for all devices
 /// Initializes also the related indexing and initializes it to the trivial
 /// index
@@ -106,35 +93,6 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
   }
 }
 
-// This function reads lwes using the indexes and place them in a single aligned
-// array. This function is needed before communication to perform a single
-// contiguous data movement. Each block handles one lwe.
-template <typename Torus>
-__global__ void align_with_indexes(Torus *d_packed_vector,
-                                   Torus const *d_vector,
-                                   Torus const *d_indexes, int lwe_size) {
-
-  int output_offset = blockIdx.x * lwe_size;
-  int input_offset = d_indexes[blockIdx.x] * lwe_size;
-  for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
-    d_packed_vector[ind + output_offset] = d_vector[ind + input_offset];
-  }
-}
-
-// This function takes the aligned array after communication and places it in
-// the corresponding indexes. Each block handles one lwe.
-template <typename Torus>
-__global__ void realign_with_indexes(Torus *d_vector,
-                                     Torus const *d_packed_vector,
-                                     Torus const *d_indexes, int lwe_size) {
-
-  int input_offset = blockIdx.x * lwe_size;
-  int output_offset = d_indexes[blockIdx.x] * lwe_size;
-  for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
-    d_vector[ind + output_offset] = d_packed_vector[ind + input_offset];
-  }
-}
-
 /// Load an array residing on one GPU to all active gpus
 /// and split the array among them.
 /// The input indexing logic is given by an index array.
@@ -144,15 +102,15 @@ template <typename Torus>
 void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
                                  uint32_t const *gpu_indexes,
                                  uint32_t gpu_count, std::vector<Torus *> &dest,
-                                 Torus const *src, Torus const *d_src_indexes,
+                                 Torus const *src, Torus const *h_src_indexes,
                                  bool is_trivial_index,
-                                 std::vector<Torus *> &aligned_vec,
                                  uint32_t max_active_gpu_count,
                                  uint32_t num_inputs, uint32_t lwe_size) {
 
   if (max_active_gpu_count < gpu_count)
     PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
           "used to create the lut")
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
   dest.resize(gpu_count);
   for (uint i = 0; i < gpu_count; i++) {
     auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
@@ -169,28 +127,18 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
           gpu_indexes[i], true);
 
     } else {
-      if (aligned_vec.size() == 0)
-        PANIC("Cuda error: auxiliary arrays should be setup!");
-
-      if (d_src_indexes == nullptr)
+      if (h_src_indexes == nullptr)
         PANIC("Cuda error: source indexes should be initialized!");
+      auto src_indexes = h_src_indexes + gpu_offset;
 
-      cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
-      cuda_set_device(gpu_indexes[0]);
-      align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
-          aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
-          lwe_size);
-      check_cuda_error(cudaGetLastError());
-      cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
-      cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
-
-      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-          dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
-          streams[i], gpu_indexes[i], true);
+      for (uint j = 0; j < inputs_on_gpu; j++) {
+        auto d_dest = dest[i] + j * lwe_size;
+        auto d_src = src + src_indexes[j] * lwe_size;
 
-      cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
-      cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
-      cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
+        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+            d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
+            true);
+      }
     }
   }
 }
@@ -202,8 +150,7 @@ template <typename Torus>
 void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes, uint32_t gpu_count,
                                 Torus *dest, const std::vector<Torus *> &src,
-                                Torus *d_dest_indexes, bool is_trivial_index,
-                                std::vector<Torus *> &aligned_vec,
+                                Torus *h_dest_indexes, bool is_trivial_index,
                                 uint32_t num_inputs, uint32_t lwe_size) {
 
   for (uint i = 0; i < gpu_count; i++) {
@@ -221,27 +168,19 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
           d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
           gpu_indexes[i], true);
     } else {
-      if (aligned_vec.size() == 0)
-        PANIC("Cuda error: auxiliary arrays should be setup!");
-      if (d_dest_indexes == nullptr)
+      if (h_dest_indexes == nullptr)
         PANIC("Cuda error: destination indexes should be initialized!");
 
-      cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
-
-      cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
-      cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
+      auto dest_indexes = h_dest_indexes + gpu_offset;
 
-      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-          aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
-          streams[i], gpu_indexes[i], true);
+      for (uint j = 0; j < inputs_on_gpu; j++) {
+        auto d_dest = dest + dest_indexes[j] * lwe_size;
+        auto d_src = src[i] + j * lwe_size;
 
-      cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
-      cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
-      cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
-      cuda_set_device(gpu_indexes[0]);
-      realign_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
-          dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset, lwe_size);
-      check_cuda_error(cudaGetLastError());
+        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+            d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
+            true);
+      }
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 72c0434b3b..6f02c58a4a 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -88,8 +88,6 @@ extern "C" {
 
     pub fn cuda_get_number_of_gpus() -> i32;
 
-    pub fn cuda_get_number_of_sms() -> i32;
-
     pub fn cuda_synchronize_device(gpu_index: u32);
 
     pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
index 0760956ac9..d84545c541 100644
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
@@ -15,7 +15,7 @@
 
 [rtl]
   bpip_use = true
-  bpip_use_opportunism = false
+  bpip_use_opportunism = true
   bpip_timeout = 100_000
 
 [board]
@@ -35,21 +35,13 @@
 
   bsk_pc = [
     {Hbm={pc=8}},
-    {Hbm={pc=10}},
     {Hbm={pc=12}},
-    {Hbm={pc=14}},
     {Hbm={pc=24}},
-    {Hbm={pc=26}},
     {Hbm={pc=28}},
-    {Hbm={pc=30}},
     {Hbm={pc=40}},
-    {Hbm={pc=42}},
     {Hbm={pc=44}},
-    {Hbm={pc=46}},
     {Hbm={pc=56}},
-    {Hbm={pc=58}},
-    {Hbm={pc=60}},
-    {Hbm={pc=62}}
+    {Hbm={pc=60}}
   ]
 
   ksk_pc = [
@@ -78,7 +70,7 @@
   #implementation = "Ilp"
   implementation = "Llt"
   integer_w=[2,4,6,8,10,12,14,16,32,64,128]
-  min_batch_size = 9
+  min_batch_size = 11
   kogge_cfg            = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
   custom_iop.'IOP[0]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
   custom_iop.'IOP[1]'  = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
@@ -95,8 +87,8 @@
   custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"
 
 [firmware.op_cfg.default]
-  fill_batch_fifo = false
-  min_batch_size = true
+  fill_batch_fifo = true
+  min_batch_size = false
   use_tiers = false
   flush_behaviour = "Patient"
   flush = true
diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
index f57f27147c..4afc095ab6 100644
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
@@ -49,15 +49,3 @@ offset= 0x10
   read_access="Read"
   write_access="Write"
   duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]
-
-[section.hpu_reset]
-description="Used to control the HPU soft reset"
-offset= 0x100
-
-[section.hpu_reset.register.trigger]
-  description="A soft reset for the whole HPU reconfigurable logic"
-  owner="Kernel"
-  read_access="Read"
-  write_access="WriteNotify"
-  field.request = { size_b=1, offset_b=0 ,  default={Cst=0}, description="request"}
-  field.done    = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}
diff --git a/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu b/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
index 18114dc0a6..d7091cff2a 100644
--- a/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
+++ b/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
-size 84230351
+oid sha256:1d1afb554756df4d8b39bee33ded2dda19c23a6f9d8e2b242092efd35cf1cc19
+size 83281321
diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
index a69d87899c..edc6eb411f 100644
--- a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
+++ b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
@@ -31,22 +31,14 @@ def __init__(self, d):
         self.__dict__ = d
 
     def args(self):
-        try:
-            return f'R{self.rid} @{hex(self.slot["Addr"])}'
-        except:
-            # It can happen that an IOP is not translated by the FW
-            return f'R{self.rid} @{self.slot}'
+        return f'R{self.rid} @{hex(self.slot["Addr"])}'
 
 class ST(BaseInstruction):
     def __init__(self, d):
         self.__dict__ = d
 
     def args(self):
-        try:
-            return f'@{hex(self.slot["Addr"])} R{self.rid}'
-        except:
-            # It can happen that an IOP is not translated by the FW
-            return f'@{self.slot} R{self.rid}'
+        return f'@{hex(self.slot["Addr"])} R{self.rid}'
 
 class MAC(BaseInstruction):
     def __init__(self, d):
diff --git a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
index 3f4f70b4ac..25e5f4d9a4 100644
--- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
@@ -176,18 +176,6 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
     imm: 1,
 };
 
-pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
-pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
-    dst: [VarMode::Native; SIMD_N],
-    src: [VarMode::Native; 2 * SIMD_N],
-    imm: 0,
-};
-pub const IOP_2NCT_F_3NCT: ConstIOpProto<{ 2 * SIMD_N }, { 3 * SIMD_N }> = ConstIOpProto {
-    dst: [VarMode::Native; 2 * SIMD_N],
-    src: [VarMode::Native; 3 * SIMD_N],
-    imm: 0,
-};
-
 use crate::iop;
 use arg::IOpFormat;
 use lazy_static::lazy_static;
@@ -239,6 +227,4 @@ iop!(
     [IOP_CT_F_CT -> "LEAD1", opcode::LEAD1],
     [IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
     [IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
-    [IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
-    [IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
 );
diff --git a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
index 13da895694..34cde3c0df 100644
--- a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
@@ -87,10 +87,6 @@ pub const LEAD1: u8 = 0x85;
 pub const TRAIL0: u8 = 0x86;
 pub const TRAIL1: u8 = 0x87;
 
-// SIMD for maximum throughput
-pub const ADD_SIMD: u8 = 0xF0;
-pub const ERC_20_SIMD: u8 = 0xF1;
-//
 // Utility operations
 // Used to handle real clone of ciphertext already uploaded in the Hpu memory
 pub const MEMCPY: u8 = 0xFF;
diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
index 700ee37c56..e08576a2d6 100644
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
@@ -72,9 +72,6 @@ crate::impl_fw!("Ilp" [
     LEAD1  => fw_impl::ilp_log::iop_lead1;
     TRAIL0 => fw_impl::ilp_log::iop_trail0;
     TRAIL1 => fw_impl::ilp_log::iop_trail1;
-    // SIMD Implementations
-    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
 ]);
 
 #[instrument(level = "trace", skip(prog))]
diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
index 44103c2248..173dc96e39 100644
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
@@ -57,16 +57,16 @@ crate::impl_fw!("Llt" [
     OVF_SSUB => fw_impl::ilp::iop_overflow_ssub;
     OVF_MULS => fw_impl::ilp::iop_overflow_muls;
 
-    BW_AND       => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
-    BW_OR        => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
-    BW_XOR       => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});
+    BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
+    BW_OR  => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
+    BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});
 
-    CMP_GT       => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
-    CMP_GTE      => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
-    CMP_LT       => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
-    CMP_LTE      => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
-    CMP_EQ       => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
-    CMP_NEQ      => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});
+    CMP_GT  => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
+    CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
+    CMP_LT  => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
+    CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
+    CMP_EQ  => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
+    CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});
 
     IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
     IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
@@ -81,10 +81,6 @@ crate::impl_fw!("Llt" [
     LEAD1 => fw_impl::ilp_log::iop_lead1;
     TRAIL0 => fw_impl::ilp_log::iop_trail0;
     TRAIL1 => fw_impl::ilp_log::iop_trail1;
-
-    // SIMD Implementations
-    ADD_SIMD     => fw_impl::llt::iop_add_simd;
-    ERC_20_SIMD  => fw_impl::llt::iop_erc_20_simd;
 ]);
 
 // ----------------------------------------------------------------------------
@@ -106,17 +102,6 @@ pub fn iop_add(prog: &mut Program) {
     iop_addx(prog, dst, src_a, src_b);
 }
 
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_add_simd(prog: &mut Program) {
-    // Add Comment header
-    prog.push_comment("ADD_SIMD Operand::Dst Operand::Src Operand::Src".to_string());
-    simd(
-        prog,
-        crate::asm::iop::SIMD_N,
-        fw_impl::llt::iop_add_ripple_rtl,
-    );
-}
-
 pub fn iop_adds(prog: &mut Program) {
     // Allocate metavariables:
     // Dest -> Operand
@@ -204,7 +189,7 @@ pub fn iop_mul(prog: &mut Program) {
 
     // Add Comment header
     prog.push_comment("MUL Operand::Dst Operand::Src Operand::Src".to_string());
-
+    // Deferred implementation to generic mulx function
     iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
 }
 
@@ -220,50 +205,29 @@ pub fn iop_muls(prog: &mut Program) {
 
     // Add Comment header
     prog.push_comment("MULS Operand::Dst Operand::Src Operand::Immediat".to_string());
-
+    // Deferred implementation to generic mulx function
     iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
 }
 
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20(prog: &mut Program) {
-    // Add Comment header
-    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
-    iop_erc_20_rtl(prog, 0).add_to_prog(prog);
-}
-
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_simd(prog: &mut Program) {
-    // Add Comment header
-    prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
-    simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl);
-}
-
-// ----------------------------------------------------------------------------
-// Helper Functions
-// ----------------------------------------------------------------------------
-
 /// Implement erc_20 fund xfer
 /// Targeted algorithm is as follow:
 /// 1. Check that from has enough funds
 /// 2. Compute real_amount to xfer (i.e. amount or 0)
 /// 3. Compute new amount (from - new_amount, to + new_amount)
-///
-/// The input operands are:
-///     (from[0], to[0], amount[0], ..., from[N-1], to[N-1], amount[N-1])
-/// The output operands are:
-///     (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
-/// Where N is the batch size
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
+pub fn iop_erc_20(prog: &mut Program) {
     // Allocate metavariables:
     // Dest -> Operand
-    let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
-    let dst_to = prog.iop_template_var(OperandKind::Dst, 2 * batch_index + 1);
+    let dst_from = prog.iop_template_var(OperandKind::Dst, 0);
+    let dst_to = prog.iop_template_var(OperandKind::Dst, 1);
     // Src -> Operand
-    let src_from = prog.iop_template_var(OperandKind::Src, 3 * batch_index);
-    let src_to = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 1);
+    let src_from = prog.iop_template_var(OperandKind::Src, 0);
+    let src_to = prog.iop_template_var(OperandKind::Src, 1);
     // Src Amount -> Operand
-    let src_amount = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 2);
+    let src_amount = prog.iop_template_var(OperandKind::Src, 2);
+
+    // Add Comment header
+    prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
 
     // TODO: Make this a parameter or sweep this
     // All these little parameters would be very handy to write an
@@ -272,7 +236,7 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
     let kogge_blk_w = 10;
     let ripple = true;
 
-    {
+    let tree = {
         let props = prog.params();
         let tfhe_params: asm::DigitParameters = props.clone().into();
         let lut = pbs_by_name!("IfFalseZeroed");
@@ -309,26 +273,13 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
             kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w)
                 + kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w)
         }
-    }
-}
-
-/// A SIMD implementation of add for maximum throughput
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8) -> Rtl {
-    // Allocate metavariables:
-    let dst = prog.iop_template_var(OperandKind::Dst, i);
-    let src_a = prog.iop_template_var(OperandKind::Src, 2 * i);
-    let src_b = prog.iop_template_var(OperandKind::Src, 2 * i + 1);
-
-    // Convert MetaVarCell in VarCell for Rtl analysis
-    let a = VarCell::from_vec(src_a);
-    let b = VarCell::from_vec(src_b);
-    let d = VarCell::from_vec(dst);
-
-    // Do a + b with the ripple carry adder
-    kogge::ripple_add(d, a, b, None)
+    };
+    tree.add_to_prog(prog);
 }
 
+// ----------------------------------------------------------------------------
+// Helper Functions
+// ----------------------------------------------------------------------------
 fn iop_addx(
     prog: &mut Program,
     dst: Vec<MetaVarCell>,
@@ -362,181 +313,11 @@ fn iop_subx(
     .add_to_prog(prog);
 }
 
-/// Generic mul operation for massively parallel HPUs
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_mulx_par(
-    prog: &mut Program,
-    dst: Vec<metavar::MetaVarCell>,
-    src_a: Vec<metavar::MetaVarCell>,
-    src_b: Vec<metavar::MetaVarCell>,
-) -> Rtl {
-    let props = prog.params();
-    let tfhe_params: asm::DigitParameters = props.clone().into();
-    let blk_w = props.blk_w();
-
-    // Transform metavars into RTL vars
-    let mut dst = VarCell::from_vec(dst);
-    let src_a = VarCell::from_vec(src_a);
-    let src_b = VarCell::from_vec(src_b);
-    let max_deg = VarDeg {
-        deg: props.max_val(),
-        nu: props.nu,
-    };
-
-    let pbs_mul_lsb = pbs_by_name!("MultCarryMsgLsb");
-    let pbs_mul_msb = pbs_by_name!("MultCarryMsgMsb");
-    let max_carry = (props.max_msg() * props.max_msg()) >> props.msg_w;
-    let max_msg = props.max_msg();
-
-    let mut mul_map: HashMap<usize, Vec<VarCellDeg>> = HashMap::new();
-    itertools::iproduct!(0..blk_w, 0..blk_w).for_each(|(i, j)| {
-        let pp = src_a[i].mac(tfhe_params.msg_range(), &src_b[j]);
-        let lsb = pp.single_pbs(&pbs_mul_lsb);
-        let msb = pp.single_pbs(&pbs_mul_msb);
-        mul_map
-            .entry(i + j)
-            .or_default()
-            .push(VarCellDeg::new(max_msg, lsb));
-        mul_map
-            .entry(i + j + 1)
-            .or_default()
-            .push(VarCellDeg::new(max_carry, msb));
-    });
-
-    let mut pp: Vec<VecVarCellDeg> = (0..dst.len())
-        .map(|i| mul_map.remove(&i).unwrap().into())
-        .collect();
-
-    // Reduce dada tree like
-    while pp.iter().any(|x| x.len() > 1) {
-        trace!(
-            target: "llt::mul",
-            "pp length: {:?}",
-            pp.iter().map(|x| x.len()).collect::<Vec<_>>()
-        );
-        for c in (0..dst.len()).rev() {
-            let mut col_len = pp[c].len();
-            let mut reduced = Vec::new();
-            let mut chunks = pp[c].deg_chunks(&max_deg).peekable();
-            let max_col = if c == (dst.len() - 1) {
-                0
-            } else {
-                dst.len() - 1
-            };
-
-            while chunks.peek().is_some() && col_len > pp[max_col].len() {
-                let mut chunk = chunks.next().unwrap();
-                let chunk_len = chunk.len();
-                col_len -= chunk.len();
-
-                // sum the chunk
-                while chunk.len() > 1 {
-                    chunk = chunk
-                        .chunks(2)
-                        .map(|chunk| match chunk.len() {
-                            1 => chunk[0].clone(),
-                            2 => &chunk[0] + &chunk[1],
-                            _ => panic!("Invalid chunk size"),
-                        })
-                        .collect()
-                }
-
-                // And bootstrap if needed
-                let element = chunk
-                    .into_iter()
-                    .next()
-                    .map(|sum| {
-                        assert!(sum.deg.nu <= props.nu);
-                        if sum.deg == max_deg || chunk_len == 1 {
-                            let (data, carry) = sum.bootstrap(&props);
-                            if let (Some(carry), Some(elm)) = (carry, pp.get_mut(c + 1)) {
-                                elm.push(carry);
-                            }
-                            data
-                        } else {
-                            sum
-                        }
-                    })
-                    .unwrap();
-
-                reduced.push(element);
-            }
-
-            pp[c] = reduced
-                .into_iter()
-                .chain(chunks.flatten())
-                .collect::<Vec<_>>()
-                .into();
-        }
-    }
-
-    trace!(
-        target: "llt::mul",
-        "final pp: {:?}", pp
-    );
-
-    // Extract carry and message and do carry propagation
-    let mut a: Vec<Option<VarCell>> = (0..dst.len() + 1).map(|_| None).collect();
-    let mut b: Vec<Option<VarCell>> = (0..dst.len() + 1).map(|_| None).collect();
-
-    pp.into_iter().enumerate().for_each(|(i, pp)| {
-        assert!(pp.len() == 1);
-        let vardeg = pp.first().unwrap();
-        let (msg, carry) = vardeg.bootstrap(&props);
-        a[i] = Some(msg.var);
-        if let Some(carry) = carry {
-            b[i + 1] = Some(carry.var);
-        }
-    });
-
-    let cs: Vec<_> = a
-        .into_iter()
-        .take(dst.len())
-        .zip(b.into_iter())
-        .map(|(a, b)| match (a, b) {
-            (Some(a), Some(b)) => &a + &b,
-            (Some(a), None) => a,
-            (None, Some(b)) => b,
-            _ => panic!("Fix your code"),
-        })
-        .collect();
-
-    // Do fully parallel carry propagation
-    kogge::propagate_carry(prog, dst.as_mut_slice(), cs.as_slice(), &None);
-
-    Rtl::from(dst)
-}
-
-/// multiplier wrapper, to choose between parallel and serial implementations
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_mulx(
-    prog: &mut Program,
-    dst: Vec<metavar::MetaVarCell>,
-    src_a: Vec<metavar::MetaVarCell>,
-    src_b: Vec<metavar::MetaVarCell>,
-) -> Rtl {
-    // When the batch size is enough to do a full stage in parallel, do parallel
-    // mul.
-    // Note: The break-even point might not be this one, but choosing the right
-    // point is uninportant since we'll leap imensely the number of batches from
-    // FPGA to ASIC.
-    let parallel = prog
-        .op_cfg()
-        .parallel
-        .unwrap_or_else(|| prog.params().pbs_batch_w >= dst.len());
-
-    if parallel {
-        iop_mulx_par(prog, dst, src_a, src_b)
-    } else {
-        iop_mulx_ser(prog, dst, src_a, src_b)
-    }
-}
-
 /// Generic mul operation
 /// One destination and two sources operation
 /// Source could be Operand or Immediat
 #[instrument(level = "trace", skip(prog))]
-pub fn iop_mulx_ser(
+pub fn iop_mulx(
     prog: &mut Program,
     dst: Vec<metavar::MetaVarCell>,
     src_a: Vec<metavar::MetaVarCell>,
@@ -586,10 +367,7 @@ pub fn iop_mulx_ser(
                     sum.var.single_pbs(&pbs_carry),
                 ));
             }
-            VarCellDeg::new(
-                sum.deg.deg.min(props.max_msg()),
-                sum.var.single_pbs(&pbs_msg),
-            )
+            VarCellDeg::new(props.max_msg(), sum.var.single_pbs(&pbs_msg))
         };
 
         while to_sum.len() > 1 {
@@ -762,23 +540,3 @@ fn bw_inv(prog: &mut Program, b: Vec<VarCell>) -> Vec<VarCell> {
         })
         .collect::<Vec<_>>()
 }
-
-/// Creates a SIMD version of the closure
-/// Make sure that the closure is a PBS optimized version of the operation
-/// The closure receives as inputs the program and the batch index.
-/// How the ASM operands are actually organized is defined by the closure
-/// itself.
-///
-/// Maybe this should go into a SIMD firmware implementation... At some point we
-/// would need a mechanism to choose between implementations on the fly to make
-/// real good use of all of this.
-fn simd<F>(prog: &mut Program, batch_size: usize, rtl_closure: F)
-where
-    F: Fn(&mut Program, u8) -> Rtl,
-{
-    (0..batch_size)
-        .map(|i| i as u8)
-        .map(|i| rtl_closure(prog, i))
-        .sum::<Rtl>()
-        .add_to_prog(prog);
-}
diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs
index 7df4ca0565..f4a00ac46f 100644
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs
@@ -1,6 +1,4 @@
 use super::rtl::VarCell;
-use super::*;
-use crate::pbs_by_name;
 use tracing::trace;
 
 #[derive(Clone, Eq, Default, Debug)]
@@ -50,48 +48,6 @@ pub struct VarCellDeg {
     pub deg: VarDeg,
 }
 
-impl VarCellDeg {
-    pub fn bootstrap(&self, props: &FwParameters) -> (VarCellDeg, Option<VarCellDeg>) {
-        trace!(target: "vardeg::VarCellDeg::bootstrap", "bootstrap: {:?}", self);
-
-        let pbs_many_carry = pbs_by_name!("ManyCarryMsg");
-        let pbs_carry = pbs_by_name!("CarryInMsg");
-        let pbs_msg = pbs_by_name!("MsgOnly");
-
-        if self.deg.deg <= props.max_msg() {
-            match self.deg.nu {
-                1 => (self.clone(), None),
-                _ => (
-                    VarCellDeg::new(self.deg.deg, self.var.single_pbs(&pbs_msg)),
-                    None,
-                ),
-            }
-        // If we still have a bit available to do manyLUT
-        } else if self.deg.deg > props.max_msg() && self.deg.deg <= (props.max_val() >> 1) {
-            let mut pbs = self.var.pbs(&pbs_many_carry).into_iter();
-            (
-                VarCellDeg::new(props.max_msg().min(self.deg.deg), pbs.next().unwrap()),
-                Some(VarCellDeg::new(
-                    self.deg.deg >> props.carry_w,
-                    pbs.next().unwrap(),
-                )),
-            )
-        //Otherwise, we'll have to use two independent PBSs
-        } else {
-            (
-                VarCellDeg::new(
-                    self.deg.deg.min(props.max_msg()),
-                    self.var.single_pbs(&pbs_msg),
-                ),
-                Some(VarCellDeg::new(
-                    self.deg.deg >> props.carry_w,
-                    self.var.single_pbs(&pbs_carry),
-                )),
-            )
-        }
-    }
-}
-
 impl PartialOrd for VarCellDeg {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
@@ -149,23 +105,25 @@ impl std::fmt::Debug for VarCellDeg {
 }
 
 impl VecVarCellDeg {
-    pub fn deg_chunks(&self, max_deg: &VarDeg) -> <Vec<Vec<VarCellDeg>> as IntoIterator>::IntoIter {
+    pub fn deg_chunks(
+        mut self,
+        max_deg: &VarDeg,
+    ) -> <Vec<Vec<VarCellDeg>> as IntoIterator>::IntoIter {
         trace!(target: "llt:deg_chunks", "len: {:?}, {:?}", self.len(), self.0);
 
         let mut res: Vec<Vec<VarCellDeg>> = Vec::new();
         let mut acc: VarDeg = VarDeg::default();
         let mut chunk: Vec<VarCellDeg> = Vec::new();
-        let mut copy = self.0.clone();
 
         // There are many ways to combine the whole vector in chunks up to
         // max_deg. We'll be greedy and sum up the elements by maximum degree
         // first.
-        copy.sort();
+        self.0.sort();
 
-        while !copy.is_empty() {
-            let sum = &acc + &copy.last().unwrap().deg;
+        while !self.is_empty() {
+            let sum = &acc + &self.0.last().unwrap().deg;
             if sum <= *max_deg {
-                chunk.push(copy.pop().unwrap());
+                chunk.push(self.0.pop().unwrap());
                 acc = sum;
             } else {
                 res.push(chunk);
@@ -173,7 +131,7 @@ impl VecVarCellDeg {
                 chunk = Vec::new();
             }
             trace!(target: "llt:deg_chunks:loop", "len: {:?}, {:?}, chunk: {:?}, acc: {:?}",
-                self.len(), copy, chunk, acc);
+                self.len(), self.0, chunk, acc);
         }
 
         // Any remaining chunk is appended
@@ -201,8 +159,4 @@ impl VecVarCellDeg {
     pub fn is_empty(&self) -> bool {
         self.0.len() == 0
     }
-
-    pub fn push(&mut self, item: VarCellDeg) {
-        self.0.push(item)
-    }
 }
diff --git a/backends/tfhe-hpu-backend/src/fw/rtl/config.rs b/backends/tfhe-hpu-backend/src/fw/rtl/config.rs
index eae6f3966b..3e3f82da7a 100644
--- a/backends/tfhe-hpu-backend/src/fw/rtl/config.rs
+++ b/backends/tfhe-hpu-backend/src/fw/rtl/config.rs
@@ -12,8 +12,6 @@ pub struct OpCfg {
     pub flush: bool,
     /// Whether to use latency tiers when scheduling
     pub use_tiers: bool,
-    /// Whether to use a massively parallel implementation
-    pub parallel: Option<bool>,
 }
 
 #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
diff --git a/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs b/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs
index 4e1c0aa196..53d83109ca 100644
--- a/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs
@@ -1500,7 +1500,7 @@ impl Arch {
     }
 }
 
-#[derive(Default, Clone, Debug)]
+#[derive(Clone, Debug)]
 pub struct Rtl(Vec<VarCell>);
 
 impl Rtl {
@@ -1623,12 +1623,6 @@ impl std::ops::Add<Rtl> for Rtl {
     }
 }
 
-impl std::iter::Sum<Rtl> for Rtl {
-    fn sum<I: Iterator<Item = Rtl>>(iter: I) -> Self {
-        iter.fold(Rtl::default(), |acc, x| acc + x)
-    }
-}
-
 impl Drop for Rtl {
     fn drop(&mut self) {
         self.unload();
diff --git a/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs b/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs
index 061b88a713..3e54ebaeb8 100644
--- a/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs
+++ b/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs
@@ -354,7 +354,7 @@ impl InfoPePbs {
     }
 
     pub fn update_load_bsk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) {
-        (0..16).for_each(|i| {
+        (1..16).for_each(|i| {
             let reg_name = format!("runtime_3in3::pep_load_bsk_rcp_dur_pc{i}");
             let reg = regmap
                 .register()
@@ -364,7 +364,7 @@ impl InfoPePbs {
         });
     }
     pub fn update_load_ksk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) {
-        (0..16).for_each(|i| {
+        (1..16).for_each(|i| {
             let reg_name = format!("runtime_1in3::pep_load_ksk_rcp_dur_pc{i}");
             let reg = regmap
                 .register()
diff --git a/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs b/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs
index 47c71d9d5e..4902a64508 100644
--- a/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs
+++ b/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs
@@ -64,13 +64,13 @@ pub struct IscPoolState {
     pub(super) vld: bool,
     pub(super) wr_lock: u32,
     pub(super) rd_lock: u32,
-    pub(super) issue_lock: u32,
+    //pub(super) issue_lock: u32,
     pub(super) sync_id: u32,
 }
 
 impl Len for IscPoolState {
     fn len() -> usize {
-        28
+        21
     }
 }
 
@@ -85,8 +85,8 @@ where
             vld: *(slice.get(2).ok_or(NoMoreBits)?),
             wr_lock: slice.get(3..10).ok_or(NoMoreBits)?.load::<u32>(),
             rd_lock: slice.get(10..17).ok_or(NoMoreBits)?.load::<u32>(),
-            issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::<u32>(),
-            sync_id: slice.get(24..28).ok_or(NoMoreBits)?.load::<u32>(),
+            //issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::<u32>(),
+            sync_id: slice.get(17..21).ok_or(NoMoreBits)?.load::<u32>(),
         })
     }
 }
diff --git a/backends/tfhe-hpu-backend/src/utils/hputil.rs b/backends/tfhe-hpu-backend/src/utils/hputil.rs
index dd8e9cefea..9369c613a1 100644
--- a/backends/tfhe-hpu-backend/src/utils/hputil.rs
+++ b/backends/tfhe-hpu-backend/src/utils/hputil.rs
@@ -79,9 +79,6 @@ pub enum Command {
         #[arg(short, long, default_value_t = String::from("trace.json"))]
         file: String,
     },
-
-    #[clap(about = "Resets all HPU processing logic")]
-    SoftReset {},
 }
 
 #[derive(Clone, Debug, ValueEnum)]
@@ -289,26 +286,5 @@ fn main() {
             serde_json::to_writer_pretty(file.make_writer(), &parsed)
                 .expect("Could not write trace dump");
         }
-        Command::SoftReset {} => {
-            let soft_reset = regmap
-                .register()
-                .get("hpu_reset::trigger")
-                .expect("The current HPU does not support soft reset.");
-            let soft_reset_addr = *soft_reset.offset() as u64;
-
-            for reset in [true, false].into_iter() {
-                hpu_hw.write_reg(soft_reset_addr, reset as u32);
-                loop {
-                    let done = {
-                        let val = hpu_hw.read_reg(soft_reset_addr);
-                        let fields = soft_reset.as_field(val);
-                        *fields.get("done").expect("Unknown field") != 0
-                    };
-                    if done == reset {
-                        break;
-                    }
-                }
-            }
-        }
     }
 }
diff --git a/ci/ec2_products_cost.json b/ci/ec2_products_cost.json
index dd346cd147..4840ae1953 100644
--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -12,9 +12,5 @@
   "n3-H100x4": 6.08,
   "n3-H100x2": 3.04,
   "n3-L40x1": 0.80,
-  "n3-H100x8-SXM5": 19.2,
-  "hpu_x1": 1.0,
-  "hpu_x2": 1.4,
-  "hpu_x4": 2.3,
-  "hpu_x8": 4.0
+  "n3-H100x8-SXM5": 19.2
 }
diff --git a/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail128_psi64.toml b/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail128_psi64.toml
index f4402a87ee..3b09a8ceb7 100644
--- a/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail128_psi64.toml
+++ b/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail128_psi64.toml
@@ -36,7 +36,7 @@
 [pc_params]
     ksk_pc= 16
     ksk_bytes_w= 32
-    bsk_pc= 16
+    bsk_pc= 8
     bsk_bytes_w= 32
     pem_pc= 2
     pem_bytes_w= 32
diff --git a/setup_hpu.sh b/setup_hpu.sh
index 5eac1f64da..f032b15487 100644
--- a/setup_hpu.sh
+++ b/setup_hpu.sh
@@ -23,7 +23,7 @@ V80_PCIE_DEV="unselected"
 XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}
 
 # V80 bitstream refresh require insmod of ami.ko module
-AMI_PATH=${AMI_PATH:-"/opt/v80/ami/ef9249f"}
+AMI_PATH=${AMI_PATH:-"/opt/v80/ami/1e6a8da"}
 
 # Parse user CLI ##############################################################
 opt_short="hc:l:p:"
@@ -63,9 +63,6 @@ do
             if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then
                 V80_PCIE_DEV="${2}"
                 ((i++))
-                shift 1
-            elif [[ ${#DEVICE[@]} -eq 1 ]]; then
-                V80_PCIE_DEV=${DEVICE[0]%%:*}
             else
                 echo "Please select a device in following list (1st two digits):"
                 for item in "${DEVICE[@]}"; do
@@ -73,7 +70,7 @@ do
                 done
                 return 1
             fi
-            shift 1
+            shift 2
             ;;
         "") # End of input reading
             break ;;
diff --git a/tfhe-benchmark/benches/high_level_api/bench.rs b/tfhe-benchmark/benches/high_level_api/bench.rs
index 384d6fd56c..7799cf3e8c 100644
--- a/tfhe-benchmark/benches/high_level_api/bench.rs
+++ b/tfhe-benchmark/benches/high_level_api/bench.rs
@@ -29,15 +29,9 @@ fn bench_fhe_type<FheType>(
         + RotateRight<&'a FheType, Output = FheType>
         + OverflowingAdd<&'a FheType, Output = FheType>
         + OverflowingSub<&'a FheType, Output = FheType>,
-    for<'a> FheType: FheMin<&'a FheType, Output = FheType> + FheMax<&'a FheType, Output = FheType>,
 {
     let mut bench_group = c.benchmark_group(type_name);
-    let mut bench_prefix = "hlapi::ops".to_string();
-    if cfg!(feature = "gpu") {
-        bench_prefix = format!("{}::cuda", bench_prefix);
-    } else if cfg!(feature = "hpu") {
-        bench_prefix = format!("{}::hpu", bench_prefix);
-    }
+    let bench_prefix = "hlapi::ops";
 
     let mut rng = thread_rng();
 
@@ -72,25 +66,31 @@ fn bench_fhe_type<FheType>(
     });
     write_record(bench_id, "add");
 
-    bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let (res, flag) = lhs.overflowing_add(&rhs);
-            res.wait();
-            black_box((res, flag))
-        })
-    });
-    write_record(bench_id, "overflowing_add");
+    #[cfg(not(feature = "hpu"))]
+    {
+        bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{bit_size}_bits");
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let (res, flag) = lhs.overflowing_add(&rhs);
+                res.wait();
+                black_box((res, flag))
+            })
+        });
+        write_record(bench_id, "overflowing_add");
+    }
 
-    bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let (res, flag) = lhs.overflowing_sub(&rhs);
-            res.wait();
-            black_box((res, flag))
-        })
-    });
-    write_record(bench_id, "overflowing_sub");
+    #[cfg(not(feature = "hpu"))]
+    {
+        bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{bit_size}_bits");
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let (res, flag) = lhs.overflowing_sub(&rhs);
+                res.wait();
+                black_box((res, flag))
+            })
+        });
+        write_record(bench_id, "overflowing_sub");
+    }
 
     bench_id = format!("{bench_prefix}::sub::{param_name}::{bit_size}_bits");
     bench_group.bench_function(&bench_id, |b| {
@@ -142,65 +142,48 @@ fn bench_fhe_type<FheType>(
     });
     write_record(bench_id, "bitxor");
 
-    bench_id = format!("{bench_prefix}::left_shift::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs << &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "left_shift");
-
-    bench_id = format!("{bench_prefix}::right_shift::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs >> &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "right_shift");
+    #[cfg(not(feature = "hpu"))]
+    {
+        bench_id = format!("{bench_prefix}::left_shift::{param_name}::{bit_size}_bits");
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let res = &lhs << &rhs;
+                res.wait();
+                black_box(res)
+            })
+        });
+        write_record(bench_id, "left_shift");
 
-    bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = (&lhs).rotate_left(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "left_rotate");
+        bench_id = format!("{bench_prefix}::right_shift::{param_name}::{bit_size}_bits");
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let res = &lhs >> &rhs;
+                res.wait();
+                black_box(res)
+            })
+        });
+        write_record(bench_id, "right_shift");
 
-    bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = (&lhs).rotate_right(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "right_rotate");
+        bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{bit_size}_bits");
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let res = (&lhs).rotate_left(&rhs);
+                res.wait();
+                black_box(res)
+            })
+        });
+        write_record(bench_id, "left_rotate");
 
-    bench_id = format!("{bench_prefix}::min::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = lhs.min(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "min");
-
-    bench_id = format!("{bench_prefix}::max::{param_name}::{bit_size}_bits");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = lhs.max(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "max");
+        bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{bit_size}_bits");
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let res = (&lhs).rotate_right(&rhs);
+                res.wait();
+                black_box(res)
+            })
+        });
+        write_record(bench_id, "right_rotate");
+    }
 }
 
 macro_rules! bench_type {
diff --git a/tfhe-benchmark/benches/high_level_api/noise_squash.rs b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
index 54d1bbf546..29c98e9dc5 100644
--- a/tfhe-benchmark/benches/high_level_api/noise_squash.rs
+++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
@@ -1,16 +1,9 @@
 #[cfg(feature = "gpu")]
-use benchmark::params_aliases::{
-    BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-    BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-    BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-    BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-};
+use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 #[cfg(not(feature = "gpu"))]
-use benchmark::params_aliases::{
-    BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-    BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-    BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-};
+use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(feature = "gpu")]
+use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 #[cfg(feature = "gpu")]
 use benchmark::utilities::configure_gpu;
 use benchmark::utilities::{
@@ -28,13 +21,11 @@ use tfhe::core_crypto::gpu::get_number_of_gpus;
 #[cfg(feature = "gpu")]
 use tfhe::{set_server_key, GpuIndex};
 use tfhe::{
-    ClientKey, CompressedCiphertextListBuilder, CompressedServerKey,
-    CompressedSquashedNoiseCiphertextListBuilder, FheUint10, FheUint12, FheUint128, FheUint14,
-    FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, HlCompressible,
-    HlExpandable, HlSquashedNoiseCompressible,
+    ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
+    FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
 };
 
-fn bench_sns_only_fhe_type<FheType>(
+fn bench_fhe_type<FheType>(
     c: &mut Criterion,
     client_key: &ClientKey,
     type_name: &str,
@@ -148,184 +139,27 @@ fn bench_sns_only_fhe_type<FheType>(
     );
 }
 
-fn bench_decomp_sns_comp_fhe_type<FheType>(
-    c: &mut Criterion,
-    client_key: &ClientKey,
-    type_name: &str,
-    num_bits: usize,
-) where
-    FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
-    FheType: SquashNoise + Tagged + HlExpandable + HlCompressible,
-    <FheType as SquashNoise>::Output: HlSquashedNoiseCompressible,
-{
-    let mut bench_group = c.benchmark_group(type_name);
-    let bench_id_prefix = if cfg!(feature = "gpu") {
-        "hlapi::cuda"
-    } else {
-        "hlapi"
-    };
-    let bench_id_suffix = format!("decomp_noise_squash_comp::{type_name}");
-
-    let mut rng = thread_rng();
-
-    let bench_id;
-
-    match get_bench_type() {
-        BenchmarkType::Latency => {
-            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
-
-            #[cfg(feature = "gpu")]
-            configure_gpu(client_key);
-
-            let input = FheType::encrypt(rng.gen(), client_key);
-
-            let mut builder = CompressedCiphertextListBuilder::new();
-            builder.push(input);
-            let compressed = builder.build().unwrap();
-
-            bench_group.bench_function(&bench_id, |b| {
-                b.iter(|| {
-                    let decompressed = compressed.get::<FheType>(0).unwrap().unwrap();
-                    let squashed = decompressed.squash_noise().unwrap();
-                    let mut builder = CompressedSquashedNoiseCiphertextListBuilder::new();
-                    builder.push(squashed);
-                    let _ = builder.build();
-                })
-            });
-        }
-        BenchmarkType::Throughput => {
-            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
-            let params = client_key.computation_parameters();
-            let num_blocks = num_bits
-                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
-
-            #[cfg(feature = "gpu")]
-            {
-                let elements = throughput_num_threads(num_blocks, 4);
-                bench_group.throughput(Throughput::Elements(elements));
-                println!("elements: {elements}");
-                let gpu_count = get_number_of_gpus() as usize;
-                let compressed_server_key = CompressedServerKey::new(client_key);
-                let sks_vec = (0..gpu_count)
-                    .map(|i| {
-                        compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
-                    })
-                    .collect::<Vec<_>>();
-
-                bench_group.bench_function(&bench_id, |b| {
-                    let compressed_values = || {
-                        (0..elements)
-                            .map(|_| {
-                                let input = FheType::encrypt(rng.gen(), client_key);
-                                let mut builder = CompressedCiphertextListBuilder::new();
-                                builder.push(input);
-                                builder.build().unwrap()
-                            })
-                            .collect::<Vec<_>>()
-                    };
-
-                    b.iter_batched(
-                        compressed_values,
-                        |compressed_inputs| {
-                            compressed_inputs
-                                .par_iter()
-                                .enumerate()
-                                .for_each(|(i, input)| {
-                                    set_server_key(sks_vec[i % gpu_count].clone());
-
-                                    let decompressed = input.get::<FheType>(0).unwrap().unwrap();
-                                    let squashed = decompressed.squash_noise().unwrap();
-                                    let mut builder =
-                                        CompressedSquashedNoiseCiphertextListBuilder::new();
-                                    builder.push(squashed);
-                                    let _ = builder.build();
-                                })
-                        },
-                        criterion::BatchSize::SmallInput,
-                    )
-                });
-            }
-
-            #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
-            {
-                let elements = throughput_num_threads(num_blocks, 1);
-                bench_group.throughput(Throughput::Elements(elements));
-                bench_group.bench_function(&bench_id, |b| {
-                    let compressed_values = || {
-                        (0..elements)
-                            .map(|_| {
-                                let input = FheType::encrypt(rng.gen(), client_key);
-                                let mut builder = CompressedCiphertextListBuilder::new();
-                                builder.push(input);
-                                builder.build().unwrap()
-                            })
-                            .collect::<Vec<_>>()
-                    };
-
-                    b.iter_batched(
-                        compressed_values,
-                        |compressed_inputs| {
-                            compressed_inputs.par_iter().for_each(|input| {
-                                let decompressed = input.get::<FheType>(0).unwrap().unwrap();
-                                let squashed = decompressed.squash_noise().unwrap();
-                                let mut builder =
-                                    CompressedSquashedNoiseCiphertextListBuilder::new();
-                                builder.push(squashed);
-                                let _ = builder.build();
-                            })
-                        },
-                        criterion::BatchSize::SmallInput,
-                    )
-                });
-            }
-        }
-    }
-    let params = client_key.computation_parameters();
-
-    write_to_json::<u64, _>(
-        &bench_id,
-        params,
-        params.name(),
-        "decomp_noise_squash_comp",
-        &OperatorType::Atomic,
-        64,
-        vec![],
-    );
-}
-
-macro_rules! bench_sns_only_type {
-    ($fhe_type:ident) => {
-        ::paste::paste! {
-            fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
-                bench_sns_only_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
-            }
-        }
-    };
-}
-
-macro_rules! bench_decomp_sns_comp_type {
+macro_rules! bench_type {
     ($fhe_type:ident) => {
         ::paste::paste! {
-            fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
-                bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
+                bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
             }
         }
     };
 }
 
-bench_sns_only_type!(FheUint2);
-bench_sns_only_type!(FheUint4);
-bench_sns_only_type!(FheUint6);
-bench_sns_only_type!(FheUint8);
-bench_sns_only_type!(FheUint10);
-bench_sns_only_type!(FheUint12);
-bench_sns_only_type!(FheUint14);
-bench_sns_only_type!(FheUint16);
-bench_sns_only_type!(FheUint32);
-bench_sns_only_type!(FheUint64);
-bench_sns_only_type!(FheUint128);
-
-bench_decomp_sns_comp_type!(FheUint64);
+bench_type!(FheUint2);
+bench_type!(FheUint4);
+bench_type!(FheUint6);
+bench_type!(FheUint8);
+bench_type!(FheUint10);
+bench_type!(FheUint12);
+bench_type!(FheUint14);
+bench_type!(FheUint16);
+bench_type!(FheUint32);
+bench_type!(FheUint64);
+bench_type!(FheUint128);
 
 fn main() {
     #[cfg(feature = "hpu")]
@@ -338,10 +172,6 @@ fn main() {
             BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
         )
         .enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
-        .enable_noise_squashing_compression(
-            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .enable_compression(BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
         .build();
         let cks = ClientKey::generate(config);
         let compressed_sks = CompressedServerKey::new(&cks);
@@ -360,12 +190,6 @@ fn main() {
         .enable_noise_squashing(
             BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
         )
-        .enable_noise_squashing_compression(
-            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .enable_compression(
-            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
         .build();
         let cks = ClientKey::generate(config);
         let compressed_sks = CompressedServerKey::new(&cks);
@@ -376,19 +200,17 @@ fn main() {
 
     let mut c = Criterion::default().configure_from_args();
 
-    bench_sns_only_fhe_uint2(&mut c, &cks);
-    bench_sns_only_fhe_uint4(&mut c, &cks);
-    bench_sns_only_fhe_uint6(&mut c, &cks);
-    bench_sns_only_fhe_uint8(&mut c, &cks);
-    bench_sns_only_fhe_uint10(&mut c, &cks);
-    bench_sns_only_fhe_uint12(&mut c, &cks);
-    bench_sns_only_fhe_uint14(&mut c, &cks);
-    bench_sns_only_fhe_uint16(&mut c, &cks);
-    bench_sns_only_fhe_uint32(&mut c, &cks);
-    bench_sns_only_fhe_uint64(&mut c, &cks);
-    bench_sns_only_fhe_uint128(&mut c, &cks);
-
-    bench_decomp_sns_comp_fhe_uint64(&mut c, &cks);
+    bench_fhe_uint2(&mut c, &cks);
+    bench_fhe_uint4(&mut c, &cks);
+    bench_fhe_uint6(&mut c, &cks);
+    bench_fhe_uint8(&mut c, &cks);
+    bench_fhe_uint10(&mut c, &cks);
+    bench_fhe_uint12(&mut c, &cks);
+    bench_fhe_uint14(&mut c, &cks);
+    bench_fhe_uint16(&mut c, &cks);
+    bench_fhe_uint32(&mut c, &cks);
+    bench_fhe_uint64(&mut c, &cks);
+    bench_fhe_uint128(&mut c, &cks);
 
     c.final_summary();
 }
diff --git a/tfhe-benchmark/benches/integer/bench.rs b/tfhe-benchmark/benches/integer/bench.rs
index 9252db5880..63485a71ef 100644
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -3032,7 +3032,7 @@ mod hpu {
         fn [< default_hpu_ $iop:lower >](c: &mut Criterion) {
             bench_hpu_iop_clean_inputs(
                 c,
-                concat!("integer::hpu::scalar_", stringify!($iop)),
+                concat!("integer::hpu::scalar::", stringify!($iop)),
                 stringify!($name),
                 &hpu_asm::iop::[< IOP_ $iop:upper >],
             )
@@ -3629,9 +3629,6 @@ fn go_through_hpu_bench_groups(val: &str) {
             hpu::default_hpu_bitwise();
             hpu::default_hpu_cmp();
             hpu::default_hpu_select();
-            hpu::default_hpu_shiftrot();
-            hpu::default_hpu_shiftrot_scalar();
-            hpu::default_hpu_bitcnt();
         }
         "fast_default" => {
             hpu::default_hpu_ops();
diff --git a/tfhe-benchmark/benches/integer/zk_pke.rs b/tfhe-benchmark/benches/integer/zk_pke.rs
index 5f1db0b658..e31040900c 100644
--- a/tfhe-benchmark/benches/integer/zk_pke.rs
+++ b/tfhe-benchmark/benches/integer/zk_pke.rs
@@ -14,29 +14,6 @@ use tfhe::keycache::NamedParam;
 use tfhe::shortint::parameters::*;
 use tfhe::zk::{CompactPkeCrs, ZkComputeLoad};
 
-struct ProofConfig {
-    crs_size: usize,
-    bits_to_prove: Vec<usize>,
-}
-
-impl ProofConfig {
-    fn new(crs_size: usize, bits_to_prove: &[usize]) -> Self {
-        Self {
-            crs_size,
-            bits_to_prove: bits_to_prove.to_vec(),
-        }
-    }
-}
-
-fn default_proof_config() -> Vec<ProofConfig> {
-    vec![
-        ProofConfig::new(64usize, &[64usize]),
-        ProofConfig::new(640, &[640]),
-        ProofConfig::new(2048, &[2048, 4 * 64usize]),
-        ProofConfig::new(4096, &[4096]),
-    ]
-}
-
 fn write_result(file: &mut File, name: &str, value: usize) {
     let line = format!("{name},{value}\n");
     let error_message = format!("cannot write {name} result into file");
@@ -44,21 +21,10 @@ fn write_result(file: &mut File, name: &str, value: usize) {
 }
 
 fn zk_throughput_num_elements() -> u64 {
-    // The number of usable threads for verification is limited to 32 in the lib.
+    // Number of usable threads for a verification is limited to 32 in the lib.
     let max_threads = 32;
     // We add 1 to be sure we saturate the target machine.
-    let usable_cpu_threads = (rayon::current_num_threads() as u64 / max_threads).max(1) + 1;
-
-    #[cfg(feature = "gpu")]
-    {
-        use tfhe::core_crypto::gpu::get_number_of_gpus;
-        get_number_of_gpus() as u64 * usable_cpu_threads
-    }
-
-    #[cfg(not(feature = "gpu"))]
-    {
-        usable_cpu_threads
-    }
+    (rayon::current_num_threads() as u64 / max_threads).max(1) + 1
 }
 
 fn cpu_pke_zk_proof(c: &mut Criterion) {
@@ -97,89 +63,88 @@ fn cpu_pke_zk_proof(c: &mut Criterion) {
 
         let zk_vers = param_pke.zk_scheme;
 
-        for proof_config in default_proof_config().iter() {
-            let msg_bits =
-                (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize;
-            println!("Generating CRS... ");
+        for bits in [64usize, 640, 1280, 4096] {
+            assert_eq!(bits % 64, 0);
+            // Packing, so we take the message and carry modulus to compute our block count
+            let num_block = 64usize.div_ceil(
+                (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
+            );
+
+            use rand::Rng;
+            let mut rng = rand::thread_rng();
+
+            let fhe_uint_count = bits / 64;
+
             let crs = CompactPkeCrs::from_shortint_params(
                 param_pke,
-                LweCiphertextCount(proof_config.crs_size / msg_bits),
+                LweCiphertextCount(num_block * fhe_uint_count),
             )
             .unwrap();
 
-            for bits in proof_config.bits_to_prove.iter() {
-                assert_eq!(bits % 64, 0);
-                // Packing, so we take the message and carry modulus to compute our block count
-                let num_block = 64usize.div_ceil(msg_bits);
-
-                let fhe_uint_count = bits / 64;
+            for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
+                let zk_load = match compute_load {
+                    ZkComputeLoad::Proof => "compute_load_proof",
+                    ZkComputeLoad::Verify => "compute_load_verify",
+                };
 
-                for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
-                    let zk_load = match compute_load {
-                        ZkComputeLoad::Proof => "compute_load_proof",
-                        ZkComputeLoad::Verify => "compute_load_verify",
-                    };
+                let bench_id;
 
-                    let bench_id;
+                match get_bench_type() {
+                    BenchmarkType::Latency => {
+                        bench_id = format!(
+                            "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                        bench_group.bench_function(&bench_id, |b| {
+                            let input_msg = rng.gen::<u64>();
+                            let messages = vec![input_msg; fhe_uint_count];
 
-                    match get_bench_type() {
-                        BenchmarkType::Latency => {
-                            bench_id = format!(
-                                "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                            );
-                            bench_group.bench_function(&bench_id, |b| {
-                                let input_msg = rng.gen::<u64>();
-                                let messages = vec![input_msg; fhe_uint_count];
+                            b.iter(|| {
+                                let _ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                    .extend(messages.iter().copied())
+                                    .build_with_proof_packed(&crs, &metadata, compute_load)
+                                    .unwrap();
+                            })
+                        });
+                    }
+                    BenchmarkType::Throughput => {
+                        let elements = zk_throughput_num_elements() * 2; // This value, found empirically, ensure saturation of current target
+                                                                         // machine
+                        bench_group.throughput(Throughput::Elements(elements));
 
-                                b.iter(|| {
-                                    let _ct1 =
-                                        tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
-                                            .extend(messages.iter().copied())
-                                            .build_with_proof_packed(&crs, &metadata, compute_load)
-                                            .unwrap();
+                        bench_id = format!(
+                            "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                        bench_group.bench_function(&bench_id, |b| {
+                            let messages = (0..elements)
+                                .map(|_| {
+                                    let input_msg = rng.gen::<u64>();
+                                    vec![input_msg; fhe_uint_count]
                                 })
-                            });
-                        }
-                        BenchmarkType::Throughput => {
-                            let elements = zk_throughput_num_elements() * 2; // This value, found empirically, ensure saturation of current target
-                                                                             // machine
-                            bench_group.throughput(Throughput::Elements(elements));
-
-                            bench_id = format!(
-                                "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                            );
-                            bench_group.bench_function(&bench_id, |b| {
-                                let messages = (0..elements)
-                                    .map(|_| {
-                                        let input_msg = rng.gen::<u64>();
-                                        vec![input_msg; fhe_uint_count]
-                                    })
-                                    .collect::<Vec<_>>();
+                                .collect::<Vec<_>>();
 
-                                b.iter(|| {
-                                    messages.par_iter().for_each(|msg| {
-                                        tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
-                                            .extend(msg.iter().copied())
-                                            .build_with_proof_packed(&crs, &metadata, compute_load)
-                                            .unwrap();
-                                    })
+                            b.iter(|| {
+                                messages.par_iter().for_each(|msg| {
+                                    tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                        .extend(msg.iter().copied())
+                                        .build_with_proof_packed(&crs, &metadata, compute_load)
+                                        .unwrap();
                                 })
-                            });
-                        }
+                            })
+                        });
                     }
-
-                    let shortint_params: PBSParameters = param_fhe.into();
-
-                    write_to_json::<u64, _>(
-                        &bench_id,
-                        shortint_params,
-                        param_name,
-                        "pke_zk_proof",
-                        &OperatorType::Atomic,
-                        shortint_params.message_modulus().0 as u32,
-                        vec![shortint_params.message_modulus().0.ilog2(); num_block],
-                    );
                 }
+
+                let shortint_params: PBSParameters = param_fhe.into();
+
+                write_to_json::<u64, _>(
+                    &bench_id,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_proof",
+                    &OperatorType::Atomic,
+                    shortint_params.message_modulus().0 as u32,
+                    vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                );
             }
         }
     }
@@ -230,121 +195,121 @@ fn cpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
 
         let zk_vers = param_pke.zk_scheme;
 
-        for proof_config in default_proof_config().iter() {
-            let msg_bits =
-                (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize;
+        for bits in [64usize, 640, 1280, 4096] {
+            assert_eq!(bits % 64, 0);
+            // Packing, so we take the message and carry modulus to compute our block count
+            let num_block = 64usize.div_ceil(
+                (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
+            );
+
+            use rand::Rng;
+            let mut rng = rand::thread_rng();
+
+            let fhe_uint_count = bits / 64;
+
             println!("Generating CRS... ");
             let crs = CompactPkeCrs::from_shortint_params(
                 param_pke,
-                LweCiphertextCount(proof_config.crs_size / msg_bits),
+                LweCiphertextCount(num_block * fhe_uint_count),
             )
             .unwrap();
 
-            for bits in proof_config.bits_to_prove.iter() {
-                assert_eq!(bits % 64, 0);
-                // Packing, so we take the message and carry modulus to compute our block count
-                let num_block = 64usize.div_ceil(msg_bits);
+            let shortint_params: PBSParameters = param_fhe.into();
 
-                let fhe_uint_count = bits / 64;
+            let crs_data = bincode::serialize(&crs).unwrap();
 
-                let shortint_params: PBSParameters = param_fhe.into();
+            println!("CRS size: {}", crs_data.len());
 
-                let crs_data = bincode::serialize(&crs).unwrap();
+            let test_name = format!("zk::crs_sizes::{param_name}_{bits}_bits_packed_ZK{zk_vers:?}");
 
-                println!("CRS size: {}", crs_data.len());
+            write_result(&mut file, &test_name, crs_data.len());
+            write_to_json::<u64, _>(
+                &test_name,
+                shortint_params,
+                param_name,
+                "pke_zk_crs",
+                &OperatorType::Atomic,
+                0,
+                vec![],
+            );
 
-                let test_name =
-                    format!("zk::crs_sizes::{param_name}_{bits}_bits_packed_ZK{zk_vers:?}");
+            for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
+                let zk_load = match compute_load {
+                    ZkComputeLoad::Proof => "compute_load_proof",
+                    ZkComputeLoad::Verify => "compute_load_verify",
+                };
 
-                write_result(&mut file, &test_name, crs_data.len());
-                write_to_json::<u64, _>(
-                    &test_name,
-                    shortint_params,
-                    param_name,
-                    "pke_zk_crs",
-                    &OperatorType::Atomic,
-                    0,
-                    vec![],
-                );
+                let bench_id_verify;
+                let bench_id_verify_and_expand;
 
-                for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
-                    let zk_load = match compute_load {
-                        ZkComputeLoad::Proof => "compute_load_proof",
-                        ZkComputeLoad::Verify => "compute_load_verify",
-                    };
-
-                    let bench_id_verify;
-                    let bench_id_verify_and_expand;
-
-                    match get_bench_type() {
-                        BenchmarkType::Latency => {
-                            bench_id_verify = format!(
+                match get_bench_type() {
+                    BenchmarkType::Latency => {
+                        bench_id_verify = format!(
                             "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
                         );
-                            bench_id_verify_and_expand = format!(
+                        bench_id_verify_and_expand = format!(
                             "{bench_name}_and_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
                         );
 
-                            let input_msg = rng.gen::<u64>();
-                            let messages = vec![input_msg; fhe_uint_count];
+                        let input_msg = rng.gen::<u64>();
+                        let messages = vec![input_msg; fhe_uint_count];
 
-                            println!("Generating proven ciphertext ({zk_load})... ");
-                            let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
-                                .extend(messages.iter().copied())
-                                .build_with_proof_packed(&crs, &metadata, compute_load)
-                                .unwrap();
+                        println!("Generating proven ciphertext ({zk_load})... ");
+                        let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                            .extend(messages.iter().copied())
+                            .build_with_proof_packed(&crs, &metadata, compute_load)
+                            .unwrap();
 
-                            let proven_ciphertext_list_serialized =
-                                bincode::serialize(&ct1).unwrap();
+                        let proven_ciphertext_list_serialized = bincode::serialize(&ct1).unwrap();
 
-                            println!(
-                                "proven list size: {}",
-                                proven_ciphertext_list_serialized.len()
-                            );
+                        println!(
+                            "proven list size: {}",
+                            proven_ciphertext_list_serialized.len()
+                        );
 
-                            let test_name = format!(
+                        let test_name = format!(
                             "zk::proven_list_size::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
                         );
 
-                            write_result(
-                                &mut file,
-                                &test_name,
-                                proven_ciphertext_list_serialized.len(),
-                            );
-                            write_to_json::<u64, _>(
-                                &test_name,
-                                shortint_params,
-                                param_name,
-                                "pke_zk_proof",
-                                &OperatorType::Atomic,
-                                0,
-                                vec![],
-                            );
+                        write_result(
+                            &mut file,
+                            &test_name,
+                            proven_ciphertext_list_serialized.len(),
+                        );
+                        write_to_json::<u64, _>(
+                            &test_name,
+                            shortint_params,
+                            param_name,
+                            "pke_zk_proof",
+                            &OperatorType::Atomic,
+                            0,
+                            vec![],
+                        );
 
-                            let proof_size = ct1.proof_size();
-                            println!("proof size: {}", ct1.proof_size());
+                        let proof_size = ct1.proof_size();
+                        println!("proof size: {}", ct1.proof_size());
 
-                            let test_name =
+                        let test_name =
                             format!("zk::proof_sizes::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}");
 
-                            write_result(&mut file, &test_name, proof_size);
-                            write_to_json::<u64, _>(
-                                &test_name,
-                                shortint_params,
-                                param_name,
-                                "pke_zk_proof",
-                                &OperatorType::Atomic,
-                                0,
-                                vec![],
-                            );
+                        write_result(&mut file, &test_name, proof_size);
+                        write_to_json::<u64, _>(
+                            &test_name,
+                            shortint_params,
+                            param_name,
+                            "pke_zk_proof",
+                            &OperatorType::Atomic,
+                            0,
+                            vec![],
+                        );
 
-                            bench_group.bench_function(&bench_id_verify, |b| {
-                                b.iter(|| {
-                                    let _ret = ct1.verify(&crs, &pk, &metadata);
-                                });
+                        bench_group.bench_function(&bench_id_verify, |b| {
+                            b.iter(|| {
+                                let _ret = ct1.verify(&crs, &pk, &metadata);
                             });
+                        });
 
-                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                        bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                             b.iter(|| {
                                 let _ret = ct1
                                     .verify_and_expand(
@@ -358,41 +323,41 @@ fn cpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
                                     .unwrap();
                             });
                         });
-                        }
-                        BenchmarkType::Throughput => {
-                            // In throughput mode object sizes are not recorded.
+                    }
+                    BenchmarkType::Throughput => {
+                        // In throughput mode object sizes are not recorded.
 
-                            let elements = zk_throughput_num_elements();
-                            bench_group.throughput(Throughput::Elements(elements));
+                        let elements = zk_throughput_num_elements();
+                        bench_group.throughput(Throughput::Elements(elements));
 
-                            bench_id_verify = format!(
+                        bench_id_verify = format!(
                             "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
                         );
-                            bench_id_verify_and_expand = format!(
+                        bench_id_verify_and_expand = format!(
                             "{bench_name}_and_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
                         );
 
-                            println!("Generating proven ciphertexts list ({zk_load})... ");
-                            let cts = (0..elements)
-                                .map(|_| {
-                                    let input_msg = rng.gen::<u64>();
-                                    let messages = vec![input_msg; fhe_uint_count];
-                                    tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
-                                        .extend(messages.iter().copied())
-                                        .build_with_proof_packed(&crs, &metadata, compute_load)
-                                        .unwrap()
-                                })
-                                .collect::<Vec<_>>();
+                        println!("Generating proven ciphertexts list ({zk_load})... ");
+                        let cts = (0..elements)
+                            .map(|_| {
+                                let input_msg = rng.gen::<u64>();
+                                let messages = vec![input_msg; fhe_uint_count];
+                                tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                    .extend(messages.iter().copied())
+                                    .build_with_proof_packed(&crs, &metadata, compute_load)
+                                    .unwrap()
+                            })
+                            .collect::<Vec<_>>();
 
-                            bench_group.bench_function(&bench_id_verify, |b| {
-                                b.iter(|| {
-                                    cts.par_iter().for_each(|ct1| {
-                                        ct1.verify(&crs, &pk, &metadata);
-                                    })
-                                });
+                        bench_group.bench_function(&bench_id_verify, |b| {
+                            b.iter(|| {
+                                cts.par_iter().for_each(|ct1| {
+                                    ct1.verify(&crs, &pk, &metadata);
+                                })
                             });
+                        });
 
-                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                        bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                             b.iter(|| {
                                 cts.par_iter().for_each(|ct1| {
                                     ct1
@@ -408,29 +373,28 @@ fn cpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
                                 })
                             });
                         });
-                        }
                     }
-
-                    write_to_json::<u64, _>(
-                        &bench_id_verify,
-                        shortint_params,
-                        param_name,
-                        "pke_zk_verify",
-                        &OperatorType::Atomic,
-                        shortint_params.message_modulus().0 as u32,
-                        vec![shortint_params.message_modulus().0.ilog2(); num_block],
-                    );
-
-                    write_to_json::<u64, _>(
-                        &bench_id_verify_and_expand,
-                        shortint_params,
-                        param_name,
-                        "pke_zk_verify_and_expand",
-                        &OperatorType::Atomic,
-                        shortint_params.message_modulus().0 as u32,
-                        vec![shortint_params.message_modulus().0.ilog2(); num_block],
-                    );
                 }
+
+                write_to_json::<u64, _>(
+                    &bench_id_verify,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_verify",
+                    &OperatorType::Atomic,
+                    shortint_params.message_modulus().0 as u32,
+                    vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                );
+
+                write_to_json::<u64, _>(
+                    &bench_id_verify_and_expand,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_verify_and_expand",
+                    &OperatorType::Atomic,
+                    shortint_params.message_modulus().0 as u32,
+                    vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                );
             }
         }
     }
@@ -485,215 +449,208 @@ mod cuda {
 
             let zk_vers = param_pke.zk_scheme;
 
-            for proof_config in default_proof_config().iter() {
-                let msg_bits =
-                    (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize;
+            for bits in [64usize, 640, 1280, 4096] {
+                assert_eq!(bits % 64, 0);
+                // Packing, so we take the message and carry modulus to compute our block count
+                let num_block = 64usize.div_ceil(
+                    (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
+                );
+
+                use rand::Rng;
+                let mut rng = rand::thread_rng();
+
+                let fhe_uint_count = bits / 64;
+
                 println!("Generating CRS... ");
                 let crs = CompactPkeCrs::from_shortint_params(
                     param_pke,
-                    LweCiphertextCount(proof_config.crs_size / msg_bits),
+                    LweCiphertextCount(num_block * fhe_uint_count),
                 )
                 .unwrap();
 
-                use rand::Rng;
-                let mut rng = rand::thread_rng();
+                let shortint_params: PBSParameters = param_fhe.into();
 
-                for bits in proof_config.bits_to_prove.iter() {
-                    assert_eq!(bits % 64, 0);
-                    // Packing, so we take the message and carry modulus to compute our block count
-                    let num_block = 64usize.div_ceil(msg_bits);
-
-                    let fhe_uint_count = bits / 64;
-
-                    let shortint_params: PBSParameters = param_fhe.into();
-
-                    let crs_data = bincode::serialize(&crs).unwrap();
-
-                    println!("CRS size: {}", crs_data.len());
-
-                    let test_name =
-                        format!("zk::crs_sizes::{param_name}_{bits}_bits_packed_ZK{zk_vers:?}");
-
-                    write_result(&mut file, &test_name, crs_data.len());
-                    write_to_json::<u64, _>(
-                        &test_name,
-                        shortint_params,
-                        param_name,
-                        "pke_zk_crs",
-                        &OperatorType::Atomic,
-                        0,
-                        vec![],
-                    );
-
-                    for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
-                        let zk_load = match compute_load {
-                            ZkComputeLoad::Proof => "compute_load_proof",
-                            ZkComputeLoad::Verify => "compute_load_verify",
-                        };
-
-                        let bench_id_verify;
-                        let bench_id_verify_and_expand;
-                        let bench_id_expand_without_verify;
-
-                        match get_bench_type() {
-                            BenchmarkType::Latency => {
-                                let streams = CudaStreams::new_multi_gpu();
-                                let gpu_sks = CudaServerKey::decompress_from_cpu(
-                                    &compressed_server_key,
-                                    &streams,
-                                );
-                                let d_ksk_material =
-                                    CudaKeySwitchingKeyMaterial::from_key_switching_key(
-                                        &ksk, &streams,
-                                    );
-                                let d_ksk =
-                                    CudaKeySwitchingKey::from_cuda_key_switching_key_material(
-                                        &d_ksk_material,
-                                        &gpu_sks,
-                                    );
-
-                                bench_id_verify = format!(
-                                    "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                                );
-                                bench_id_verify_and_expand = format!(
-                                    "{bench_name}_and_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                                );
-                                bench_id_expand_without_verify = format!(
-                                    "{bench_name}_only_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                                );
+                let crs_data = bincode::serialize(&crs).unwrap();
 
-                                let input_msg = rng.gen::<u64>();
-                                let messages = vec![input_msg; fhe_uint_count];
+                println!("CRS size: {}", crs_data.len());
 
-                                println!("Generating proven ciphertext ({zk_load})... ");
-                                let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
-                                    .extend(messages.iter().copied())
-                                    .build_with_proof_packed(&crs, &metadata, compute_load)
-                                    .unwrap();
-                                let gpu_ct1 =
-                                    CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
-                                        &ct1, &streams,
-                                    );
-
-                                let proven_ciphertext_list_serialized =
-                                    bincode::serialize(&ct1).unwrap();
-
-                                println!(
-                                    "proven list size: {}",
-                                    proven_ciphertext_list_serialized.len()
-                                );
-
-                                let test_name = format!(
-                                    "zk::proven_list_size::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                                );
-
-                                write_result(
-                                    &mut file,
-                                    &test_name,
-                                    proven_ciphertext_list_serialized.len(),
-                                );
-                                write_to_json::<u64, _>(
-                                    &test_name,
-                                    shortint_params,
-                                    param_name,
-                                    "pke_zk_proof",
-                                    &OperatorType::Atomic,
-                                    0,
-                                    vec![],
-                                );
-
-                                let proof_size = ct1.proof_size();
-                                println!("proof size: {}", ct1.proof_size());
-
-                                let test_name =
-                                    format!("zk::proof_sizes::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}");
-
-                                write_result(&mut file, &test_name, proof_size);
-                                write_to_json::<u64, _>(
-                                    &test_name,
-                                    shortint_params,
-                                    param_name,
-                                    "pke_zk_proof",
-                                    &OperatorType::Atomic,
-                                    0,
-                                    vec![],
-                                );
-
-                                bench_group.bench_function(&bench_id_verify, |b| {
-                                    b.iter(|| {
-                                        let _ret = ct1.verify(&crs, &pk, &metadata);
-                                    });
+                let test_name =
+                    format!("zk::crs_sizes::{param_name}_{bits}_bits_packed_ZK{zk_vers:?}");
+
+                write_result(&mut file, &test_name, crs_data.len());
+                write_to_json::<u64, _>(
+                    &test_name,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_crs",
+                    &OperatorType::Atomic,
+                    0,
+                    vec![],
+                );
+
+                for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
+                    let zk_load = match compute_load {
+                        ZkComputeLoad::Proof => "compute_load_proof",
+                        ZkComputeLoad::Verify => "compute_load_verify",
+                    };
+
+                    let bench_id_verify;
+                    let bench_id_verify_and_expand;
+                    let bench_id_expand_without_verify;
+
+                    match get_bench_type() {
+                        BenchmarkType::Latency => {
+                            let streams = CudaStreams::new_multi_gpu();
+                            let gpu_sks = CudaServerKey::decompress_from_cpu(
+                                &compressed_server_key,
+                                &streams,
+                            );
+                            let d_ksk_material =
+                                CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
+                            let d_ksk = CudaKeySwitchingKey::from_cuda_key_switching_key_material(
+                                &d_ksk_material,
+                                &gpu_sks,
+                            );
+
+                            bench_id_verify = format!(
+                            "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_verify_and_expand = format!(
+                            "{bench_name}_and_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_expand_without_verify = format!(
+                            "{bench_name}_only_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+
+                            let input_msg = rng.gen::<u64>();
+                            let messages = vec![input_msg; fhe_uint_count];
+
+                            println!("Generating proven ciphertext ({zk_load})... ");
+                            let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                .extend(messages.iter().copied())
+                                .build_with_proof_packed(&crs, &metadata, compute_load)
+                                .unwrap();
+                            let gpu_ct1 =
+                            CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
+                                &ct1, &streams,
+                            );
+
+                            let proven_ciphertext_list_serialized =
+                                bincode::serialize(&ct1).unwrap();
+
+                            println!(
+                                "proven list size: {}",
+                                proven_ciphertext_list_serialized.len()
+                            );
+
+                            let test_name = format!(
+                            "zk::proven_list_size::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+
+                            write_result(
+                                &mut file,
+                                &test_name,
+                                proven_ciphertext_list_serialized.len(),
+                            );
+                            write_to_json::<u64, _>(
+                                &test_name,
+                                shortint_params,
+                                param_name,
+                                "pke_zk_proof",
+                                &OperatorType::Atomic,
+                                0,
+                                vec![],
+                            );
+
+                            let proof_size = ct1.proof_size();
+                            println!("proof size: {}", ct1.proof_size());
+
+                            let test_name =
+                            format!("zk::proof_sizes::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}");
+
+                            write_result(&mut file, &test_name, proof_size);
+                            write_to_json::<u64, _>(
+                                &test_name,
+                                shortint_params,
+                                param_name,
+                                "pke_zk_proof",
+                                &OperatorType::Atomic,
+                                0,
+                                vec![],
+                            );
+
+                            bench_group.bench_function(&bench_id_verify, |b| {
+                                b.iter(|| {
+                                    let _ret = ct1.verify(&crs, &pk, &metadata);
                                 });
+                            });
 
-                                bench_group.bench_function(&bench_id_expand_without_verify, |b| {
-                                    b.iter(|| {
-                                        let _ret = gpu_ct1
-                                            .expand_without_verification(&d_ksk, &streams)
-                                            .unwrap();
-                                    });
+                            bench_group.bench_function(&bench_id_expand_without_verify, |b| {
+                                b.iter(|| {
+                                    let _ret = gpu_ct1
+                                        .expand_without_verification(&d_ksk, &streams)
+                                        .unwrap();
                                 });
+                            });
 
-                                bench_group.bench_function(&bench_id_verify_and_expand, |b| {
-                                    b.iter(|| {
-                                        let _ret = gpu_ct1
-                                            .verify_and_expand(
-                                                &crs, &pk, &metadata, &d_ksk, &streams,
-                                            )
-                                            .unwrap();
-                                    });
+                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                                b.iter(|| {
+                                    let _ret = gpu_ct1
+                                        .verify_and_expand(&crs, &pk, &metadata, &d_ksk, &streams)
+                                        .unwrap();
                                 });
+                            });
+                        }
+                        BenchmarkType::Throughput => {
+                            let mut elements_per_gpu = 100;
+                            if bits == 4096 {
+                                elements_per_gpu /= 5;
                             }
-                            BenchmarkType::Throughput => {
-                                let mut elements_per_gpu = 100;
-                                if *bits == 4096 {
-                                    elements_per_gpu /= 5;
-                                }
-                                // This value, found empirically, ensure saturation of 8XH100 SXM5
-                                let elements = elements_per_gpu * get_number_of_gpus() as u64;
-                                bench_group.throughput(Throughput::Elements(elements));
-
-                                bench_id_verify = format!(
-                                    "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                                );
-                                bench_id_verify_and_expand = format!(
-                                    "{bench_name}_and_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                                );
-                                bench_id_expand_without_verify = format!(
-                                    "{bench_name}_only_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
-                                );
-                                println!("Generating proven ciphertexts list ({zk_load})... ");
-                                let cts = (0..elements)
-                                    .map(|_| {
-                                        let input_msg = rng.gen::<u64>();
-                                        let messages = vec![input_msg; fhe_uint_count];
-                                        tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
-                                            .extend(messages.iter().copied())
-                                            .build_with_proof_packed(&crs, &metadata, compute_load)
-                                            .unwrap()
-                                    })
-                                    .collect::<Vec<_>>();
-
-                                let local_streams =
-                                    cuda_local_streams(num_block, elements as usize);
-                                let d_ksk_material_vec = local_streams
-                                    .par_iter()
-                                    .map(|local_stream| {
-                                        CudaKeySwitchingKeyMaterial::from_key_switching_key(
-                                            &ksk,
-                                            local_stream,
-                                        )
-                                    })
-                                    .collect::<Vec<_>>();
+                            // This value, found empirically, ensure saturation of 8XH100 SXM5
+                            let elements = elements_per_gpu * get_number_of_gpus() as u64;
+                            bench_group.throughput(Throughput::Elements(elements));
 
-                                bench_group.bench_function(&bench_id_verify, |b| {
-                                    b.iter(|| {
-                                        cts.par_iter().for_each(|ct1| {
-                                            ct1.verify(&crs, &pk, &metadata);
-                                        })
-                                    });
+                            bench_id_verify = format!(
+                            "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_verify_and_expand = format!(
+                            "{bench_name}_and_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_expand_without_verify = format!(
+                            "{bench_name}_only_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            println!("Generating proven ciphertexts list ({zk_load})... ");
+                            let cts = (0..elements)
+                                .map(|_| {
+                                    let input_msg = rng.gen::<u64>();
+                                    let messages = vec![input_msg; fhe_uint_count];
+                                    tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                        .extend(messages.iter().copied())
+                                        .build_with_proof_packed(&crs, &metadata, compute_load)
+                                        .unwrap()
+                                })
+                                .collect::<Vec<_>>();
+
+                            let local_streams = cuda_local_streams(num_block, elements as usize);
+                            let d_ksk_material_vec = local_streams
+                                .par_iter()
+                                .map(|local_stream| {
+                                    CudaKeySwitchingKeyMaterial::from_key_switching_key(
+                                        &ksk,
+                                        local_stream,
+                                    )
+                                })
+                                .collect::<Vec<_>>();
+
+                            bench_group.bench_function(&bench_id_verify, |b| {
+                                b.iter(|| {
+                                    cts.par_iter().for_each(|ct1| {
+                                        ct1.verify(&crs, &pk, &metadata);
+                                    })
                                 });
+                            });
 
-                                bench_group.bench_function(&bench_id_expand_without_verify, |b| {
+                            bench_group.bench_function(&bench_id_expand_without_verify, |b| {
                                     let setup_encrypted_values = || {
                                         let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
                                             let local_stream = &local_streams[i % local_streams.len()];
@@ -705,73 +662,73 @@ mod cuda {
                                         gpu_cts
                                     };
 
-                                    b.iter_batched(setup_encrypted_values,
-                                                   |gpu_cts| {
-                                                       gpu_cts.par_iter().enumerate().for_each
-                                                       (|(i, gpu_ct)| {
-                                                           let local_stream = &local_streams[i % local_streams.len()];
-
-                                                           let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
-                                                           let d_ksk =
-                                                               CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);
-
-                                                           gpu_ct
-                                                               .expand_without_verification(&d_ksk, local_stream)
-                                                               .unwrap();
-                                                       });
-                                                   }, BatchSize::SmallInput);
-                                });
+                                b.iter_batched(setup_encrypted_values,
+                                               |gpu_cts| {
+                                               gpu_cts.par_iter().enumerate().for_each
+                                               (|(i, gpu_ct)| {
+                                                   let local_stream = &local_streams[i % local_streams.len()];
+
+            let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
+                                                   let d_ksk =
+                CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);
+
+                                        gpu_ct
+                                            .expand_without_verification(&d_ksk, local_stream)
+                                            .unwrap();
+                                    });
+                                }, BatchSize::SmallInput);
+                            });
 
-                                bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                                     let setup_encrypted_values = || {
                                         let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
                                             CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
-                                                ct, &local_streams[i % local_streams.len()],
+                                                ct, &local_streams[i% local_streams.len()],
                                             )
                                         }).collect_vec();
 
                                         gpu_cts
                                     };
 
-                                    b.iter_batched(setup_encrypted_values,
-                                                   |gpu_cts| {
-                                                       gpu_cts.par_iter().enumerate().for_each
-                                                       (|(i, gpu_ct)| {
-                                                           let local_stream = &local_streams[i % local_streams.len()];
-                                                           let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
-                                                           let d_ksk =
-                                                               CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);
-
-                                                           gpu_ct
-                                                               .verify_and_expand(
-                                                                   &crs, &pk, &metadata, &d_ksk, local_stream,
-                                                               )
-                                                               .unwrap();
-                                                       });
-                                                   }, BatchSize::SmallInput);
-                                });
-                            }
-                        }
+                                b.iter_batched(setup_encrypted_values,
+                                               |gpu_cts| {
+                                               gpu_cts.par_iter().enumerate().for_each
+                                               (|(i, gpu_ct)| {
+                                                   let local_stream = &local_streams[i % local_streams.len()];
+            let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
+                                                   let d_ksk =
+                CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);
 
-                        for (bench_id, display_name) in [
-                            (bench_id_verify, "pke_zk_verify"),
-                            (bench_id_expand_without_verify, "pke_zk_verify_only_expand"),
-                            (bench_id_verify_and_expand, "pke_zk_verify_and_expand"),
-                        ] {
-                            write_to_json::<u64, _>(
-                                &bench_id,
-                                shortint_params,
-                                param_name,
-                                display_name,
-                                &OperatorType::Atomic,
-                                shortint_params.message_modulus().0 as u32,
-                                vec![shortint_params.message_modulus().0.ilog2(); num_block],
-                            );
+                                        gpu_ct
+                                            .verify_and_expand(
+                                                &crs, &pk, &metadata, &d_ksk, local_stream,
+                                            )
+                                            .unwrap();
+                                    });
+                                }, BatchSize::SmallInput);
+                            });
                         }
                     }
+
+                    for (bench_id, display_name) in [
+                        (bench_id_verify, "pke_zk_verify"),
+                        (bench_id_expand_without_verify, "pke_zk_verify_only_expand"),
+                        (bench_id_verify_and_expand, "pke_zk_verify_and_expand"),
+                    ] {
+                        write_to_json::<u64, _>(
+                            &bench_id,
+                            shortint_params,
+                            param_name,
+                            display_name,
+                            &OperatorType::Atomic,
+                            shortint_params.message_modulus().0 as u32,
+                            vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                        );
+                    }
                 }
             }
         }
+
         bench_group.finish()
     }
 
diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs
index 0d43e4c0da..300e78e11d 100644
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -3,7 +3,7 @@ use std::path::PathBuf;
 use std::sync::OnceLock;
 use std::{env, fs};
 #[cfg(feature = "gpu")]
-use tfhe::core_crypto::gpu::{get_number_of_gpus, get_number_of_sms};
+use tfhe::core_crypto::gpu::get_number_of_gpus;
 use tfhe::core_crypto::prelude::*;
 
 #[cfg(feature = "boolean")]
@@ -417,6 +417,10 @@ pub fn get_bench_type() -> &'static BenchmarkType {
     BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap())
 }
 
+/// Number of streaming multiprocessors (SM) available on Nvidia H100 GPU
+#[cfg(feature = "gpu")]
+const H100_PCIE_SM_COUNT: u32 = 114;
+
 /// Generate a number of threads to use to saturate current machine for throughput measurements.
 pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
     let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
@@ -427,19 +431,11 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
 
     #[cfg(feature = "gpu")]
     {
-        let num_sms_per_gpu = get_number_of_sms();
-        let total_num_sm = num_sms_per_gpu * get_number_of_gpus();
-
-        let total_blocks_per_sm = 4u32; // Assume each SM can handle 4 blocks concurrently
-        let total_num_sm = total_blocks_per_sm * total_num_sm;
-        let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU
-        let elements_per_wave = total_num_sm as u64 / (num_block as u64);
-
+        let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
         let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
         let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
-        elements.min(elements_per_wave * min_num_waves) // This threshold is useful for operation
-                                                        // with both a small number of
-                                                        // block and low PBs count.
+        elements.min(200) // This threshold is useful for operation with both a small number of
+                          // block and low PBs count.
     }
     #[cfg(feature = "hpu")]
     {
diff --git a/tfhe-csprng/Cargo.toml b/tfhe-csprng/Cargo.toml
index 2b8d65fb50..179acf4021 100644
--- a/tfhe-csprng/Cargo.toml
+++ b/tfhe-csprng/Cargo.toml
@@ -14,7 +14,6 @@ rust-version = "1.72"
 [dependencies]
 aes = "0.8.2"
 rayon = { workspace = true, optional = true }
-getrandom = { workspace = true }
 
 [target.'cfg(target_os = "macos")'.dependencies]
 libc = "0.2.133"
diff --git a/tfhe-csprng/src/seeders/implem/unix.rs b/tfhe-csprng/src/seeders/implem/unix.rs
index 5911058230..c4e62c5891 100644
--- a/tfhe-csprng/src/seeders/implem/unix.rs
+++ b/tfhe-csprng/src/seeders/implem/unix.rs
@@ -1,11 +1,12 @@
 use crate::seeders::{Seed, Seeder};
+use std::fs::File;
+use std::io::Read;
 
-/// A seeder which uses the system entropy source on unix-like systems.
-///
-/// If available, this will use `getrandom` or `getentropy` system call. Otherwise it will draw from
-/// `/dev/urandom` after successfully polling `/dev/random`.
+/// A seeder which uses the `/dev/random` source on unix-like systems.
 pub struct UnixSeeder {
+    counter: u128,
     secret: u128,
+    file: File,
 }
 
 impl UnixSeeder {
@@ -14,34 +15,35 @@ impl UnixSeeder {
     /// Important:
     /// ----------
     ///
-    /// This secret is used to ensure the quality of the seed in scenarios where the system random
-    /// source may be compromised.
+    /// This secret is used to ensure the quality of the seed in scenarios where `/dev/random` may
+    /// be compromised.
     ///
     /// The attack hypotheses are as follow:
-    /// - The kernel random output can be predicted by a process running on the machine by just
+    /// - `/dev/random` output can be predicted by a process running on the machine by just
     ///   observing various states of the machine
     /// - The attacker cannot read data from the process where `tfhe-csprng` is running
     ///
     /// Using a secret in `tfhe-csprng` allows to generate values that the attacker cannot
-    /// predict, making this seeder secure on systems were the kernel random outputs can be
+    /// predict, making this seeder secure on systems were `/dev/random` outputs can be
     /// predicted.
     pub fn new(secret: u128) -> UnixSeeder {
-        UnixSeeder { secret }
+        let file = std::fs::File::open("/dev/random").expect("Failed to open /dev/random .");
+        let counter = std::time::UNIX_EPOCH
+            .elapsed()
+            .expect("Failed to initialize unix seeder.")
+            .as_nanos();
+        UnixSeeder {
+            secret,
+            counter,
+            file,
+        }
     }
 }
 
 impl Seeder for UnixSeeder {
-    /// Draws entropy from a system source to seed a CSPRNG.
-    ///
-    /// It may be blocking at system startup if the kernel entropy pool has not been initialized,
-    /// but should not be blocking after.
-    ///
-    /// # Panics
-    /// This may panic if the `getrandom` system call is not available and no file descriptor is
-    /// available on the system.
     fn seed(&mut self) -> Seed {
-        let output = self.secret ^ get_system_entropy();
-
+        let output = self.secret ^ self.counter ^ dev_random(&mut self.file);
+        self.counter = self.counter.wrapping_add(1);
         Seed(output)
     }
 
@@ -50,14 +52,11 @@ impl Seeder for UnixSeeder {
     }
 }
 
-fn get_system_entropy() -> u128 {
+fn dev_random(random: &mut File) -> u128 {
     let mut buf = [0u8; 16];
-    // This will use the getrandom syscall if possible (from linux 3.17). This syscall is not
-    // vulnerable to fd exhaustion since it directly pulls from kernel entropy sources.
-    //
-    // This syscall will use the urandom entropy source but block at startup until it is correctly
-    // seeded. See <https://www.2uo.de/myths-about-urandom/> for a rational around random/urandom.
-    getrandom::getrandom(&mut buf).expect("Failed to read entropy from system");
+    random
+        .read_exact(&mut buf[..])
+        .expect("Failed to read from /dev/random .");
     u128::from_ne_bytes(buf)
 }
 
diff --git a/tfhe-fft/Cargo.toml b/tfhe-fft/Cargo.toml
index 2012dcaa84..25b33fb159 100644
--- a/tfhe-fft/Cargo.toml
+++ b/tfhe-fft/Cargo.toml
@@ -37,7 +37,7 @@ serde_json = "1.0.96"
 [target.'cfg(target_arch = "wasm32")'.dev-dependencies]
 wasm-bindgen-test = "0.3"
 wasm-bindgen = { workspace = true }
-getrandom = { workspace = true, features = ["js"] }
+getrandom = { version = "0.2", features = ["js"] }
 
 [target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dev-dependencies]
 rug = "1.19.1"
diff --git a/tfhe-fft/src/dif16.rs b/tfhe-fft/src/dif16.rs
index be71f90f78..2b38d8e481 100644
--- a/tfhe-fft/src/dif16.rs
+++ b/tfhe-fft/src/dif16.rs
@@ -216,6 +216,24 @@ fn stockham_core_1x2<c64xN: Pod>(
         let oo = simd.mul(we, simd.add(a08m1a4c_pj_a2am1a6e, v8_a19m1a5d_pj_a3bm1a7f));
         let pp = simd.mul(wf, simd.add(s08pjs4c_pv_s2apjs6e, hf_s19pjs5d_pv_s3bpjs7f));
 
+        let ab = simd.catlo(aa, bb);
+        y[0] = ab;
+        let cd = simd.catlo(cc, dd);
+        y[1] = cd;
+        let ef = simd.catlo(ee, ff);
+        y[2] = ef;
+        let gh = simd.catlo(gg, hh);
+        y[3] = gh;
+
+        let ab = simd.cathi(aa, bb);
+        y[4] = ab;
+        let cd = simd.cathi(cc, dd);
+        y[5] = cd;
+        let ef = simd.cathi(ee, ff);
+        y[6] = ef;
+        let gh = simd.cathi(gg, hh);
+        y[7] = gh;
+
         let ab = simd.catlo(aa, bb);
         y[0x0] = ab;
         let cd = simd.catlo(cc, dd);
diff --git a/tfhe/Cargo.toml b/tfhe/Cargo.toml
index bef2dfd92d..2d154bc4a6 100644
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -86,7 +86,7 @@ wasm-bindgen-rayon = { version = "1.3.0", optional = true }
 js-sys = { version = "0.3", optional = true }
 console_error_panic_hook = { version = "0.1.7", optional = true }
 serde-wasm-bindgen = { version = "0.6.0", optional = true }
-getrandom = { workspace = true, optional = true }
+getrandom = { version = "0.2.8", optional = true }
 bytemuck = { workspace = true }
 
 tfhe-hpu-backend = { version = "0.2", path = "../backends/tfhe-hpu-backend", optional = true }
diff --git a/tfhe/docs/configuration/rust-configuration.md b/tfhe/docs/configuration/rust-configuration.md
index af1fd91a7a..c2e0639eb6 100644
--- a/tfhe/docs/configuration/rust-configuration.md
+++ b/tfhe/docs/configuration/rust-configuration.md
@@ -2,7 +2,7 @@
 
 This document provides basic instructions to configure the Rust toolchain and features for **TFHE-rs.**
 
-**TFHE-rs** requires a nightly Rust toolchain to build the C API and utilize advanced SIMD instructions. However, for other uses, a stable toolchain (version 1.84 or later) is sufficient.
+**TFHE-rs** requires a nightly Rust toolchain to build the C API and utilize advanced SIMD instructions. However, for other uses, a stable toolchain (version 1.81 or later) is sufficient.
 
 Follow the following instructions to install the necessary Rust toolchain:
 
diff --git a/tfhe/docs/fhe-computation/advanced-features/zk-pok.md b/tfhe/docs/fhe-computation/advanced-features/zk-pok.md
index c293b2d56f..698271b29b 100644
--- a/tfhe/docs/fhe-computation/advanced-features/zk-pok.md
+++ b/tfhe/docs/fhe-computation/advanced-features/zk-pok.md
@@ -73,6 +73,15 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
 }
 ```
 
+Performance can be improved by setting `lto="fat"` in `Cargo.toml`
+
+```toml
+[profile.release]
+lto = "fat"
+```
+
+and by building the code for the native CPU architecture and in release mode, e.g. by calling `RUSTFLAGS="-C target-cpu=native" cargo run --release`.
+
 {% hint style="info" %}
 You can choose a more costly proof with `ZkComputeLoad::Proof`, which has a faster verification time. Alternatively, you can select `ZkComputeLoad::Verify` for a faster proof and slower verification.
 {% endhint %}
diff --git a/tfhe/docs/fhe-computation/compute/parameters.md b/tfhe/docs/fhe-computation/compute/parameters.md
index 7c2e58ea8b..c2d62cbb77 100644
--- a/tfhe/docs/fhe-computation/compute/parameters.md
+++ b/tfhe/docs/fhe-computation/compute/parameters.md
@@ -7,6 +7,7 @@ All parameter sets provide at least 128-bits of security according to the [Latti
 ## Default parameters
 Currently, the default parameters use blocks that contain 2 bits of message and 2 bits of carry - a tweaked uniform (TUniform, defined [here](../../getting-started/security-and-cryptography.md#noise)) noise distribution, and have a bootstrapping failure probability $$p_{error} \le 2^{-128}$$.
 These are particularly suitable for applications that need to be secure in the IND-CPA^D model (see [here](../../getting-started/security-and-cryptography.md#security) for more details).
+The GPU backend still uses an error probability smaller than $$2^{-64}$$ by default. Those will be updated soon.
 
 When using the high-level API of **TFHE-rs**, you can create a key pair using the default recommended set of parameters. For example:
 
diff --git a/tfhe/docs/getting-started/installation.md b/tfhe/docs/getting-started/installation.md
index fbcfdebf40..9c22946bea 100644
--- a/tfhe/docs/getting-started/installation.md
+++ b/tfhe/docs/getting-started/installation.md
@@ -11,7 +11,7 @@ tfhe = { version = "~1.3.0", features = ["boolean", "shortint", "integer"] }
 ```
 
 {% hint style="info" %}
-**Rust version**: a minimum Rust version of 1.84 is required to compile **TFHE-rs**.
+**Rust version**: a minimum Rust version of 1.81 is required to compile **TFHE-rs**.
 {% endhint %}
 
 {% hint style="success" %}
diff --git a/tfhe/docs/getting-started/quick-start.md b/tfhe/docs/getting-started/quick-start.md
index 944a9f530d..4b8df6cc36 100644
--- a/tfhe/docs/getting-started/quick-start.md
+++ b/tfhe/docs/getting-started/quick-start.md
@@ -85,8 +85,7 @@ Now that the project has **TFHE-rs** as a dependency here are the detailed steps
 5. Server-side: [compute over encrypted data](../fhe-computation/compute/)
 6. Client-side: [decrypt data](../fhe-computation/compute/decrypt-data.md)
 
-This example demonstrates the basic workflow combining the client and server parts.
-Edit the `main.rs` file that has been generated at the previous step and add this content:
+This example demonstrates the basic workflow combining the client and server parts:
 
 ```rust
 use tfhe::{ConfigBuilder, generate_keys, set_server_key, FheUint8};
@@ -117,21 +116,4 @@ fn main() {
 }
 ```
 
-Now, you are ready to compute your first homomorphic operations! To run the project, simply invoke the following command:
-```console
-$ cargo run --release
-```
-{% hint style="success" %}
-**Performance**: for optimal performance, it is highly recommended to run code that uses **`TFHE-rs`** in release mode with cargo's `--release` flag.
-{% endhint %}
-
 You can learn more about homomorphic types and associated compilation features in the [configuration documentation.](../configuration/rust-configuration.md)
-
-## Perforance tips
-Performance can be further improved by setting `lto="fat"` in `Cargo.toml`
-```toml
-[profile.release]
-lto = "fat"
-```
-
-If your application does not need to be portable, you can also build the code for the native CPU architecture, e.g. by calling `RUSTFLAGS="-C target-cpu=native" cargo run --release`.
diff --git a/tfhe/docs/getting-started/security-and-cryptography.md b/tfhe/docs/getting-started/security-and-cryptography.md
index 3b8340a2e7..21b3fb6950 100644
--- a/tfhe/docs/getting-started/security-and-cryptography.md
+++ b/tfhe/docs/getting-started/security-and-cryptography.md
@@ -97,7 +97,7 @@ For example, when adding two ciphertexts, the sum could exceed the range of eith
 
 By default, the cryptographic parameters provided by **TFHE-rs** ensure at least 128 bits of security. The security has been evaluated using the latest versions of the Lattice Estimator ([repository](https://github.com/malb/lattice-estimator)) with `red_cost_model = reduction.RC.BDGL16`.
 
-For the High-Level API the default parameters are selected with a bootstrapping failure probability (or error probability) fixed at $$p_{error} \le 2^{-128}$$ for all backends (x86 CPU, GPU and HPU).
+For the High-Level API the default parameters are selected with a bootstrapping failure probability (or error probability) fixed at $$p_{error} \le 2^{-128}$$ for the x86 CPU backend, and $$p_{error} \le 2^{-64}$$ for the GPU backend.
 A failure probability below $$2^{-128}$$ ensures that our implementation is resilient against attacks in the IND-CPA-D model [1]. In the case where only the IND-CPA model is considered, there is a possibility to choose parameters with a $$p_{error} \le 2^{-64}$$, see the dedicated [Parameters section](../fhe-computation/compute/parameters.md)
 
 \[1][ Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022.](https://eprint.iacr.org/2022/816.pdf)
diff --git a/tfhe/docs/references/fine-grained-apis/shortint/parameters.md b/tfhe/docs/references/fine-grained-apis/shortint/parameters.md
index d327d81a6b..8df1751c1b 100644
--- a/tfhe/docs/references/fine-grained-apis/shortint/parameters.md
+++ b/tfhe/docs/references/fine-grained-apis/shortint/parameters.md
@@ -1,6 +1,6 @@
 # Cryptographic Parameters
 
-All parameter sets provide at least 128-bits of security according to the [Lattice-Estimator](https://github.com/malb/lattice-estimator). Default parameters have an error probability equal to $$2^{-128}$$ when using programmable bootstrapping. This error probability is due to the randomness added at each encryption (see [here](../../../getting-started/security-and-cryptography.md) for more details about the encryption process).
+All parameter sets provide at least 128-bits of security according to the [Lattice-Estimator](https://github.com/malb/lattice-estimator), with an error probability equal to $$2^{-64}$$ when using programmable bootstrapping. This error probability is due to the randomness added at each encryption (see [here](../../../getting-started/security-and-cryptography.md) for more details about the encryption process).
 
 ## Parameters and message precision
 
diff --git a/tfhe/src/core_crypto/algorithms/glwe_encryption.rs b/tfhe/src/core_crypto/algorithms/glwe_encryption.rs
index 35c78c5f53..08c4836a88 100644
--- a/tfhe/src/core_crypto/algorithms/glwe_encryption.rs
+++ b/tfhe/src/core_crypto/algorithms/glwe_encryption.rs
@@ -728,11 +728,8 @@ pub fn encrypt_glwe_ciphertext_list<
     Gen: ByteRandomGenerator,
 {
     assert!(
-        output_glwe_ciphertext_list
-            .polynomial_size()
-            .0
-            .checked_mul(output_glwe_ciphertext_list.glwe_ciphertext_count().0)
-            .unwrap()
+        output_glwe_ciphertext_list.polynomial_size().0
+            * output_glwe_ciphertext_list.glwe_ciphertext_count().0
             == input_plaintext_list.plaintext_count().0,
         "Mismatch between required number of plaintexts: {} ({:?} * {:?}) and input \
         PlaintextCount: {:?}",
@@ -1352,11 +1349,7 @@ pub fn encrypt_seeded_glwe_ciphertext_list_with_pre_seeded_generator<
         output.glwe_size().to_glwe_dimension(),
     );
     assert!(
-        output
-            .glwe_ciphertext_count()
-            .0
-            .checked_mul(output.polynomial_size().0)
-            .unwrap()
+        output.glwe_ciphertext_count().0 * output.polynomial_size().0
             == encoded.plaintext_count().0,
         "Mismatch between number of output ciphertexts and input plaintexts. \
         Got {:?} plaintexts while {:?} plaintexts are required to encrypt {:?} ciphertexts.",
diff --git a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs
index 281672687b..125e256696 100644
--- a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs
@@ -7,7 +7,7 @@ use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
 use crate::core_crypto::commons::math::ntt::ntt64::{Ntt64, Ntt64View};
 use crate::core_crypto::commons::parameters::{GlweSize, MonomialDegree, PolynomialSize};
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::prelude::{lwe_ciphertext_modulus_switch, ModulusSwitchedLweCiphertext};
 use aligned_vec::CACHELINE_ALIGN;
@@ -203,9 +203,6 @@ pub fn blind_rotate_ntt64_bnf_assign<OutputCont, KeyCont>(
 /// a properly configured [`Ntt64View`] object and a `PodStack` used as a memory buffer having a
 /// capacity at least as large as the result of
 /// [`blind_rotate_ntt64_bnf_assign_mem_optimized_requirement`].
-///
-/// # Panics
-/// This will panic if the input mask len does not match the size of the bsk
 pub fn blind_rotate_ntt64_bnf_assign_mem_optimized<OutputCont, KeyCont>(
     msed_input: &impl ModulusSwitchedLweCiphertext<usize>,
     lut: &mut GlweCiphertext<OutputCont>,
@@ -237,8 +234,7 @@ pub fn blind_rotate_ntt64_bnf_assign_mem_optimized<OutputCont, KeyCont>(
         // We initialize the ct_0 used for the successive cmuxes
         let mut ct0 = lut;
 
-        for (lwe_mask_element, bootstrap_key_ggsw) in izip_eq!(msed_lwe_mask, bsk.into_ggsw_iter())
-        {
+        for (lwe_mask_element, bootstrap_key_ggsw) in izip!(msed_lwe_mask, bsk.into_ggsw_iter()) {
             if lwe_mask_element != 0 {
                 // We copy ct_0 to ct_1
                 let (ct1, stack) =
@@ -627,7 +623,7 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
                 //
                 //        t = 1                           t = 2                     ...
 
-                izip_eq!(
+                izip!(
                     ggsw_decomp_matrix.into_rows(),
                     glwe_decomp_term.as_polynomial_list().iter()
                 )
@@ -661,7 +657,7 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
         //
         // We iterate over the polynomials in the output.
         if !is_output_uninit {
-            izip_eq!(
+            izip!(
                 out.as_mut_polynomial_list().iter_mut(),
                 output_fft_buffer
                     .into_chunks(poly_size)
@@ -679,9 +675,6 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
 }
 
 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
-///
-/// # Panics
-/// This will panic if ct0 and ct1 are not of the same size
 pub(crate) fn cmux_ntt64_bnf_assign(
     ct0: GlweCiphertextMutView<'_, u64>,
     ct1: GlweCiphertextMutView<'_, u64>,
@@ -696,7 +689,7 @@ pub(crate) fn cmux_ntt64_bnf_assign(
         ntt: Ntt64View<'_>,
         stack: &mut PodStack,
     ) {
-        izip_eq!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
+        izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
             *c1 = c1.wrapping_sub(*c0);
         });
         add_external_product_ntt64_bnf_assign(ct0, ggsw, &ct1, ntt, stack);
@@ -718,7 +711,7 @@ pub(crate) fn update_with_fmadd_ntt64_bnf(
         output_fft_buffer.fill(0);
     }
 
-    izip_eq!(
+    izip!(
         output_fft_buffer.into_chunks(poly_size),
         lhs_polynomial_list.into_chunks(poly_size)
     )
diff --git a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs
index e3c7a57242..65e717020f 100644
--- a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs
@@ -15,7 +15,7 @@ use crate::core_crypto::commons::math::decomposition::{
 use crate::core_crypto::commons::math::ntt::ntt64::{Ntt64, Ntt64View};
 use crate::core_crypto::commons::parameters::{GlweSize, MonomialDegree, PolynomialSize};
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use aligned_vec::CACHELINE_ALIGN;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -208,9 +208,6 @@ pub fn blind_rotate_ntt64_assign<InputCont, OutputCont, KeyCont>(
 /// a properly configured [`Ntt64View`] object and a `PodStack` used as a memory buffer having a
 /// capacity at least as large as the result of
 /// [`blind_rotate_ntt64_assign_mem_optimized_requirement`].
-///
-/// # Panics
-/// This will panic if the input mask len does not match the size of the bsk
 pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
     input: &LweCiphertext<InputCont>,
     lut: &mut GlweCiphertext<OutputCont>,
@@ -253,9 +250,7 @@ pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
         // We initialize the ct_0 used for the successive cmuxes
         let mut ct0 = lut;
 
-        for (lwe_mask_element, bootstrap_key_ggsw) in
-            izip_eq!(lwe_mask.iter(), bsk.into_ggsw_iter())
-        {
+        for (lwe_mask_element, bootstrap_key_ggsw) in izip!(lwe_mask.iter(), bsk.into_ggsw_iter()) {
             if *lwe_mask_element != 0u64 {
                 let stack = &mut *stack;
                 // We copy ct_0 to ct_1
@@ -620,7 +615,7 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
             //
             //        t = 1                           t = 2                     ...
 
-            izip_eq!(
+            izip!(
                 ggsw_decomp_matrix.into_rows(),
                 glwe_decomp_term.as_polynomial_list().iter()
             )
@@ -652,7 +647,7 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
     //
     // We iterate over the polynomials in the output.
     if !is_output_uninit {
-        izip_eq!(
+        izip!(
             out.as_mut_polynomial_list().iter_mut(),
             output_fft_buffer
                 .into_chunks(poly_size)
@@ -665,9 +660,6 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
 }
 
 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
-///
-/// # Panics
-/// This will panic if ct0 and ct1 are not of the same size
 pub(crate) fn cmux_ntt64_assign(
     ct0: GlweCiphertextMutView<'_, u64>,
     mut ct1: GlweCiphertextMutView<'_, u64>,
@@ -675,7 +667,7 @@ pub(crate) fn cmux_ntt64_assign(
     ntt: Ntt64View<'_>,
     stack: &mut PodStack,
 ) {
-    izip_eq!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
+    izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
         *c1 = c1.wrapping_sub_custom_mod(*c0, ntt.custom_modulus());
     });
     add_external_product_ntt64_assign(ct0, ggsw, &ct1, ntt, stack);
@@ -694,7 +686,7 @@ pub(crate) fn update_with_fmadd_ntt64(
         output_fft_buffer.fill(0);
     }
 
-    izip_eq!(
+    izip!(
         output_fft_buffer.into_chunks(poly_size),
         lhs_polynomial_list.into_chunks(poly_size)
     )
diff --git a/tfhe/src/core_crypto/algorithms/modulus_switch.rs b/tfhe/src/core_crypto/algorithms/modulus_switch.rs
index 42dcc43d4c..6f5c2b6bb7 100644
--- a/tfhe/src/core_crypto/algorithms/modulus_switch.rs
+++ b/tfhe/src/core_crypto/algorithms/modulus_switch.rs
@@ -5,7 +5,7 @@ pub trait ModulusSwitchedLweCiphertext<Scalar> {
     fn log_modulus(&self) -> CiphertextModulusLog;
     fn lwe_dimension(&self) -> LweDimension;
     fn body(&self) -> Scalar;
-    fn mask(&self) -> impl ExactSizeIterator<Item = Scalar> + '_;
+    fn mask(&self) -> impl Iterator<Item = Scalar> + '_;
 }
 
 pub fn lwe_ciphertext_modulus_switch<Scalar, SwitchedScalar, Cont>(
diff --git a/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs b/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs
index 2eaf09deb8..902d3a97d7 100644
--- a/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs
+++ b/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs
@@ -1,5 +1,5 @@
 use crate::core_crypto::commons::ciphertext_modulus::CiphertextModulusKind;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::prelude::*;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
@@ -123,7 +123,7 @@ impl Ntt64View<'_> {
         pulp::Arch::new().dispatch(
             #[inline(always)]
             || {
-                for (out, inp) in izip_eq!(standard, &*ntt) {
+                for (out, inp) in izip!(standard, &*ntt) {
                     *out = u64::wrapping_add_custom_mod(*out, *inp, self.custom_modulus());
                 }
             },
@@ -258,7 +258,7 @@ impl Ntt64View<'_> {
         pulp::Arch::new().dispatch(
             #[inline(always)]
             || {
-                for (out, inp) in izip_eq!(standard, &*ntt) {
+                for (out, inp) in izip!(standard, &*ntt) {
                     *out = u64::wrapping_add(*out, *inp);
                 }
             },
diff --git a/tfhe/src/core_crypto/commons/traits/contiguous_entity_container.rs b/tfhe/src/core_crypto/commons/traits/contiguous_entity_container.rs
index 48cbf4c103..21eddae734 100644
--- a/tfhe/src/core_crypto/commons/traits/contiguous_entity_container.rs
+++ b/tfhe/src/core_crypto/commons/traits/contiguous_entity_container.rs
@@ -141,7 +141,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
         // mid here is the number of ref_elements, we need to multiply by the size of a single
         // element to know where to split the underlying container
 
-        let mid = mid.checked_mul(self.get_entity_view_pod_size()).unwrap();
+        let mid = mid * self.get_entity_view_pod_size();
         let self_meta = self.get_self_view_creation_metadata();
 
         let (container_left, container_right) = self.as_ref().split_at(mid);
@@ -156,7 +156,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
         // index here is the number of ref_elements, we need to multiply by the size of a single
         // element to know where to reference the underlying container
 
-        let start = index.checked_mul(self.get_entity_view_pod_size()).unwrap();
+        let start = index * self.get_entity_view_pod_size();
         let stop = start + self.get_entity_view_pod_size();
         let meta = self.get_entity_view_creation_metadata();
 
@@ -178,12 +178,8 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
             Bound::Unbounded => self.entity_count(),
         };
 
-        let start_index = entity_start_index
-            .checked_mul(self.get_entity_view_pod_size())
-            .unwrap();
-        let stop_index = entity_stop_index
-            .checked_mul(self.get_entity_view_pod_size())
-            .unwrap();
+        let start_index = entity_start_index * self.get_entity_view_pod_size();
+        let stop_index = entity_stop_index * self.get_entity_view_pod_size();
 
         let self_meta = self.get_self_view_creation_metadata();
 
@@ -218,7 +214,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
         let entity_count = self.entity_count();
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_ref()
@@ -239,7 +235,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
         );
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_ref()
@@ -277,7 +273,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
         let entity_count = self.entity_count();
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_ref()
@@ -303,7 +299,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
         );
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_ref()
@@ -390,12 +386,8 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
             Bound::Unbounded => self.entity_count(),
         };
 
-        let start_index = entity_start_index
-            .checked_mul(self.get_entity_view_pod_size())
-            .unwrap();
-        let stop_index = entity_stop_index
-            .checked_mul(self.get_entity_view_pod_size())
-            .unwrap();
+        let start_index = entity_start_index * self.get_entity_view_pod_size();
+        let stop_index = entity_stop_index * self.get_entity_view_pod_size();
 
         let self_meta = self.get_self_view_creation_metadata();
 
@@ -420,7 +412,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
         let entity_count = self.entity_count();
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_mut()
@@ -442,7 +434,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
         );
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_mut()
@@ -495,7 +487,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
         let entity_count = self.entity_count();
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_mut()
@@ -521,7 +513,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
         );
 
         let entity_view_pod_size = self.get_entity_view_pod_size();
-        let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
+        let pod_chunk_size = entity_view_pod_size * chunk_size;
 
         let meta = self.get_self_view_creation_metadata();
         self.as_mut()
diff --git a/tfhe/src/core_crypto/commons/utils.rs b/tfhe/src/core_crypto/commons/utils.rs
index bcb17ce549..3c3a5d1be6 100644
--- a/tfhe/src/core_crypto/commons/utils.rs
+++ b/tfhe/src/core_crypto/commons/utils.rs
@@ -1,30 +1,32 @@
 //! Utilities for the library.
 
+#[track_caller]
+#[inline]
+fn assert_same_len(a: (usize, Option<usize>), b: (usize, Option<usize>)) {
+    debug_assert_eq!(a.1, Some(a.0));
+    debug_assert_eq!(b.1, Some(b.0));
+    debug_assert_eq!(a.0, b.0);
+}
+
 /// Return a Zip iterator, but checks that the two components have the same length.
-pub trait ZipChecked: IntoIterator + Sized
-where
-    <Self as IntoIterator>::IntoIter: ExactSizeIterator,
-{
+pub trait ZipChecked: IntoIterator + Sized {
     #[track_caller]
     #[inline]
     fn zip_checked<B: IntoIterator>(
         self,
         b: B,
-    ) -> core::iter::Zip<<Self as IntoIterator>::IntoIter, <B as IntoIterator>::IntoIter>
-    where
-        <B as IntoIterator>::IntoIter: ExactSizeIterator,
-    {
+    ) -> core::iter::Zip<<Self as IntoIterator>::IntoIter, <B as IntoIterator>::IntoIter> {
         let a = self.into_iter();
         let b = b.into_iter();
-        assert_eq!(a.len(), b.len());
+        assert_same_len(a.size_hint(), b.size_hint());
         core::iter::zip(a, b)
     }
 }
 
-impl<A: IntoIterator> ZipChecked for A where <A as IntoIterator>::IntoIter: ExactSizeIterator {}
+impl<A: IntoIterator> ZipChecked for A {}
 
 // https://docs.rs/itertools/0.7.8/src/itertools/lib.rs.html#247-269
-macro_rules! izip_eq {
+macro_rules! izip {
     (@ __closure @ ($a:expr)) => { |a| (a,) };
     (@ __closure @ ($a:expr, $b:expr)) => { |(a, b)| (a, b) };
     (@ __closure @ ($a:expr, $b:expr, $c:expr)) => { |((a, b), c)| (a, b, c) };
@@ -52,11 +54,11 @@ macro_rules! izip_eq {
         {
             #[allow(unused_imports)]
             use $crate::core_crypto::commons::utils::ZipChecked;
-            $first
+            ::core::iter::IntoIterator::into_iter($first)
                 $(.zip_checked($rest))*
-                .map($crate::core_crypto::commons::utils::izip_eq!(@ __closure @ ($first, $($rest),*)))
+                .map($crate::core_crypto::commons::utils::izip!(@ __closure @ ($first, $($rest),*)))
         }
     };
 }
 
-pub(crate) use izip_eq;
+pub(crate) use izip;
diff --git a/tfhe/src/core_crypto/entities/glwe_ciphertext.rs b/tfhe/src/core_crypto/entities/glwe_ciphertext.rs
index b160c5ac13..06ccfb9cab 100644
--- a/tfhe/src/core_crypto/entities/glwe_ciphertext.rs
+++ b/tfhe/src/core_crypto/entities/glwe_ciphertext.rs
@@ -255,6 +255,9 @@ pub fn glwe_ciphertext_encryption_noise_sample_count(
 
 /// A [`GLWE ciphertext`](`GlweCiphertext`).
 ///
+/// **Remark:** GLWE ciphertexts generalize LWE ciphertexts by definition, however in this library,
+/// GLWE ciphertext entities do not generalize LWE ciphertexts, i.e., polynomial size cannot be 1.
+///
 /// # Formal Definition
 ///
 /// ## GLWE Ciphertext
diff --git a/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key/fft128_lwe_multi_bit_bootstrap_key.rs b/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key/fft128_lwe_multi_bit_bootstrap_key.rs
index db11add54c..b763b542d9 100644
--- a/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key/fft128_lwe_multi_bit_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/entities/lwe_multi_bit_bootstrap_key/fft128_lwe_multi_bit_bootstrap_key.rs
@@ -6,7 +6,7 @@ use crate::core_crypto::commons::parameters::{
     PolynomialSize,
 };
 use crate::core_crypto::commons::traits::{Container, Split};
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::fft_impl::fft128::crypto::ggsw::Fourier128GgswCiphertext;
 use crate::core_crypto::prelude::MultiBitBootstrapKeyConformanceParams;
 
@@ -77,7 +77,7 @@ impl<C: Container<Element = f64>> Fourier128LweMultiBitBootstrapKey<C> {
         let ggsw_count =
             multi_bit_lwe_dim.0 * self.grouping_factor().ggsw_per_multi_bit_element().0;
 
-        izip_eq!(
+        izip!(
             self.data_re0.split_into(ggsw_count),
             self.data_re1.split_into(ggsw_count),
             self.data_im0.split_into(ggsw_count),
diff --git a/tfhe/src/core_crypto/entities/modulus_switched_lwe_ciphertext.rs b/tfhe/src/core_crypto/entities/modulus_switched_lwe_ciphertext.rs
index 69a695a690..0eae0c5d35 100644
--- a/tfhe/src/core_crypto/entities/modulus_switched_lwe_ciphertext.rs
+++ b/tfhe/src/core_crypto/entities/modulus_switched_lwe_ciphertext.rs
@@ -52,7 +52,7 @@ impl<Scalar: Copy> ModulusSwitchedLweCiphertext<Scalar>
         *self.container.last().unwrap()
     }
 
-    fn mask(&self) -> impl ExactSizeIterator<Item = Scalar> + '_ {
+    fn mask(&self) -> impl Iterator<Item = Scalar> + '_ {
         let (_body, mask) = self.container.split_last().unwrap();
 
         mask.iter().copied()
@@ -138,7 +138,7 @@ where
         .cast_into()
     }
 
-    fn mask(&self) -> impl ExactSizeIterator<Item = SwitchedScalar> {
+    fn mask(&self) -> impl Iterator<Item = SwitchedScalar> {
         self.lwe_in
             .as_ref()
             .split_last()
diff --git a/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext.rs b/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext.rs
index dd0aa6a2e1..32910421d5 100644
--- a/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext.rs
+++ b/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext.rs
@@ -288,9 +288,7 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttGgswLevelMatrix
     }
 
     /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(
-        self,
-    ) -> impl DoubleEndedIterator<Item = NttGgswLevelRow<C>> + ExactSizeIterator<Item = NttGgswLevelRow<C>>
+    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = NttGgswLevelRow<C>>
     where
         C: Split,
     {
diff --git a/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext_list.rs b/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext_list.rs
index 9cc94f7c1a..84d2e961ac 100644
--- a/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/entities/ntt_ggsw_ciphertext_list.rs
@@ -207,10 +207,7 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttGgswCiphertextL
     /// consider calling [`NttGgswCiphertextList::as_view`] or
     /// [`NttGgswCiphertextList::as_mut_view`] first to have an iterator over borrowed contents
     /// instead of consuming the original entity.
-    pub fn into_ggsw_iter(
-        self,
-    ) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
-           + ExactSizeIterator<Item = NttGgswCiphertext<C>>
+    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
     where
         C: Split,
     {
diff --git a/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs b/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs
index aaf74b6c28..bbba7e1b4e 100644
--- a/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs
@@ -130,10 +130,7 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttLweBootstrapKey
     /// consider calling [`NttLweBootstrapKey::as_view`] or
     /// [`NttLweBootstrapKey::as_mut_view`] first to have an iterator over borrowed contents
     /// instead of consuming the original entity.
-    pub fn into_ggsw_iter(
-        self,
-    ) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
-           + ExactSizeIterator<Item = NttGgswCiphertext<C>>
+    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
     where
         C: Split,
     {
diff --git a/tfhe/src/core_crypto/experimental/algorithms/glwe_fast_keyswitch.rs b/tfhe/src/core_crypto/experimental/algorithms/glwe_fast_keyswitch.rs
index 5c1622d746..756de2aab2 100644
--- a/tfhe/src/core_crypto/experimental/algorithms/glwe_fast_keyswitch.rs
+++ b/tfhe/src/core_crypto/experimental/algorithms/glwe_fast_keyswitch.rs
@@ -3,7 +3,7 @@
 use crate::core_crypto::commons::math::decomposition::SignedDecomposer;
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::experimental::entities::fourier_pseudo_ggsw_ciphertext::{
     PseudoFourierGgswCiphertext, PseudoFourierGgswCiphertextView,
@@ -237,7 +237,7 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
                 //
                 //        t = 1                           t = 2                     ...
 
-                izip_eq!(
+                izip!(
                     ggsw_decomp_matrix.into_rows(),
                     glwe_decomp_term.get_mask().as_polynomial_list().iter()
                 )
@@ -276,7 +276,7 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
         //
         // We iterate over the polynomials in the output.
         if !is_output_uninit {
-            izip_eq!(
+            izip!(
                 out.as_mut_polynomial_list().iter_mut(),
                 output_fft_buffer
                     .into_chunks(fourier_poly_size)
diff --git a/tfhe/src/core_crypto/experimental/entities/fourier_pseudo_ggsw_ciphertext.rs b/tfhe/src/core_crypto/experimental/entities/fourier_pseudo_ggsw_ciphertext.rs
index aff269b907..5c194388b8 100644
--- a/tfhe/src/core_crypto/experimental/entities/fourier_pseudo_ggsw_ciphertext.rs
+++ b/tfhe/src/core_crypto/experimental/entities/fourier_pseudo_ggsw_ciphertext.rs
@@ -5,7 +5,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
     Container, ContiguousEntityContainer, IntoContainerOwned, Split,
 };
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::experimental::entities::PseudoGgswCiphertext;
 use crate::core_crypto::fft_impl::fft64::math::decomposition::DecompositionLevel;
 use crate::core_crypto::fft_impl::fft64::math::fft::{FftView, FourierPolynomialList};
@@ -159,10 +159,7 @@ impl<C: Container<Element = c64>> PseudoFourierGgswLevelMatrix<C> {
     }
 
     /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(
-        self,
-    ) -> impl DoubleEndedIterator<Item = PseudoFourierGgswLevelRow<C>>
-           + ExactSizeIterator<Item = PseudoFourierGgswLevelRow<C>>
+    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = PseudoFourierGgswLevelRow<C>>
     where
         C: Split,
     {
@@ -281,7 +278,7 @@ impl PseudoFourierGgswCiphertextMutView<'_> {
         debug_assert_eq!(coef_ggsw.polynomial_size(), self.polynomial_size());
         let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;
 
-        for (fourier_poly, coef_poly) in izip_eq!(
+        for (fourier_poly, coef_poly) in izip!(
             self.data().into_chunks(fourier_poly_size),
             coef_ggsw.as_polynomial_list().iter()
         ) {
diff --git a/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs b/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs
index a33b53a78c..de88d2676f 100644
--- a/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs
@@ -14,7 +14,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
     Container, ContiguousEntityContainer, ContiguousEntityContainerMut, Split,
 };
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::ggsw_ciphertext::fourier_ggsw_ciphertext_size;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
@@ -78,14 +78,11 @@ impl<C: Container<Element = f64>> Fourier128LweBootstrapKey<C> {
     }
 
     /// Return an iterator over the GGSW ciphertexts composing the key.
-    pub fn into_ggsw_iter(
-        self,
-    ) -> impl DoubleEndedIterator<Item = Fourier128GgswCiphertext<C>>
-           + ExactSizeIterator<Item = Fourier128GgswCiphertext<C>>
+    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = Fourier128GgswCiphertext<C>>
     where
         C: Split,
     {
-        izip_eq!(
+        izip!(
             self.data_re0.split_into(self.input_lwe_dimension.0),
             self.data_re1.split_into(self.input_lwe_dimension.0),
             self.data_im0.split_into(self.input_lwe_dimension.0),
@@ -219,9 +216,7 @@ where
             coef_bsk: LweBootstrapKey<&[Scalar]>,
             fft: Fft128View<'_>,
         ) {
-            for (mut fourier_ggsw, standard_ggsw) in
-                izip_eq!(this.into_ggsw_iter(), coef_bsk.iter())
-            {
+            for (mut fourier_ggsw, standard_ggsw) in izip!(this.into_ggsw_iter(), coef_bsk.iter()) {
                 fourier_ggsw.fill_with_forward_fourier(&standard_ggsw, fft);
             }
         }
@@ -292,7 +287,7 @@ where
             let mut ct0 = lut;
 
             for (lwe_mask_element, bootstrap_key_ggsw) in
-                izip_eq!(msed_lwe_mask, this.into_ggsw_iter())
+                izip!(msed_lwe_mask, this.into_ggsw_iter())
             {
                 if lwe_mask_element != 0 {
                     let stack = &mut *stack;
diff --git a/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs b/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
index 73cb36c8d2..34b827aada 100644
--- a/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
@@ -8,7 +8,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
     Container, ContiguousEntityContainer, ContiguousEntityContainerMut, Split,
 };
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::ggsw_ciphertext::{
     fourier_ggsw_ciphertext_size, fourier_ggsw_level_matrix_size, GgswCiphertext,
 };
@@ -113,9 +113,6 @@ impl<C: Container<Element = f64>> Fourier128GgswCiphertext<C> {
             decomposition_level_count,
         );
         assert_eq!(data_re0.container_len(), container_len);
-        assert_eq!(data_re1.container_len(), container_len);
-        assert_eq!(data_im0.container_len(), container_len);
-        assert_eq!(data_im1.container_len(), container_len);
 
         Self {
             data_re0,
@@ -187,7 +184,7 @@ impl<C: Container<Element = f64>> Fourier128GgswCiphertext<C> {
         C: Split,
     {
         let decomposition_level_count = self.decomposition_level_count.0;
-        izip_eq!(
+        izip!(
             self.data_re0.split_into(decomposition_level_count),
             self.data_re1.split_into(decomposition_level_count),
             self.data_im0.split_into(decomposition_level_count),
@@ -238,15 +235,12 @@ impl<C: Container<Element = f64>> Fourier128GgswLevelMatrix<C> {
     }
 
     /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(
-        self,
-    ) -> impl DoubleEndedIterator<Item = Fourier128GgswLevelRow<C>>
-           + ExactSizeIterator<Item = Fourier128GgswLevelRow<C>>
+    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = Fourier128GgswLevelRow<C>>
     where
         C: Split,
     {
         let row_count = self.row_count();
-        izip_eq!(
+        izip!(
             self.data_re0.split_into(row_count),
             self.data_re1.split_into(row_count),
             self.data_im0.split_into(row_count),
@@ -355,7 +349,7 @@ where
 
             let (data_re0, data_re1, data_im0, data_im1) = this.data();
 
-            for (fourier_re0, fourier_re1, fourier_im0, fourier_im1, coef_poly) in izip_eq!(
+            for (fourier_re0, fourier_re1, fourier_im0, fourier_im1, coef_poly) in izip!(
                 data_re0.into_chunks(poly_size),
                 data_re1.into_chunks(poly_size),
                 data_im0.into_chunks(poly_size),
@@ -491,7 +485,7 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
                 //
                 //        t = 1                           t = 2                     ...
 
-                for (ggsw_row, glwe_poly) in izip_eq!(
+                for (ggsw_row, glwe_poly) in izip!(
                     ggsw_decomp_matrix.into_rows(),
                     glwe_decomp_term.as_polynomial_list().iter()
                 ) {
@@ -537,7 +531,7 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
         //
         // We iterate over the polynomials in the output.
         if !is_output_uninit {
-            for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip_eq!(
+            for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
                 out.as_mut_polynomial_list().iter_mut(),
                 output_fft_buffer_re0.into_chunks(fourier_poly_size),
                 output_fft_buffer_re1.into_chunks(fourier_poly_size),
@@ -610,7 +604,7 @@ fn update_with_fmadd_scalar(
             rhs_re1,
             rhs_im0,
             rhs_im1,
-        ) in izip_eq!(
+        ) in izip!(
             output_fourier_re0,
             output_fourier_re1,
             output_fourier_im0,
@@ -653,7 +647,7 @@ fn update_with_fmadd_scalar(
             rhs_re1,
             rhs_im0,
             rhs_im1,
-        ) in izip_eq!(
+        ) in izip!(
             output_fourier_re0,
             output_fourier_re1,
             output_fourier_im0,
@@ -709,7 +703,7 @@ pub fn update_with_fmadd(
         ggsw_poly_re1,
         ggsw_poly_im0,
         ggsw_poly_im1,
-    ) in izip_eq!(
+    ) in izip!(
         output_fft_buffer_re0.into_chunks(fourier_poly_size),
         output_fft_buffer_re1.into_chunks(fourier_poly_size),
         output_fft_buffer_im0.into_chunks(fourier_poly_size),
@@ -803,9 +797,6 @@ pub fn cmux_scratch<Scalar>(
 }
 
 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
-///
-/// # Panics
-/// This will panic if ct0 and ct1 are not of the same size
 pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
     ct0: &mut GlweCiphertext<ContCt0>,
     ct1: &mut GlweCiphertext<ContCt1>,
@@ -825,7 +816,7 @@ pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
         fft: Fft128View<'_>,
         stack: &mut PodStack,
     ) {
-        for (c1, c0) in izip_eq!(ct1.as_mut(), ct0.as_ref()) {
+        for (c1, c0) in izip!(ct1.as_mut(), ct0.as_ref()) {
             *c1 = c1.wrapping_sub(*c0);
         }
         add_external_product_assign(&mut ct0, &ggsw, &ct1, fft, stack);
diff --git a/tfhe/src/core_crypto/fft_impl/fft128/math/fft/mod.rs b/tfhe/src/core_crypto/fft_impl/fft128/math/fft/mod.rs
index 47d5135c79..7171bb00c7 100644
--- a/tfhe/src/core_crypto/fft_impl/fft128/math/fft/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/math/fft/mod.rs
@@ -1,7 +1,7 @@
 use crate::core_crypto::commons::math::torus::UnsignedTorus;
 use crate::core_crypto::commons::numeric::{CastFrom, CastInto, UnsignedInteger};
 use crate::core_crypto::commons::parameters::PolynomialSize;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use core::any::TypeId;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
 use std::collections::hash_map::Entry;
@@ -277,7 +277,7 @@ pub fn convert_forward_torus<Scalar: UnsignedTorus>(
     let normalization = 2.0_f64.powi(-(Scalar::BITS as i32));
 
     for (out_re0, out_re1, out_im0, out_im1, &in_re, &in_im) in
-        izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
+        izip!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
     {
         let out_re = to_signed_to_f128(in_re);
         let out_im = to_signed_to_f128(in_im);
@@ -301,7 +301,7 @@ pub fn convert_forward_integer<Scalar: UnsignedTorus>(
     in_im: &[Scalar],
 ) {
     for (out_re0, out_re1, out_im0, out_im1, &in_re, &in_im) in
-        izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
+        izip!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
     {
         let out_re = to_signed_to_f128(in_re);
         let out_im = to_signed_to_f128(in_im);
@@ -323,7 +323,7 @@ fn convert_add_backward_torus<Scalar: UnsignedTorus>(
 ) {
     let norm = 1.0 / in_re0.len() as f64;
     for (out_re, out_im, in_re0, in_re1, in_im0, in_im1) in
-        izip_eq!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
+        izip!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
     {
         let in_re = f128(*in_re0 * norm, *in_re1 * norm);
         let in_im = f128(*in_im0 * norm, *in_im1 * norm);
@@ -343,7 +343,7 @@ fn convert_backward_torus<Scalar: UnsignedTorus>(
 ) {
     let norm = 1.0 / in_re0.len() as f64;
     for (out_re, out_im, in_re0, in_re1, in_im0, in_im1) in
-        izip_eq!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
+        izip!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
     {
         let in_re = f128(*in_re0 * norm, *in_re1 * norm);
         let in_im = f128(*in_im0 * norm, *in_im1 * norm);
diff --git a/tfhe/src/core_crypto/fft_impl/fft128/math/fft/tests.rs b/tfhe/src/core_crypto/fft_impl/fft128/math/fft/tests.rs
index 9a607111eb..9319517b79 100644
--- a/tfhe/src/core_crypto/fft_impl/fft128/math/fft/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/math/fft/tests.rs
@@ -42,7 +42,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
             stack,
         );
 
-        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
             if Scalar::BITS <= 64 {
                 assert_eq!(*expected, *actual);
             } else {
@@ -103,7 +103,7 @@ fn test_product<Scalar: UnsignedTorus>() {
             let mut fourier1_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
 
             let integer_magnitude = 16;
-            for (x, y) in izip_eq!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
+            for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
                 *x = generator.random_uniform();
                 *y = generator.random_uniform();
 
@@ -128,7 +128,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                 &poly1,
             );
 
-            for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip_eq!(
+            for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip!(
                 &mut *fourier0_re0,
                 &mut *fourier0_re1,
                 &mut *fourier0_im0,
@@ -161,7 +161,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                 poly1.as_ref(),
             );
 
-            for (expected, actual) in izip_eq!(
+            for (expected, actual) in izip!(
                 convolution_from_naive.as_ref().iter(),
                 convolution_from_fft.as_ref().iter()
             ) {
diff --git a/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs b/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs
index cae17f9019..5d5fa1a112 100644
--- a/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs
@@ -6,7 +6,7 @@ use crate::core_crypto::commons::parameters::{
     CiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, MonomialDegree,
 };
 use crate::core_crypto::commons::traits::ContiguousEntityContainerMut;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::prelude::{Container, ContainerMut, ModulusSwitchedLweCiphertext};
 use aligned_vec::CACHELINE_ALIGN;
@@ -21,13 +21,13 @@ pub fn polynomial_wrapping_monic_monomial_mul_assign_split(
     let output_hi = output_hi.into_container();
     let full_cycles_count = monomial_degree.0 / output_lo.container_len();
     if full_cycles_count % 2 != 0 {
-        izip_eq!(&mut *output_lo, &mut *output_hi)
+        izip!(&mut *output_lo, &mut *output_hi)
             .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
     }
     let remaining_degree = monomial_degree.0 % output_lo.container_len();
     output_lo.rotate_right(remaining_degree);
     output_hi.rotate_right(remaining_degree);
-    izip_eq!(output_lo, output_hi)
+    izip!(output_lo, output_hi)
         .take(remaining_degree)
         .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
 }
@@ -41,13 +41,13 @@ pub fn polynomial_wrapping_monic_monomial_div_assign_split(
     let output_hi = output_hi.into_container();
     let full_cycles_count = monomial_degree.0 / output_lo.container_len();
     if full_cycles_count % 2 != 0 {
-        izip_eq!(&mut *output_lo, &mut *output_hi)
+        izip!(&mut *output_lo, &mut *output_hi)
             .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
     }
     let remaining_degree = monomial_degree.0 % output_lo.container_len();
     output_lo.rotate_left(remaining_degree);
     output_hi.rotate_left(remaining_degree);
-    izip_eq!(output_lo, output_hi)
+    izip!(output_lo, output_hi)
         .rev()
         .take(remaining_degree)
         .for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
@@ -79,7 +79,7 @@ where
             let msed_lwe_mask = msed_lwe.mask();
             let msed_lwe_body = msed_lwe.body();
 
-            for (poly_lo, poly_hi) in izip_eq!(
+            for (poly_lo, poly_hi) in izip!(
                 lut_lo.as_mut_polynomial_list().iter_mut(),
                 lut_hi.as_mut_polynomial_list().iter_mut(),
             ) {
@@ -95,7 +95,7 @@ where
             let mut ct0_hi = lut_hi;
 
             for (lwe_mask_element, bootstrap_key_ggsw) in
-                izip_eq!(msed_lwe_mask, this.into_ggsw_iter())
+                izip!(msed_lwe_mask, this.into_ggsw_iter())
             {
                 if lwe_mask_element != 0 {
                     let stack = &mut *stack;
@@ -116,7 +116,7 @@ where
                     );
 
                     // We rotate ct_1 by performing ct_1 <- ct_1 * X^{a_hat}
-                    for (poly_lo, poly_hi) in izip_eq!(
+                    for (poly_lo, poly_hi) in izip!(
                         ct1_lo.as_mut_polynomial_list().iter_mut(),
                         ct1_hi.as_mut_polynomial_list().iter_mut(),
                     ) {
@@ -204,7 +204,7 @@ where
             );
             let (local_accumulator, _) = stack.collect_aligned(
                 align,
-                izip_eq!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
+                izip!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
                     .map(|(&lo, &hi)| lo as u128 | ((hi as u128) << 64)),
             );
             let mut local_accumulator = GlweCiphertextMutView::from_container(
diff --git a/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/ggsw.rs b/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/ggsw.rs
index 8e29319f78..61bec423b1 100644
--- a/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/ggsw.rs
@@ -4,7 +4,7 @@ use crate::core_crypto::commons::traits::container::Split;
 use crate::core_crypto::commons::traits::contiguous_entity_container::{
     ContiguousEntityContainer, ContiguousEntityContainerMut,
 };
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::fft128::crypto::ggsw::update_with_fmadd;
 use crate::core_crypto::prelude::{Container, ContainerMut, SignedDecomposer};
@@ -86,7 +86,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
             let (decomposition_states_hi, substack1) =
                 stack.make_aligned_raw::<u64>(poly_size * glwe_size, align);
 
-            for (out_lo, out_hi, in_lo, in_hi) in izip_eq!(
+            for (out_lo, out_hi, in_lo, in_hi) in izip!(
                 &mut *decomposition_states_lo,
                 &mut *decomposition_states_hi,
                 glwe_lo.as_ref(),
@@ -153,7 +153,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
                 //
                 //        t = 1                           t = 2                     ...
 
-                for (ggsw_row, glwe_poly_lo, glwe_poly_hi) in izip_eq!(
+                for (ggsw_row, glwe_poly_lo, glwe_poly_hi) in izip!(
                     ggsw_decomp_matrix.into_rows(),
                     glwe_decomp_term_lo.as_polynomial_list().iter(),
                     glwe_decomp_term_hi.as_polynomial_list().iter(),
@@ -202,7 +202,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
         //
         // We iterate over the polynomials in the output.
         if !is_output_uninit {
-            for (mut out_lo, mut out_hi, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip_eq!(
+            for (mut out_lo, mut out_hi, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
                 out_lo.as_mut_polynomial_list().iter_mut(),
                 out_hi.as_mut_polynomial_list().iter_mut(),
                 output_fft_buffer_re0.into_chunks(fourier_poly_size),
@@ -293,7 +293,7 @@ fn collect_next_term_split_avx512(
             let base_log_complement = simd.splat_u64x8(64u64.wrapping_sub(base_log));
             let base_log = simd.splat_u64x8(base_log);
 
-            for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
+            for (out_lo, out_hi, state_lo, state_hi) in izip!(
                 glwe_decomp_term_lo,
                 glwe_decomp_term_hi,
                 decomposition_states_lo,
@@ -425,7 +425,7 @@ fn collect_next_term_split_avx2(
             let base_log_complement = simd.splat_u64x4(64u64.wrapping_sub(base_log));
             let base_log = simd.splat_u64x4(base_log);
 
-            for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
+            for (out_lo, out_hi, state_lo, state_hi) in izip!(
                 glwe_decomp_term_lo,
                 glwe_decomp_term_hi,
                 decomposition_states_lo,
@@ -507,7 +507,7 @@ fn collect_next_term_split_scalar(
     base_log: usize,
 ) {
     assert!(base_log < 128);
-    for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
+    for (out_lo, out_hi, state_lo, state_hi) in izip!(
         glwe_decomp_term_lo,
         glwe_decomp_term_hi,
         decomposition_states_lo,
@@ -603,9 +603,6 @@ fn collect_next_term_split(
 }
 
 /// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
-///
-/// # Panics
-/// This will panic if ct0_lo, ct0_hi, ct1_lo and ct1_hi are not of the same size
 pub fn cmux_split<ContCt0Lo, ContCt0Hi, ContCt1Lo, ContCt1Hi, ContGgsw>(
     ct0_lo: &mut GlweCiphertext<ContCt0Lo>,
     ct0_hi: &mut GlweCiphertext<ContCt0Hi>,
@@ -630,7 +627,7 @@ pub fn cmux_split<ContCt0Lo, ContCt0Hi, ContCt1Lo, ContCt1Hi, ContGgsw>(
         fft: Fft128View<'_>,
         stack: &mut PodStack,
     ) {
-        for (c1_lo, c1_hi, c0_lo, c0_hi) in izip_eq!(
+        for (c1_lo, c1_hi, c0_lo, c0_hi) in izip!(
             ct1_lo.as_mut(),
             ct1_hi.as_mut(),
             ct0_lo.as_ref(),
diff --git a/tfhe/src/core_crypto/fft_impl/fft128_u128/math/fft/mod.rs b/tfhe/src/core_crypto/fft_impl/fft128_u128/math/fft/mod.rs
index f4fadd1cf6..52fdb790cd 100644
--- a/tfhe/src/core_crypto/fft_impl/fft128_u128/math/fft/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128_u128/math/fft/mod.rs
@@ -1,4 +1,4 @@
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 pub use crate::core_crypto::fft_impl::fft128::math::fft::Fft128View;
 use dyn_stack::PodStack;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -819,7 +819,7 @@ pub fn convert_forward_integer_avx2(
             let in_im_hi = pulp::as_arrays::<4, _>(in_im_hi).0;
 
             for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
-                izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
+                izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
             {
                 let out_re =
                     to_signed_to_f128_avx2(simd, (pulp::cast(*in_re_lo), pulp::cast(*in_re_hi)));
@@ -899,7 +899,7 @@ pub fn convert_forward_integer_avx512(
             let in_im_hi = pulp::as_arrays::<8, _>(in_im_hi).0;
 
             for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
-                izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
+                izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
             {
                 let out_re =
                     to_signed_to_f128_avx512(simd, (pulp::cast(*in_re_lo), pulp::cast(*in_re_hi)));
@@ -938,7 +938,7 @@ pub fn convert_forward_integer_scalar(
     in_im_hi: &[u64],
 ) {
     for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
-        izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
+        izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
     {
         let out_re = to_signed_to_f128((*in_re_lo, *in_re_hi));
         let out_im = to_signed_to_f128((*in_im_lo, *in_im_hi));
@@ -990,7 +990,7 @@ pub fn convert_add_backward_torus_scalar(
 ) {
     let norm = 1.0 / in_re0.len() as f64;
     for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
-        izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
+        izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
     {
         let in_re = f128(*in_re0 * norm, *in_re1 * norm);
         let in_im = f128(*in_im0 * norm, *in_im1 * norm);
@@ -1056,7 +1056,7 @@ pub fn convert_add_backward_torus_avx2(
             let in_im1 = pulp::as_arrays::<4, _>(in_im1).0;
 
             for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
-                izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
+                izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
             {
                 let in_re = (
                     simd.mul_f64x4(pulp::cast(*in_re0), norm),
@@ -1149,7 +1149,7 @@ pub fn convert_add_backward_torus_avx512(
             let in_im1 = pulp::as_arrays::<8, _>(in_im1).0;
 
             for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
-                izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
+                izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
             {
                 let in_re = (
                     simd.mul_f64x8(pulp::cast(*in_re0), norm),
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/crypto/bootstrap.rs b/tfhe/src/core_crypto/fft_impl/fft64/crypto/bootstrap.rs
index b251c8683c..804b76342a 100644
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/bootstrap.rs
@@ -14,7 +14,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
     Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
 };
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
 use crate::core_crypto::fft_impl::fft64::math::fft::par_convert_polynomials_list_to_fourier;
@@ -72,10 +72,7 @@ impl<C: Container<Element = c64>> FourierLweBootstrapKey<C> {
     }
 
     /// Return an iterator over the GGSW ciphertexts composing the key.
-    pub fn into_ggsw_iter(
-        self,
-    ) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
-           + ExactSizeIterator<Item = FourierGgswCiphertext<C>>
+    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
     where
         C: Split,
     {
@@ -193,9 +190,6 @@ pub fn fill_with_forward_fourier_scratch(fft: FftView<'_>) -> Result<StackReq, S
 impl FourierLweBootstrapKeyMutView<'_> {
     /// Fill a bootstrapping key with the Fourier transform of a bootstrapping key in the standard
     /// domain.
-    ///
-    /// # Panics
-    /// This will panic if self and coeff_bsk are not of the same size
     pub fn fill_with_forward_fourier<Scalar: UnsignedTorus>(
         mut self,
         coef_bsk: LweBootstrapKey<&'_ [Scalar]>,
@@ -203,7 +197,7 @@ impl FourierLweBootstrapKeyMutView<'_> {
         stack: &mut PodStack,
     ) {
         for (fourier_ggsw, standard_ggsw) in
-            izip_eq!(self.as_mut_view().into_ggsw_iter(), coef_bsk.iter())
+            izip!(self.as_mut_view().into_ggsw_iter(), coef_bsk.iter())
         {
             fourier_ggsw.fill_with_forward_fourier(standard_ggsw, fft, stack);
         }
@@ -331,8 +325,7 @@ impl FourierLweBootstrapKeyView<'_> {
         let mut ct1 =
             GlweCiphertextMutView::from_container(&mut *ct1, lut_poly_size, ciphertext_modulus);
 
-        for (lwe_mask_element, bootstrap_key_ggsw) in izip_eq!(msed_lwe_mask, self.into_ggsw_iter())
-        {
+        for (lwe_mask_element, bootstrap_key_ggsw) in izip!(msed_lwe_mask, self.into_ggsw_iter()) {
             if lwe_mask_element != 0 {
                 let monomial_degree = MonomialDegree(lwe_mask_element);
 
@@ -342,7 +335,7 @@ impl FourierLweBootstrapKeyView<'_> {
 
                 // We rotate ct_1 and subtract ct_0 (first step of cmux) by performing
                 // ct_1 <- (ct_0 * X^{a_hat}) - ct_0
-                for (mut ct1_poly, ct0_poly) in izip_eq!(
+                for (mut ct1_poly, ct0_poly) in izip!(
                     ct1.as_mut_polynomial_list().iter_mut(),
                     ct0.as_polynomial_list().iter(),
                 ) {
@@ -393,7 +386,7 @@ impl FourierLweBootstrapKeyView<'_> {
         let ciphertext_modulus = lut_list.ciphertext_modulus();
         assert!(ciphertext_modulus.is_compatible_with_native_modulus());
 
-        for (mut lut, lwe) in izip_eq!(lut_list.as_mut_view().iter_mut(), msed_lwe_list.iter()) {
+        for (mut lut, lwe) in izip!(lut_list.as_mut_view().iter_mut(), msed_lwe_list.iter()) {
             let msed_lwe_body = lwe.body();
 
             let monomial_degree = MonomialDegree(msed_lwe_body.cast_into());
@@ -421,7 +414,7 @@ impl FourierLweBootstrapKeyView<'_> {
         );
 
         for (idx, bootstrap_key_ggsw) in self.into_ggsw_iter().enumerate() {
-            for (mut ct0, mut ct1, msed_lwe) in izip_eq!(
+            for (mut ct0, mut ct1, msed_lwe) in izip!(
                 ct0_list.as_mut_view().iter_mut(),
                 ct1_list.as_mut_view().iter_mut(),
                 msed_lwe_list.iter()
@@ -438,7 +431,7 @@ impl FourierLweBootstrapKeyView<'_> {
 
                     // We rotate ct_1 and subtract ct_0 (first step of cmux) by performing
                     // ct_1 <- (ct_0 * X^{a_hat}) - ct_0
-                    for (mut ct1_poly, ct0_poly) in izip_eq!(
+                    for (mut ct1_poly, ct0_poly) in izip!(
                         ct1.as_mut_polynomial_list().iter_mut(),
                         ct0.as_polynomial_list().iter(),
                     ) {
@@ -558,8 +551,7 @@ impl FourierLweBootstrapKeyView<'_> {
 
         self.batch_blind_rotate_assign(local_accumulator.as_mut_view(), &lwe_in_msed, fft, stack);
 
-        for (mut lwe_out, local_accumulator) in
-            izip_eq!(lwe_out.iter_mut(), local_accumulator.iter())
+        for (mut lwe_out, local_accumulator) in izip!(lwe_out.iter_mut(), local_accumulator.iter())
         {
             extract_lwe_sample_from_glwe_ciphertext(
                 &local_accumulator,
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs b/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs
index e68b329ff3..f6435a98b8 100644
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/ggsw.rs
@@ -10,7 +10,7 @@ use crate::core_crypto::commons::parameters::{
 use crate::core_crypto::commons::traits::{
     Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
 };
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::ggsw_ciphertext::{
     fourier_ggsw_level_matrix_size, GgswCiphertextView,
 };
@@ -154,10 +154,7 @@ impl<C: Container<Element = c64>> FourierGgswLevelMatrix<C> {
     }
 
     /// Return an iterator over the rows of the level matrices.
-    pub fn into_rows(
-        self,
-    ) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
-           + ExactSizeIterator<Item = FourierGgswLevelRow<C>>
+    pub fn into_rows(self) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
     where
         C: Split,
     {
@@ -265,7 +262,7 @@ impl FourierGgswCiphertextMutView<'_> {
         debug_assert_eq!(coef_ggsw.polynomial_size(), self.polynomial_size());
         let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;
 
-        for (fourier_poly, coef_poly) in izip_eq!(
+        for (fourier_poly, coef_poly) in izip!(
             self.data().into_chunks(fourier_poly_size),
             coef_ggsw.as_polynomial_list().iter()
         ) {
@@ -404,10 +401,7 @@ impl<C: Container<Element = c64>> FourierGgswCiphertextList<C> {
         }
     }
 
-    pub fn into_ggsw_iter(
-        self,
-    ) -> impl ExactSizeIterator<Item = FourierGgswCiphertext<C>>
-           + DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
+    pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
     where
         C: Split,
     {
@@ -554,7 +548,7 @@ pub fn add_external_product_assign<Scalar>(
             //
             //        t = 1                           t = 2                     ...
 
-            izip_eq!(
+            izip!(
                 ggsw_decomp_matrix.into_rows(),
                 glwe_decomp_term.as_polynomial_list().iter()
             )
@@ -592,7 +586,7 @@ pub fn add_external_product_assign<Scalar>(
     //
     // We iterate over the polynomials in the output.
     if !is_output_uninit {
-        izip_eq!(
+        izip!(
             out.as_mut_polynomial_list().iter_mut(),
             output_fft_buffer
                 .into_chunks(fourier_poly_size)
@@ -655,26 +649,26 @@ pub(crate) fn update_with_fmadd(
                 let rhs = S::as_simd_c64s(fourier).0;
 
                 if is_output_uninit {
-                    for (output_fourier, ggsw_poly) in izip_eq!(
+                    for (output_fourier, ggsw_poly) in izip!(
                         output_fft_buffer.into_chunks(fourier_poly_size),
                         lhs_polynomial_list.into_chunks(fourier_poly_size)
                     ) {
                         let out = S::as_mut_simd_c64s(output_fourier).0;
                         let lhs = S::as_simd_c64s(ggsw_poly).0;
 
-                        for (out, lhs, rhs) in izip_eq!(out, lhs, rhs) {
+                        for (out, lhs, rhs) in izip!(out, lhs, rhs) {
                             *out = simd.mul_c64s(*lhs, *rhs);
                         }
                     }
                 } else {
-                    for (output_fourier, ggsw_poly) in izip_eq!(
+                    for (output_fourier, ggsw_poly) in izip!(
                         output_fft_buffer.into_chunks(fourier_poly_size),
                         lhs_polynomial_list.into_chunks(fourier_poly_size)
                     ) {
                         let out = S::as_mut_simd_c64s(output_fourier).0;
                         let lhs = S::as_simd_c64s(ggsw_poly).0;
 
-                        for (out, lhs, rhs) in izip_eq!(out, lhs, rhs) {
+                        for (out, lhs, rhs) in izip!(out, lhs, rhs) {
                             *out = simd.mul_add_c64s(*lhs, *rhs, *out);
                         }
                     }
@@ -725,7 +719,7 @@ pub(crate) fn update_with_fmadd_factor(
         fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
             let factor = simd.splat_c64s(self.factor);
 
-            for (output_fourier, ggsw_poly) in izip_eq!(
+            for (output_fourier, ggsw_poly) in izip!(
                 self.output_fft_buffer.into_chunks(self.fourier_poly_size),
                 self.lhs_polynomial_list.into_chunks(self.fourier_poly_size)
             ) {
@@ -734,12 +728,12 @@ pub(crate) fn update_with_fmadd_factor(
                 let rhs = S::as_simd_c64s(self.fourier).0;
 
                 if self.is_output_uninit {
-                    for (out, &lhs, &rhs) in izip_eq!(out, lhs, rhs) {
+                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
                         // NOTE: factor * (lhs * rhs) is more efficient than (lhs * rhs) * factor
                         *out = simd.mul_c64s(factor, simd.mul_c64s(lhs, rhs));
                     }
                 } else {
-                    for (out, &lhs, &rhs) in izip_eq!(out, lhs, rhs) {
+                    for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
                         // NOTE: see above
                         *out = simd.mul_add_c64s(factor, simd.mul_c64s(lhs, rhs), *out);
                     }
@@ -775,7 +769,7 @@ pub fn cmux<Scalar: UnsignedTorus>(
     fft: FftView<'_>,
     stack: &mut PodStack,
 ) {
-    izip_eq!(ct1.as_mut(), ct0.as_ref()).for_each(|(c1, c0)| {
+    izip!(ct1.as_mut(), ct0.as_ref()).for_each(|(c1, c0)| {
         *c1 = c1.wrapping_sub(*c0);
     });
     add_external_product_assign(ct0, ggsw, ct1.as_view(), fft, stack);
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs
index bdb147e671..d3d851e6d6 100644
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/mod.rs
@@ -12,7 +12,7 @@ use crate::core_crypto::algorithms::*;
 use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
 use crate::core_crypto::commons::parameters::*;
 use crate::core_crypto::commons::traits::*;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use aligned_vec::CACHELINE_ALIGN;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -216,7 +216,7 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
         *out_pbs_body = (*out_pbs_body).wrapping_add(Scalar::ONE << (delta_log.0 + bit_idx - 1));
 
         // Remove the extracted bit from the initial LWE to get a 0 at the extracted bit location.
-        izip_eq!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
+        izip!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
             .for_each(|(out, inp)| *out = (*out).wrapping_sub(*inp));
     }
 }
@@ -521,7 +521,7 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
                 break;
             };
 
-            let mut t_iter = izip_eq!(t_0.iter_mut(), t_1.iter_mut()).enumerate();
+            let mut t_iter = izip!(t_0.iter_mut(), t_1.iter_mut()).enumerate();
 
             let (mut j_counter, (mut t0_j, mut t1_j)) = t_iter.next().unwrap();
 
@@ -539,7 +539,7 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
                 if t_fill[j] == 2 {
                     let (diff_data, stack) = stack.collect_aligned(
                         CACHELINE_ALIGN,
-                        izip_eq!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
+                        izip!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
                     );
                     let diff = GlweCiphertext::from_container(
                         &*diff_data,
@@ -709,7 +709,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
         pfpksk_list.ciphertext_modulus(),
     );
 
-    for (lwe_in, ggsw) in izip_eq!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter()) {
+    for (lwe_in, ggsw) in izip!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter()) {
         circuit_bootstrap_boolean(
             fourier_bsk,
             lwe_in,
@@ -728,7 +728,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
 
     let small_lut_size = big_lut_as_polynomial_list.polynomial_count().0 / number_of_luts;
 
-    for (lut, lwe_out) in izip_eq!(
+    for (lut, lwe_out) in izip!(
         big_lut_as_polynomial_list.chunks_exact(small_lut_size),
         lwe_list_out.iter_mut(),
     ) {
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs
index 1140d5aa77..86efb8ee31 100644
--- a/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/crypto/wop_pbs/tests.rs
@@ -521,7 +521,7 @@ pub fn test_cmux_tree() {
             level,
         );
         for (&single_bit_msg, mut fourier_ggsw) in
-            izip_eq!(vec_message.iter(), ggsw_list.as_mut_view().into_ggsw_iter())
+            izip!(vec_message.iter(), ggsw_list.as_mut_view().into_ggsw_iter())
         {
             let mut ggsw = GgswCiphertextOwned::new(
                 0_u64,
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs
index 1039f5d04d..f6471686be 100644
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/mod.rs
@@ -6,7 +6,7 @@ use crate::core_crypto::commons::math::torus::UnsignedTorus;
 use crate::core_crypto::commons::numeric::CastInto;
 use crate::core_crypto::commons::parameters::{PolynomialCount, PolynomialSize};
 use crate::core_crypto::commons::traits::{Container, ContainerMut, IntoContainerOwned};
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::entities::*;
 use aligned_vec::{avec, ABox};
 use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -65,7 +65,7 @@ impl Twisties {
         let mut im = avec![0.0; n].into_boxed_slice();
 
         let unit = core::f64::consts::PI / (2.0 * n as f64);
-        for (i, (re, im)) in izip_eq!(&mut *re, &mut *im).enumerate() {
+        for (i, (re, im)) in izip!(&mut *re, &mut *im).enumerate() {
             (*im, *re) = (i as f64 * unit).sin_cos();
         }
 
@@ -212,7 +212,7 @@ fn convert_forward_torus<Scalar: UnsignedTorus>(
 ) {
     let normalization = 2.0_f64.powi(-(Scalar::BITS as i32));
 
-    izip_eq!(out, in_re, in_im, twisties.re, twisties.im).for_each(
+    izip!(out, in_re, in_im, twisties.re, twisties.im).for_each(
         |(out, in_re, in_im, w_re, w_im)| {
             let in_re: f64 = in_re.into_signed().cast_into() * normalization;
             let in_im: f64 = in_im.into_signed().cast_into() * normalization;
@@ -233,7 +233,7 @@ fn convert_forward_integer_scalar<Scalar: UnsignedTorus>(
     in_im: &[Scalar],
     twisties: TwistiesView<'_>,
 ) {
-    izip_eq!(out, in_re, in_im, twisties.re, twisties.im).for_each(
+    izip!(out, in_re, in_im, twisties.re, twisties.im).for_each(
         |(out, in_re, in_im, w_re, w_im)| {
             let in_re: f64 = in_re.into_signed().cast_into();
             let in_im: f64 = in_im.into_signed().cast_into();
@@ -278,7 +278,7 @@ fn convert_backward_torus<Scalar: UnsignedTorus>(
     twisties: TwistiesView<'_>,
 ) {
     let normalization = 1.0 / inp.len() as f64;
-    izip_eq!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
+    izip!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
         |(out_re, out_im, inp, w_re, w_im)| {
             let tmp = inp
                 * (c64 {
@@ -299,7 +299,7 @@ fn convert_add_backward_torus_scalar<Scalar: UnsignedTorus>(
     twisties: TwistiesView<'_>,
 ) {
     let normalization = 1.0 / inp.len() as f64;
-    izip_eq!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
+    izip!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
         |(out_re, out_im, inp, w_re, w_im)| {
             let tmp = inp
                 * (c64 {
@@ -781,7 +781,7 @@ pub fn par_convert_polynomials_list_to_fourier<Scalar: UnsignedTorus>(
 
             let stack = PodStack::new(&mut mem);
 
-            for (fourier_poly, standard_poly) in izip_eq!(
+            for (fourier_poly, standard_poly) in izip!(
                 fourier_poly_chunk.chunks_exact_mut(f_polynomial_size),
                 standard_poly_chunk.chunks_exact(polynomial_size.0)
             ) {
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs
index d9bd6a06d8..68036f3089 100644
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/tests.rs
@@ -34,7 +34,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
         fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
         fft.backward_as_torus(roundtrip.as_mut_view(), fourier.as_view(), stack);
 
-        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
             if Scalar::BITS == 32 {
                 assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
             } else {
@@ -48,7 +48,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
         fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
         fft.add_backward_as_torus(roundtrip.as_mut_view(), fourier.as_view(), stack);
 
-        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
             if Scalar::BITS == 32 {
                 assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
             } else {
@@ -62,7 +62,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
         fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
         fft.add_backward_in_place_as_torus(roundtrip.as_mut_view(), fourier.as_mut_view(), stack);
 
-        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
             if Scalar::BITS == 32 {
                 assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
             } else {
@@ -119,7 +119,7 @@ fn test_product<Scalar: UnsignedTorus>() {
             };
 
             let integer_magnitude = 16;
-            for (x, y) in izip_eq!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
+            for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
                 *x = generator.random_uniform();
                 *y = generator.random_uniform();
                 *y >>= Scalar::BITS - integer_magnitude;
@@ -135,7 +135,7 @@ fn test_product<Scalar: UnsignedTorus>() {
             fft.forward_as_torus(fourier0.as_mut_view(), poly0.as_view(), stack);
             fft.forward_as_integer(fourier1.as_mut_view(), poly1.as_view(), stack);
 
-            for (f0, f1) in izip_eq!(&mut *fourier0.data, &*fourier1.data) {
+            for (f0, f1) in izip!(&mut *fourier0.data, &*fourier1.data) {
                 *f0 *= *f1;
             }
 
@@ -152,7 +152,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                 stack,
             );
 
-            for (expected, actual) in izip_eq!(
+            for (expected, actual) in izip!(
                 convolution_from_naive.as_ref().iter(),
                 convolution_from_fft.as_ref().iter()
             ) {
@@ -174,7 +174,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                 stack,
             );
 
-            for (expected, actual) in izip_eq!(
+            for (expected, actual) in izip!(
                 convolution_from_naive.as_ref().iter(),
                 convolution_from_fft.as_ref().iter()
             ) {
@@ -198,7 +198,7 @@ fn test_product<Scalar: UnsignedTorus>() {
                 stack,
             );
 
-            for (expected, actual) in izip_eq!(
+            for (expected, actual) in izip!(
                 convolution_from_naive.as_ref().iter(),
                 convolution_from_fft.as_ref().iter()
             ) {
diff --git a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
index 42bdff8325..15c59aa440 100644
--- a/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft64/math/fft/x86.rs
@@ -11,7 +11,7 @@
 
 use super::super::super::c64;
 use super::TwistiesView;
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -178,7 +178,7 @@ pub fn convert_forward_integer_u32_v4(
             let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
 
-            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
                 let in_re = pulp::cast(in_re);
                 let in_im = pulp::cast(in_im);
                 let w_re = pulp::cast(w_re);
@@ -270,7 +270,7 @@ pub fn convert_forward_integer_u64_v4(
             let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
 
-            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
                 let in_re = pulp::cast(in_re);
                 let in_im = pulp::cast(in_im);
                 let w_re = pulp::cast(w_re);
@@ -363,7 +363,7 @@ pub fn convert_forward_integer_u32_v3(
             let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
 
-            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
                 let in_re = pulp::cast(in_re);
                 let in_im = pulp::cast(in_im);
                 let w_re = pulp::cast(w_re);
@@ -456,7 +456,7 @@ pub fn convert_forward_integer_u64_avx2_v3(
             let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
 
-            for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
+            for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
                 let in_re = pulp::cast(in_re);
                 let in_im = pulp::cast(in_im);
                 let w_re = pulp::cast(w_re);
@@ -604,7 +604,7 @@ pub fn convert_add_backward_torus_u32_v4(
             let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
 
-            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
                 let inp = pulp::cast::<_, [__m512d; 2]>(inp);
                 let w_re = pulp::cast(w_re);
                 let w_im = pulp::cast(w_im);
@@ -690,7 +690,7 @@ pub fn convert_add_backward_torus_u64_v4(
             let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
 
-            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
                 let inp = pulp::cast::<_, [__m512d; 2]>(inp);
                 let w_re = pulp::cast(w_re);
                 let w_im = pulp::cast(w_im);
@@ -832,7 +832,7 @@ pub fn convert_add_backward_torus_u32_v3(
             let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
 
-            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
                 let inp = pulp::cast::<_, [__m128d; 4]>(inp);
                 let w_re = pulp::cast(w_re);
                 let w_im = pulp::cast(w_im);
@@ -917,7 +917,7 @@ pub fn convert_add_backward_torus_u64_v3(
             let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
             let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
 
-            for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
+            for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
                 let inp = pulp::cast::<_, [__m128d; 4]>(inp);
                 let w_re = pulp::cast(w_re);
                 let w_im = pulp::cast(w_im);
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/fft.rs b/tfhe/src/core_crypto/gpu/algorithms/test/fft.rs
index df3031ebff..d76f7f8479 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/fft.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/fft.rs
@@ -1,5 +1,5 @@
 use crate::core_crypto::commons::test_tools::{modular_distance, new_random_generator};
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::gpu::vec::GpuIndex;
 use crate::core_crypto::gpu::{
     fourier_transform_backward_as_torus_f128_async, fourier_transform_forward_as_torus_f128_async,
@@ -53,7 +53,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
             cuda_synchronize_device(0);
         }
 
-        for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
+        for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
             if Scalar::BITS <= 64 {
                 assert_eq!(*expected, *actual);
             } else {
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
index 9b666a49de..38013b6435 100644
--- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
@@ -1,5 +1,5 @@
 use crate::core_crypto::gpu::vec::{range_bounds_to_start_end, CudaVec};
-use crate::core_crypto::gpu::{CudaLweList, CudaStreams, GpuIndex};
+use crate::core_crypto::gpu::{CudaLweList, CudaStreams};
 use crate::core_crypto::prelude::{
     CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
     LweDimension, LweSize, UnsignedInteger,
@@ -213,10 +213,6 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
         Self(self.0.duplicate(streams))
     }
 
-    pub(crate) fn gpu_indexes(&self) -> &[GpuIndex] {
-        self.0.d_vec.gpu_indexes.as_slice()
-    }
-
     pub(crate) fn lwe_dimension(&self) -> LweDimension {
         self.0.lwe_dimension
     }
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index a5c7e5ef1e..b6799e5090 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -1212,11 +1212,6 @@ pub fn get_number_of_gpus() -> u32 {
     unsafe { cuda_get_number_of_gpus() as u32 }
 }
 
-/// Get the number of sms on the GPU
-pub fn get_number_of_sms() -> u32 {
-    unsafe { cuda_get_number_of_sms() as u32 }
-}
-
 /// Setup multi-GPU and return the number of GPUs used
 pub fn setup_multi_gpu(device_0_id: GpuIndex) -> u32 {
     unsafe { cuda_setup_multi_gpu(device_0_id.get()) as u32 }
diff --git a/tfhe/src/high_level_api/booleans/squashed_noise.rs b/tfhe/src/high_level_api/booleans/squashed_noise.rs
index a48157d54a..0721c479a2 100644
--- a/tfhe/src/high_level_api/booleans/squashed_noise.rs
+++ b/tfhe/src/high_level_api/booleans/squashed_noise.rs
@@ -5,10 +5,6 @@ use crate::backward_compatibility::booleans::{
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::errors::UninitializedNoiseSquashing;
 use crate::high_level_api::global_state::{self, with_internal_keys};
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::{
-    with_cuda_internal_keys, with_thread_local_cuda_streams_for_gpu_indexes,
-};
 use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::traits::{FheDecrypt, SquashNoise, Tagged};
 use crate::high_level_api::SquashedNoiseCiphertextState;
@@ -16,8 +12,6 @@ use crate::integer::ciphertext::SquashedNoiseBooleanBlock;
 #[cfg(feature = "gpu")]
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 #[cfg(feature = "gpu")]
-use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseBooleanBlock;
-#[cfg(feature = "gpu")]
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::named::Named;
 use crate::{ClientKey, Device, Tag};
@@ -27,20 +21,12 @@ use tfhe_versionable::{Unversionize, UnversionizeError, Versionize, VersionizeOw
 /// Enum that manages the current inner representation of a boolean.
 pub(in crate::high_level_api) enum InnerSquashedNoiseBoolean {
     Cpu(SquashedNoiseBooleanBlock),
-    #[cfg(feature = "gpu")]
-    Cuda(CudaSquashedNoiseBooleanBlock),
 }
 
 impl Clone for InnerSquashedNoiseBoolean {
     fn clone(&self) -> Self {
         match self {
             Self::Cpu(inner) => Self::Cpu(inner.clone()),
-            #[cfg(feature = "gpu")]
-            Self::Cuda(inner) => {
-                with_thread_local_cuda_streams_for_gpu_indexes(inner.gpu_indexes(), |streams| {
-                    Self::Cuda(inner.duplicate(streams))
-                })
-            }
         }
     }
 }
@@ -115,66 +101,17 @@ impl InnerSquashedNoiseBoolean {
     pub(crate) fn on_cpu(&self) -> MaybeCloned<'_, SquashedNoiseBooleanBlock> {
         match self {
             Self::Cpu(ct) => MaybeCloned::Borrowed(ct),
-            #[cfg(feature = "gpu")]
-            Self::Cuda(ct) => {
-                with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| {
-                    MaybeCloned::Cloned(ct.to_squashed_noise_boolean_block(streams))
-                })
-            }
-        }
-    }
-
-    fn current_device(&self) -> crate::Device {
-        match self {
-            Self::Cpu(_) => crate::Device::Cpu,
-            #[cfg(feature = "gpu")]
-            Self::Cuda(_) => crate::Device::CudaGpu,
         }
     }
 
     #[allow(clippy::needless_pass_by_ref_mut)]
-    fn move_to_device(&mut self, target_device: Device) {
-        let current_device = self.current_device();
-
-        if current_device == target_device {
-            #[cfg(feature = "gpu")]
-            // We may not be on the correct Cuda device
-            if let Self::Cuda(cuda_ct) = self {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    if cuda_ct.gpu_indexes() != streams.gpu_indexes() {
-                        *cuda_ct = cuda_ct.duplicate(streams);
-                    }
-                })
-            }
-            return;
-        }
-
-        // The logic is that the common device is the CPU, all other devices
-        // know how to transfer from and to CPU.
-
-        // So we first transfer to CPU
-        let cpu_ct = self.on_cpu();
-
-        // Then we can transfer the desired device
-        match target_device {
-            Device::Cpu => {
-                let _ = cpu_ct;
-            }
-            #[cfg(feature = "gpu")]
-            Device::CudaGpu => {
-                let new_inner = with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    CudaSquashedNoiseBooleanBlock::from_squashed_noise_boolean_block(
-                        &cpu_ct, streams,
-                    )
-                });
-                *self = Self::Cuda(new_inner);
-            }
-            #[cfg(feature = "hpu")]
-            Device::Hpu => {
-                panic!("HPU does not support compression");
+    pub(crate) fn move_to_device(&mut self, device: Device) {
+        match (&self, device) {
+            (Self::Cpu(_), Device::Cpu) => {
+                // Nothing to do, we already are on the correct device
             }
+            #[cfg(any(feature = "gpu", feature = "hpu"))]
+            _ => panic!("Cuda/Hpu devices do not support noise squashing yet"),
         }
     }
 
diff --git a/tfhe/src/high_level_api/compressed_noise_squashed_ciphertext_list.rs b/tfhe/src/high_level_api/compressed_noise_squashed_ciphertext_list.rs
index c5e5c7dbf6..c2a3cca40f 100644
--- a/tfhe/src/high_level_api/compressed_noise_squashed_ciphertext_list.rs
+++ b/tfhe/src/high_level_api/compressed_noise_squashed_ciphertext_list.rs
@@ -1,10 +1,6 @@
 use crate::backward_compatibility::compressed_ciphertext_list::CompressedSquashedNoiseCiphertextListVersions;
 use crate::high_level_api::booleans::InnerSquashedNoiseBoolean;
 use crate::high_level_api::global_state::try_with_internal_keys;
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::{
-    with_cuda_internal_keys, with_thread_local_cuda_streams_for_gpu_indexes,
-};
 use crate::high_level_api::integers::signed::InnerSquashedNoiseSignedRadixCiphertext;
 use crate::high_level_api::integers::unsigned::InnerSquashedNoiseRadixCiphertext;
 use crate::high_level_api::keys::InternalServerKey;
@@ -15,36 +11,25 @@ use crate::integer::ciphertext::{
     DataKind, SquashedNoiseBooleanBlock, SquashedNoiseExpandable, SquashedNoiseRadixCiphertext,
     SquashedNoiseSignedRadixCiphertext,
 };
-#[cfg(feature = "gpu")]
-use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
-#[cfg(feature = "gpu")]
-use crate::integer::gpu::ciphertext::{
-    CudaCompressedSquashedNoiseCiphertextList, CudaSquashedNoiseExpandable,
-};
 use crate::named::Named;
 use crate::shortint::ciphertext::SquashedNoiseCiphertext;
-use crate::{
-    Device, SquashedNoiseFheBool, SquashedNoiseFheInt, SquashedNoiseFheUint, Tag, Versionize,
-};
-#[cfg(feature = "gpu")]
-use itertools::Itertools;
+use crate::{SquashedNoiseFheBool, SquashedNoiseFheInt, SquashedNoiseFheUint, Tag, Versionize};
 use serde::{Deserialize, Serialize};
 use std::num::NonZero;
 use tfhe_versionable::{Unversionize, UnversionizeError, VersionizeOwned};
 
-#[derive(Clone)]
 pub(in crate::high_level_api) enum InnerCompressedSquashedNoiseCiphertextList {
     Cpu(IntegerCompressedSquashedNoiseCiphertextList),
-    #[cfg(feature = "gpu")]
-    Cuda(CudaCompressedSquashedNoiseCiphertextList),
 }
 
 impl Versionize for InnerCompressedSquashedNoiseCiphertextList {
-    type Versioned<'vers> =
-        <IntegerCompressedSquashedNoiseCiphertextList as VersionizeOwned>::VersionedOwned;
+    type Versioned<'vers>
+        = <IntegerCompressedSquashedNoiseCiphertextList as Versionize>::Versioned<'vers>
+    where
+        Self: 'vers;
 
     fn versionize(&self) -> Self::Versioned<'_> {
-        self.on_cpu().versionize_owned()
+        self.on_cpu().versionize()
     }
 }
 
@@ -90,89 +75,47 @@ impl<'de> Deserialize<'de> for InnerCompressedSquashedNoiseCiphertextList {
 }
 
 impl InnerCompressedSquashedNoiseCiphertextList {
-    /// Returns the inner cpu compressed ciphertext list if self is on the CPU, otherwise, returns a
-    /// copy that is on the CPU
-    fn on_cpu(&self) -> IntegerCompressedSquashedNoiseCiphertextList {
+    fn on_cpu(&self) -> &IntegerCompressedSquashedNoiseCiphertextList {
         match self {
-            Self::Cpu(cpu_ct) => cpu_ct.clone(),
-            #[cfg(feature = "gpu")]
-            Self::Cuda(cuda_ct) => {
-                let cpu_ct = with_thread_local_cuda_streams_for_gpu_indexes(
-                    cuda_ct.gpu_indexes(),
-                    |streams| cuda_ct.to_compressed_squashed_noise_ciphertext_list(streams),
-                );
-                cpu_ct
-            }
+            Self::Cpu(inner) => inner,
         }
     }
 
     fn into_cpu(self) -> IntegerCompressedSquashedNoiseCiphertextList {
         match self {
-            Self::Cpu(cpu_ct) => cpu_ct,
-            #[cfg(feature = "gpu")]
-            Self::Cuda(cuda_ct) => {
-                let cpu_ct = with_thread_local_cuda_streams_for_gpu_indexes(
-                    cuda_ct.gpu_indexes(),
-                    |streams| cuda_ct.to_compressed_squashed_noise_ciphertext_list(streams),
-                );
-                cpu_ct
-            }
+            Self::Cpu(inner) => inner,
         }
     }
 
     fn current_device(&self) -> crate::Device {
         match self {
             Self::Cpu(_) => crate::Device::Cpu,
-            #[cfg(feature = "gpu")]
-            Self::Cuda(_) => crate::Device::CudaGpu,
         }
     }
 
-    #[cfg_attr(not(feature = "gpu"), allow(clippy::needless_pass_by_ref_mut))]
     #[allow(clippy::unnecessary_wraps, reason = "It depends on activated features")]
     fn move_to_device(&mut self, target_device: crate::Device) -> crate::Result<()> {
         let current_device = self.current_device();
-
         if current_device == target_device {
-            #[cfg(feature = "gpu")]
-            // We may not be on the correct Cuda device
-            if let Self::Cuda(cuda_ct) = self {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    if cuda_ct.gpu_indexes() != streams.gpu_indexes() {
-                        *cuda_ct = cuda_ct.duplicate(streams);
-                    }
-                })
-            }
             return Ok(());
         }
 
-        // The logic is that the common device is the CPU, all other devices
-        // know how to transfer from and to CPU.
-
-        // So we first transfer to CPU
         let cpu_ct = self.on_cpu();
 
-        // Then we can transfer the desired device
         match target_device {
-            Device::Cpu => {
-                let _ = cpu_ct;
+            crate::Device::Cpu => {
+                *self = Self::Cpu(cpu_ct.to_owned());
+                Ok(())
             }
             #[cfg(feature = "gpu")]
-            Device::CudaGpu => {
-                let new_inner = with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    CudaCompressedSquashedNoiseCiphertextList::from_compressed_squashed_noise_ciphertext_list(&cpu_ct, streams)
-                });
-                *self = Self::Cuda(new_inner);
-            }
+            crate::Device::CudaGpu => Err(crate::error!(
+                "Cuda does not support CompressedSquashedNoiseCiphertextList"
+            )),
             #[cfg(feature = "hpu")]
-            Device::Hpu => {
-                panic!("HPU does not support compression");
-            }
+            crate::Device::Hpu => Err(crate::error!(
+                "Hpu does not support CompressedSquashedNoiseCiphertextList"
+            )),
         }
-
-        Ok(())
     }
 }
 
@@ -208,8 +151,6 @@ impl CompressedSquashedNoiseCiphertextList {
     pub fn len(&self) -> usize {
         match &self.inner {
             InnerCompressedSquashedNoiseCiphertextList::Cpu(inner) => inner.len(),
-            #[cfg(feature = "gpu")]
-            InnerCompressedSquashedNoiseCiphertextList::Cuda(inner) => inner.len(),
         }
     }
 
@@ -223,12 +164,6 @@ impl CompressedSquashedNoiseCiphertextList {
     {
         let mut r = match &self.inner {
             InnerCompressedSquashedNoiseCiphertextList::Cpu(inner) => inner.get::<T>(index),
-            #[cfg(feature = "gpu")]
-            InnerCompressedSquashedNoiseCiphertextList::Cuda(cuda_ct) => {
-                with_thread_local_cuda_streams_for_gpu_indexes(cuda_ct.gpu_indexes(), |streams| {
-                    cuda_ct.get::<T>(index, streams)
-                })
-            }
         };
 
         if let Ok(Some(ct)) = &mut r {
@@ -237,10 +172,8 @@ impl CompressedSquashedNoiseCiphertextList {
         r
     }
 }
-#[cfg(not(feature = "gpu"))]
+
 pub trait HlSquashedNoiseExpandable: SquashedNoiseExpandable {}
-#[cfg(feature = "gpu")]
-pub trait HlSquashedNoiseExpandable: SquashedNoiseExpandable + CudaSquashedNoiseExpandable {}
 
 fn create_error_message(tried: DataKind, actual: DataKind) -> crate::Error {
     fn name(kind: DataKind) -> &'static str {
@@ -322,102 +255,14 @@ impl SquashedNoiseExpandable for SquashedNoiseFheInt {
 }
 
 impl HlSquashedNoiseExpandable for SquashedNoiseFheBool {}
-
 impl HlSquashedNoiseExpandable for SquashedNoiseFheUint {}
-
 impl HlSquashedNoiseExpandable for SquashedNoiseFheInt {}
 
-#[cfg(feature = "gpu")]
-mod gpu {
-    use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
-    use crate::high_level_api::booleans::InnerSquashedNoiseBoolean;
-    use crate::high_level_api::integers::signed::InnerSquashedNoiseSignedRadixCiphertext;
-    use crate::high_level_api::integers::unsigned::InnerSquashedNoiseRadixCiphertext;
-    use crate::high_level_api::SquashedNoiseCiphertextState;
-    use crate::integer::ciphertext::DataKind;
-    use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
-    use crate::integer::gpu::ciphertext::squashed_noise::{
-        CudaSquashedNoiseBooleanBlock, CudaSquashedNoiseRadixCiphertext,
-        CudaSquashedNoiseSignedRadixCiphertext,
-    };
-    use crate::integer::gpu::ciphertext::CudaSquashedNoiseExpandable;
-    use crate::{SquashedNoiseFheBool, SquashedNoiseFheInt, SquashedNoiseFheUint, Tag};
-
-    impl CudaSquashedNoiseExpandable for SquashedNoiseFheBool {
-        fn from_expanded_blocks(
-            blocks: CudaLweCiphertextList<u128>,
-            info: CudaRadixCiphertextInfo,
-            _kind: DataKind,
-        ) -> crate::Result<Self> {
-            let cuda_ns_ct = CudaSquashedNoiseRadixCiphertext {
-                packed_d_blocks: blocks,
-                info,
-                original_block_count: 1,
-            };
-            let cuda_ns_boolean_ct = CudaSquashedNoiseBooleanBlock {
-                ciphertext: cuda_ns_ct,
-            };
-            Ok(Self::new(
-                InnerSquashedNoiseBoolean::Cuda(cuda_ns_boolean_ct),
-                SquashedNoiseCiphertextState::Normal,
-                Tag::default(),
-            ))
-        }
-    }
-
-    impl CudaSquashedNoiseExpandable for SquashedNoiseFheUint {
-        fn from_expanded_blocks(
-            blocks: CudaLweCiphertextList<u128>,
-            info: CudaRadixCiphertextInfo,
-            kind: DataKind,
-        ) -> crate::Result<Self> {
-            let message_modulus = info.blocks.first().unwrap().message_modulus;
-
-            let cuda_ns_ct = CudaSquashedNoiseRadixCiphertext {
-                packed_d_blocks: blocks,
-                info,
-                original_block_count: kind.num_blocks(message_modulus),
-            };
-            Ok(Self::new(
-                InnerSquashedNoiseRadixCiphertext::Cuda(cuda_ns_ct),
-                SquashedNoiseCiphertextState::Normal,
-                Tag::default(),
-            ))
-        }
-    }
-    impl CudaSquashedNoiseExpandable for SquashedNoiseFheInt {
-        fn from_expanded_blocks(
-            blocks: CudaLweCiphertextList<u128>,
-            info: CudaRadixCiphertextInfo,
-            kind: DataKind,
-        ) -> crate::Result<Self> {
-            let message_modulus = info.blocks.first().unwrap().message_modulus;
-            let cuda_ns_ct = CudaSquashedNoiseRadixCiphertext {
-                packed_d_blocks: blocks,
-                info,
-                original_block_count: kind.num_blocks(message_modulus),
-            };
-            let cuda_ns_signed_ct = CudaSquashedNoiseSignedRadixCiphertext {
-                ciphertext: cuda_ns_ct,
-            };
-            Ok(Self::new(
-                InnerSquashedNoiseSignedRadixCiphertext::Cuda(cuda_ns_signed_ct),
-                SquashedNoiseCiphertextState::Normal,
-                Tag::default(),
-            ))
-        }
-    }
-}
-
 mod private {
-    #[cfg(feature = "gpu")]
-    use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
     use crate::shortint::ciphertext::SquashedNoiseCiphertext;
 
     pub enum SquashedNoiseToBeCompressed {
         Cpu(Vec<SquashedNoiseCiphertext>),
-        #[cfg(feature = "gpu")]
-        Cuda(CudaSquashedNoiseRadixCiphertext),
     }
 }
 
@@ -433,11 +278,6 @@ impl HlSquashedNoiseCompressible for SquashedNoiseFheBool {
                 private::SquashedNoiseToBeCompressed::Cpu(vec![cpu_ct.ciphertext]),
                 kind,
             )),
-            #[cfg(feature = "gpu")]
-            InnerSquashedNoiseBoolean::Cuda(gpu_ct) => messages.push((
-                private::SquashedNoiseToBeCompressed::Cuda(gpu_ct.ciphertext),
-                kind,
-            )),
         }
     }
 }
@@ -455,13 +295,6 @@ impl HlSquashedNoiseCompressible for SquashedNoiseFheUint {
                     ))
                 }
             }
-            #[cfg(feature = "gpu")]
-            InnerSquashedNoiseRadixCiphertext::Cuda(gpu_ct) => {
-                if let Some(n) = NonZero::new(gpu_ct.original_block_count) {
-                    let kind = DataKind::Unsigned(n);
-                    messages.push((private::SquashedNoiseToBeCompressed::Cuda(gpu_ct), kind));
-                }
-            }
         }
     }
 }
@@ -478,16 +311,6 @@ impl HlSquashedNoiseCompressible for SquashedNoiseFheInt {
                     ))
                 }
             }
-            #[cfg(feature = "gpu")]
-            InnerSquashedNoiseSignedRadixCiphertext::Cuda(gpu_ct) => {
-                if let Some(n) = NonZero::new(gpu_ct.ciphertext.original_block_count) {
-                    let kind = DataKind::Signed(n);
-                    messages.push((
-                        private::SquashedNoiseToBeCompressed::Cuda(gpu_ct.ciphertext),
-                        kind,
-                    ));
-                }
-            }
         }
     }
 }
@@ -528,18 +351,6 @@ impl CompressedSquashedNoiseCiphertextListBuilder {
                         private::SquashedNoiseToBeCompressed::Cpu(cpu_blocks) => {
                             flat_cpu_blocks.extend_from_slice(cpu_blocks.as_slice());
                         }
-                        #[cfg(feature = "gpu")]
-                        private::SquashedNoiseToBeCompressed::Cuda(gpu_blocks) => {
-                            // If gpu_blocks is on the GPU, we bring it back to the CPU
-                            let cpu_blocks = with_thread_local_cuda_streams_for_gpu_indexes(
-                                gpu_blocks.gpu_indexes(),
-                                |streams| gpu_blocks.to_squashed_noise_radix_ciphertext(streams),
-                            );
-
-                            let vec_cpu_blocks =
-                                cpu_blocks.packed_blocks.iter().cloned().collect_vec();
-                            flat_cpu_blocks.extend_from_slice(vec_cpu_blocks.as_slice());
-                        }
                     }
                 }
                 cpu_key
@@ -570,50 +381,9 @@ impl CompressedSquashedNoiseCiphertextListBuilder {
                     })
             }
             #[cfg(feature = "gpu")]
-            Some(InternalServerKey::Cuda(cuda_key)) => {
-                let mut cuda_radixes = vec![];
-                for (element, _) in &self.inner {
-                    match element {
-                        private::SquashedNoiseToBeCompressed::Cpu(cpu_blocks) => {
-                            let streams = &cuda_key.streams;
-                            cuda_radixes.push(CudaSquashedNoiseRadixCiphertext::from_cpu_blocks(
-                                cpu_blocks, streams,
-                            ));
-                        }
-                        #[cfg(feature = "gpu")]
-                        private::SquashedNoiseToBeCompressed::Cuda(cuda_radix) => {
-                            {
-                                let streams = &cuda_key.streams;
-                                cuda_radixes.push(cuda_radix.duplicate(streams));
-                            };
-                        }
-                    }
-                }
-
-                cuda_key
-                    .key
-                    .noise_squashing_compression_key
-                    .as_ref()
-                    .ok_or_else(|| {
-                        crate::Error::new("Compression key not set in server key".to_owned())
-                    })
-                    .map(|compression_key| {
-                        let streams = &cuda_key.streams;
-                        let compressed_list = compression_key
-                            .compress_noise_squashed_ciphertexts_into_list(&cuda_radixes, streams);
-                        let info = self.inner.iter().map(|(_, kind)| *kind).collect();
-
-                        CompressedSquashedNoiseCiphertextList {
-                            inner: InnerCompressedSquashedNoiseCiphertextList::Cuda(
-                                CudaCompressedSquashedNoiseCiphertextList {
-                                    packed_list: compressed_list,
-                                    info,
-                                },
-                            ),
-                            tag: cuda_key.tag.clone(),
-                        }
-                    })
-            }
+            Some(InternalServerKey::Cuda(_)) => Err(crate::error!(
+                "Cuda GPU does not support compression of squashed noise ciphertexts"
+            )),
             #[cfg(feature = "hpu")]
             Some(InternalServerKey::Hpu(_)) => Err(crate::error!(
                 "HPU does not support compression of squashed noise ciphertexts"
@@ -688,67 +458,4 @@ mod tests {
         let decrypted: bool = ns_c.decrypt(&cks);
         assert_eq!(decrypted, clear_c);
     }
-
-    #[test]
-    #[cfg(feature = "gpu")]
-    fn test_gpu_compressed_squashed_noise_ciphertext_list() {
-        let params = V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-        let noise_squashing_params =
-            V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-        let noise_squashing_compression_params =
-            V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-
-        let config = ConfigBuilder::with_custom_parameters(params)
-            .enable_noise_squashing(noise_squashing_params)
-            .enable_noise_squashing_compression(noise_squashing_compression_params)
-            .build();
-
-        let cks = crate::ClientKey::generate(config);
-        let sks = crate::CompressedServerKey::new(&cks);
-
-        set_server_key(sks.decompress_to_gpu());
-
-        let mut rng = rand::thread_rng();
-
-        let clear_a = rng.gen::<i32>();
-        let clear_b = rng.gen::<u32>();
-        let clear_c = rng.gen_bool(0.5);
-
-        let mut a = FheInt32::encrypt(clear_a, &cks);
-        let mut b = FheUint32::encrypt(clear_b, &cks);
-        let mut c = FheBool::encrypt(clear_c, &cks);
-
-        a.move_to_device(crate::Device::CudaGpu);
-        b.move_to_device(crate::Device::CudaGpu);
-        c.move_to_device(crate::Device::CudaGpu);
-
-        let ns_a = a.squash_noise().unwrap();
-        let ns_b = b.squash_noise().unwrap();
-        let ns_c = c.squash_noise().unwrap();
-
-        let list = CompressedSquashedNoiseCiphertextList::builder()
-            .push(ns_a)
-            .push(ns_b)
-            .push(ns_c)
-            .build()
-            .unwrap();
-
-        let mut serialized_list = vec![];
-        safe_serialize(&list, &mut serialized_list, 1 << 24).unwrap();
-        let list: CompressedSquashedNoiseCiphertextList =
-            safe_deserialize(serialized_list.as_slice(), 1 << 24).unwrap();
-
-        let ns_a: SquashedNoiseFheInt = list.get(0).unwrap().unwrap();
-        let ns_b: SquashedNoiseFheUint = list.get(1).unwrap().unwrap();
-        let ns_c: SquashedNoiseFheBool = list.get(2).unwrap().unwrap();
-
-        let decrypted: i32 = ns_a.decrypt(&cks);
-        assert_eq!(decrypted, clear_a);
-
-        let decrypted: u32 = ns_b.decrypt(&cks);
-        assert_eq!(decrypted, clear_b);
-
-        let decrypted: bool = ns_c.decrypt(&cks);
-        assert_eq!(decrypted, clear_c);
-    }
 }
diff --git a/tfhe/src/high_level_api/integers/signed/squashed_noise.rs b/tfhe/src/high_level_api/integers/signed/squashed_noise.rs
index 9c50134d5c..77438fe8bf 100644
--- a/tfhe/src/high_level_api/integers/signed/squashed_noise.rs
+++ b/tfhe/src/high_level_api/integers/signed/squashed_noise.rs
@@ -5,16 +5,10 @@ use crate::backward_compatibility::integers::{
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::errors::UninitializedNoiseSquashing;
 use crate::high_level_api::global_state::{self, with_internal_keys};
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::{
-    with_cuda_internal_keys, with_thread_local_cuda_streams_for_gpu_indexes,
-};
 use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::traits::{FheDecrypt, SquashNoise};
 use crate::high_level_api::SquashedNoiseCiphertextState;
 use crate::integer::block_decomposition::{RecomposableFrom, SignExtendable};
-#[cfg(feature = "gpu")]
-use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseSignedRadixCiphertext;
 use crate::named::Named;
 use crate::prelude::Tagged;
 use crate::{ClientKey, Device, Tag};
@@ -24,20 +18,12 @@ use tfhe_versionable::{Unversionize, UnversionizeError, Versionize, VersionizeOw
 /// Enum that manages the current inner representation of a squashed noise FheInt .
 pub(in crate::high_level_api) enum InnerSquashedNoiseSignedRadixCiphertext {
     Cpu(crate::integer::ciphertext::SquashedNoiseSignedRadixCiphertext),
-    #[cfg(feature = "gpu")]
-    Cuda(CudaSquashedNoiseSignedRadixCiphertext),
 }
 
 impl Clone for InnerSquashedNoiseSignedRadixCiphertext {
     fn clone(&self) -> Self {
         match self {
             Self::Cpu(inner) => Self::Cpu(inner.clone()),
-            #[cfg(feature = "gpu")]
-            Self::Cuda(inner) => {
-                with_thread_local_cuda_streams_for_gpu_indexes(inner.gpu_indexes(), |streams| {
-                    Self::Cuda(inner.duplicate(streams))
-                })
-            }
         }
     }
 }
@@ -120,63 +106,17 @@ impl InnerSquashedNoiseSignedRadixCiphertext {
     ) -> MaybeCloned<'_, crate::integer::ciphertext::SquashedNoiseSignedRadixCiphertext> {
         match self {
             Self::Cpu(ct) => MaybeCloned::Borrowed(ct),
-            #[cfg(feature = "gpu")]
-            Self::Cuda(ct) => {
-                with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| {
-                    MaybeCloned::Cloned(ct.to_squashed_noise_signed_radix_ciphertext(streams))
-                })
-            }
-        }
-    }
-    fn current_device(&self) -> crate::Device {
-        match self {
-            Self::Cpu(_) => crate::Device::Cpu,
-            #[cfg(feature = "gpu")]
-            Self::Cuda(_) => crate::Device::CudaGpu,
         }
     }
 
     #[allow(clippy::needless_pass_by_ref_mut)]
-    fn move_to_device(&mut self, target_device: Device) {
-        let current_device = self.current_device();
-
-        if current_device == target_device {
-            #[cfg(feature = "gpu")]
-            // We may not be on the correct Cuda device
-            if let Self::Cuda(cuda_ct) = self {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    if cuda_ct.gpu_indexes() != streams.gpu_indexes() {
-                        *cuda_ct = cuda_ct.duplicate(streams);
-                    }
-                })
-            }
-            return;
-        }
-
-        // The logic is that the common device is the CPU, all other devices
-        // know how to transfer from and to CPU.
-
-        // So we first transfer to CPU
-        let cpu_ct = self.on_cpu();
-
-        // Then we can transfer the desired device
-        match target_device {
-            Device::Cpu => {
-                let _ = cpu_ct;
-            }
-            #[cfg(feature = "gpu")]
-            Device::CudaGpu => {
-                let new_inner = with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    CudaSquashedNoiseSignedRadixCiphertext::from_squashed_noise_signed_radix_ciphertext(&cpu_ct, streams)
-                });
-                *self = Self::Cuda(new_inner);
-            }
-            #[cfg(feature = "hpu")]
-            Device::Hpu => {
-                panic!("HPU does not support noise squashing compression");
+    pub(crate) fn move_to_device(&mut self, device: Device) {
+        match (&self, device) {
+            (Self::Cpu(_), Device::Cpu) => {
+                // Nothing to do, we already are on the correct device
             }
+            #[cfg(any(feature = "gpu", feature = "hpu"))]
+            _ => panic!("Cuda/Hpu devices do not support noise squashing yet"),
         }
     }
 
@@ -218,21 +158,7 @@ impl SquashedNoiseFheInt {
     pub fn num_bits(&self) -> usize {
         match &self.inner {
             InnerSquashedNoiseSignedRadixCiphertext::Cpu(on_cpu) => {
-                on_cpu.original_block_count
-                    * on_cpu.packed_blocks[0].message_modulus().0.ilog2() as usize
-            }
-            #[cfg(feature = "gpu")]
-            InnerSquashedNoiseSignedRadixCiphertext::Cuda(gpu_ct) => {
-                gpu_ct.ciphertext.original_block_count
-                    * gpu_ct
-                        .ciphertext
-                        .info
-                        .blocks
-                        .first()
-                        .unwrap()
-                        .message_modulus
-                        .0
-                        .ilog2() as usize
+                on_cpu.original_block_count * on_cpu.packed_blocks[0].message_modulus().0 as usize
             }
         }
     }
diff --git a/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs b/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs
index 21fddc2557..59e50778bf 100644
--- a/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs
@@ -6,16 +6,10 @@ use crate::core_crypto::commons::numeric::UnsignedNumeric;
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::errors::UninitializedNoiseSquashing;
 use crate::high_level_api::global_state::{self, with_internal_keys};
-#[cfg(feature = "gpu")]
-use crate::high_level_api::global_state::{
-    with_cuda_internal_keys, with_thread_local_cuda_streams_for_gpu_indexes,
-};
 use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::traits::{FheDecrypt, SquashNoise, Tagged};
 use crate::high_level_api::SquashedNoiseCiphertextState;
 use crate::integer::block_decomposition::RecomposableFrom;
-#[cfg(feature = "gpu")]
-use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
 use crate::named::Named;
 use crate::{ClientKey, Device, Tag};
 use serde::{Deserializer, Serializer};
@@ -24,20 +18,12 @@ use tfhe_versionable::{Unversionize, UnversionizeError, Versionize, VersionizeOw
 /// Enum that manages the current inner representation of a squashed noise FheUint .
 pub(in crate::high_level_api) enum InnerSquashedNoiseRadixCiphertext {
     Cpu(crate::integer::ciphertext::SquashedNoiseRadixCiphertext),
-    #[cfg(feature = "gpu")]
-    Cuda(CudaSquashedNoiseRadixCiphertext),
 }
 
 impl Clone for InnerSquashedNoiseRadixCiphertext {
     fn clone(&self) -> Self {
         match self {
             Self::Cpu(inner) => Self::Cpu(inner.clone()),
-            #[cfg(feature = "gpu")]
-            Self::Cuda(inner) => {
-                with_thread_local_cuda_streams_for_gpu_indexes(inner.gpu_indexes(), |streams| {
-                    Self::Cuda(inner.duplicate(streams))
-                })
-            }
         }
     }
 }
@@ -114,65 +100,17 @@ impl InnerSquashedNoiseRadixCiphertext {
     ) -> MaybeCloned<'_, crate::integer::ciphertext::SquashedNoiseRadixCiphertext> {
         match self {
             Self::Cpu(ct) => MaybeCloned::Borrowed(ct),
-            #[cfg(feature = "gpu")]
-            Self::Cuda(ct) => {
-                with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| {
-                    MaybeCloned::Cloned(ct.to_squashed_noise_radix_ciphertext(streams))
-                })
-            }
-        }
-    }
-    fn current_device(&self) -> crate::Device {
-        match self {
-            Self::Cpu(_) => crate::Device::Cpu,
-            #[cfg(feature = "gpu")]
-            Self::Cuda(_) => crate::Device::CudaGpu,
         }
     }
 
     #[allow(clippy::needless_pass_by_ref_mut)]
-    fn move_to_device(&mut self, target_device: Device) {
-        let current_device = self.current_device();
-
-        if current_device == target_device {
-            #[cfg(feature = "gpu")]
-            // We may not be on the correct Cuda device
-            if let Self::Cuda(cuda_ct) = self {
-                with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    if cuda_ct.gpu_indexes() != streams.gpu_indexes() {
-                        *cuda_ct = cuda_ct.duplicate(streams);
-                    }
-                })
-            }
-            return;
-        }
-
-        // The logic is that the common device is the CPU, all other devices
-        // know how to transfer from and to CPU.
-
-        // So we first transfer to CPU
-        let cpu_ct = self.on_cpu();
-
-        // Then we can transfer the desired device
-        match target_device {
-            Device::Cpu => {
-                let _ = cpu_ct;
-            }
-            #[cfg(feature = "gpu")]
-            Device::CudaGpu => {
-                let new_inner = with_cuda_internal_keys(|keys| {
-                    let streams = &keys.streams;
-                    CudaSquashedNoiseRadixCiphertext::from_squashed_noise_radix_ciphertext(
-                        &cpu_ct, streams,
-                    )
-                });
-                *self = Self::Cuda(new_inner);
-            }
-            #[cfg(feature = "hpu")]
-            Device::Hpu => {
-                panic!("HPU does not support compression");
+    pub(crate) fn move_to_device(&mut self, device: Device) {
+        match (&self, device) {
+            (Self::Cpu(_), Device::Cpu) => {
+                // Nothing to do, we already are on the correct device
             }
+            #[cfg(any(feature = "gpu", feature = "hpu"))]
+            _ => panic!("Cuda/Hpu devices do not support noise squashing yet"),
         }
     }
 
@@ -213,21 +151,8 @@ impl SquashedNoiseFheUint {
 
     pub fn num_bits(&self) -> usize {
         match &self.inner {
-            InnerSquashedNoiseRadixCiphertext::Cpu(cpu_ct) => {
-                cpu_ct.original_block_count
-                    * cpu_ct.packed_blocks[0].message_modulus().0.ilog2() as usize
-            }
-            #[cfg(feature = "gpu")]
-            InnerSquashedNoiseRadixCiphertext::Cuda(gpu_ct) => {
-                gpu_ct.original_block_count
-                    * gpu_ct
-                        .info
-                        .blocks
-                        .first()
-                        .unwrap()
-                        .message_modulus
-                        .0
-                        .ilog2() as usize
+            InnerSquashedNoiseRadixCiphertext::Cpu(on_cpu) => {
+                on_cpu.original_block_count * on_cpu.packed_blocks[0].message_modulus().0 as usize
             }
         }
     }
diff --git a/tfhe/src/high_level_api/keys/inner.rs b/tfhe/src/high_level_api/keys/inner.rs
index e9e76550a0..155dcc575f 100644
--- a/tfhe/src/high_level_api/keys/inner.rs
+++ b/tfhe/src/high_level_api/keys/inner.rs
@@ -373,9 +373,6 @@ pub struct IntegerCudaServerKey {
         Option<crate::integer::gpu::list_compression::server_keys::CudaDecompressionKey>,
     pub(crate) noise_squashing_key:
         Option<crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey>,
-    pub(crate) noise_squashing_compression_key: Option<
-        crate::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey,
-    >,
 }
 
 #[derive(Clone, serde::Serialize, serde::Deserialize, Versionize)]
diff --git a/tfhe/src/high_level_api/keys/server.rs b/tfhe/src/high_level_api/keys/server.rs
index 3ebaee4189..4af3ceac1a 100644
--- a/tfhe/src/high_level_api/keys/server.rs
+++ b/tfhe/src/high_level_api/keys/server.rs
@@ -330,7 +330,6 @@ impl CompressedServerKey {
         let decompression_key: Option<
             crate::integer::gpu::list_compression::server_keys::CudaDecompressionKey,
         > = match &self.integer_key.decompression_key {
-            // Convert decompression_key in the (cpu) integer keyset to the GPU if it's defined
             Some(decompression_key) => {
                 let polynomial_size = decompression_key.key.blind_rotate_key.polynomial_size();
                 let glwe_dimension = decompression_key
@@ -353,8 +352,6 @@ impl CompressedServerKey {
             }
             None => None,
         };
-
-        // Convert noise_squashing_key in the (cpu) integer keyset to the GPU if it's defined
         let noise_squashing_key: Option<
             crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey,
         > = self
@@ -362,19 +359,6 @@ impl CompressedServerKey {
             .noise_squashing_key
             .as_ref()
             .map(|noise_squashing_key| noise_squashing_key.decompress_to_cuda(&streams));
-
-        // Convert noise_squashing_compression_key in the (cpu) integer keyset to the GPU if it's
-        // defined
-        let noise_squashing_compression_key: Option<
-            crate::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey,
-        > = self
-            .integer_key
-            .noise_squashing_compression_key
-            .as_ref()
-            .map(|noise_squashing_compression_key| {
-                noise_squashing_compression_key.decompress_to_cuda(&streams)
-            });
-
         synchronize_devices(streams.len() as u32);
         CudaServerKey {
             key: Arc::new(IntegerCudaServerKey {
@@ -383,7 +367,6 @@ impl CompressedServerKey {
                 compression_key,
                 decompression_key,
                 noise_squashing_key,
-                noise_squashing_compression_key,
             }),
             tag: self.tag.clone(),
             streams,
diff --git a/tfhe/src/integer/ciphertext/compressed_noise_squashed_ciphertext_list.rs b/tfhe/src/integer/ciphertext/compressed_noise_squashed_ciphertext_list.rs
index e3d53bae7c..d9db2c7e71 100644
--- a/tfhe/src/integer/ciphertext/compressed_noise_squashed_ciphertext_list.rs
+++ b/tfhe/src/integer/ciphertext/compressed_noise_squashed_ciphertext_list.rs
@@ -4,10 +4,6 @@ use super::{
 };
 use crate::conformance::ParameterSetConformant;
 use crate::core_crypto::commons::math::random::{Deserialize, Serialize};
-#[cfg(feature = "gpu")]
-use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
-#[cfg(feature = "gpu")]
-use crate::core_crypto::gpu::CudaStreams;
 use crate::integer::backward_compatibility::list_compression::{
     CompressedNoiseSquashingCompressionKeyVersions, CompressedSquashedNoiseCiphertextListVersions,
     NoiseSquashingCompressionPrivateKeyVersions,
@@ -29,8 +25,6 @@ use crate::Versionize;
 use std::num::NonZero;
 
 use crate::integer::backward_compatibility::list_compression::NoiseSquashingCompressionKeyVersions;
-#[cfg(feature = "gpu")]
-use crate::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey;
 
 #[derive(Clone, Debug, Serialize, Deserialize, Versionize)]
 #[versionize(NoiseSquashingCompressionPrivateKeyVersions)]
@@ -85,21 +79,6 @@ impl CompressedNoiseSquashingCompressionKey {
         NoiseSquashingCompressionKey { key }
     }
 
-    #[cfg(feature = "gpu")]
-    pub fn decompress_to_cuda(&self, streams: &CudaStreams) -> CudaNoiseSquashingCompressionKey {
-        CudaNoiseSquashingCompressionKey {
-            packing_key_switching_key: CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(
-                &self
-                    .key
-                    .packing_key_switching_key
-                    .clone()
-                    .decompress_into_lwe_packing_keyswitch_key(),
-                streams,
-            ),
-            lwe_per_glwe: self.key.lwe_per_glwe,
-        }
-    }
-
     pub fn from_raw_parts(key: ShortintCompressedNoiseSquashingCompressionKey) -> Self {
         Self { key }
     }
diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_noise_squashed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_noise_squashed_ciphertext_list.rs
index 9d6f8a7b1c..6d2ddc48d5 100644
--- a/tfhe/src/integer/gpu/ciphertext/compressed_noise_squashed_ciphertext_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compressed_noise_squashed_ciphertext_list.rs
@@ -1,12 +1,8 @@
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::CudaStreams;
-use crate::core_crypto::prelude::compressed_modulus_switched_glwe_ciphertext::CompressedModulusSwitchedGlweCiphertext;
 use crate::core_crypto::prelude::LweCiphertextCount;
 use crate::error::error;
-use crate::integer::ciphertext::{
-    CompressedSquashedNoiseCiphertextList as IntegerCompressedSquashedNoiseCiphertextList,
-    CompressedSquashedNoiseCiphertextList, DataKind,
-};
+use crate::integer::ciphertext::DataKind;
 use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
 use crate::integer::gpu::ciphertext::squashed_noise::{
     CudaSquashedNoiseBooleanBlock, CudaSquashedNoiseRadixCiphertext,
@@ -16,11 +12,9 @@ use crate::integer::gpu::decompress_integer_radix_async_128;
 use crate::integer::gpu::list_compression::server_keys::{
     CudaNoiseSquashingCompressionKey, CudaPackedGlweCiphertextList,
 };
-use crate::shortint::ciphertext::{CompressedSquashedNoiseCiphertextListMeta, Degree, NoiseLevel};
+use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::{AtomicPatternKind, PBSOrder};
-use crate::{shortint, GpuIndex};
 use itertools::Itertools;
-use shortint::ciphertext::CompressedSquashedNoiseCiphertextList as ShortintCompressedSquashedNoiseCiphertextList;
 use std::num::NonZeroUsize;
 
 pub struct CudaCompressedSquashedNoiseCiphertextListBuilder {
@@ -28,88 +22,11 @@ pub struct CudaCompressedSquashedNoiseCiphertextListBuilder {
     pub(crate) info: Vec<DataKind>,
 }
 
-#[derive(Clone)]
 pub struct CudaCompressedSquashedNoiseCiphertextList {
     pub(crate) packed_list: CudaPackedGlweCiphertextList<u128>,
     pub(crate) info: Vec<DataKind>,
 }
 
-impl CudaCompressedSquashedNoiseCiphertextList {
-    pub(crate) fn to_compressed_squashed_noise_ciphertext_list(
-        &self,
-        streams: &CudaStreams,
-    ) -> IntegerCompressedSquashedNoiseCiphertextList {
-        // Extract the packed list
-        let vec_packed_integers = self.packed_list.to_vec_packed_integers(streams);
-        let lwe_per_glwe = self.packed_list.meta.unwrap().lwe_per_glwe;
-        let total_num_lwes = self.packed_list.bodies_count();
-
-        let glwe_ciphertext_list = vec_packed_integers
-            .iter()
-            .enumerate()
-            .map(|(pack_index, packed_integers)| {
-                // Calculate number of LWEs for this GLWE
-                let num_lwes =
-                    std::cmp::min(lwe_per_glwe.0, total_num_lwes - pack_index * lwe_per_glwe.0);
-
-                CompressedModulusSwitchedGlweCiphertext::from_raw_parts(
-                    packed_integers.clone(),
-                    self.packed_list.meta.unwrap().glwe_dimension,
-                    self.packed_list.meta.unwrap().polynomial_size,
-                    LweCiphertextCount(num_lwes),
-                    self.packed_list.meta.unwrap().ciphertext_modulus,
-                )
-            })
-            .collect_vec();
-
-        // Extract the metadata
-        let meta = Some(CompressedSquashedNoiseCiphertextListMeta {
-            message_modulus: self.packed_list.meta.unwrap().message_modulus,
-            carry_modulus: self.packed_list.meta.unwrap().carry_modulus,
-            lwe_per_glwe: self.packed_list.meta.unwrap().lwe_per_glwe,
-        });
-
-        let list = ShortintCompressedSquashedNoiseCiphertextList {
-            glwe_ciphertext_list,
-            meta,
-        };
-
-        let info = self.info.clone();
-
-        IntegerCompressedSquashedNoiseCiphertextList { list, info }
-    }
-
-    pub(crate) fn from_compressed_squashed_noise_ciphertext_list(
-        ct: &CompressedSquashedNoiseCiphertextList,
-        streams: &CudaStreams,
-    ) -> Self {
-        Self {
-            packed_list: CudaPackedGlweCiphertextList::from_glwe_ciphertext_list(&ct.list, streams),
-            info: ct.info.clone(),
-        }
-    }
-
-    pub fn gpu_indexes(&self) -> &[GpuIndex] {
-        self.packed_list.data.gpu_indexes.as_slice()
-    }
-
-    pub fn duplicate(&self, streams: &CudaStreams) -> Self {
-        Self {
-            packed_list: self.packed_list.duplicate(streams),
-            info: self.info.clone(),
-        }
-    }
-
-    /// Returns the number of squashed noise ciphertext that are stored
-    pub fn len(&self) -> usize {
-        self.info.len()
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.info.len() == 0
-    }
-}
-
 pub trait CudaSquashedNoiseCompressible {
     fn compress_into(
         self,
diff --git a/tfhe/src/integer/gpu/ciphertext/squashed_noise.rs b/tfhe/src/integer/gpu/ciphertext/squashed_noise.rs
index 9c53a3cd2c..35f6072121 100644
--- a/tfhe/src/integer/gpu/ciphertext/squashed_noise.rs
+++ b/tfhe/src/integer/gpu/ciphertext/squashed_noise.rs
@@ -1,8 +1,6 @@
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::CudaStreams;
-use crate::core_crypto::prelude::{
-    LweCiphertextCount, LweCiphertextList, LweCiphertextOwned, LweSize,
-};
+use crate::core_crypto::prelude::{LweCiphertextCount, LweCiphertextOwned, LweSize};
 use crate::integer::ciphertext::{
     SquashedNoiseBooleanBlock, SquashedNoiseRadixCiphertext, SquashedNoiseSignedRadixCiphertext,
 };
@@ -10,8 +8,6 @@ use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextIn
 use crate::shortint::ciphertext::{Degree, NoiseLevel, SquashedNoiseCiphertext};
 use crate::shortint::parameters::CoreCiphertextModulus;
 use crate::shortint::{AtomicPatternKind, CarryModulus, MessageModulus, PBSOrder};
-use crate::GpuIndex;
-use itertools::Itertools;
 
 pub struct CudaSquashedNoiseRadixCiphertext {
     pub packed_d_blocks: CudaLweCiphertextList<u128>,
@@ -103,125 +99,6 @@ impl CudaSquashedNoiseRadixCiphertext {
             original_block_count: self.original_block_count,
         }
     }
-
-    pub(crate) fn from_cpu_blocks(
-        blocks: &[SquashedNoiseCiphertext],
-        streams: &CudaStreams,
-    ) -> Self {
-        let mut h_radix_ciphertext = blocks
-            .iter()
-            .flat_map(|block| block.lwe_ciphertext().clone().into_container())
-            .collect::<Vec<_>>();
-
-        let lwe_size = blocks.first().unwrap().lwe_ciphertext().lwe_size();
-        let ciphertext_modulus = blocks
-            .first()
-            .unwrap()
-            .lwe_ciphertext()
-            .ciphertext_modulus();
-
-        let h_ct = LweCiphertextList::from_container(
-            h_radix_ciphertext.as_mut_slice(),
-            lwe_size,
-            ciphertext_modulus,
-        );
-        let packed_d_blocks = CudaLweCiphertextList::from_lwe_ciphertext_list(&h_ct, streams);
-
-        let info = CudaRadixCiphertextInfo {
-            blocks: blocks
-                .iter()
-                .map(|block| CudaBlockInfo {
-                    degree: block.degree(),
-                    message_modulus: block.message_modulus(),
-                    carry_modulus: block.carry_modulus(),
-                    atomic_pattern: AtomicPatternKind::Standard(PBSOrder::KeyswitchBootstrap),
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect(),
-        };
-
-        let original_block_count = blocks.len();
-
-        Self {
-            packed_d_blocks,
-            info,
-            original_block_count,
-        }
-    }
-
-    pub(crate) fn from_squashed_noise_ciphertext(
-        ct: &SquashedNoiseCiphertext,
-        streams: &CudaStreams,
-    ) -> Self {
-        Self {
-            packed_d_blocks: CudaLweCiphertextList::from_lwe_ciphertext(
-                ct.lwe_ciphertext(),
-                streams,
-            ),
-            info: CudaRadixCiphertextInfo {
-                blocks: vec![CudaBlockInfo {
-                    degree: ct.degree(),
-                    message_modulus: ct.message_modulus(),
-                    carry_modulus: ct.carry_modulus(),
-                    atomic_pattern: AtomicPatternKind::Standard(PBSOrder::KeyswitchBootstrap),
-                    noise_level: NoiseLevel::NOMINAL,
-                }],
-            },
-            original_block_count: 1,
-        }
-    }
-
-    pub(crate) fn from_squashed_noise_radix_ciphertext(
-        ct: &SquashedNoiseRadixCiphertext,
-        streams: &CudaStreams,
-    ) -> Self {
-        let lwe_size = ct
-            .packed_blocks
-            .first()
-            .unwrap()
-            .lwe_ciphertext()
-            .lwe_size();
-        let ciphertext_modulus = ct
-            .packed_blocks
-            .first()
-            .unwrap()
-            .lwe_ciphertext()
-            .ciphertext_modulus();
-
-        let vec_lwe = ct
-            .packed_blocks
-            .iter()
-            .flat_map(|ct| ct.lwe_ciphertext().clone().into_container())
-            .collect_vec();
-        let lwe_list = LweCiphertextList::from_container(vec_lwe, lwe_size, ciphertext_modulus);
-        let packed_d_blocks = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, streams);
-
-        let info = CudaRadixCiphertextInfo {
-            blocks: ct
-                .packed_blocks
-                .iter()
-                .map(|ct| CudaBlockInfo {
-                    degree: ct.degree(),
-                    message_modulus: ct.message_modulus(),
-                    carry_modulus: ct.carry_modulus(),
-                    atomic_pattern: AtomicPatternKind::KeySwitch32,
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect_vec(),
-        };
-
-        let original_block_count = ct.original_block_count;
-
-        Self {
-            packed_d_blocks,
-            info,
-            original_block_count,
-        }
-    }
-
-    pub fn gpu_indexes(&self) -> &[GpuIndex] {
-        self.packed_d_blocks.0.d_vec.gpu_indexes.as_slice()
-    }
 }
 
 impl CudaSquashedNoiseSignedRadixCiphertext {
@@ -238,65 +115,11 @@ impl CudaSquashedNoiseSignedRadixCiphertext {
         }
     }
 
-    pub(crate) fn from_squashed_noise_signed_radix_ciphertext(
-        ct: &SquashedNoiseSignedRadixCiphertext,
-        streams: &CudaStreams,
-    ) -> Self {
-        let lwe_size = ct
-            .packed_blocks
-            .first()
-            .unwrap()
-            .lwe_ciphertext()
-            .lwe_size();
-        let ciphertext_modulus = ct
-            .packed_blocks
-            .first()
-            .unwrap()
-            .lwe_ciphertext()
-            .ciphertext_modulus();
-
-        let vec_lwe = ct
-            .packed_blocks
-            .iter()
-            .flat_map(|ct| ct.lwe_ciphertext().clone().into_container())
-            .collect_vec();
-        let lwe_list = LweCiphertextList::from_container(vec_lwe, lwe_size, ciphertext_modulus);
-        let packed_d_blocks = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, streams);
-
-        let info = CudaRadixCiphertextInfo {
-            blocks: ct
-                .packed_blocks
-                .iter()
-                .map(|ct| CudaBlockInfo {
-                    degree: ct.degree(),
-                    message_modulus: ct.message_modulus(),
-                    carry_modulus: ct.carry_modulus(),
-                    atomic_pattern: AtomicPatternKind::KeySwitch32,
-                    noise_level: NoiseLevel::NOMINAL,
-                })
-                .collect_vec(),
-        };
-
-        let original_block_count = ct.original_block_count;
-
-        Self {
-            ciphertext: CudaSquashedNoiseRadixCiphertext {
-                packed_d_blocks,
-                info,
-                original_block_count,
-            },
-        }
-    }
-
     pub(crate) fn duplicate(&self, streams: &CudaStreams) -> Self {
         Self {
             ciphertext: self.ciphertext.duplicate(streams),
         }
     }
-
-    pub fn gpu_indexes(&self) -> &[GpuIndex] {
-        self.ciphertext.gpu_indexes()
-    }
 }
 
 impl CudaSquashedNoiseBooleanBlock {
@@ -313,25 +136,9 @@ impl CudaSquashedNoiseBooleanBlock {
         }
     }
 
-    pub(crate) fn from_squashed_noise_boolean_block(
-        ct: &SquashedNoiseBooleanBlock,
-        streams: &CudaStreams,
-    ) -> Self {
-        Self {
-            ciphertext: CudaSquashedNoiseRadixCiphertext::from_squashed_noise_ciphertext(
-                &ct.ciphertext,
-                streams,
-            ),
-        }
-    }
-
     pub(crate) fn duplicate(&self, streams: &CudaStreams) -> Self {
         Self {
             ciphertext: self.ciphertext.duplicate(streams),
         }
     }
-
-    pub fn gpu_indexes(&self) -> &[GpuIndex] {
-        self.ciphertext.gpu_indexes()
-    }
 }
diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs
index 27eaf100e8..c488f33a4b 100644
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -2,7 +2,6 @@ use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePacking
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
-use crate::core_crypto::prelude::packed_integers::PackedIntegers;
 use crate::core_crypto::prelude::{
     glwe_ciphertext_size, glwe_mask_size, CiphertextModulus, CiphertextModulusLog,
     GlweCiphertextCount, LweCiphertextCount, PolynomialSize, UnsignedInteger,
@@ -18,11 +17,7 @@ use crate::integer::gpu::{
     compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
     get_compression_size_on_gpu, get_decompression_size_on_gpu,
 };
-use crate::prelude::CastInto;
-use crate::shortint::ciphertext::{
-    CompressedSquashedNoiseCiphertextList as ShortintCompressedSquashedNoiseCiphertextList, Degree,
-    NoiseLevel,
-};
+use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::parameters::AtomicPatternKind;
 use crate::shortint::prelude::{GlweDimension, LweDimension};
 use crate::shortint::{CarryModulus, MessageModulus, PBSOrder};
@@ -73,100 +68,6 @@ pub struct CudaPackedGlweCiphertextList<T: UnsignedInteger> {
 }
 
 impl<T: UnsignedInteger> CudaPackedGlweCiphertextList<T> {
-    pub(crate) fn from_glwe_ciphertext_list(
-        ct_list: &ShortintCompressedSquashedNoiseCiphertextList,
-        streams: &CudaStreams,
-    ) -> Self {
-        let flat_packed_integers: Vec<T> = ct_list
-            .glwe_ciphertext_list
-            .iter()
-            .flat_map(|ct| {
-                ct.packed_integers()
-                    .packed_coeffs()
-                    .iter()
-                    .map(|&x| x.cast_into())
-            })
-            .collect();
-
-        let data = unsafe {
-            CudaVec::from_cpu_async(
-                flat_packed_integers.as_slice(),
-                streams,
-                streams.gpu_indexes[0].get(),
-            )
-        };
-
-        let input_meta = ct_list.meta.clone().unwrap();
-        let total_lwe_bodies_count = ct_list
-            .glwe_ciphertext_list
-            .iter()
-            .map(|ct| ct.bodies_count().0)
-            .sum();
-        let glwe_dimension = ct_list
-            .glwe_ciphertext_list
-            .first()
-            .unwrap()
-            .glwe_dimension();
-        let polynomial_size = ct_list
-            .glwe_ciphertext_list
-            .first()
-            .unwrap()
-            .polynomial_size();
-        let num_glwes = ct_list.glwe_ciphertext_list.len();
-        let glwe_mask_size = glwe_mask_size(glwe_dimension, polynomial_size);
-        let initial_len = num_glwes * glwe_mask_size + total_lwe_bodies_count;
-
-        let meta = Some(CudaPackedGlweCiphertextListMeta::<T> {
-            glwe_dimension: ct_list
-                .glwe_ciphertext_list
-                .first()
-                .unwrap()
-                .glwe_dimension(),
-            polynomial_size: ct_list
-                .glwe_ciphertext_list
-                .first()
-                .unwrap()
-                .polynomial_size(),
-            message_modulus: ct_list.message_modulus().unwrap(),
-            carry_modulus: input_meta.carry_modulus,
-            ciphertext_modulus: CiphertextModulus::new_native(),
-            storage_log_modulus: ct_list
-                .glwe_ciphertext_list
-                .first()
-                .unwrap()
-                .packed_integers()
-                .log_modulus(),
-            lwe_per_glwe: input_meta.lwe_per_glwe,
-            total_lwe_bodies_count,
-            initial_len,
-        });
-
-        Self { data, meta }
-    }
-
-    // Split PackedIntegers considering their GLWE representation
-    pub(crate) fn to_vec_packed_integers(&self, streams: &CudaStreams) -> Vec<PackedIntegers<T>> {
-        let mut packed_coeffs: Vec<T> = vec![T::ZERO; self.data.len()];
-
-        unsafe {
-            self.data
-                .copy_to_cpu_async(packed_coeffs.as_mut_slice(), streams, 0);
-        }
-        streams.synchronize();
-
-        let glwe_size = glwe_ciphertext_size(
-            self.meta.unwrap().glwe_dimension.to_glwe_size(),
-            self.meta.unwrap().polynomial_size,
-        );
-        let log_modulus = self.meta.unwrap().storage_log_modulus;
-        let initial_len = self.meta.unwrap().initial_len;
-
-        packed_coeffs
-            .chunks(glwe_size)
-            .map(|chunk| PackedIntegers::from_raw_parts(chunk.to_vec(), log_modulus, initial_len))
-            .collect_vec()
-    }
-
     /// Returns the message modulus of the Ciphertexts in the list, or None if the list is empty
     pub fn message_modulus(&self) -> Option<MessageModulus> {
         self.meta.as_ref().map(|meta| meta.message_modulus)
diff --git a/tfhe/src/integer/server_key/radix/shift.rs b/tfhe/src/integer/server_key/radix/shift.rs
index f8da2855ac..00f85bb47c 100644
--- a/tfhe/src/integer/server_key/radix/shift.rs
+++ b/tfhe/src/integer/server_key/radix/shift.rs
@@ -1,4 +1,4 @@
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::prelude::CastFrom;
 use crate::integer::ciphertext::RadixCiphertext;
 use crate::integer::ServerKey;
@@ -186,7 +186,7 @@ impl ServerKey {
         // and did the last one separately
         let blocks_to_replace = &mut ct.blocks[..num_blocks - rotations - 1];
         assert_eq!(partial_blocks.len(), blocks_to_replace.len());
-        for (block, shifted_block) in izip_eq!(blocks_to_replace, partial_blocks) {
+        for (block, shifted_block) in izip!(blocks_to_replace, partial_blocks) {
             *block = shifted_block;
         }
         debug_assert!(ct.block_carries_are_empty());
@@ -330,7 +330,7 @@ impl ServerKey {
         // and did the last one separately
         let blocks_to_replace = &mut ct.blocks[rotations + 1..];
         assert_eq!(partial_blocks.len(), blocks_to_replace.len());
-        for (block, shifted_block) in izip_eq!(blocks_to_replace, partial_blocks) {
+        for (block, shifted_block) in izip!(blocks_to_replace, partial_blocks) {
             *block = shifted_block;
         }
         debug_assert!(ct.block_carries_are_empty());
diff --git a/tfhe/src/integer/server_key/radix_parallel/scalar_shift.rs b/tfhe/src/integer/server_key/radix_parallel/scalar_shift.rs
index ddd3e3646c..5aa775544a 100644
--- a/tfhe/src/integer/server_key/radix_parallel/scalar_shift.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/scalar_shift.rs
@@ -1,4 +1,4 @@
-use crate::core_crypto::commons::utils::izip_eq;
+use crate::core_crypto::commons::utils::izip;
 use crate::core_crypto::prelude::CastFrom;
 use crate::integer::ciphertext::IntegerRadixCiphertext;
 use crate::integer::ServerKey;
@@ -223,7 +223,7 @@ impl ServerKey {
         // and did the last one separately
         let blocks_to_replace = &mut ct.blocks_mut()[..num_blocks - rotations - 1];
         assert_eq!(partial_blocks.len(), blocks_to_replace.len());
-        for (block, shifted_block) in izip_eq!(blocks_to_replace, partial_blocks) {
+        for (block, shifted_block) in izip!(blocks_to_replace, partial_blocks) {
             *block = shifted_block;
         }
 
@@ -298,7 +298,7 @@ impl ServerKey {
         // and did the last one separately
         let blocks_to_replace = &mut ct.blocks_mut()[..num_blocks - rotations - 1];
         assert_eq!(partial_blocks.len(), blocks_to_replace.len());
-        for (block, shifted_block) in izip_eq!(blocks_to_replace, partial_blocks) {
+        for (block, shifted_block) in izip!(blocks_to_replace, partial_blocks) {
             *block = shifted_block;
         }
 
@@ -625,7 +625,7 @@ impl ServerKey {
         ct.blocks_mut()[rotations] = block;
         let blocks_to_replace = &mut ct.blocks_mut()[rotations + 1..];
         assert_eq!(partial_blocks.len(), blocks_to_replace.len());
-        for (block, shifted_block) in izip_eq!(blocks_to_replace, partial_blocks) {
+        for (block, shifted_block) in izip!(blocks_to_replace, partial_blocks) {
             *block = shifted_block;
         }
         debug_assert!(ct.block_carries_are_empty());
diff --git a/tfhe/src/safe_serialization.rs b/tfhe/src/safe_serialization.rs
index 1f112d7975..e4a950d20b 100644
--- a/tfhe/src/safe_serialization.rs
+++ b/tfhe/src/safe_serialization.rs
@@ -99,13 +99,6 @@ impl SerializationHeader {
 
     /// Checks the validity of the header
     fn validate<T: Named>(&self) -> Result<(), String> {
-        if self.header_version != SERIALIZATION_VERSION {
-            return Err(format!(
-                "On deserialization, expected serialization header version {SERIALIZATION_VERSION}, \
-got version {}", self.header_version
-            ));
-        }
-
         match &self.versioning_mode {
             SerializationVersioningMode::Versioned { versioning_version } => {
                 // For the moment there is only one versioning scheme, so another value is
diff --git a/tfhe/src/shortint/oprf.rs b/tfhe/src/shortint/oprf.rs
index f8167987f3..74dbf5657b 100644
--- a/tfhe/src/shortint/oprf.rs
+++ b/tfhe/src/shortint/oprf.rs
@@ -31,7 +31,7 @@ impl ModulusSwitchedLweCiphertext<usize> for PrfSeededModulusSwitched {
         self.body
     }
 
-    fn mask(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
+    fn mask(&self) -> impl Iterator<Item = usize> + '_ {
         self.mask.iter().copied()
     }
 }
diff --git a/tfhe/src/shortint/parameters/v1_4/noise_squashing/p_fail_2_minus_128/mod.rs b/tfhe/src/shortint/parameters/v1_4/noise_squashing/p_fail_2_minus_128/mod.rs
index 98c16ccb55..8fddbb8459 100644
--- a/tfhe/src/shortint/parameters/v1_4/noise_squashing/p_fail_2_minus_128/mod.rs
+++ b/tfhe/src/shortint/parameters/v1_4/noise_squashing/p_fail_2_minus_128/mod.rs
@@ -1,25 +1,9 @@
-use crate::shortint::parameters::noise_squashing::NoiseSquashingClassicParameters;
-use crate::shortint::parameters::{
-    CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
-    DynamicDistribution, GlweDimension, MessageModulus, ModulusSwitchType,
-    NoiseSquashingCompressionParameters, NoiseSquashingParameters, PolynomialSize,
-};
+use crate::shortint::parameters::{NoiseSquashingCompressionParameters, NoiseSquashingParameters};
 
 pub const V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: NoiseSquashingParameters = crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 
 pub const V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: NoiseSquashingCompressionParameters = crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 
-pub const V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
-    NoiseSquashingParameters = NoiseSquashingParameters::Classic(NoiseSquashingClassicParameters {
-    glwe_dimension: GlweDimension(2),
-    polynomial_size: PolynomialSize(2048),
-    glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
-    decomp_base_log: DecompositionBaseLog(24),
-    decomp_level_count: DecompositionLevelCount(3),
-    modulus_switch_noise_reduction_params: ModulusSwitchType::CenteredMeanNoiseReduction,
-    message_modulus: MessageModulus(4),
-    carry_modulus: CarryModulus(4),
-    ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
-});
+pub const V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: NoiseSquashingParameters = crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 
 pub const V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: NoiseSquashingParameters = crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
diff --git a/tfhe/web_wasm_parallel_tests/package-lock.json b/tfhe/web_wasm_parallel_tests/package-lock.json
index 8c44979c8a..997c67d43e 100644
--- a/tfhe/web_wasm_parallel_tests/package-lock.json
+++ b/tfhe/web_wasm_parallel_tests/package-lock.json
@@ -15,7 +15,7 @@
       "devDependencies": {
         "@babel/preset-env": "^7.25.4",
         "prettier": "^3.3.3",
-        "serve": "^14.2.5",
+        "serve": "^14.2.3",
         "webpack": "^5.94.0",
         "webpack-cli": "^5.1.4"
       }
@@ -2127,6 +2127,20 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/accepts": {
+      "version": "1.3.8",
+      "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz",
+      "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "mime-types": "~2.1.34",
+        "negotiator": "0.6.3"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
     "node_modules/acorn": {
       "version": "8.12.1",
       "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.1.tgz",
@@ -2362,9 +2376,9 @@
       }
     },
     "node_modules/brace-expansion": {
-      "version": "1.1.12",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
-      "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
+      "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -2413,9 +2427,9 @@
       "license": "MIT"
     },
     "node_modules/bytes": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
-      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
+      "integrity": "sha512-pMhOfFDPiv9t5jjIXkHosWmkSyQbvsgEVNkz0ERHbuLh2T/7j4Mqqpz523Fe8MVY89KC6Sh/QfS2sM+SjgFDcw==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -2657,18 +2671,18 @@
       }
     },
     "node_modules/compression": {
-      "version": "1.8.1",
-      "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz",
-      "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==",
+      "version": "1.7.4",
+      "resolved": "https://registry.npmjs.org/compression/-/compression-1.7.4.tgz",
+      "integrity": "sha512-jaSIDzP9pZVS4ZfQ+TzvtiWhdpFhE2RDHz8QJkpX9SIpLq88VueF5jJw6t+6CUQcAoA6t+x89MLrWAqpfDE8iQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "bytes": "3.1.2",
-        "compressible": "~2.0.18",
+        "accepts": "~1.3.5",
+        "bytes": "3.0.0",
+        "compressible": "~2.0.16",
         "debug": "2.6.9",
-        "negotiator": "~0.6.4",
-        "on-headers": "~1.1.0",
-        "safe-buffer": "5.2.1",
+        "on-headers": "~1.0.2",
+        "safe-buffer": "5.1.2",
         "vary": "~1.1.2"
       },
       "engines": {
@@ -2954,6 +2968,16 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/fast-url-parser": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/fast-url-parser/-/fast-url-parser-1.1.3.tgz",
+      "integrity": "sha512-5jOCVXADYNuRkKFzNJ0dCCewsZiYo0dz8QNYljkOpFC6r2U4OBmKtvm/Tsuh4w1YYdDqDb31a8TVhBJ2OJKdqQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "punycode": "^1.3.2"
+      }
+    },
     "node_modules/fastest-levenshtein": {
       "version": "1.0.16",
       "resolved": "https://registry.npmjs.org/fastest-levenshtein/-/fastest-levenshtein-1.0.16.tgz",
@@ -3374,9 +3398,9 @@
       "license": "MIT"
     },
     "node_modules/mime-db": {
-      "version": "1.54.0",
-      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz",
-      "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==",
+      "version": "1.53.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.53.0.tgz",
+      "integrity": "sha512-oHlN/w+3MQ3rba9rqFr6V/ypF10LSkdwUysQL7GkXoTgIWeV+tcXGA852TBxH+gsh8UWoyhR1hKcoMJTuWflpg==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -3447,9 +3471,9 @@
       "license": "MIT"
     },
     "node_modules/negotiator": {
-      "version": "0.6.4",
-      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz",
-      "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==",
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz",
+      "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -3484,9 +3508,9 @@
       }
     },
     "node_modules/on-headers": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz",
-      "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==",
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.0.2.tgz",
+      "integrity": "sha512-pZAE+FJLoyITytdqK0U5s+FIpjN0JP3OzFi/u8Rx+EV5/W+JTWGXG8xFzevE7AjBfDqHv/8vL8qQsIhHnqRkrA==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -3583,9 +3607,9 @@
       "license": "MIT"
     },
     "node_modules/path-to-regexp": {
-      "version": "3.3.0",
-      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-3.3.0.tgz",
-      "integrity": "sha512-qyCH421YQPS2WFDxDjftfc1ZR5WKQzVzqsp4n9M2kQhVOo/ByahFoUNJfl58kOcEGfQ//7weFTDhm+ss8Ecxgw==",
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-2.2.1.tgz",
+      "integrity": "sha512-gu9bD6Ta5bwGrrU8muHzVOBFFREpp2iRkVfhBJahwJ6p6Xw20SjT0MxLnwkjOibQmGSYhiUnf2FLe7k+jcFmGQ==",
       "dev": true,
       "license": "MIT"
     },
@@ -3625,6 +3649,13 @@
         "url": "https://github.com/prettier/prettier?sponsor=1"
       }
     },
+    "node_modules/punycode": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz",
+      "integrity": "sha512-jmYNElW7yvO7TV33CjSmvSiE2yco3bV2czu/OzDKdMNVZQWfxCblURLhf+47syQRBntjfLdd/H0egrzIG+oaFQ==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/randombytes": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz",
@@ -3837,24 +3868,10 @@
       }
     },
     "node_modules/safe-buffer": {
-      "version": "5.2.1",
-      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
-      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
+      "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==",
       "dev": true,
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ],
       "license": "MIT"
     },
     "node_modules/schema-utils": {
@@ -3931,9 +3948,9 @@
       }
     },
     "node_modules/serve": {
-      "version": "14.2.5",
-      "resolved": "https://registry.npmjs.org/serve/-/serve-14.2.5.tgz",
-      "integrity": "sha512-Qn/qMkzCcMFVPb60E/hQy+iRLpiU8PamOfOSYoAHmmF+fFFmpPpqa6Oci2iWYpTdOUM3VF+TINud7CfbQnsZbA==",
+      "version": "14.2.3",
+      "resolved": "https://registry.npmjs.org/serve/-/serve-14.2.3.tgz",
+      "integrity": "sha512-VqUFMC7K3LDGeGnJM9h56D3XGKb6KGgOw0cVNtA26yYXHCcpxf3xwCTUaQoWlVS7i8Jdh3GjQkOB23qsXyjoyQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -3944,9 +3961,9 @@
         "chalk": "5.0.1",
         "chalk-template": "0.4.0",
         "clipboardy": "3.0.0",
-        "compression": "1.8.1",
+        "compression": "1.7.4",
         "is-port-reachable": "4.0.0",
-        "serve-handler": "6.1.6",
+        "serve-handler": "6.1.5",
         "update-check": "1.5.4"
       },
       "bin": {
@@ -3957,31 +3974,22 @@
       }
     },
     "node_modules/serve-handler": {
-      "version": "6.1.6",
-      "resolved": "https://registry.npmjs.org/serve-handler/-/serve-handler-6.1.6.tgz",
-      "integrity": "sha512-x5RL9Y2p5+Sh3D38Fh9i/iQ5ZK+e4xuXRd/pGbM4D13tgo/MGwbttUk8emytcr1YYzBYs+apnUngBDFYfpjPuQ==",
+      "version": "6.1.5",
+      "resolved": "https://registry.npmjs.org/serve-handler/-/serve-handler-6.1.5.tgz",
+      "integrity": "sha512-ijPFle6Hwe8zfmBxJdE+5fta53fdIY0lHISJvuikXB3VYFafRjMRpOffSPvCYsbKyBA7pvy9oYr/BT1O3EArlg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
         "bytes": "3.0.0",
         "content-disposition": "0.5.2",
+        "fast-url-parser": "1.1.3",
         "mime-types": "2.1.18",
         "minimatch": "3.1.2",
         "path-is-inside": "1.0.2",
-        "path-to-regexp": "3.3.0",
+        "path-to-regexp": "2.2.1",
         "range-parser": "1.2.0"
       }
     },
-    "node_modules/serve-handler/node_modules/bytes": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
-      "integrity": "sha512-pMhOfFDPiv9t5jjIXkHosWmkSyQbvsgEVNkz0ERHbuLh2T/7j4Mqqpz523Fe8MVY89KC6Sh/QfS2sM+SjgFDcw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/serve-handler/node_modules/mime-db": {
       "version": "1.33.0",
       "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.33.0.tgz",
diff --git a/tfhe/web_wasm_parallel_tests/package.json b/tfhe/web_wasm_parallel_tests/package.json
index 66ee3571f2..37efbe931e 100644
--- a/tfhe/web_wasm_parallel_tests/package.json
+++ b/tfhe/web_wasm_parallel_tests/package.json
@@ -14,7 +14,7 @@
   "devDependencies": {
     "@babel/preset-env": "^7.25.4",
     "prettier": "^3.3.3",
-    "serve": "^14.2.5",
+    "serve": "^14.2.3",
     "webpack": "^5.94.0",
     "webpack-cli": "^5.1.4"
   },
diff --git a/tfhe/web_wasm_parallel_tests/worker.js b/tfhe/web_wasm_parallel_tests/worker.js
index 7d176a8450..94b9be33cb 100644
--- a/tfhe/web_wasm_parallel_tests/worker.js
+++ b/tfhe/web_wasm_parallel_tests/worker.js
@@ -692,68 +692,58 @@ async function compactPublicKeyZeroKnowledgeBench() {
       [ZkComputeLoad.Verify]: "compute_load_verify",
     };
 
-    // Proof configuration:
-    let proof_configs = [
-      { crs_bit_size: 64, bits_to_encrypt: [64] },
-      { crs_bit_size: 640, bits_to_encrypt: [640] },
-      { crs_bit_size: 2048, bits_to_encrypt: [2048, 64 * 4] }, // 64 * 4 is a production use-case
-      { crs_bit_size: 4096, bits_to_encrypt: [4096] },
-    ];
+    let bits_to_encrypt = [64, 640, 1280, 4096];
+
+    let encrypt_counts = bits_to_encrypt.map((v) => v / 64);
 
-    for (const proof_config of proof_configs) {
+    for (const encrypt_count of encrypt_counts) {
       console.log("Start CRS generation");
       console.time("CRS generation");
-      let crs = CompactPkeCrs.from_config(config, proof_config["crs_bit_size"]);
+      let crs = CompactPkeCrs.from_config(config, encrypt_count * 64);
       console.timeEnd("CRS generation");
 
       // 320 bits is a use case we have, 8 bits per byte
       const metadata = new Uint8Array(320 / 8);
       crypto.getRandomValues(metadata);
 
-      for (const bits_to_encrypt of proof_config["bits_to_encrypt"]) {
-        let encrypt_count = bits_to_encrypt / 64;
-
-        let inputs = Array.from(Array(encrypt_count).keys()).map(
-          (_) => U64_MAX,
-        );
-        for (const loadChoice of load_choices) {
-          let serialized_size = 0;
-          let timing = 0;
-          for (let i = 0; i < bench_loops; i++) {
-            console.time("Loop " + i);
-            let compact_list_builder =
-              ProvenCompactCiphertextList.builder(publicKey);
-            for (let j = 0; j < encrypt_count; j++) {
-              compact_list_builder.push_u64(inputs[j]);
-            }
-            const start = performance.now();
-            let list = compact_list_builder.build_with_proof_packed(
-              crs,
-              metadata,
-              loadChoice,
-            );
-            const end = performance.now();
-            console.timeEnd("Loop " + i);
-            timing += end - start;
-            serialized_size = list.safe_serialize(BigInt(10000000)).length;
+      let inputs = Array.from(Array(encrypt_count).keys()).map((_) => U64_MAX);
+      for (const loadChoice of load_choices) {
+        let serialized_size = 0;
+        let timing = 0;
+        for (let i = 0; i < bench_loops; i++) {
+          console.time("Loop " + i);
+          let compact_list_builder =
+            ProvenCompactCiphertextList.builder(publicKey);
+          for (let j = 0; j < encrypt_count; j++) {
+            compact_list_builder.push_u64(inputs[j]);
           }
-          const mean = timing / bench_loops;
-          const common_bench_str =
-            "compact_fhe_uint_proven_encryption_" +
-            params.zk_scheme +
-            "_" +
-            encrypt_count * 64 +
-            "_bits_packed_" +
-            load_to_str[loadChoice];
-          const bench_str_1 = common_bench_str + "_mean_" + block_params_name;
-          console.log(bench_str_1, ": ", mean, " ms");
-          const bench_str_2 =
-            common_bench_str + "_serialized_size_mean_" + block_params_name;
-          console.log(bench_str_2, ": ", serialized_size, " bytes");
-
-          bench_results[bench_str_1] = mean;
-          bench_results[bench_str_2] = serialized_size;
+          const start = performance.now();
+          let list = compact_list_builder.build_with_proof_packed(
+            crs,
+            metadata,
+            loadChoice,
+          );
+          const end = performance.now();
+          console.timeEnd("Loop " + i);
+          timing += end - start;
+          serialized_size = list.safe_serialize(BigInt(10000000)).length;
         }
+        const mean = timing / bench_loops;
+        const common_bench_str =
+          "compact_fhe_uint_proven_encryption_" +
+          params.zk_scheme +
+          "_" +
+          encrypt_count * 64 +
+          "_bits_packed_" +
+          load_to_str[loadChoice];
+        const bench_str_1 = common_bench_str + "_mean_" + block_params_name;
+        console.log(bench_str_1, ": ", mean, " ms");
+        const bench_str_2 =
+          common_bench_str + "_serialized_size_mean_" + block_params_name;
+        console.log(bench_str_2, ": ", serialized_size, " bytes");
+
+        bench_results[bench_str_1] = mean;
+        bench_results[bench_str_2] = serialized_size;
       }
     }
   }