diff --git a/.github/workflows/benchmark_hpu_hlapi.yml b/.github/workflows/benchmark_hpu_hlapi.yml
deleted file mode 100644
index 9f8a5584d2..0000000000
--- a/.github/workflows/benchmark_hpu_hlapi.yml
+++ /dev/null
@@ -1,98 +0,0 @@
-# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
-name: Hpu Hlapi Benchmarks
-
-on:
- workflow_dispatch:
-
-env:
- CARGO_TERM_COLOR: always
- RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
- ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
- RUST_BACKTRACE: "full"
- RUST_MIN_STACK: "8388608"
-
-permissions: {}
-
-jobs:
- hlapi-benchmarks-hpu:
- name: Execute HLAPI benchmarks for HPU backend
- runs-on: v80-desktop
- concurrency:
- group: ${{ github.workflow }}_${{ github.ref }}
- cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
- timeout-minutes: 1440 # 24 hours
- steps:
- # Needed as long as hw_regmap repository is private
- - name: Configure SSH
- uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
- with:
- ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-
- - name: Checkout tfhe-rs repo with tags
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
- with:
- fetch-depth: 0
- persist-credentials: 'false'
- lfs: true
- token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
- - name: Get benchmark details
- run: |
- COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
- {
- echo "BENCH_DATE=$(date --iso-8601=seconds)";
- echo "COMMIT_DATE=${COMMIT_DATE}";
- echo "COMMIT_HASH=$(git describe --tags --dirty)";
- } >> "${GITHUB_ENV}"
- env:
- SHA: ${{ github.sha }}
-
- - name: Install rust
- uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
- with:
- toolchain: nightly
-
- - name: Checkout Slab repo
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
- with:
- repository: zama-ai/slab
- path: slab
- persist-credentials: 'false'
- token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
- - name: Run benchmarks
- run: |
- make pull_hpu_files
- export V80_SERIAL_NUMBER=XFL12E4XJXWK
- source /opt/xilinx/Vivado/2024.2/settings64.sh
- make bench_hlapi_erc20_hpu
- make bench_hlapi_hpu
-
- - name: Parse results
- run: |
- python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
- --database tfhe_rs \
- --hardware "hpu_x1" \
- --backend hpu \
- --project-version "${COMMIT_HASH}" \
- --branch "${REF_NAME}" \
- --commit-date "${COMMIT_DATE}" \
- --bench-date "${BENCH_DATE}" \
- --walk-subdirs
- env:
- REF_NAME: ${{ github.ref_name }}
-
- - name: Upload parsed results artifact
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
- with:
- name: ${{ github.sha }}_hlapi_benchmarks
- path: ${{ env.RESULTS_FILENAME }}
-
- - name: Send data to Slab
- shell: bash
- run: |
- python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
- --slab-url "${SLAB_URL}"
- env:
- JOB_SECRET: ${{ secrets.JOB_SECRET }}
- SLAB_URL: ${{ secrets.SLAB_URL }}
diff --git a/.github/workflows/benchmark_hpu_integer.yml b/.github/workflows/benchmark_hpu_integer.yml
index 612b3c7f5c..e09bbb51ee 100644
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -3,15 +3,6 @@ name: Hpu Integer Benchmarks
on:
workflow_dispatch:
- inputs:
- bench_type:
- description: "Benchmarks type"
- type: choice
- default: both
- options:
- - latency
- - throughput
- - both
env:
CARGO_TERM_COLOR: always
@@ -23,46 +14,13 @@ env:
permissions: {}
jobs:
- prepare-matrix:
- name: Prepare operations matrix
- runs-on: v80-desktop
- outputs:
- bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
- steps:
- - name: Set benchmark types
- if: github.event_name == 'workflow_dispatch'
- run: |
- if [[ -z $INPUTS_BENCH_TYPE || "${INPUTS_BENCH_TYPE}" == "both" ]]; then
- echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
- else
- echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
- fi
- env:
- INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}
-
- - name: Default benchmark type
- if: github.event_name != 'workflow_dispatch'
- run: |
- echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
-
-
- - name: Set benchmark types output
- id: set_bench_type
- run: | # zizmor: ignore[template-injection] this env variable is safe
- echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
-
integer-benchmarks-hpu:
name: Execute integer & erc20 benchmarks for HPU backend
- needs: prepare-matrix
runs-on: v80-desktop
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
timeout-minutes: 1440 # 24 hours
- strategy:
- max-parallel: 1
- matrix:
- bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
steps:
# Needed as long as hw_regmap repository is private
- name: Configure SSH
@@ -105,11 +63,8 @@ jobs:
- name: Run benchmarks
run: |
make pull_hpu_files
- export V80_SERIAL_NUMBER=XFL12E4XJXWK
- source /opt/xilinx/Vivado/2024.2/settings64.sh
- make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
- env:
- BENCH_TYPE: ${{ matrix.bench_type }}
+ make bench_integer_hpu
+ make bench_hlapi_erc20_hpu
- name: Parse results
run: |
@@ -121,16 +76,14 @@ jobs:
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
- --walk-subdirs \
- --bench-type "${BENCH_TYPE}"
+ --walk-subdirs
env:
REF_NAME: ${{ github.ref_name }}
- BENCH_TYPE: ${{ matrix.bench_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
- name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
+ name: ${{ github.sha }}_integer_benchmarks
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
diff --git a/.github/workflows/cargo_test_ntt.yml b/.github/workflows/cargo_test_ntt.yml
index da945bd612..dc01d7723b 100644
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -11,7 +11,6 @@ env:
CARGO_TERM_COLOR: always
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
- SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -32,7 +31,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
fetch-depth: 0
- persist-credentials: "false"
+ persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
@@ -46,46 +45,18 @@ jobs:
- tfhe-ntt/**
- '.github/workflows/cargo_test_ntt.yml'
- setup-instance:
- needs: should-run
- if: needs.should-run.outputs.ntt_test == 'true'
- runs-on: ubuntu-latest
- outputs:
- matrix_os: ${{ steps.set-os-matrix.outputs.matrix_os }}
- runner-name: ${{ steps.start-remote-instance.outputs.label }}
- steps:
- - name: Start remote instance
- id: start-remote-instance
- if: env.SECRETS_AVAILABLE == 'true'
- uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
- with:
- mode: start
- github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
- slab-url: ${{ secrets.SLAB_BASE_URL }}
- job-secret: ${{ secrets.JOB_SECRET }}
- backend: aws
- profile: cpu-small
-
- - name: Set os matrix
- id: set-os-matrix
- env:
- SLAB_INSTANCE: ${{ steps.start-remote-instance.outputs.label }}
- run: |
- INSTANCE_TO_USE="${SLAB_INSTANCE:-ubuntu-latest}"
- echo "matrix_os=[\"${INSTANCE_TO_USE}\", \"macos-latest\", \"windows-latest\"]" >> "$GITHUB_OUTPUT"
-
cargo-tests-ntt:
- needs: [should-run, setup-instance]
+ needs: should-run
if: needs.should-run.outputs.ntt_test == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
- os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
+ os: [ ubuntu-latest, macos-latest, windows-latest ]
fail-fast: false
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
- persist-credentials: "false"
+ persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install Rust
@@ -101,16 +72,16 @@ jobs:
run: make test_ntt_no_std
cargo-tests-ntt-nightly:
- needs: [should-run, setup-instance]
+ needs: should-run
if: needs.should-run.outputs.ntt_test == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
- os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
+ os: [ ubuntu-latest, macos-latest, windows-latest ]
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
- persist-credentials: "false"
+ persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install Rust
@@ -126,7 +97,7 @@ jobs:
run: make test_ntt_no_std_nightly
cargo-tests-ntt-successful:
- needs: [should-run, cargo-tests-ntt, cargo-tests-ntt-nightly]
+ needs: [ should-run, cargo-tests-ntt, cargo-tests-ntt-nightly ]
if: ${{ always() }}
runs-on: ubuntu-latest
steps:
@@ -149,28 +120,3 @@ jobs:
run: |
echo "Some tfhe-ntt tests failed"
exit 1
-
- teardown-instance:
- name: Teardown instance (cargo-tests-ntt-successful)
- if: ${{ always() && needs.setup-instance.result == 'success' }}
- needs: [setup-instance, cargo-tests-ntt-successful]
- runs-on: ubuntu-latest
- steps:
- - name: Stop remote instance
- id: stop-instance
- if: env.SECRETS_AVAILABLE == 'true'
- uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
- with:
- mode: stop
- github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
- slab-url: ${{ secrets.SLAB_BASE_URL }}
- job-secret: ${{ secrets.JOB_SECRET }}
- label: ${{ needs.setup-instance.outputs.runner-name }}
-
- - name: Slack Notification
- if: ${{ failure() }}
- continue-on-error: true
- uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
- env:
- SLACK_COLOR: ${{ job.status }}
- SLACK_MESSAGE: "Instance teardown (cargo-tests-ntt) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
diff --git a/.github/workflows/coprocessor-benchmark-gpu.yml b/.github/workflows/coprocessor-benchmark-gpu.yml
index 6d4482c74e..eebae996e8 100644
--- a/.github/workflows/coprocessor-benchmark-gpu.yml
+++ b/.github/workflows/coprocessor-benchmark-gpu.yml
@@ -100,7 +100,7 @@ jobs:
git lfs install
- name: Checkout tfhe-rs
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+ uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
path: tfhe-rs
persist-credentials: false
@@ -111,7 +111,7 @@ jobs:
ls
- name: Checkout fhevm
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+ uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
repository: zama-ai/fhevm
persist-credentials: 'false'
@@ -162,10 +162,10 @@ jobs:
cargo install sqlx-cli
- name: Install foundry
- uses: foundry-rs/foundry-toolchain@82dee4ba654bd2146511f85f0d013af94670c4de
+ uses: foundry-rs/foundry-toolchain@de808b1eea699e761c404bda44ba8f21aba30b2c
- name: Cache cargo
- uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
+ uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
path: |
~/.cargo/registry
@@ -175,7 +175,7 @@ jobs:
restore-keys: ${{ runner.os }}-cargo-
- name: Login to GitHub Container Registry
- uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
+ uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
with:
registry: ghcr.io
username: ${{ github.actor }}
@@ -186,7 +186,7 @@ jobs:
working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
- name: Use Node.js
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+ uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.2
with:
node-version: 20.x
@@ -257,7 +257,7 @@ jobs:
path: fhevm/$${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
repository: zama-ai/slab
path: slab
diff --git a/Cargo.toml b/Cargo.toml
index 8a8630d70f..17e9009327 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,7 +33,6 @@ rand = "0.8"
rayon = "1.11"
serde = { version = "1.0", default-features = false }
wasm-bindgen = "0.2.100"
-getrandom = "0.2.8"
[profile.bench]
lto = "fat"
diff --git a/Makefile b/Makefile
index b2144c8aad..a9e9683e5e 100644
--- a/Makefile
+++ b/Makefile
@@ -1312,11 +1312,11 @@ bench_signed_integer_gpu: install_rs_check_toolchain
.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
bench_integer_hpu: install_rs_check_toolchain
- source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
+ source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
- --features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
+ --features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
@@ -1497,13 +1497,11 @@ bench_hlapi_gpu: install_rs_check_toolchain
--bench hlapi \
--features=integer,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark --
-.PHONY: bench_hlapi_hpu # Run benchmarks for HLAPI operations on HPU
+.PHONY: bench_hlapi_hpu # Run benchmarks for integer operations on HPU
bench_hlapi_hpu: install_rs_check_toolchain
- source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
- RUSTFLAGS="$(RUSTFLAGS)" \
- cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+ RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi \
- --features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
+ --features=integer,hpu,hpu-v80,internal-keycache,nightly-avx512 -p tfhe-benchmark --
.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
bench_hlapi_erc20: install_rs_check_toolchain
@@ -1531,11 +1529,11 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
bench_hlapi_erc20_hpu: install_rs_check_toolchain
- source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
+ source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
- --features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
+ --features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
bench_tfhe_zk_pok: install_rs_check_toolchain
diff --git a/README.md b/README.md
index c927f0618f..cc03588dad 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ production-ready library for all the advanced features of TFHE.
- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
- **Size-efficient public key encryption**
- **Ciphertext and server key compression** for efficient data transfer
-- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
+- **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.
*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
```
> [!Note]
-> Note: You need to use Rust version >= 1.84 to compile TFHE-rs.
+> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
> [!Note]
> Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
@@ -147,7 +147,7 @@ To run this code, use the following command:
> [!Note]
> Note that when running code that uses `TFHE-rs`, it is highly recommended
-to run in release mode with cargo's `--release` flag to have the best performances possible.
+to run in release mode with cargo's `--release` flag to have the best performance possible.
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
@@ -201,9 +201,11 @@ When a new update is published in the Lattice Estimator, we update parameters ac
### Security model
-By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
+By default, the parameter sets used in the High-Level API with the x86 CPU backend have a failure probability $\le 2^{128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).
+The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-128}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].
+
[1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf
[2] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf
@@ -242,7 +244,7 @@ This software is distributed under the **BSD-3-Clause-Clear** license. Read [thi
#### FAQ
**Is Zama’s technology free to use?**
>Zama’s libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zama’s commercial patent license.
->
+->
>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).
**What do I need to do if I want to use Zama’s technology for commercial purposes?**
diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index c833104f12..1cd6c313b9 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -119,8 +119,6 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
int cuda_get_number_of_gpus();
-int cuda_get_number_of_sms();
-
void cuda_synchronize_device(uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
index 464a20a0d4..973193c4f8 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -115,10 +115,8 @@ template struct int_decompression {
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus,
decompression_rescale_f, gpu_memory_allocated);
- auto active_gpu_count =
- get_active_gpu_count(num_blocks_to_decompress, gpu_count);
- decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+
+ decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index ed9831d331..bb4766c592 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -320,15 +320,10 @@ template struct int_radix_lut {
std::vector lwe_after_ks_vec;
std::vector lwe_after_pbs_vec;
std::vector lwe_trivial_indexes_vec;
- std::vector lwe_aligned_vec;
uint32_t *gpu_indexes;
bool gpu_memory_allocated;
- cudaEvent_t event_scatter_in;
- cudaEvent_t *event_scatter_out;
- cudaEvent_t event_broadcast;
-
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
@@ -347,6 +342,7 @@ template struct int_radix_lut {
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
int8_t *gpu_pbs_buffer;
@@ -363,21 +359,10 @@ template struct int_radix_lut {
if (i == 0) {
size_tracker += size;
}
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
buffer.push_back(gpu_pbs_buffer);
}
- // We create the events only if we have multiple GPUs
- if (active_gpu_count > 1) {
- event_scatter_in = cuda_create_event(gpu_indexes[0]);
- event_broadcast = cuda_create_event(gpu_indexes[0]);
-
- event_scatter_out =
- (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
- for (int i = 0; i < active_gpu_count; i++) {
- event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
- }
- }
-
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
@@ -396,6 +381,8 @@ template struct int_radix_lut {
lut_vec.push_back(lut);
lut_indexes_vec.push_back(lut_indexes);
+
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -511,8 +498,11 @@ template struct int_radix_lut {
cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
streams[i], gpu_indexes[i],
allocate_gpu_memory);
+
lut_vec.push_back(lut);
lut_indexes_vec.push_back(lut_indexes);
+
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -569,6 +559,7 @@ template struct int_radix_lut {
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
int8_t *gpu_pbs_buffer;
@@ -585,19 +576,10 @@ template struct int_radix_lut {
if (i == 0) {
size_tracker += size;
}
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
buffer.push_back(gpu_pbs_buffer);
}
- // We create the events only if we have multiple GPUs
- if (active_gpu_count > 1) {
- event_scatter_in = cuda_create_event(gpu_indexes[0]);
- event_broadcast = cuda_create_event(gpu_indexes[0]);
-
- event_scatter_out =
- (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
- for (int i = 0; i < active_gpu_count; i++) {
- event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
- }
- }
+
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
@@ -613,8 +595,11 @@ template struct int_radix_lut {
cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
streams[i], gpu_indexes[i],
allocate_gpu_memory);
+
lut_vec.push_back(lut);
lut_indexes_vec.push_back(lut_indexes);
+
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -665,9 +650,11 @@ template struct int_radix_lut {
multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
lwe_trivial_indexes_vec, num_radix_blocks,
size_tracker, allocate_gpu_memory);
- multi_gpu_copy_array_from_cpu_async(
- streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
- h_lwe_indexes_in, num_radix_blocks, allocate_gpu_memory);
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+ multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
+ lwe_trivial_indexes_vec, lwe_trivial_indexes,
+ num_radix_blocks, allocate_gpu_memory);
+
// Keyswitch
tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async(
@@ -724,87 +711,29 @@ template struct int_radix_lut {
// Broadcast luts from device gpu_indexes[0] to all active gpus
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes) {
- // We only do broadcast if there are more than 1 active GPU
- if (active_gpu_count > 1) {
- int active_device = cuda_get_device();
+ int active_device = cuda_get_device();
- uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
+ uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
- auto src_lut = lut_vec[0];
- auto src_lut_indexes = lut_indexes_vec[0];
-
- cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
- for (uint i = 0; i < active_gpu_count; i++) {
- if (gpu_indexes[i] != gpu_indexes[0]) {
- cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
- auto dst_lut = lut_vec[i];
- auto dst_lut_indexes = lut_indexes_vec[i];
- cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
- dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
- gpu_indexes[i], gpu_memory_allocated);
- cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
- dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
- streams[i], gpu_indexes[i], gpu_memory_allocated);
- }
- }
- // Ensure the device set at the end of this method is the same as it was
- // set at the beginning
- cuda_set_device(active_device);
- }
- }
- // Broadcast luts from device gpu_indexes[0] to all active gpus
- void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
- uint32_t new_active_gpu_count,
- bool broadcast_lut_values = true) {
- // We only do broadcast if there are more than 1 active GPU
- if (new_active_gpu_count > 1) {
- int active_device = cuda_get_device();
-
- uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
-
- auto src_lut = lut_vec[0];
- auto src_lut_indexes = lut_indexes_vec[0];
- if (active_gpu_count > 1)
- cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
- for (uint i = 0; i < new_active_gpu_count; i++) {
- if (gpu_indexes[i] != gpu_indexes[0]) {
- cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
- if (broadcast_lut_values) {
- auto dst_lut = lut_vec[i];
- cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
- dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
- streams[i], gpu_indexes[i], gpu_memory_allocated);
- }
- auto dst_lut_indexes = lut_indexes_vec[i];
- cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
- dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
- streams[i], gpu_indexes[i], gpu_memory_allocated);
- }
- }
- // Ensure the device set at the end of this method is the same as it was
- // set at the beginning
- cuda_set_device(active_device);
- }
- }
+ auto src_lut = lut_vec[0];
+ auto src_lut_indexes = lut_indexes_vec[0];
- void allocate_lwe_vector_for_non_trivial_indexes(
- cudaStream_t const *streams, uint32_t const *gpu_indexes,
- uint32_t active_gpu_count, uint64_t max_num_radix_blocks,
- uint64_t &size_tracker, bool allocate_gpu_memory) {
- // We need to create the auxiliary array only in GPU 0
- lwe_aligned_vec.resize(active_gpu_count);
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
- uint64_t size_tracker_on_array_i = 0;
- auto inputs_on_gpu = std::max(
- THRESHOLD_MULTI_GPU,
- get_num_inputs_on_gpu(max_num_radix_blocks, i, active_gpu_count));
- Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
- inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
- streams[0], gpu_indexes[0], size_tracker_on_array_i,
- allocate_gpu_memory);
- lwe_aligned_vec[i] = d_array;
- size_tracker += size_tracker_on_array_i;
+ if (gpu_indexes[i] != gpu_indexes[0]) {
+ auto dst_lut = lut_vec[i];
+ auto dst_lut_indexes = lut_indexes_vec[i];
+ cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+ dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
+ gpu_indexes[i], gpu_memory_allocated);
+ cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+ dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
+ streams[i], gpu_indexes[i], gpu_memory_allocated);
+ }
}
+ // Ensure the device set at the end of this method is the same as it was set
+ // at the beginning
+ cuda_set_device(active_device);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -816,6 +745,7 @@ template struct int_radix_lut {
cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
gpu_indexes[i], gpu_memory_allocated);
}
+
cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
@@ -862,23 +792,6 @@ template struct int_radix_lut {
lwe_after_ks_vec.clear();
lwe_after_pbs_vec.clear();
lwe_trivial_indexes_vec.clear();
- if (active_gpu_count > 1) {
- for (uint i = 0; i < active_gpu_count; i++) {
- cuda_synchronize_stream(streams[i], gpu_indexes[i]);
- cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
- }
- cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
- cuda_event_destroy(event_broadcast, gpu_indexes[0]);
- free(event_scatter_out);
- }
- if (lwe_aligned_vec.size() > 0) {
- for (uint i = 0; i < active_gpu_count; i++) {
- cuda_drop_with_size_tracking_async(lwe_aligned_vec[i], streams[0],
- gpu_indexes[0],
- gpu_memory_allocated);
- }
- lwe_aligned_vec.clear();
- }
}
free(h_lut_indexes);
free(degrees);
@@ -927,8 +840,6 @@ template struct int_noise_squashing_lut {
bool using_trivial_lwe_indexes = true;
bool gpu_memory_allocated;
- std::vector lwe_aligned_scatter_vec;
- std::vector<__uint128_t *> lwe_aligned_gather_vec;
// noise squashing constructor
int_noise_squashing_lut(cudaStream_t const *streams,
uint32_t const *input_gpu_indexes, uint32_t gpu_count,
@@ -1070,10 +981,7 @@ template struct int_noise_squashing_lut {
&pbs_buffer[i]);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
- if (lwe_aligned_gather_vec.size() > 0) {
- multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_gather_vec);
- multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_scatter_vec);
- }
+
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
@@ -1136,10 +1044,7 @@ template struct int_bit_extract_luts_buffer {
lut->get_lut_indexes(0, 0), h_lut_indexes,
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
gpu_indexes[0], allocate_gpu_memory);
-
- auto active_gpu_count =
- get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
/**
* the input indexes should take the first bits_per_block PBS to target
@@ -1165,9 +1070,6 @@ template struct int_bit_extract_luts_buffer {
lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
h_lwe_indexes_out);
- lut->allocate_lwe_vector_for_non_trivial_indexes(
- streams, gpu_indexes, active_gpu_count,
- num_radix_blocks * bits_per_block, size_tracker, allocate_gpu_memory);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lwe_indexes_in);
@@ -1309,9 +1211,7 @@ template struct int_shift_and_rotate_buffer {
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
- auto active_gpu_count_mux =
- get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
- mux_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_mux);
+ mux_lut->broadcast_lut(streams, gpu_indexes);
auto cleaning_lut_f = [params](Torus x) -> Torus {
return x % params.message_modulus;
@@ -1321,10 +1221,7 @@ template struct int_shift_and_rotate_buffer {
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
- auto active_gpu_count_cleaning =
- get_active_gpu_count(num_radix_blocks, gpu_count);
- cleaning_lut->broadcast_lut(streams, gpu_indexes,
- active_gpu_count_cleaning);
+ cleaning_lut->broadcast_lut(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1413,8 +1310,8 @@ template struct int_fullprop_buffer {
cuda_memcpy_with_size_tracking_async_to_gpu(
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams[0],
gpu_indexes[0], allocate_gpu_memory);
- auto active_gpu_count = get_active_gpu_count(2, gpu_count);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ lut->broadcast_lut(streams, gpu_indexes);
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async(
@@ -1549,11 +1446,9 @@ template struct int_overflowing_sub_memory {
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_message_acc, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
- luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
- message_acc->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ luts_array->broadcast_lut(streams, gpu_indexes);
+ luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes);
+ message_acc->broadcast_lut(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1662,8 +1557,9 @@ template struct int_sum_ciphertexts_vec_memory {
uint32_t total_messages = 0;
current_columns.next_accumulation(total_ciphertexts, total_messages,
_needs_processing);
- uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
+
if (!mem_reuse) {
+ uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
if (total_ciphertexts > 0 ||
reduce_degrees_for_single_carry_propagation) {
uint64_t size_tracker = 0;
@@ -1671,11 +1567,6 @@ template struct int_sum_ciphertexts_vec_memory {
new int_radix_lut(streams, gpu_indexes, gpu_count, params, 2,
pbs_count, true, size_tracker);
allocated_luts_message_carry = true;
- auto active_gpu_count =
- get_active_gpu_count(this->max_total_blocks_in_vec, gpu_count);
- luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
- streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
- size_tracker, true);
}
}
if (allocated_luts_message_carry) {
@@ -1703,9 +1594,7 @@ template struct int_sum_ciphertexts_vec_memory {
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
- auto active_gpu_count_mc = get_active_gpu_count(pbs_count, gpu_count);
- luts_message_carry->broadcast_lut(streams, gpu_indexes,
- active_gpu_count_mc);
+ luts_message_carry->broadcast_lut(streams, gpu_indexes);
}
}
int_sum_ciphertexts_vec_memory(
@@ -1725,7 +1614,6 @@ template struct int_sum_ciphertexts_vec_memory {
this->allocated_luts_message_carry = false;
this->reduce_degrees_for_single_carry_propagation =
reduce_degrees_for_single_carry_propagation;
-
setup_index_buffers(streams, gpu_indexes, size_tracker);
// because we setup_lut in host function for sum_ciphertexts to save memory
// the size_tracker is topped up here to have a max bound on the used memory
@@ -1773,9 +1661,6 @@ template struct int_sum_ciphertexts_vec_memory {
this->current_blocks = current_blocks;
this->small_lwe_vector = small_lwe_vector;
this->luts_message_carry = reused_lut;
- this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
- streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
- size_tracker, allocate_gpu_memory);
setup_index_buffers(streams, gpu_indexes, size_tracker);
}
@@ -1859,9 +1744,8 @@ template struct int_seq_group_prop_memory {
cuda_memcpy_with_size_tracking_async_to_gpu(
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
- auto active_gpu_count = get_active_gpu_count(num_seq_luts, gpu_count);
- lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+
+ lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes);
free(h_seq_lut_indexes);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1916,8 +1800,8 @@ template struct int_hs_group_prop_memory {
lut_hillis_steele->get_degree(0), lut_hillis_steele->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_lut_hillis_steele, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(num_groups, gpu_count);
- lut_hillis_steele->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ lut_hillis_steele->broadcast_lut(streams, gpu_indexes);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -2093,9 +1977,8 @@ template struct int_shifted_blocks_and_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
gpu_indexes[0], allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- luts_array_first_step->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+
+ luts_array_first_step->broadcast_lut(streams, gpu_indexes);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -2356,9 +2239,7 @@ template struct int_prop_simu_group_carries_memory {
scalar_array_cum_sum, h_scalar_array_cum_sum,
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
allocate_gpu_memory);
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- luts_array_second_step->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ luts_array_second_step->broadcast_lut(streams, gpu_indexes);
if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -2377,17 +2258,14 @@ template struct int_prop_simu_group_carries_memory {
// needed for the division to update the lut indexes
void update_lut_indexes(cudaStream_t const *streams,
- uint32_t const *gpu_indexes, uint32_t gpu_count,
- Torus *new_lut_indexes, Torus *new_scalars,
- uint32_t new_num_blocks) {
+ uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+ Torus *new_scalars, uint32_t new_num_blocks) {
Torus *lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0], gpu_memory_allocated);
- auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
- // We just need to update the lut indexes so we use false here
- luts_array_second_step->broadcast_lut(streams, gpu_indexes,
- new_active_gpu_count, false);
+
+ luts_array_second_step->broadcast_lut(streams, gpu_indexes);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
@@ -2552,9 +2430,7 @@ template struct int_sc_prop_memory {
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
}
// For the final cleanup in case of overflow or carry (it seems that I can)
@@ -2623,9 +2499,7 @@ template struct int_sc_prop_memory {
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
allocate_gpu_memory);
}
- auto active_gpu_count =
- get_active_gpu_count(num_radix_blocks + 1, gpu_count);
- lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut_message_extract->broadcast_lut(streams, gpu_indexes);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -2820,23 +2694,19 @@ template struct int_shifted_blocks_and_borrow_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
gpu_indexes[0], allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- luts_array_first_step->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+
+ luts_array_first_step->broadcast_lut(streams, gpu_indexes);
};
// needed for the division to update the lut indexes
void update_lut_indexes(cudaStream_t const *streams,
- uint32_t const *gpu_indexes, uint32_t gpu_count,
- Torus *new_lut_indexes, uint32_t new_num_blocks) {
+ uint32_t const *gpu_indexes, Torus *new_lut_indexes,
+ uint32_t new_num_blocks) {
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0], gpu_memory_allocated);
- auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
- // We just need to update the lut indexes so we use false here
- luts_array_first_step->broadcast_lut(streams, gpu_indexes,
- new_active_gpu_count, false);
+ luts_array_first_step->broadcast_lut(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -2935,9 +2805,8 @@ template struct int_borrow_prop_memory {
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
- active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut_message_extract->broadcast_lut(streams, gpu_indexes);
if (compute_overflow) {
lut_borrow_flag = new int_radix_lut(
@@ -2953,7 +2822,8 @@ template struct int_borrow_prop_memory {
lut_borrow_flag->get_degree(0), lut_borrow_flag->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_borrow_flag, gpu_memory_allocated);
- lut_borrow_flag->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ lut_borrow_flag->broadcast_lut(streams, gpu_indexes);
}
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
@@ -2981,15 +2851,15 @@ template struct int_borrow_prop_memory {
// needed for the division to update the lut indexes
void update_lut_indexes(cudaStream_t const *streams,
- uint32_t const *gpu_indexes, uint32_t gpu_count,
+ uint32_t const *gpu_indexes,
Torus *first_indexes_for_div,
Torus *second_indexes_for_div, Torus *scalars_for_div,
uint32_t new_num_blocks) {
shifted_blocks_borrow_state_mem->update_lut_indexes(
- streams, gpu_indexes, gpu_count, first_indexes_for_div, new_num_blocks);
+ streams, gpu_indexes, first_indexes_for_div, new_num_blocks);
prop_simu_group_carries_mem->update_lut_indexes(
- streams, gpu_indexes, gpu_count, second_indexes_for_div,
- scalars_for_div, new_num_blocks);
+ streams, gpu_indexes, second_indexes_for_div, scalars_for_div,
+ new_num_blocks);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -3120,10 +2990,7 @@ template struct int_mul_memory {
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
-
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes);
zero_out_mem = new int_zero_out_if_buffer(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3196,8 +3063,8 @@ template struct int_mul_memory {
streams[0], gpu_indexes[0],
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
- auto active_gpu_count = get_active_gpu_count(total_block_count, gpu_count);
- luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ luts_array->broadcast_lut(streams, gpu_indexes);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3329,8 +3196,7 @@ template struct int_logical_scalar_shift_buffer {
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
@@ -3414,15 +3280,13 @@ template struct int_logical_scalar_shift_buffer {
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
- cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (auto &buffer : lut_buffers_bivariate) {
buffer->release(streams, gpu_indexes, gpu_count);
delete buffer;
@@ -3520,9 +3384,7 @@ template struct int_arithmetic_scalar_shift_buffer {
shift_last_block_lut_univariate->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
}
@@ -3547,9 +3409,7 @@ template struct int_arithmetic_scalar_shift_buffer {
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
padding_block_lut_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -3588,9 +3448,7 @@ template struct int_arithmetic_scalar_shift_buffer {
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
blocks_lut_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes);
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
}
@@ -3598,7 +3456,6 @@ template struct int_arithmetic_scalar_shift_buffer {
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
- cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
@@ -3705,13 +3562,9 @@ template struct int_cmux_buffer {
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
allocate_gpu_memory);
- auto active_gpu_count_pred =
- get_active_gpu_count(2 * num_radix_blocks, gpu_count);
- predicate_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_pred);
- auto active_gpu_count_msg =
- get_active_gpu_count(num_radix_blocks, gpu_count);
- message_extract_lut->broadcast_lut(streams, gpu_indexes,
- active_gpu_count_msg);
+
+ predicate_lut->broadcast_lut(streams, gpu_indexes);
+ message_extract_lut->broadcast_lut(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3783,8 +3636,7 @@ template struct int_are_all_block_true_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(max_chunks, gpu_count);
- is_max_value->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ is_max_value->broadcast_lut(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3844,8 +3696,7 @@ template struct int_comparison_eq_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- operator_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ operator_lut->broadcast_lut(streams, gpu_indexes);
// f(x) -> x == 0
Torus total_modulus = params.message_modulus * params.carry_modulus;
@@ -3863,7 +3714,7 @@ template struct int_comparison_eq_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
- is_non_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ is_non_zero_lut->broadcast_lut(streams, gpu_indexes);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut(
@@ -3882,8 +3733,8 @@ template struct int_comparison_eq_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, gpu_memory_allocated);
}
- scalar_comparison_luts->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+
+ scalar_comparison_luts->broadcast_lut(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3954,8 +3805,8 @@ template struct int_tree_sign_reduction_buffer {
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
block_selector_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -4142,7 +3993,8 @@ template struct int_comparison_buffer {
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
- identity_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ identity_lut->broadcast_lut(streams, gpu_indexes);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -4159,7 +4011,7 @@ template struct int_comparison_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_zero_f, gpu_memory_allocated);
- is_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ is_zero_lut->broadcast_lut(streams, gpu_indexes);
switch (op) {
case COMPARISON_TYPE::MAX:
@@ -4241,8 +4093,8 @@ template struct int_comparison_buffer {
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- signed_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ signed_lut->broadcast_lut(streams, gpu_indexes);
}
}
@@ -4291,7 +4143,6 @@ template struct int_comparison_buffer {
delete signed_msb_lut;
delete tmp_trivial_sign_block;
}
- cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
@@ -4458,23 +4309,17 @@ template struct unsigned_int_div_rem_memory {
streams, gpu_indexes, gpu_count, params, 1, num_blocks,
allocate_gpu_memory, size_tracker);
- generate_device_accumulator(
- streams[0], gpu_indexes[0], masking_luts_1[i]->get_lut(0, 0),
- masking_luts_1[i]->get_degree(0),
- masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
- params.polynomial_size, params.message_modulus, params.carry_modulus,
- lut_f_masking, gpu_memory_allocated);
- auto active_gpu_count1 = get_active_gpu_count(1, gpu_count);
- masking_luts_1[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count1);
+ int_radix_lut *luts[2] = {masking_luts_1[i], masking_luts_2[i]};
- generate_device_accumulator(
- streams[0], gpu_indexes[0], masking_luts_2[i]->get_lut(0, 0),
- masking_luts_2[i]->get_degree(0),
- masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
- params.polynomial_size, params.message_modulus, params.carry_modulus,
- lut_f_masking, gpu_memory_allocated);
- auto active_gpu_count2 = get_active_gpu_count(num_blocks, gpu_count);
- masking_luts_2[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count2);
+ for (int j = 0; j < 2; j++) {
+ generate_device_accumulator(
+ streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
+ luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+ params.glwe_dimension, params.polynomial_size,
+ params.message_modulus, params.carry_modulus, lut_f_masking,
+ gpu_memory_allocated);
+ luts[j]->broadcast_lut(streams, gpu_indexes);
+ }
}
// create and generate message_extract_lut_1 and message_extract_lut_2
@@ -4494,14 +4339,13 @@ template struct unsigned_int_div_rem_memory {
int_radix_lut *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
- auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
for (int j = 0; j < 2; j++) {
generate_device_accumulator(
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
- luts[j]->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ luts[j]->broadcast_lut(streams, gpu_indexes);
}
// Give name to closures to improve readability
@@ -4537,8 +4381,7 @@ template struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
gpu_memory_allocated);
- zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator_bivariate_with_factor(
streams[0], gpu_indexes[0],
zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
@@ -4547,8 +4390,7 @@ template struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
gpu_memory_allocated);
- zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes);
// create and generate zero_out_if_overflow_happened
zero_out_if_overflow_happened = new int_radix_lut *[2];
@@ -4575,8 +4417,7 @@ template struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
gpu_memory_allocated);
- zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator_bivariate_with_factor(
streams[0], gpu_indexes[0],
zero_out_if_overflow_happened[1]->get_lut(0, 0),
@@ -4585,12 +4426,10 @@ template struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
gpu_memory_allocated);
- zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes);
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut *[num_bits_in_message];
- auto active_gpu_count_for_bits = get_active_gpu_count(1, gpu_count);
for (int i = 0; i < num_bits_in_message; i++) {
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
return (x == 0 && y == 0) << i;
@@ -4607,8 +4446,7 @@ template struct unsigned_int_div_rem_memory {
merge_overflow_flags_luts[i]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
- merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes,
- active_gpu_count_for_bits);
+ merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes);
}
}
@@ -4864,7 +4702,6 @@ template struct unsigned_int_div_rem_memory {
delete[] merge_overflow_flags_luts;
// release sub streams
- cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -4926,7 +4763,7 @@ template struct int_bitop_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+
switch (op) {
case BITAND:
case BITOR:
@@ -4953,7 +4790,7 @@ template struct int_bitop_buffer {
lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
}
break;
default:
@@ -4983,7 +4820,7 @@ template struct int_bitop_buffer {
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
}
}
}
@@ -5268,10 +5105,7 @@ template struct int_div_rem_memory {
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
f_compare_extracted_signed_bits, gpu_memory_allocated);
- auto active_gpu_count_cmp =
- get_active_gpu_count(1, gpu_count); // only 1 block needed
- compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes,
- active_gpu_count_cmp);
+ compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes);
}
}
@@ -5313,7 +5147,6 @@ template struct int_div_rem_memory {
delete compare_signed_bits_lut;
// release sub streams
- cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -5942,7 +5775,7 @@ template struct int_prepare_count_of_consecutive_bits_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->direction = direction;
this->bit_value = bit_value;
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+
this->univ_lut_mem = new int_radix_lut(
streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
@@ -5981,7 +5814,7 @@ template struct int_prepare_count_of_consecutive_bits_buffer {
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
if (allocate_gpu_memory) {
- univ_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ univ_lut_mem->broadcast_lut(streams, gpu_indexes);
}
auto generate_bi_lut_lambda =
@@ -6000,7 +5833,7 @@ template struct int_prepare_count_of_consecutive_bits_buffer {
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
if (allocate_gpu_memory) {
- biv_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ biv_lut_mem->broadcast_lut(streams, gpu_indexes);
}
this->tmp_ct = new CudaRadixCiphertextFFI;
@@ -6218,8 +6051,7 @@ template struct int_grouped_oprf_memory {
cuda_memcpy_async_to_gpu(luts->get_lut_indexes(0, 0), this->h_lut_indexes,
num_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
- auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
- luts->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ luts->broadcast_lut(streams, gpu_indexes);
free(h_corrections);
}
diff --git a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
index ca543e6a20..bcbe409101 100644
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -232,13 +232,8 @@ template struct zk_expand_mem {
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
allocate_gpu_memory);
- auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
- message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
- active_gpu_count);
+ message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes);
- message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
- streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
- allocate_gpu_memory);
// The expanded LWEs will always be on the casting key format
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index c6f0997d86..c5464743e9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,88 +1,15 @@
#include "device.h"
#include
#include
-#include
uint32_t cuda_get_device() {
int device;
check_cuda_error(cudaGetDevice(&device));
return static_cast(device);
}
-std::mutex pool_mutex;
-bool mem_pools_enabled = false;
-
-// We use memory pools to reduce some overhead of memory allocations due
-// to our scratch/release pattern. This function is the simplest way of using
-// mempools, it modifies the default memory pool to use a threshold of 5% of the
-// free memory:
-// - Enabled opportunistic reuse to maximize reuse in malloc/free patterns
-// - Prevent memory from being released back to the OS too soon if is within
-// our threshold
-// - Warm up the pool by allocating and freeing a large block of memory
-// This function is called only once, the first time a GPU is set, and it
-// configures all the GPUs available.
-// We have measured an improvement of around 10% in our integer operations,
-// especially the ones involving many allocations.
-// We tested more complex configurations of mempools, but they did not yield
-// better results.
-void cuda_setup_mempool(uint32_t caller_gpu_index) {
- if (!mem_pools_enabled) {
- pool_mutex.lock();
- if (mem_pools_enabled)
- return; // If mem pools are already enabled, we don't need to do anything
-
- // We do it only once for all GPUs
- mem_pools_enabled = true;
- uint32_t num_gpus = cuda_get_number_of_gpus();
- for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
- cuda_set_device(gpu_index);
-
- size_t total_mem, free_mem;
- check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
-
- // If we have more than 5% of free memory, we can set up the mempool
- uint64_t mem_pool_threshold = total_mem / 20; // 5% of total memory
- mem_pool_threshold =
- mem_pool_threshold - (mem_pool_threshold % 1024); // Align to 1KB
- if (mem_pool_threshold < free_mem) {
- // Get default memory pool
- cudaMemPool_t default_pool;
- check_cuda_error(cudaDeviceGetDefaultMemPool(&default_pool, gpu_index));
-
- // Enable opportunistic reuse
- int reuse = 1;
- check_cuda_error(cudaMemPoolSetAttribute(
- default_pool, cudaMemPoolReuseAllowOpportunistic, &reuse));
-
- // Prevent memory from being released back to the OS too soon
- check_cuda_error(cudaMemPoolSetAttribute(
- default_pool, cudaMemPoolAttrReleaseThreshold,
- &mem_pool_threshold));
-
- // Warm up the pool by allocating and freeing a large block
- cudaStream_t stream;
- stream = cuda_create_stream(gpu_index);
- void *warmup_ptr = nullptr;
- warmup_ptr = cuda_malloc_async(mem_pool_threshold, stream, gpu_index);
- cuda_drop_async(warmup_ptr, stream, gpu_index);
-
- // Sync to ensure pool is grown
- cuda_synchronize_stream(stream, gpu_index);
-
- // Clean up
- cuda_destroy_stream(stream, gpu_index);
- }
- }
- // We return to the original gpu_index
- cuda_set_device(caller_gpu_index);
- pool_mutex.unlock();
- }
-}
void cuda_set_device(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
- // Mempools are initialized only once in all the GPUS available
- cuda_setup_mempool(gpu_index);
}
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
@@ -402,13 +329,6 @@ int cuda_get_number_of_gpus() {
return num_gpus;
}
-int cuda_get_number_of_sms() {
- int num_sms = 0;
- check_cuda_error(
- cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
- return num_sms;
-}
-
/// Drop a cuda array
void cuda_drop(void *ptr, uint32_t gpu_index) {
cuda_set_device(gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
index ac0b56a798..d3dba45bc7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -148,8 +148,7 @@ __host__ void are_all_comparisons_block_true(
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
h_lut_indexes, num_chunks * sizeof(Torus),
streams[0], gpu_indexes[0]);
- auto active_gpu_count = get_active_gpu_count(num_chunks, gpu_count);
- is_max_value_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ is_max_value_lut->broadcast_lut(streams, gpu_indexes);
}
lut = is_max_value_lut;
}
@@ -168,10 +167,7 @@ __host__ void are_all_comparisons_block_true(
is_max_value_lut->h_lut_indexes,
is_max_value_lut->num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
- auto active_gpu_count_is_max =
- get_active_gpu_count(is_max_value_lut->num_blocks, gpu_count);
- is_max_value_lut->broadcast_lut(streams, gpu_indexes,
- active_gpu_count_is_max, false);
+ is_max_value_lut->broadcast_lut(streams, gpu_indexes);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;
} else {
@@ -503,9 +499,7 @@ __host__ void tree_sign_reduction(
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f, true);
-
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- last_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ last_lut->broadcast_lut(streams, gpu_indexes);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
index 01e04888ad..917337b69b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -363,17 +363,14 @@ host_integer_decompress(cudaStream_t const *streams,
lut->lwe_trivial_indexes_vec;
/// Make sure all data that should be on GPU 0 is indeed there
- cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[j], lut->event_scatter_in,
- gpu_indexes[j]);
- }
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
- extracted_lwe, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
- lut->lwe_aligned_vec, lut->active_gpu_count, num_blocks_to_decompress,
+ extracted_lwe, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+ lut->active_gpu_count, num_blocks_to_decompress,
compression_params.small_lwe_dimension + 1);
/// Apply PBS
@@ -391,20 +388,13 @@ host_integer_decompress(cudaStream_t const *streams,
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async(
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
- lwe_after_pbs_vec, lut->lwe_indexes_out,
- lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
- num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
+ lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+ lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
+ encryption_params.big_lwe_dimension + 1);
/// Synchronize all GPUs
- // other gpus record their events
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_event_record(lut->event_scatter_out[j], streams[j],
- gpu_indexes[j]);
- }
- // GPU 0 waits for all
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
- gpu_indexes[0]);
+ for (uint i = 0; i < active_gpu_count; i++) {
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
} else {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index fb7cac01a7..6d75d7f2a8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -311,8 +311,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
mem_ptr->scalars_for_overflow_sub
[merged_interesting_remainder->num_radix_blocks - 1];
mem_ptr->overflow_sub_mem->update_lut_indexes(
- streams, gpu_indexes, gpu_count, first_indexes, second_indexes,
- scalar_indexes, merged_interesting_remainder->num_radix_blocks);
+ streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
+ merged_interesting_remainder->num_radix_blocks);
host_integer_overflowing_sub(
streams, gpu_indexes, gpu_count, new_remainder,
merged_interesting_remainder, interesting_divisor,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 5abf0e7005..d298fd9937 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -567,20 +567,16 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
- cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
- }
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
- PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
- (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
- lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
- lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
- POP_RANGE()
+ (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
+ lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
+ big_lwe_dimension + 1);
+
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -599,20 +595,15 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
- PUSH_RANGE("gather")
- multi_gpu_gather_lwe_async(
- streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
- lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
- lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
- POP_RANGE()
- // other gpus record their events
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
- }
- // GPU 0 waits for all
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
- gpu_indexes[0]);
+ multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count,
+ (Torus *)lwe_array_out->ptr,
+ lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+ lut->using_trivial_lwe_indexes,
+ num_radix_blocks, big_lwe_dimension + 1);
+
+ /// Synchronize all GPUs
+ for (uint i = 0; i < active_gpu_count; i++) {
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
for (uint i = 0; i < num_radix_blocks; i++) {
@@ -683,19 +674,16 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
- cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
- }
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
- PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
- (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
- lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
- lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
- POP_RANGE()
+ (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
+ lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
+ big_lwe_dimension + 1);
+
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -714,22 +702,15 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
- PUSH_RANGE("gather")
multi_gpu_gather_many_lut_lwe_async(
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
num_many_lut);
- POP_RANGE()
- // other gpus record their events
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
- }
- // GPU 0 waits for all
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
- gpu_indexes[0]);
+ /// Synchronize all GPUs
+ for (uint i = 0; i < active_gpu_count; i++) {
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -814,17 +795,13 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
- cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
- }
- PUSH_RANGE("scatter")
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
- (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
- lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
- lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
- POP_RANGE()
+ (Torus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+ lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
+ big_lwe_dimension + 1);
+
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -843,20 +820,15 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
- PUSH_RANGE("gather")
- multi_gpu_gather_lwe_async(
- streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
- lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
- lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
- POP_RANGE()
- // other gpus record their events
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
- }
- // GPU 0 waits for all
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
- gpu_indexes[0]);
+ multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count,
+ (Torus *)(lwe_array_out->ptr),
+ lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+ lut->using_trivial_lwe_indexes,
+ num_radix_blocks, big_lwe_dimension + 1);
+
+ /// Synchronize all GPUs
+ for (uint i = 0; i < active_gpu_count; i++) {
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
for (uint i = 0; i < num_radix_blocks; i++) {
@@ -1028,6 +1000,7 @@ void generate_device_accumulator_no_encoding(
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
+
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
@@ -1131,7 +1104,8 @@ void generate_device_accumulator_bivariate_with_factor(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
factor);
- // copy host lut and lut_indexes_vec to device
+ cuda_synchronize_stream(stream, gpu_index);
+ // copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
@@ -1163,6 +1137,7 @@ void generate_device_accumulator_with_encoding(
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
+
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
@@ -1693,7 +1668,6 @@ __host__ void reduce_signs(
"than the number of blocks to operate on")
auto diff_buffer = mem_ptr->diff_buffer;
- auto active_gpu_count = mem_ptr->active_gpu_count;
auto params = mem_ptr->params;
auto glwe_dimension = params.glwe_dimension;
@@ -1723,7 +1697,7 @@ __host__ void reduce_signs(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, reduce_two_orderings_function, true);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
while (num_sign_blocks > 2) {
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
@@ -1754,7 +1728,7 @@ __host__ void reduce_signs(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f, true);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
num_sign_blocks, message_modulus);
@@ -1774,7 +1748,7 @@ __host__ void reduce_signs(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f, true);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
@@ -1800,8 +1774,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
POP_RANGE()
return size_tracker;
}
@@ -1838,8 +1811,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
POP_RANGE()
return size_tracker;
}
@@ -1876,8 +1848,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
- auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
- (*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
POP_RANGE()
return size_tracker;
}
@@ -2392,10 +2363,9 @@ __host__ void integer_radix_apply_noise_squashing_kb(
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
- (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
- lut->using_trivial_lwe_indexes, lut->lwe_aligned_scatter_vec,
- lut->active_gpu_count, lwe_array_out->num_radix_blocks,
- lut->input_big_lwe_dimension + 1);
+ (InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+ lut->using_trivial_lwe_indexes, lut->active_gpu_count,
+ lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
execute_keyswitch_async(
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
@@ -2418,8 +2388,8 @@ __host__ void integer_radix_apply_noise_squashing_kb(
multi_gpu_gather_lwe_async<__uint128_t>(
streams, gpu_indexes, active_gpu_count,
(__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec, nullptr,
- lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
- lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
+ lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
+ big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 7ce5e3847c..ad50af6e90 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -415,10 +415,31 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
+ Torus *h_lwe_indexes_in_pinned;
+ Torus *h_lwe_indexes_out_pinned;
+ cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
+ total_ciphertexts * sizeof(Torus));
+ cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
+ total_ciphertexts * sizeof(Torus));
+ for (uint32_t i = 0; i < total_ciphertexts; i++) {
+ h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
+ h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
+ }
+ cuda_memcpy_async_to_cpu(
+ h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
+ total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
+ cuda_memcpy_async_to_cpu(
+ h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
+ total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+ for (uint32_t i = 0; i < total_ciphertexts; i++) {
+ luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
+ luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
+ }
+ cudaFreeHost(h_lwe_indexes_in_pinned);
+ cudaFreeHost(h_lwe_indexes_out_pinned);
- // we just need to broadcast the indexes
- luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
- false);
+ luts_message_carry->broadcast_lut(streams, gpu_indexes);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb(
@@ -470,9 +491,31 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
lut_stride);
} else {
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
- // we just need to broadcast the indexes
- luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
- false);
+ Torus *h_lwe_indexes_in_pinned;
+ Torus *h_lwe_indexes_out_pinned;
+ cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
+ num_blocks_in_apply_lut * sizeof(Torus));
+ cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
+ num_blocks_in_apply_lut * sizeof(Torus));
+ for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
+ h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
+ h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
+ }
+ cuda_memcpy_async_to_cpu(
+ h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
+ num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
+ cuda_memcpy_async_to_cpu(
+ h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
+ num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+ for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
+ luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
+ luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
+ }
+ cudaFreeHost(h_lwe_indexes_in_pinned);
+ cudaFreeHost(h_lwe_indexes_out_pinned);
+
+ luts_message_carry->broadcast_lut(streams, gpu_indexes);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
index bd666fdb8a..eb79720172 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -48,10 +48,7 @@ void host_integer_grouped_oprf(
std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
- cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
- }
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
if (!lut->using_trivial_lwe_indexes) {
PANIC("lut->using_trivial_lwe_indexes should be true");
@@ -59,8 +56,8 @@ void host_integer_grouped_oprf(
multi_gpu_scatter_lwe_async(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
- seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
- lut->lwe_aligned_vec, active_gpu_count, num_blocks_to_process,
+ seeded_lwe_input, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+ active_gpu_count, num_blocks_to_process,
mem_ptr->params.small_lwe_dimension + 1);
execute_pbs_async(
@@ -75,18 +72,12 @@ void host_integer_grouped_oprf(
multi_gpu_gather_lwe_async(
streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
- lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
- lut->lwe_aligned_vec, num_blocks_to_process,
+ lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+ lut->using_trivial_lwe_indexes, num_blocks_to_process,
mem_ptr->params.big_lwe_dimension + 1);
- // other gpus record their events
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
- }
- // GPU 0 waits for all
- for (int j = 1; j < active_gpu_count; j++) {
- cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
- gpu_indexes[0]);
+ for (uint32_t i = 0; i < active_gpu_count; i++) {
+ cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
index 87bd09a528..b9c0373a32 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -47,8 +47,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
num_clear_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
- auto active_gpu_count = get_active_gpu_count(num_clear_blocks, gpu_count);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count, false);
+ lut->broadcast_lut(streams, gpu_indexes);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index 3fa42f4c5d..4bc90783a1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -154,8 +154,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_last_leaf_lut_f, true);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -254,8 +253,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
@@ -288,8 +286,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ one_block_lut->broadcast_lut(streams, gpu_indexes);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -436,8 +434,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ lut->broadcast_lut(streams, gpu_indexes);
integer_radix_apply_bivariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
@@ -543,8 +540,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, true);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- signed_msb_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+ signed_msb_lut->broadcast_lut(streams, gpu_indexes);
CudaRadixCiphertextFFI sign_block;
as_radix_ciphertext_slice(
@@ -592,8 +588,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true);
- auto active_gpu_count = get_active_gpu_count(1, gpu_count);
- one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
+
+ one_block_lut->broadcast_lut(streams, gpu_indexes);
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -823,11 +819,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
gpu_indexes[0]);
}
- auto active_gpu_count =
- get_active_gpu_count(num_halved_scalar_blocks, gpu_count);
- // We use false cause we only will broadcast the indexes
- scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes,
- active_gpu_count, false);
+ scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes);
integer_radix_apply_univariate_lookup_table_kb(
lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
index be64884fe2..ea4564049d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -38,19 +38,6 @@ void multi_gpu_copy_array_async(cudaStream_t const *streams,
gpu_indexes[i], gpu_memory_allocated);
}
}
-/// Copy an array residing on one CPU to all active gpus
-template
-void multi_gpu_copy_array_from_cpu_async(
- cudaStream_t const *streams, uint32_t const *gpu_indexes,
- uint32_t gpu_count, std::vector &dest, Torus const *h_src,
- uint32_t elements_per_gpu, bool gpu_memory_allocated) {
- dest.resize(gpu_count);
- for (uint i = 0; i < gpu_count; i++) {
- cuda_memcpy_with_size_tracking_async_to_gpu(
- dest[i], h_src, elements_per_gpu * sizeof(Torus), streams[i],
- gpu_indexes[i], gpu_memory_allocated);
- }
-}
/// Allocates the input/output vector for all devices
/// Initializes also the related indexing and initializes it to the trivial
/// index
@@ -106,35 +93,6 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
}
}
-// This function reads lwes using the indexes and place them in a single aligned
-// array. This function is needed before communication to perform a single
-// contiguous data movement. Each block handles one lwe.
-template
-__global__ void align_with_indexes(Torus *d_packed_vector,
- Torus const *d_vector,
- Torus const *d_indexes, int lwe_size) {
-
- int output_offset = blockIdx.x * lwe_size;
- int input_offset = d_indexes[blockIdx.x] * lwe_size;
- for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
- d_packed_vector[ind + output_offset] = d_vector[ind + input_offset];
- }
-}
-
-// This function takes the aligned array after communication and places it in
-// the corresponding indexes. Each block handles one lwe.
-template
-__global__ void realign_with_indexes(Torus *d_vector,
- Torus const *d_packed_vector,
- Torus const *d_indexes, int lwe_size) {
-
- int input_offset = blockIdx.x * lwe_size;
- int output_offset = d_indexes[blockIdx.x] * lwe_size;
- for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
- d_vector[ind + output_offset] = d_packed_vector[ind + input_offset];
- }
-}
-
/// Load an array residing on one GPU to all active gpus
/// and split the array among them.
/// The input indexing logic is given by an index array.
@@ -144,15 +102,15 @@ template
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector &dest,
- Torus const *src, Torus const *d_src_indexes,
+ Torus const *src, Torus const *h_src_indexes,
bool is_trivial_index,
- std::vector &aligned_vec,
uint32_t max_active_gpu_count,
uint32_t num_inputs, uint32_t lwe_size) {
if (max_active_gpu_count < gpu_count)
PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
"used to create the lut")
+ cuda_synchronize_stream(streams[0], gpu_indexes[0]);
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
@@ -169,28 +127,18 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
gpu_indexes[i], true);
} else {
- if (aligned_vec.size() == 0)
- PANIC("Cuda error: auxiliary arrays should be setup!");
-
- if (d_src_indexes == nullptr)
+ if (h_src_indexes == nullptr)
PANIC("Cuda error: source indexes should be initialized!");
+ auto src_indexes = h_src_indexes + gpu_offset;
- cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
- cuda_set_device(gpu_indexes[0]);
- align_with_indexes<<>>(
- aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
- lwe_size);
- check_cuda_error(cudaGetLastError());
- cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
- cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
-
- cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
- dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
- streams[i], gpu_indexes[i], true);
+ for (uint j = 0; j < inputs_on_gpu; j++) {
+ auto d_dest = dest[i] + j * lwe_size;
+ auto d_src = src + src_indexes[j] * lwe_size;
- cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
- cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
- cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
+ cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+ d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
+ true);
+ }
}
}
}
@@ -202,8 +150,7 @@ template
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *dest, const std::vector &src,
- Torus *d_dest_indexes, bool is_trivial_index,
- std::vector &aligned_vec,
+ Torus *h_dest_indexes, bool is_trivial_index,
uint32_t num_inputs, uint32_t lwe_size) {
for (uint i = 0; i < gpu_count; i++) {
@@ -221,27 +168,19 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], true);
} else {
- if (aligned_vec.size() == 0)
- PANIC("Cuda error: auxiliary arrays should be setup!");
- if (d_dest_indexes == nullptr)
+ if (h_dest_indexes == nullptr)
PANIC("Cuda error: destination indexes should be initialized!");
- cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
-
- cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
- cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
+ auto dest_indexes = h_dest_indexes + gpu_offset;
- cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
- aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
- streams[i], gpu_indexes[i], true);
+ for (uint j = 0; j < inputs_on_gpu; j++) {
+ auto d_dest = dest + dest_indexes[j] * lwe_size;
+ auto d_src = src[i] + j * lwe_size;
- cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
- cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
- cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
- cuda_set_device(gpu_indexes[0]);
- realign_with_indexes<<>>(
- dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset, lwe_size);
- check_cuda_error(cudaGetLastError());
+ cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+ d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
+ true);
+ }
}
}
}
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 72c0434b3b..6f02c58a4a 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -88,8 +88,6 @@ extern "C" {
pub fn cuda_get_number_of_gpus() -> i32;
- pub fn cuda_get_number_of_sms() -> i32;
-
pub fn cuda_synchronize_device(gpu_index: u32);
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
index 0760956ac9..d84545c541 100644
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml
@@ -15,7 +15,7 @@
[rtl]
bpip_use = true
- bpip_use_opportunism = false
+ bpip_use_opportunism = true
bpip_timeout = 100_000
[board]
@@ -35,21 +35,13 @@
bsk_pc = [
{Hbm={pc=8}},
- {Hbm={pc=10}},
{Hbm={pc=12}},
- {Hbm={pc=14}},
{Hbm={pc=24}},
- {Hbm={pc=26}},
{Hbm={pc=28}},
- {Hbm={pc=30}},
{Hbm={pc=40}},
- {Hbm={pc=42}},
{Hbm={pc=44}},
- {Hbm={pc=46}},
{Hbm={pc=56}},
- {Hbm={pc=58}},
- {Hbm={pc=60}},
- {Hbm={pc=62}}
+ {Hbm={pc=60}}
]
ksk_pc = [
@@ -78,7 +70,7 @@
#implementation = "Ilp"
implementation = "Llt"
integer_w=[2,4,6,8,10,12,14,16,32,64,128]
- min_batch_size = 9
+ min_batch_size = 11
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
@@ -95,8 +87,8 @@
custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"
[firmware.op_cfg.default]
- fill_batch_fifo = false
- min_batch_size = true
+ fill_batch_fifo = true
+ min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
index f57f27147c..4afc095ab6 100644
--- a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
+++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml
@@ -49,15 +49,3 @@ offset= 0x10
read_access="Read"
write_access="Write"
duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]
-
-[section.hpu_reset]
-description="Used to control the HPU soft reset"
-offset= 0x100
-
-[section.hpu_reset.register.trigger]
- description="A soft reset for the whole HPU reconfigurable logic"
- owner="Kernel"
- read_access="Read"
- write_access="WriteNotify"
- field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"}
- field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}
diff --git a/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu b/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
index 18114dc0a6..d7091cff2a 100644
--- a/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
+++ b/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
-size 84230351
+oid sha256:1d1afb554756df4d8b39bee33ded2dda19c23a6f9d8e2b242092efd35cf1cc19
+size 83281321
diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
index a69d87899c..edc6eb411f 100644
--- a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
+++ b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py
@@ -31,22 +31,14 @@ def __init__(self, d):
self.__dict__ = d
def args(self):
- try:
- return f'R{self.rid} @{hex(self.slot["Addr"])}'
- except:
- # It can happen that an IOP is not translated by the FW
- return f'R{self.rid} @{self.slot}'
+ return f'R{self.rid} @{hex(self.slot["Addr"])}'
class ST(BaseInstruction):
def __init__(self, d):
self.__dict__ = d
def args(self):
- try:
- return f'@{hex(self.slot["Addr"])} R{self.rid}'
- except:
- # It can happen that an IOP is not translated by the FW
- return f'@{self.slot} R{self.rid}'
+ return f'@{hex(self.slot["Addr"])} R{self.rid}'
class MAC(BaseInstruction):
def __init__(self, d):
diff --git a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
index 3f4f70b4ac..25e5f4d9a4 100644
--- a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs
@@ -176,18 +176,6 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
imm: 1,
};
-pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
-pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
- dst: [VarMode::Native; SIMD_N],
- src: [VarMode::Native; 2 * SIMD_N],
- imm: 0,
-};
-pub const IOP_2NCT_F_3NCT: ConstIOpProto<{ 2 * SIMD_N }, { 3 * SIMD_N }> = ConstIOpProto {
- dst: [VarMode::Native; 2 * SIMD_N],
- src: [VarMode::Native; 3 * SIMD_N],
- imm: 0,
-};
-
use crate::iop;
use arg::IOpFormat;
use lazy_static::lazy_static;
@@ -239,6 +227,4 @@ iop!(
[IOP_CT_F_CT -> "LEAD1", opcode::LEAD1],
[IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
[IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
- [IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
- [IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
);
diff --git a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
index 13da895694..34cde3c0df 100644
--- a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
+++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs
@@ -87,10 +87,6 @@ pub const LEAD1: u8 = 0x85;
pub const TRAIL0: u8 = 0x86;
pub const TRAIL1: u8 = 0x87;
-// SIMD for maximum throughput
-pub const ADD_SIMD: u8 = 0xF0;
-pub const ERC_20_SIMD: u8 = 0xF1;
-//
// Utility operations
// Used to handle real clone of ciphertext already uploaded in the Hpu memory
pub const MEMCPY: u8 = 0xFF;
diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
index 700ee37c56..e08576a2d6 100644
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs
@@ -72,9 +72,6 @@ crate::impl_fw!("Ilp" [
LEAD1 => fw_impl::ilp_log::iop_lead1;
TRAIL0 => fw_impl::ilp_log::iop_trail0;
TRAIL1 => fw_impl::ilp_log::iop_trail1;
- // SIMD Implementations
- ADD_SIMD => fw_impl::llt::iop_add_simd;
- ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
]);
#[instrument(level = "trace", skip(prog))]
diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
index 44103c2248..173dc96e39 100644
--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs
@@ -57,16 +57,16 @@ crate::impl_fw!("Llt" [
OVF_SSUB => fw_impl::ilp::iop_overflow_ssub;
OVF_MULS => fw_impl::ilp::iop_overflow_muls;
- BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
- BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
- BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});
+ BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
+ BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
+ BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});
- CMP_GT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
- CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
- CMP_LT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
- CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
- CMP_EQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
- CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});
+ CMP_GT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
+ CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
+ CMP_LT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
+ CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
+ CMP_EQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
+ CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
@@ -81,10 +81,6 @@ crate::impl_fw!("Llt" [
LEAD1 => fw_impl::ilp_log::iop_lead1;
TRAIL0 => fw_impl::ilp_log::iop_trail0;
TRAIL1 => fw_impl::ilp_log::iop_trail1;
-
- // SIMD Implementations
- ADD_SIMD => fw_impl::llt::iop_add_simd;
- ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
]);
// ----------------------------------------------------------------------------
@@ -106,17 +102,6 @@ pub fn iop_add(prog: &mut Program) {
iop_addx(prog, dst, src_a, src_b);
}
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_add_simd(prog: &mut Program) {
- // Add Comment header
- prog.push_comment("ADD_SIMD Operand::Dst Operand::Src Operand::Src".to_string());
- simd(
- prog,
- crate::asm::iop::SIMD_N,
- fw_impl::llt::iop_add_ripple_rtl,
- );
-}
-
pub fn iop_adds(prog: &mut Program) {
// Allocate metavariables:
// Dest -> Operand
@@ -204,7 +189,7 @@ pub fn iop_mul(prog: &mut Program) {
// Add Comment header
prog.push_comment("MUL Operand::Dst Operand::Src Operand::Src".to_string());
-
+ // Deferred implementation to generic mulx function
iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
}
@@ -220,50 +205,29 @@ pub fn iop_muls(prog: &mut Program) {
// Add Comment header
prog.push_comment("MULS Operand::Dst Operand::Src Operand::Immediat".to_string());
-
+ // Deferred implementation to generic mulx function
iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
}
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20(prog: &mut Program) {
- // Add Comment header
- prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
- iop_erc_20_rtl(prog, 0).add_to_prog(prog);
-}
-
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_simd(prog: &mut Program) {
- // Add Comment header
- prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
- simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl);
-}
-
-// ----------------------------------------------------------------------------
-// Helper Functions
-// ----------------------------------------------------------------------------
-
/// Implement erc_20 fund xfer
/// Targeted algorithm is as follow:
/// 1. Check that from has enough funds
/// 2. Compute real_amount to xfer (i.e. amount or 0)
/// 3. Compute new amount (from - new_amount, to + new_amount)
-///
-/// The input operands are:
-/// (from[0], to[0], amount[0], ..., from[N-1], to[N-1], amount[N-1])
-/// The output operands are:
-/// (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
-/// Where N is the batch size
#[instrument(level = "trace", skip(prog))]
-pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
+pub fn iop_erc_20(prog: &mut Program) {
// Allocate metavariables:
// Dest -> Operand
- let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
- let dst_to = prog.iop_template_var(OperandKind::Dst, 2 * batch_index + 1);
+ let dst_from = prog.iop_template_var(OperandKind::Dst, 0);
+ let dst_to = prog.iop_template_var(OperandKind::Dst, 1);
// Src -> Operand
- let src_from = prog.iop_template_var(OperandKind::Src, 3 * batch_index);
- let src_to = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 1);
+ let src_from = prog.iop_template_var(OperandKind::Src, 0);
+ let src_to = prog.iop_template_var(OperandKind::Src, 1);
// Src Amount -> Operand
- let src_amount = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 2);
+ let src_amount = prog.iop_template_var(OperandKind::Src, 2);
+
+ // Add Comment header
+ prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
// TODO: Make this a parameter or sweep this
// All these little parameters would be very handy to write an
@@ -272,7 +236,7 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
let kogge_blk_w = 10;
let ripple = true;
- {
+ let tree = {
let props = prog.params();
let tfhe_params: asm::DigitParameters = props.clone().into();
let lut = pbs_by_name!("IfFalseZeroed");
@@ -309,26 +273,13 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w)
+ kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w)
}
- }
-}
-
-/// A SIMD implementation of add for maximum throughput
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8) -> Rtl {
- // Allocate metavariables:
- let dst = prog.iop_template_var(OperandKind::Dst, i);
- let src_a = prog.iop_template_var(OperandKind::Src, 2 * i);
- let src_b = prog.iop_template_var(OperandKind::Src, 2 * i + 1);
-
- // Convert MetaVarCell in VarCell for Rtl analysis
- let a = VarCell::from_vec(src_a);
- let b = VarCell::from_vec(src_b);
- let d = VarCell::from_vec(dst);
-
- // Do a + b with the ripple carry adder
- kogge::ripple_add(d, a, b, None)
+ };
+ tree.add_to_prog(prog);
}
+// ----------------------------------------------------------------------------
+// Helper Functions
+// ----------------------------------------------------------------------------
fn iop_addx(
prog: &mut Program,
dst: Vec,
@@ -362,181 +313,11 @@ fn iop_subx(
.add_to_prog(prog);
}
-/// Generic mul operation for massively parallel HPUs
-#[instrument(level = "trace", skip(prog))]
-pub fn iop_mulx_par(
- prog: &mut Program,
- dst: Vec,
- src_a: Vec,
- src_b: Vec,
-) -> Rtl {
- let props = prog.params();
- let tfhe_params: asm::DigitParameters = props.clone().into();
- let blk_w = props.blk_w();
-
- // Transform metavars into RTL vars
- let mut dst = VarCell::from_vec(dst);
- let src_a = VarCell::from_vec(src_a);
- let src_b = VarCell::from_vec(src_b);
- let max_deg = VarDeg {
- deg: props.max_val(),
- nu: props.nu,
- };
-
- let pbs_mul_lsb = pbs_by_name!("MultCarryMsgLsb");
- let pbs_mul_msb = pbs_by_name!("MultCarryMsgMsb");
- let max_carry = (props.max_msg() * props.max_msg()) >> props.msg_w;
- let max_msg = props.max_msg();
-
- let mut mul_map: HashMap> = HashMap::new();
- itertools::iproduct!(0..blk_w, 0..blk_w).for_each(|(i, j)| {
- let pp = src_a[i].mac(tfhe_params.msg_range(), &src_b[j]);
- let lsb = pp.single_pbs(&pbs_mul_lsb);
- let msb = pp.single_pbs(&pbs_mul_msb);
- mul_map
- .entry(i + j)
- .or_default()
- .push(VarCellDeg::new(max_msg, lsb));
- mul_map
- .entry(i + j + 1)
- .or_default()
- .push(VarCellDeg::new(max_carry, msb));
- });
-
- let mut pp: Vec = (0..dst.len())
- .map(|i| mul_map.remove(&i).unwrap().into())
- .collect();
-
- // Reduce dada tree like
- while pp.iter().any(|x| x.len() > 1) {
- trace!(
- target: "llt::mul",
- "pp length: {:?}",
- pp.iter().map(|x| x.len()).collect::>()
- );
- for c in (0..dst.len()).rev() {
- let mut col_len = pp[c].len();
- let mut reduced = Vec::new();
- let mut chunks = pp[c].deg_chunks(&max_deg).peekable();
- let max_col = if c == (dst.len() - 1) {
- 0
- } else {
- dst.len() - 1
- };
-
- while chunks.peek().is_some() && col_len > pp[max_col].len() {
- let mut chunk = chunks.next().unwrap();
- let chunk_len = chunk.len();
- col_len -= chunk.len();
-
- // sum the chunk
- while chunk.len() > 1 {
- chunk = chunk
- .chunks(2)
- .map(|chunk| match chunk.len() {
- 1 => chunk[0].clone(),
- 2 => &chunk[0] + &chunk[1],
- _ => panic!("Invalid chunk size"),
- })
- .collect()
- }
-
- // And bootstrap if needed
- let element = chunk
- .into_iter()
- .next()
- .map(|sum| {
- assert!(sum.deg.nu <= props.nu);
- if sum.deg == max_deg || chunk_len == 1 {
- let (data, carry) = sum.bootstrap(&props);
- if let (Some(carry), Some(elm)) = (carry, pp.get_mut(c + 1)) {
- elm.push(carry);
- }
- data
- } else {
- sum
- }
- })
- .unwrap();
-
- reduced.push(element);
- }
-
- pp[c] = reduced
- .into_iter()
- .chain(chunks.flatten())
- .collect::>()
- .into();
- }
- }
-
- trace!(
- target: "llt::mul",
- "final pp: {:?}", pp
- );
-
- // Extract carry and message and do carry propagation
- let mut a: Vec