diff --git a/Cargo.toml b/Cargo.toml index 16e66caaf..7acdd2d7a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -88,8 +88,8 @@ opt-level = 3 [profile.release] lto = "thin" -#[patch."ssh://git@github.com/scroll-tech/ceno-gpu.git"] -#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal" } +# [patch."ssh://git@github.com/scroll-tech/ceno-gpu.git"] +# ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal" } #[patch."https://github.com/scroll-tech/gkr-backend"] #ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" } diff --git a/build-scripts/conditional-patch.sh b/build-scripts/conditional-patch.sh index 1b02e76b2..1ccbdcfe2 100755 --- a/build-scripts/conditional-patch.sh +++ b/build-scripts/conditional-patch.sh @@ -7,7 +7,7 @@ WORKSPACE_CARGO="Cargo.toml" # Workspace dependency declarations LOCAL_DEP='ceno_gpu = { path = "utils/cuda_hal", package = "cuda_hal" }' -REMOTE_DEP='ceno_gpu = { git = "ssh://git@github.com/scroll-tech/ceno-gpu.git", package = "cuda_hal", branch = "dev/integrate-into-ceno-as-dep" }' +REMOTE_DEP='ceno_gpu = { git = "ssh://git@github.com/scroll-tech/ceno-gpu.git", package = "cuda_hal", branch = "main", default-features = false, features = \["bb31"\] }' if [ "$1" = "enable-gpu" ]; then echo "Switching to GPU mode (using remote implementation)..." diff --git a/ceno_zkvm/src/scheme/cpu/mod.rs b/ceno_zkvm/src/scheme/cpu/mod.rs index 414cf1068..9b0020116 100644 --- a/ceno_zkvm/src/scheme/cpu/mod.rs +++ b/ceno_zkvm/src/scheme/cpu/mod.rs @@ -543,7 +543,7 @@ impl> MainSumcheckProver> { #[allow(clippy::type_complexity)] - #[tracing::instrument(skip_all, name = "table_witness", fields(profiling_3), level = "trace")] + #[tracing::instrument(skip_all, name = "table_witness", fields(profiling_2), level = "trace")] fn table_witness<'a>( &self, input: &ProofInput<'a, CpuBackend< as ProverBackend>::E, PCS>>, @@ -551,7 +551,7 @@ impl> MainSumcheckProver as ProverBackend>::E], ) -> Vec as ProverBackend>::MultilinearPoly<'a>>> { // main constraint: lookup denominator and numerator record witness inference - let record_span = entered_span!("record"); + let span = entered_span!("witness_infer", profiling_2 = true); let records: Vec> = cs .r_table_expressions .par_iter() @@ -581,7 +581,7 @@ impl> MainSumcheckProver> DeviceTransporter( &self, - mles: Vec>, + mles: &[MultilinearExtension<'a, E>], ) -> Vec> { - mles.into_iter().map(|mle| mle.into()).collect_vec() + mles.iter().map(|mle| mle.clone().into()).collect_vec() } } diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs index 6c7d608d7..023686e36 100644 --- a/ceno_zkvm/src/scheme/gpu/mod.rs +++ b/ceno_zkvm/src/scheme/gpu/mod.rs @@ -9,7 +9,7 @@ use crate::{ }, structs::{ComposedConstrainSystem, PointAndEval, TowerProofs}, }; -use ceno_gpu::gl64::GpuPolynomialExt; +use ceno_gpu::bb31::GpuPolynomialExt; use ff_ext::{ExtensionField, GoldilocksExt2}; use gkr_iop::{ gkr::{ @@ -109,9 +109,7 @@ impl> TraceCommitter as ProverBackend>::PcsData, PCS::Commitment, ) { - if std::any::TypeId::of::() - != std::any::TypeId::of::() - { + if std::any::TypeId::of::() != std::any::TypeId::of::() { panic!("GPU backend only supports Goldilocks base field"); } @@ -140,11 +138,14 @@ impl> TraceCommitter> = + let traces_gl64: Vec> = unsafe { std::mem::transmute(vec_traces) }; let span = entered_span!("[gpu] batch_commit", profiling_2 = true); - let pcs_data = cuda_hal.basefold.batch_commit(traces_gl64).unwrap(); + let pcs_data = cuda_hal + .basefold + .batch_commit(&cuda_hal, traces_gl64) + .unwrap(); exit_span!(span); let span = entered_span!("[gpu] get_pure_commitment", profiling_2 = true); @@ -184,9 +185,9 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( input: &ProofInput<'_, GpuBackend>>, records: &[ArcMultilinearExtensionGpu<'_, E>], challenges: &[E; 2], - cuda_hal: &CudaHalGL64, - prod_buffers: &'buf mut Vec>, - logup_buffers: &'buf mut Vec>, + cuda_hal: &CudaHalBB31, + prod_buffers: &'buf mut Vec>, + logup_buffers: &'buf mut Vec>, ) -> Result< ( Vec>, @@ -195,7 +196,7 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( String, > { use crate::scheme::constants::{NUM_FANIN, NUM_FANIN_LOGUP}; - use ceno_gpu::{CudaHal as _, gl64::GpuPolynomialExt}; + use ceno_gpu::{CudaHal as _, bb31::GpuPolynomialExt}; use p3::field::FieldAlgebra; let ComposedConstrainSystem { @@ -205,6 +206,14 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( input.num_instances << composed_cs.rotation_vars().unwrap_or(0); let chip_record_alpha = challenges[0]; + // TODO: safety ? + let records = unsafe { + std::mem::transmute::< + &[ArcMultilinearExtensionGpu<'_, E>], + &[ArcMultilinearExtensionGpu<'static, E>], + >(records) + }; + // Parse records into different categories (same as build_tower_witness) let num_reads = cs.r_expressions.len() + cs.r_table_expressions.len(); let num_writes = cs.w_expressions.len() + cs.w_table_expressions.len(); @@ -230,10 +239,11 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( let gpu_chunks = cuda_hal .tower .masked_mle_split_to_chunks( + &cuda_hal, wit.as_ceno_gpu_ext(), num_instances_with_rotation, NUM_FANIN, - GL64Ext::ONE, + BB31Ext::ONE, ) .map_err(|e| format!("GPU masked_mle_split_to_chunks failed for r_set: {:?}", e))?; r_set_gpu_chunks.push(gpu_chunks); @@ -244,10 +254,11 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( let gpu_chunks = cuda_hal .tower .masked_mle_split_to_chunks( + &cuda_hal, wit.as_ceno_gpu_ext(), num_instances_with_rotation, NUM_FANIN, - GL64Ext::ONE, + BB31Ext::ONE, ) .map_err(|e| format!("GPU masked_mle_split_to_chunks failed for w_set: {:?}", e))?; w_set_gpu_chunks.push(gpu_chunks); @@ -261,10 +272,11 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( let gpu_chunks = cuda_hal .tower .masked_mle_split_to_chunks( + &cuda_hal, wit.as_ceno_gpu_ext(), num_instances_with_rotation, NUM_FANIN_LOGUP, - GL64Ext::ONE, + BB31Ext::ONE, ) .map_err(|e| format!("GPU masked_mle_split_to_chunks failed for lk_n: {:?}", e))?; lk_numerator_gpu_chunks.push(gpu_chunks); @@ -272,13 +284,14 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( for wit in lk_d_wit.iter() { // For GPU backend, E must be GoldilocksExt2. This is ensured by the caller. - let chip_record_alpha_gl: GL64Ext = unsafe { - assert_eq!(std::mem::size_of::(), std::mem::size_of::()); + let chip_record_alpha_gl: BB31Ext = unsafe { + assert_eq!(std::mem::size_of::(), std::mem::size_of::()); std::mem::transmute_copy(&chip_record_alpha) }; let gpu_chunks = cuda_hal .tower .masked_mle_split_to_chunks( + &cuda_hal, wit.as_ceno_gpu_ext(), num_instances_with_rotation, NUM_FANIN_LOGUP, @@ -349,10 +362,10 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>( .into_iter() .map(|lk_d_chunks| { let nv = lk_d_chunks[0].num_vars(); - let p1_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, GL64Ext::ONE) + let p1_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE) .map_err(|e| format!("Failed to create p1 GPU polynomial with scalar: {:?}", e)) .unwrap(); - let p2_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, GL64Ext::ONE) + let p2_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE) .map_err(|e| format!("Failed to create p2 GPU polynomial with scalar: {:?}", e)) .unwrap(); // Use [1, 1, q1, q2] format for the last layer @@ -431,9 +444,7 @@ impl> TowerProver() - != std::any::TypeId::of::() - { + if std::any::TypeId::of::() != std::any::TypeId::of::() { panic!("GPU backend only supports Goldilocks base field"); } @@ -444,8 +455,8 @@ impl> TowerProver> = Vec::new(); - let mut _logup_buffers: Vec> = Vec::new(); + let mut _prod_buffers: Vec> = Vec::new(); + let mut _logup_buffers: Vec> = Vec::new(); // Call build_tower_witness_gpu which will allocate buffers and build GPU specs let span = entered_span!("build_tower_witness", profiling_2 = true); @@ -471,8 +482,8 @@ impl> TowerProver>> BasicTranscript - let basic_tr: &mut BasicTranscript = - unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript) }; + let basic_tr: &mut BasicTranscript = + unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript) }; let input = ceno_gpu::TowerInput { prod_specs: prod_gpu, @@ -497,6 +508,8 @@ impl> TowerProver> MainSumcheckProver> for GpuProver> { + #[allow(clippy::type_complexity)] + #[tracing::instrument(skip_all, name = "table_witness", fields(profiling_2), level = "trace")] fn table_witness<'a>( &self, input: &ProofInput<'a, GpuBackend>, @@ -517,6 +530,7 @@ impl> MainSumcheckProver> MainSumcheckProver> MainSumcheckProver = unsafe { std::mem::transmute(coeffs) }; + let coeffs_gl64: Vec = unsafe { std::mem::transmute(coeffs) }; (coeffs_gl64, indices, size_info) }) .fold( @@ -569,9 +582,11 @@ impl> MainSumcheckProver> = + let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = unsafe { std::mem::transmute(layer_witin) }; let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec(); @@ -595,6 +610,7 @@ impl> MainSumcheckProver> MainSumcheckProver> OpeningProver + 'static), ) -> PCS::Proof { - if std::any::TypeId::of::() - != std::any::TypeId::of::() - { + if std::any::TypeId::of::() != std::any::TypeId::of::() { panic!("GPU backend only supports Goldilocks base field"); } @@ -787,21 +800,24 @@ impl> OpeningProver = unsafe { std::mem::transmute(prover_param) }; let rounds_gl64: Vec<_> = rounds .iter() .map(|(commitment, point_eval_pairs)| { let commitment_gl64: &BasefoldCommitmentWithWitnessGpu< - GL64Base, - BufferImpl, + BB31Base, + BufferImpl, + GpuDigestLayer, + GpuMatrix<'static>, + GpuPolynomial<'static>, > = unsafe { std::mem::transmute(*commitment) }; let point_eval_pairs_gl64: Vec<_> = point_eval_pairs .iter() .map(|(point, evals)| { - let point_gl64: &Vec = unsafe { std::mem::transmute(point) }; - let evals_gl64: &Vec = unsafe { std::mem::transmute(evals) }; + let point_gl64: &Vec = unsafe { std::mem::transmute(point) }; + let evals_gl64: &Vec = unsafe { std::mem::transmute(evals) }; (point_gl64.clone(), evals_gl64.clone()) }) .collect(); @@ -809,10 +825,10 @@ impl> OpeningProver() == std::any::TypeId::of::() { + let gpu_proof = if std::any::TypeId::of::() == std::any::TypeId::of::() { let transcript_any = transcript as &mut dyn std::any::Any; let basic_transcript = transcript_any - .downcast_mut::>() + .downcast_mut::>() .expect("Type should match"); let cuda_hal = get_cuda_hal().unwrap(); @@ -842,22 +858,27 @@ impl> DeviceTransporter as ProverBackend>::Pcs, >, >, - ) -> DeviceProvingKey> { + ) -> DeviceProvingKey<'_, GpuBackend> { let pcs_data_original = pk.fixed_commit_wd.clone().unwrap(); // assert pcs match - let is_pcs_match = - std::mem::size_of::>() - == std::mem::size_of::(); + let is_pcs_match = std::mem::size_of::>() + == std::mem::size_of::(); assert!(is_pcs_match, "pcs mismatch"); // 1. transmute from PCS::CommitmentWithWitness to BasefoldCommitmentWithWitness - let basefold_commitment: &mpcs::BasefoldCommitmentWithWitness = + let basefold_commitment: &mpcs::BasefoldCommitmentWithWitness = unsafe { std::mem::transmute_copy(&pcs_data_original.as_ref()) }; - // 2. convert from BasefoldCommitmentWithWitness to BasefoldCommitmentWithWitness + // 2. convert from BasefoldCommitmentWithWitness to BasefoldCommitmentWithWitness let cuda_hal = get_cuda_hal().unwrap(); - let pcs_data_basefold = - convert_ceno_to_gpu_basefold_commitment(&cuda_hal, basefold_commitment); + let pcs_data_basefold = convert_ceno_to_gpu_basefold_commitment::< + CudaHalBB31, + BB31Ext, + BB31Base, + GpuDigestLayer, + GpuMatrix, + GpuPolynomial, + >(&cuda_hal, basefold_commitment); let pcs_data: as ProverBackend>::PcsData = unsafe { std::mem::transmute_copy(&pcs_data_basefold) }; std::mem::forget(pcs_data_basefold); @@ -878,7 +899,7 @@ impl> DeviceTransporter( &self, - mles: Vec>, + mles: &[MultilinearExtension<'a, E>], ) -> Vec> { let cuda_hal = get_cuda_hal().unwrap(); mles.iter() diff --git a/ceno_zkvm/src/scheme/hal.rs b/ceno_zkvm/src/scheme/hal.rs index 85cb5ce45..17ad6b92a 100644 --- a/ceno_zkvm/src/scheme/hal.rs +++ b/ceno_zkvm/src/scheme/hal.rs @@ -167,7 +167,7 @@ pub trait DeviceTransporter { fn transport_mles<'a>( &self, - mles: Vec>, + mles: &[MultilinearExtension<'a, PB::E>], ) -> Vec>>; } diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs index 1a1c4f17e..e1094d77f 100644 --- a/ceno_zkvm/src/scheme/prover.rs +++ b/ceno_zkvm/src/scheme/prover.rs @@ -200,6 +200,10 @@ impl< ]; tracing::debug!("global challenges in prover: {:?}", challenges); + let public_input_span = entered_span!("public_input", profiling_1 = true); + let public_input = self.device.transport_mles(&pi); + exit_span!(public_input_span); + let main_proofs_span = entered_span!("main_proofs", profiling_1 = true); let (points, evaluations) = self.pk.circuit_pks.iter().enumerate().try_fold( (vec![], vec![]), @@ -216,24 +220,29 @@ impl< return Ok::<(Vec<_>, Vec>), ZKVMError>((points, evaluations)); } transcript.append_field_element(&E::BaseField::from_canonical_u64(index as u64)); + // TODO: add an enum for circuit type either in constraint_system or vk let witness_mle = witness_mles .drain(..cs.num_witin()) .map(|mle| mle.into()) .collect_vec(); - let structural_witness = self.device.transport_mles( - structural_wits - .remove(circuit_name) - .map(|(sw, _)| sw) - .unwrap_or(vec![]), - ); + + let structural_witness_span = + entered_span!("structural_witness", profiling_2 = true); + let structural_mles = structural_wits + .remove(circuit_name) + .map(|(sw, _)| sw) + .unwrap_or(vec![]); + let structural_witness = self.device.transport_mles(&structural_mles); + exit_span!(structural_witness_span); + let fixed = fixed_mles.drain(..cs.num_fixed()).collect_vec(); - let public_input = self.device.transport_mles(pi.clone()); + let mut input = ProofInput { witness: witness_mle, fixed, structural_witness, - public_input, + public_input: public_input.clone(), num_instances, }; @@ -327,6 +336,8 @@ impl< let log2_num_instances = input.log2_num_instances(); let num_var_with_rotation = log2_num_instances + cs.rotation_vars().unwrap_or(0); + // println!("create_chip_proof: {}", name); + // build main witness let (records, is_padded) = build_main_witness::(&self.device, cs, &input, challenges); @@ -346,6 +357,7 @@ impl< // 1. prove the main constraints among witness polynomials // 2. prove the relation between last layer in the tower and read/write/logup records + let span = entered_span!("prove_main_constraints", profiling_2 = true); let (input_opening_point, evals, main_sumcheck_proofs, gkr_iop_proof) = self .device .prove_main_constraints(rt_tower, &input, cs, challenges, transcript)?; @@ -353,6 +365,7 @@ impl< wits_in_evals, fixed_in_evals, } = evals; + exit_span!(span); // evaluate pi if there is instance query let mut pi_in_evals: HashMap = HashMap::new(); diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs index 194b77060..c8b67929e 100644 --- a/ceno_zkvm/src/scheme/utils.rs +++ b/ceno_zkvm/src/scheme/utils.rs @@ -16,7 +16,6 @@ use itertools::Itertools; use mpcs::PolynomialCommitmentScheme; pub use multilinear_extensions::wit_infer_by_expr; use multilinear_extensions::{ - macros::{entered_span, exit_span}, mle::{ArcMultilinearExtension, FieldType, IntoMLE, MultilinearExtension}, util::ceil_log2, }; @@ -297,6 +296,12 @@ pub(crate) fn infer_tower_product_witness( wit_layers } +#[tracing::instrument( + skip_all, + name = "build_main_witness", + fields(profiling_2), + level = "trace" +)] pub fn build_main_witness< 'a, E: ExtensionField, @@ -439,7 +444,6 @@ pub fn gkr_witness< // generate all layer witness from input to output for (i, layer) in circuit.layers.iter().rev().enumerate() { tracing::debug!("generating input {i} layer with layer name {}", layer.name); - let span = entered_span!("per_layer_gen_witness", profiling_2 = true); // process in_evals to prepare layer witness // This should assume the input of the first layer is the phase1 witness of the circuit. let current_layer_wits = layer @@ -486,7 +490,6 @@ pub fn gkr_witness< } other => unimplemented!("{:?}", other), }); - exit_span!(span); } layer_wits.reverse(); diff --git a/gkr_iop/src/cpu/mod.rs b/gkr_iop/src/cpu/mod.rs index 85e058b2b..9b2864150 100644 --- a/gkr_iop/src/cpu/mod.rs +++ b/gkr_iop/src/cpu/mod.rs @@ -7,6 +7,7 @@ use ff_ext::ExtensionField; use itertools::izip; use mpcs::{PolynomialCommitmentScheme, SecurityLevel, SecurityLevel::Conjecture100bits}; use multilinear_extensions::{ + macros::{entered_span, exit_span}, mle::{ArcMultilinearExtension, MultilinearExtension, Point}, wit_infer_by_monomial_expr, }; @@ -111,12 +112,13 @@ impl> pub_io_evals: &[Arc< as ProverBackend>::MultilinearPoly<'a>>], challenges: &[E], ) -> Vec as ProverBackend>::MultilinearPoly<'a>>> { + let span = entered_span!("witness_infer", profiling_2 = true); let out_evals: Vec<_> = layer .out_sel_and_eval_exprs .iter() .flat_map(|(sel_type, out_eval)| izip!(iter::repeat(sel_type), out_eval.iter())) .collect(); - layer + let res = layer .exprs_with_selector_out_eval_monomial_form .par_iter() .zip_eq(layer.expr_names.par_iter()) @@ -141,10 +143,13 @@ impl> EvalExpression::Partition(_, _) => unimplemented!(), } }) - .collect::>() + .collect::>(); + exit_span!(span); + res } } +#[tracing::instrument(skip_all, name = "layer_witness", fields(profiling_2), level = "trace")] pub fn layer_witness<'a, E>( layer: &Layer, layer_wits: &[ArcMultilinearExtension<'a, E>], @@ -154,12 +159,13 @@ pub fn layer_witness<'a, E>( where E: ExtensionField, { + let span = entered_span!("witness_infer", profiling_2 = true); let out_evals: Vec<_> = layer .out_sel_and_eval_exprs .iter() .flat_map(|(sel_type, out_eval)| izip!(iter::repeat(sel_type), out_eval.iter())) .collect(); - layer + let res = layer .exprs_with_selector_out_eval_monomial_form .par_iter() .zip_eq(layer.expr_names.par_iter()) @@ -184,5 +190,7 @@ where EvalExpression::Partition(_, _) => unimplemented!(), } }) - .collect::>() + .collect::>(); + exit_span!(span); + res } diff --git a/gkr_iop/src/gkr/layer/gpu/mod.rs b/gkr_iop/src/gkr/layer/gpu/mod.rs index 022d0e98f..d9380c511 100644 --- a/gkr_iop/src/gkr/layer/gpu/mod.rs +++ b/gkr_iop/src/gkr/layer/gpu/mod.rs @@ -54,15 +54,18 @@ impl> LinearLayerProver, transcript: &mut impl transcript::Transcript, ) -> crate::gkr::layer::sumcheck_layer::LayerProof { + let span = entered_span!("LinearLayerProver", profiling_2 = true); let cpu_wits: Vec>> = wit .0 .into_iter() .map(|gpu_mle| Arc::new(gpu_mle.inner_to_mle())) .collect(); let cpu_wit = LayerWitness::>(cpu_wits); - > as LinearLayerProver>>::prove( + let res = > as LinearLayerProver>>::prove( layer, cpu_wit, out_point, transcript, - ) + ); + exit_span!(span); + res } } @@ -77,20 +80,23 @@ impl> SumcheckLayerProver< challenges: &[ as ProverBackend>::E], transcript: &mut impl Transcript< as ProverBackend>::E>, ) -> LayerProof< as ProverBackend>::E> { + let span = entered_span!("SumcheckLayerProver", profiling_2 = true); let cpu_wits: Vec>> = wit .0 .into_iter() .map(|gpu_mle| Arc::new(gpu_mle.inner_to_mle())) .collect(); let cpu_wit = LayerWitness::>(cpu_wits); - > as SumcheckLayerProver>>::prove( + let res = > as SumcheckLayerProver>>::prove( layer, num_threads, max_num_variables, cpu_wit, challenges, transcript, - ) + ); + exit_span!(span); + res } } @@ -111,6 +117,7 @@ impl> ZerocheckLayerProver LayerProof< as ProverBackend>::E>, Point< as ProverBackend>::E>, ) { + let span = entered_span!("ZerocheckLayerProver", profiling_2 = true); let num_threads = 1; // VP builder for GPU: do not use _num_threads assert_eq!(challenges.len(), 2); @@ -163,7 +170,6 @@ impl> ZerocheckLayerProver ) .collect_vec(); - let span = entered_span!("IOPProverState::prove", profiling_4 = true); let cuda_hal = get_cuda_hal().unwrap(); let eqs_gpu = layer .out_sel_and_eval_exprs @@ -222,11 +228,11 @@ impl> ZerocheckLayerProver .unwrap_or(0); // Convert types for GPU function Call - let basic_tr: &mut BasicTranscript = - unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript) }; - let term_coefficients_gl64: Vec = + let basic_tr: &mut BasicTranscript = + unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript) }; + let term_coefficients_gl64: Vec = unsafe { std::mem::transmute(term_coefficients) }; - let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = + let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = unsafe { std::mem::transmute(all_witins_gpu) }; let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec(); let (proof_gpu, evals_gpu, challenges_gpu) = cuda_hal @@ -247,13 +253,12 @@ impl> ZerocheckLayerProver // convert back to E: ExtensionField let proof_gpu_e = - unsafe { std::mem::transmute::, IOPProof>(proof_gpu) }; - let evals_gpu_e = unsafe { std::mem::transmute::, Vec>(evals_gpu) }; + unsafe { std::mem::transmute::, IOPProof>(proof_gpu) }; + let evals_gpu_e = unsafe { std::mem::transmute::, Vec>(evals_gpu) }; let row_challenges_e = - unsafe { std::mem::transmute::, Vec>(row_challenges) }; + unsafe { std::mem::transmute::, Vec>(row_challenges) }; exit_span!(span); - ( LayerProof { main: SumcheckLayerProof { @@ -292,7 +297,7 @@ pub(crate) fn prove_rotation_gpu> = rotated_mles_gpu .iter() @@ -344,10 +349,10 @@ pub(crate) fn prove_rotation_gpu = - unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript) }; - let term_coefficients_gl64: Vec = unsafe { std::mem::transmute(term_coefficients) }; - let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = + let basic_tr: &mut BasicTranscript = + unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript) }; + let term_coefficients_gl64: Vec = unsafe { std::mem::transmute(term_coefficients) }; + let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = unsafe { std::mem::transmute(mle_gpu_ref) }; let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec(); // gpu prover @@ -367,14 +372,14 @@ pub(crate) fn prove_rotation_gpu, IOPProof>(proof_gpu) }; - let mut evals_gpu_e = unsafe { std::mem::transmute::, Vec>(evals_gpu) }; - let row_challenges_e = unsafe { std::mem::transmute::, Vec>(row_challenges) }; + let proof_gpu_e = unsafe { std::mem::transmute::, IOPProof>(proof_gpu) }; + let mut evals_gpu_e = unsafe { std::mem::transmute::, Vec>(evals_gpu) }; + let row_challenges_e = unsafe { std::mem::transmute::, Vec>(row_challenges) }; // skip selector/eq as verifier can derive itself evals_gpu_e.truncate(raw_rotation_exprs.len() * 2); exit_span!(span); - let span = entered_span!("rotation derived left/right eval", profiling_4 = true); + let span = entered_span!("rotation derived left/right eval", profiling_3 = true); let bh = BooleanHypercube::new(rotation_cyclic_group_log2); let (left_point, right_point) = bh.get_rotation_points(&row_challenges_e); let evals = evals_gpu_e diff --git a/gkr_iop/src/gkr/layer/gpu/utils.rs b/gkr_iop/src/gkr/layer/gpu/utils.rs index 156b76e0f..726b1112c 100644 --- a/gkr_iop/src/gkr/layer/gpu/utils.rs +++ b/gkr_iop/src/gkr/layer/gpu/utils.rs @@ -60,15 +60,13 @@ pub fn extract_mle_relationships_from_monomial_terms<'a, E: ExtensionField>( (term_coefficients, mle_indices_per_term, mle_size_info) } -pub fn build_eq_x_r_with_sel_gpu<'a, E: ExtensionField>( - hal: &'a CudaHalGL64, +pub fn build_eq_x_r_with_sel_gpu( + hal: &CudaHalBB31, point: &Point, num_instances: usize, selector: &SelectorType, -) -> MultilinearExtensionGpu<'a, E> { - if std::any::TypeId::of::() - != std::any::TypeId::of::() - { +) -> MultilinearExtensionGpu<'static, E> { + if std::any::TypeId::of::() != std::any::TypeId::of::() { panic!("GPU backend only supports Goldilocks base field"); } @@ -89,58 +87,75 @@ pub fn build_eq_x_r_with_sel_gpu<'a, E: ExtensionField>( GpuFieldType::Unreachable => panic!("Unreachable GpuFieldType"), }; let indices_u32 = indices.iter().map(|x| *x as u32).collect_vec(); - ordered_sparse32_selector_gpu(&hal.inner, &mut eq_buf.buf, &indices_u32, num_instances) - .unwrap(); + ordered_sparse32_selector_gpu::( + &hal.inner, + &mut eq_buf.buf, + &indices_u32, + num_instances, + ) + .unwrap(); eq_buf } else { - let point_gl64: &Point = unsafe { std::mem::transmute(point) }; + let point_gl64: &Point = unsafe { std::mem::transmute(point) }; let mut gpu_output = hal.alloc_ext_elems_on_device(eq_len).unwrap(); let gpu_points = hal.alloc_ext_elems_from_host(point_gl64).unwrap(); - build_mle_as_ceno(&hal.inner, &gpu_points, &mut gpu_output, num_instances).unwrap(); + build_mle_as_ceno::( + &hal.inner, + &gpu_points, + &mut gpu_output, + num_instances, + ) + .unwrap(); GpuPolynomialExt::new(gpu_output, point.len()) }; let mle_gl64 = MultilinearExtensionGpu::from_ceno_gpu_ext(eq_mle); unsafe { - std::mem::transmute::, MultilinearExtensionGpu<'a, E>>( - mle_gl64, - ) + std::mem::transmute::< + MultilinearExtensionGpu<'static, BB31Ext>, + MultilinearExtensionGpu<'static, E>, + >(mle_gl64) } } -pub fn build_eq_x_r_gpu<'a, E: ExtensionField>( - hal: &'a CudaHalGL64, +pub fn build_eq_x_r_gpu( + hal: &CudaHalBB31, point: &Point, -) -> MultilinearExtensionGpu<'a, E> { - if std::any::TypeId::of::() - != std::any::TypeId::of::() - { +) -> MultilinearExtensionGpu<'static, E> { + if std::any::TypeId::of::() != std::any::TypeId::of::() { panic!("GPU backend only supports Goldilocks base field"); } let eq_len = 1 << point.len(); // type eq - let point_gl64: &Point = unsafe { std::mem::transmute(point) }; + let point_gl64: &Point = unsafe { std::mem::transmute(point) }; let eq_mle = { let mut gpu_output = hal.alloc_ext_elems_on_device(eq_len).unwrap(); let gpu_points = hal.alloc_ext_elems_from_host(point_gl64).unwrap(); - build_mle_as_ceno(&hal.inner, &gpu_points, &mut gpu_output, eq_len).unwrap(); + build_mle_as_ceno::( + &hal.inner, + &gpu_points, + &mut gpu_output, + eq_len, + ) + .unwrap(); GpuPolynomialExt::new(gpu_output, point.len()) }; let mle_gl64 = MultilinearExtensionGpu::from_ceno_gpu_ext(eq_mle); unsafe { - std::mem::transmute::, MultilinearExtensionGpu<'a, E>>( - mle_gl64, - ) + std::mem::transmute::< + MultilinearExtensionGpu<'static, BB31Ext>, + MultilinearExtensionGpu<'static, E>, + >(mle_gl64) } } -pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentScheme>( - cuda_hal: &'a CudaHalGL64, +pub fn build_rotation_mles_gpu>( + cuda_hal: &CudaHalBB31, raw_rotation_exprs: &[(Expression, Expression)], wit: &LayerWitness>, bh: &BooleanHypercube, rotation_cyclic_group_log2: usize, -) -> Vec> { +) -> Vec> { raw_rotation_exprs .iter() .map(|rotation_expr| match rotation_expr { @@ -158,10 +173,16 @@ pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentS _ => panic!("unimplemented input mle"), }; let mut output_buf = cuda_hal.alloc_elems_on_device(input_buf.len()).unwrap(); - rotation_next_base_mle_gpu( + + // Safety: GPU buffers are actually 'static lifetime. We only read from input_buf + // during the GPU kernel execution, which completes synchronously before returning. + let input_buf_static: &BufferImpl<'static, BB31Base> = + unsafe { std::mem::transmute(input_buf) }; + + rotation_next_base_mle_gpu::( &cuda_hal.inner, &mut output_buf, - input_buf, + input_buf_static, &rotation_index, cyclic_group_size, ) @@ -172,8 +193,8 @@ pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentS )); unsafe { std::mem::transmute::< - MultilinearExtensionGpu, - MultilinearExtensionGpu<'_, E>, + MultilinearExtensionGpu<'static, BB31Ext>, + MultilinearExtensionGpu<'static, E>, >(output_mle) } } @@ -182,32 +203,34 @@ pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentS .collect::>() } -pub fn build_rotation_selector_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentScheme>( - cuda_hal: &'a CudaHalGL64, +pub fn build_rotation_selector_gpu>( + cuda_hal: &CudaHalBB31, wit: &LayerWitness>, rt: &Point, bh: &BooleanHypercube, rotation_cyclic_subgroup_size: usize, rotation_cyclic_group_log2: usize, -) -> MultilinearExtensionGpu<'a, E> { +) -> MultilinearExtensionGpu<'static, E> { let total_len = wit[0].evaluations_len(); // Take first mle just to retrieve total length assert!(total_len.is_power_of_two()); let mut output_buf = cuda_hal.alloc_ext_elems_on_device(total_len).unwrap(); + let eq = build_eq_x_r_gpu(cuda_hal, rt); - let eq_buf = match &eq.mle { + let eq_buf_owned = match eq.mle { GpuFieldType::Base(_) => panic!("should be ext field"), - GpuFieldType::Ext(mle) => mle.evaluations(), + GpuFieldType::Ext(mle) => mle.buf, GpuFieldType::Unreachable => panic!("Unreachable GpuFieldType"), }; + let rotation_index = bh .into_iter() .take(rotation_cyclic_subgroup_size) .map(|x| x as u32) .collect_vec(); - rotation_selector_gpu( + rotation_selector_gpu::( &cuda_hal.inner, &mut output_buf, - eq_buf, + &eq_buf_owned, &rotation_index, 1 << rotation_cyclic_group_log2, rotation_cyclic_subgroup_size, @@ -218,8 +241,9 @@ pub fn build_rotation_selector_gpu<'a, E: ExtensionField, PCS: PolynomialCommitm total_len.ilog2() as usize, )); unsafe { - std::mem::transmute::, MultilinearExtensionGpu<'_, E>>( - output_mle, - ) + std::mem::transmute::< + MultilinearExtensionGpu<'static, BB31Ext>, + MultilinearExtensionGpu<'static, E>, + >(output_mle) } } diff --git a/gkr_iop/src/gpu/mod.rs b/gkr_iop/src/gpu/mod.rs index 20de4e819..14a371a7b 100644 --- a/gkr_iop/src/gpu/mod.rs +++ b/gkr_iop/src/gpu/mod.rs @@ -4,7 +4,10 @@ use crate::{ }; use ff_ext::ExtensionField; use mpcs::{PolynomialCommitmentScheme, SecurityLevel}; -use multilinear_extensions::mle::{FieldType, MultilinearExtension, Point}; +use multilinear_extensions::{ + macros::{entered_span, exit_span}, + mle::{FieldType, MultilinearExtension, Point}, +}; use p3::field::TwoAdicField; use std::{rc::Rc, sync::Arc}; use witness::RowMajorMatrix; @@ -17,37 +20,43 @@ use std::marker::PhantomData; pub mod gpu_prover { pub use ceno_gpu::{ BasefoldCommitmentWithWitness as BasefoldCommitmentWithWitnessGpu, Buffer, CudaHal, - gl64::{ - CudaHalGL64, GpuFieldType, GpuPolynomial, GpuPolynomialExt, buffer::BufferImpl, - build_mle_as_ceno, convert_ceno_to_gpu_basefold_commitment, - ordered_sparse32_selector_gpu, rotation_next_base_mle_gpu, rotation_selector_gpu, + bb31::{ + CudaHalBB31, GpuDigestLayer, GpuFieldType, GpuMatrix, GpuPolynomial, GpuPolynomialExt, + }, + common::{ + basefold::utils::convert_ceno_to_gpu_basefold_commitment, + buffer::BufferImpl, + mle::{ + build_mle_as_ceno, ordered_sparse32_selector_gpu, rotation_next_base_mle_gpu, + rotation_selector_gpu, + }, }, }; use cudarc::driver::{CudaDevice, DriverError}; use once_cell::sync::Lazy; use std::sync::{Arc, Mutex, MutexGuard}; - pub type GL64Base = p3::goldilocks::Goldilocks; - pub type GL64Ext = ff_ext::GoldilocksExt2; + pub type BB31Base = p3::babybear::BabyBear; + pub type BB31Ext = ff_ext::BabyBearExt4; pub static CUDA_DEVICE: Lazy, DriverError>> = Lazy::new(|| CudaDevice::new(0)); #[allow(clippy::type_complexity)] pub static CUDA_HAL: Lazy< - Result>, Box>, + Result>, Box>, > = Lazy::new(|| { let device = CUDA_DEVICE .as_ref() .map_err(|e| format!("Device init failed: {:?}", e))?; device.bind_to_thread()?; - CudaHalGL64::new() + CudaHalBB31::new() .map(|hal| Arc::new(Mutex::new(hal))) .map_err(|e| Box::new(e) as Box) }); - pub fn get_cuda_hal() -> Result, String> { + pub fn get_cuda_hal() -> Result, String> { let device = CUDA_DEVICE .as_ref() .map_err(|e| format!("Device not available: {:?}", e))?; @@ -146,7 +155,7 @@ impl<'a, E: ExtensionField> MultilinearPolynomial for MultilinearExtensionGpu impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> { /// Get reference to internal GPU polynomial - pub fn inner(&self) -> &GpuFieldType { + pub fn inner(&self) -> &GpuFieldType<'_> { &self.mle } @@ -180,12 +189,12 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> { } /// Create GPU version from CPU version of MultilinearExtension - pub fn from_ceno(cuda_hal: &CudaHalGL64, mle: &MultilinearExtension<'a, E>) -> Self { + pub fn from_ceno(cuda_hal: &CudaHalBB31, mle: &MultilinearExtension<'a, E>) -> Self { // check type of mle match mle.evaluations { FieldType::Base(_) => { let mle_vec_ref = mle.get_base_field_vec(); - let mle_vec_ref_gl64: &[GL64Base] = unsafe { std::mem::transmute(mle_vec_ref) }; + let mle_vec_ref_gl64: &[BB31Base] = unsafe { std::mem::transmute(mle_vec_ref) }; let mle_gpu = GpuPolynomial::from_ceno_vec(cuda_hal, mle_vec_ref_gl64, mle.num_vars()) .unwrap(); @@ -196,7 +205,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> { } FieldType::Ext(_) => { let mle_vec_ref = mle.get_ext_field_vec(); - let mle_vec_ref_gl64_ext: &[GL64Ext] = unsafe { std::mem::transmute(mle_vec_ref) }; + let mle_vec_ref_gl64_ext: &[BB31Ext] = unsafe { std::mem::transmute(mle_vec_ref) }; let mle_gpu = GpuPolynomialExt::from_ceno_vec(cuda_hal, mle_vec_ref_gl64_ext, mle.num_vars()) .unwrap(); @@ -231,7 +240,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> { } /// get inner poly reference with base field claim - pub fn as_ceno_gpu_base(&self) -> &GpuPolynomial { + pub fn as_ceno_gpu_base(&self) -> &GpuPolynomial<'_> { match &self.mle { GpuFieldType::Base(poly) => poly, GpuFieldType::Ext(_) => panic!("poly in ext field"), @@ -240,7 +249,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> { } /// get inner poly reference with ext field claim - pub fn as_ceno_gpu_ext(&self) -> &GpuPolynomialExt { + pub fn as_ceno_gpu_ext(&self) -> &GpuPolynomialExt<'_> { match &self.mle { GpuFieldType::Base(_) => panic!("poly in base field"), GpuFieldType::Ext(poly) => poly, @@ -286,8 +295,13 @@ impl> ProverBackend for Gp type MultilinearPoly<'a> = MultilinearExtensionGpu<'a, E>; type Matrix = RowMajorMatrix; #[cfg(feature = "gpu")] - type PcsData = - BasefoldCommitmentWithWitnessGpu>; + type PcsData = BasefoldCommitmentWithWitnessGpu< + E::BaseField, + BufferImpl<'static, E::BaseField>, + GpuDigestLayer, + GpuMatrix<'static>, + GpuPolynomial<'static>, + >; #[cfg(not(feature = "gpu"))] type PcsData = >::CommitmentWithWitness; @@ -320,15 +334,15 @@ where impl> ProtocolWitnessGeneratorProver> for GpuProver> { + #[tracing::instrument(skip_all, name = "layer_witness", fields(profiling_2), level = "trace")] fn layer_witness<'a>( layer: &Layer, layer_wits: &[Arc< as ProverBackend>::MultilinearPoly<'a>>], pub_io_evals: &[Arc< as ProverBackend>::MultilinearPoly<'a>>], challenges: &[E], ) -> Vec as ProverBackend>::MultilinearPoly<'a>>> { - if std::any::TypeId::of::() - != std::any::TypeId::of::() - { + let span = entered_span!("preprocess", profiling_2 = true); + if std::any::TypeId::of::() != std::any::TypeId::of::() { panic!("GPU backend only supports Goldilocks base field"); } @@ -369,7 +383,7 @@ impl> &pub_io_evals, challenges, ); - let coeffs_gl64: Vec = unsafe { std::mem::transmute(coeffs) }; + let coeffs_gl64: Vec = unsafe { std::mem::transmute(coeffs) }; (coeffs_gl64, indices, size_info) }) .fold( @@ -390,10 +404,12 @@ impl> .as_ref() .unwrap() .0; + exit_span!(span); + let span = entered_span!("witness_infer", profiling_2 = true); // process & transmute poly let all_witins_gpu = layer_wits.iter().map(|mle| mle.as_ref()).collect_vec(); - let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = + let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu> = unsafe { std::mem::transmute(all_witins_gpu) }; let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec(); @@ -411,13 +427,14 @@ impl> cuda_hal .witness_infer .wit_infer_by_monomial_expr( - &cuda_hal, + &*cuda_hal, all_witins_gpu_type_gl64, &term_coefficients, &mle_indices_per_term, &mut next_witness_buf, ) .unwrap(); + exit_span!(span); // recover it back and interleaving with default gpu let mut next_iter = next_witness_buf.into_iter(); diff --git a/utils/cuda_hal/src/lib.rs b/utils/cuda_hal/src/lib.rs index 1ecfe9a9b..619c6fc80 100644 --- a/utils/cuda_hal/src/lib.rs +++ b/utils/cuda_hal/src/lib.rs @@ -12,16 +12,16 @@ compile_error!( ); // Minimal stub exports to satisfy basic compilation when gpu feature is disabled -pub mod gl64 { - pub struct CudaHalGL64; +pub mod bb31 { + pub struct CudaHalBB31; - impl CudaHalGL64 { + impl CudaHalBB31 { pub fn new() -> Result> { Err("GPU placeholder: real implementation required".into()) } } - pub fn convert_ceno_to_gpu_basefold_commitment(_hal: &CudaHalGL64, _commitment: &T) -> T { + pub fn convert_ceno_to_gpu_basefold_commitment(_hal: &CudaHalBB31, _commitment: &T) -> T { panic!("GPU placeholder: real implementation required") }