diff --git a/Cargo.toml b/Cargo.toml
index 16e66caaf..7acdd2d7a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -88,8 +88,8 @@ opt-level = 3
 [profile.release]
 lto = "thin"
 
-#[patch."ssh://git@github.com/scroll-tech/ceno-gpu.git"]
-#ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal" }
+# [patch."ssh://git@github.com/scroll-tech/ceno-gpu.git"]
+# ceno_gpu = { path = "../ceno-gpu/cuda_hal", package = "cuda_hal" }
 
 #[patch."https://github.com/scroll-tech/gkr-backend"]
 #ff_ext = { path = "../gkr-backend/crates/ff_ext", package = "ff_ext" }
diff --git a/build-scripts/conditional-patch.sh b/build-scripts/conditional-patch.sh
index 1b02e76b2..1ccbdcfe2 100755
--- a/build-scripts/conditional-patch.sh
+++ b/build-scripts/conditional-patch.sh
@@ -7,7 +7,7 @@ WORKSPACE_CARGO="Cargo.toml"
 
 # Workspace dependency declarations
 LOCAL_DEP='ceno_gpu = { path = "utils/cuda_hal", package = "cuda_hal" }'
-REMOTE_DEP='ceno_gpu = { git = "ssh://git@github.com/scroll-tech/ceno-gpu.git", package = "cuda_hal", branch = "dev/integrate-into-ceno-as-dep" }'
+REMOTE_DEP='ceno_gpu = { git = "ssh://git@github.com/scroll-tech/ceno-gpu.git", package = "cuda_hal", branch = "main", default-features = false, features = \["bb31"\] }'
 
 if [ "$1" = "enable-gpu" ]; then
     echo "Switching to GPU mode (using remote implementation)..."
diff --git a/ceno_zkvm/src/scheme/cpu/mod.rs b/ceno_zkvm/src/scheme/cpu/mod.rs
index 414cf1068..9b0020116 100644
--- a/ceno_zkvm/src/scheme/cpu/mod.rs
+++ b/ceno_zkvm/src/scheme/cpu/mod.rs
@@ -543,7 +543,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<C
     for CpuProver<CpuBackend<E, PCS>>
 {
     #[allow(clippy::type_complexity)]
-    #[tracing::instrument(skip_all, name = "table_witness", fields(profiling_3), level = "trace")]
+    #[tracing::instrument(skip_all, name = "table_witness", fields(profiling_2), level = "trace")]
     fn table_witness<'a>(
         &self,
         input: &ProofInput<'a, CpuBackend<<CpuBackend<E, PCS> as ProverBackend>::E, PCS>>,
@@ -551,7 +551,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<C
         challenges: &[<CpuBackend<E, PCS> as ProverBackend>::E],
     ) -> Vec<Arc<<CpuBackend<E, PCS> as ProverBackend>::MultilinearPoly<'a>>> {
         // main constraint: lookup denominator and numerator record witness inference
-        let record_span = entered_span!("record");
+        let span = entered_span!("witness_infer", profiling_2 = true);
         let records: Vec<ArcMultilinearExtension<'_, E>> = cs
             .r_table_expressions
             .par_iter()
@@ -581,7 +581,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<C
                 )
             })
             .collect();
-        exit_span!(record_span);
+        exit_span!(span);
         records
     }
 
@@ -774,9 +774,9 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> DeviceTransporter<Cp
 
     fn transport_mles<'a>(
         &self,
-        mles: Vec<MultilinearExtension<'a, E>>,
+        mles: &[MultilinearExtension<'a, E>],
     ) -> Vec<ArcMultilinearExtension<'a, E>> {
-        mles.into_iter().map(|mle| mle.into()).collect_vec()
+        mles.iter().map(|mle| mle.clone().into()).collect_vec()
     }
 }
 
diff --git a/ceno_zkvm/src/scheme/gpu/mod.rs b/ceno_zkvm/src/scheme/gpu/mod.rs
index 6c7d608d7..023686e36 100644
--- a/ceno_zkvm/src/scheme/gpu/mod.rs
+++ b/ceno_zkvm/src/scheme/gpu/mod.rs
@@ -9,7 +9,7 @@ use crate::{
     },
     structs::{ComposedConstrainSystem, PointAndEval, TowerProofs},
 };
-use ceno_gpu::gl64::GpuPolynomialExt;
+use ceno_gpu::bb31::GpuPolynomialExt;
 use ff_ext::{ExtensionField, GoldilocksExt2};
 use gkr_iop::{
     gkr::{
@@ -109,9 +109,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TraceCommitter<GpuBa
         <GpuBackend<E, PCS> as ProverBackend>::PcsData,
         PCS::Commitment,
     ) {
-        if std::any::TypeId::of::<E::BaseField>()
-            != std::any::TypeId::of::<p3::goldilocks::Goldilocks>()
-        {
+        if std::any::TypeId::of::<E::BaseField>() != std::any::TypeId::of::<BB31Base>() {
             panic!("GPU backend only supports Goldilocks base field");
         }
 
@@ -140,11 +138,14 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TraceCommitter<GpuBa
             let cuda_hal = get_cuda_hal().unwrap();
             exit_span!(span);
 
-            let traces_gl64: Vec<witness::RowMajorMatrix<p3::goldilocks::Goldilocks>> =
+            let traces_gl64: Vec<witness::RowMajorMatrix<BB31Base>> =
                 unsafe { std::mem::transmute(vec_traces) };
 
             let span = entered_span!("[gpu] batch_commit", profiling_2 = true);
-            let pcs_data = cuda_hal.basefold.batch_commit(traces_gl64).unwrap();
+            let pcs_data = cuda_hal
+                .basefold
+                .batch_commit(&cuda_hal, traces_gl64)
+                .unwrap();
             exit_span!(span);
 
             let span = entered_span!("[gpu] get_pure_commitment", profiling_2 = true);
@@ -184,9 +185,9 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
     input: &ProofInput<'_, GpuBackend<E, impl PolynomialCommitmentScheme<E>>>,
     records: &[ArcMultilinearExtensionGpu<'_, E>],
     challenges: &[E; 2],
-    cuda_hal: &CudaHalGL64,
-    prod_buffers: &'buf mut Vec<BufferImpl<GL64Ext>>,
-    logup_buffers: &'buf mut Vec<BufferImpl<GL64Ext>>,
+    cuda_hal: &CudaHalBB31,
+    prod_buffers: &'buf mut Vec<BufferImpl<BB31Ext>>,
+    logup_buffers: &'buf mut Vec<BufferImpl<BB31Ext>>,
 ) -> Result<
     (
         Vec<ceno_gpu::GpuProverSpec<'buf>>,
@@ -195,7 +196,7 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
     String,
 > {
     use crate::scheme::constants::{NUM_FANIN, NUM_FANIN_LOGUP};
-    use ceno_gpu::{CudaHal as _, gl64::GpuPolynomialExt};
+    use ceno_gpu::{CudaHal as _, bb31::GpuPolynomialExt};
     use p3::field::FieldAlgebra;
 
     let ComposedConstrainSystem {
@@ -205,6 +206,14 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
         input.num_instances << composed_cs.rotation_vars().unwrap_or(0);
     let chip_record_alpha = challenges[0];
 
+    // TODO: safety ?
+    let records = unsafe {
+        std::mem::transmute::<
+            &[ArcMultilinearExtensionGpu<'_, E>],
+            &[ArcMultilinearExtensionGpu<'static, E>],
+        >(records)
+    };
+
     // Parse records into different categories (same as build_tower_witness)
     let num_reads = cs.r_expressions.len() + cs.r_table_expressions.len();
     let num_writes = cs.w_expressions.len() + cs.w_table_expressions.len();
@@ -230,10 +239,11 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
         let gpu_chunks = cuda_hal
             .tower
             .masked_mle_split_to_chunks(
+                &cuda_hal,
                 wit.as_ceno_gpu_ext(),
                 num_instances_with_rotation,
                 NUM_FANIN,
-                GL64Ext::ONE,
+                BB31Ext::ONE,
             )
             .map_err(|e| format!("GPU masked_mle_split_to_chunks failed for r_set: {:?}", e))?;
         r_set_gpu_chunks.push(gpu_chunks);
@@ -244,10 +254,11 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
         let gpu_chunks = cuda_hal
             .tower
             .masked_mle_split_to_chunks(
+                &cuda_hal,
                 wit.as_ceno_gpu_ext(),
                 num_instances_with_rotation,
                 NUM_FANIN,
-                GL64Ext::ONE,
+                BB31Ext::ONE,
             )
             .map_err(|e| format!("GPU masked_mle_split_to_chunks failed for w_set: {:?}", e))?;
         w_set_gpu_chunks.push(gpu_chunks);
@@ -261,10 +272,11 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
         let gpu_chunks = cuda_hal
             .tower
             .masked_mle_split_to_chunks(
+                &cuda_hal,
                 wit.as_ceno_gpu_ext(),
                 num_instances_with_rotation,
                 NUM_FANIN_LOGUP,
-                GL64Ext::ONE,
+                BB31Ext::ONE,
             )
             .map_err(|e| format!("GPU masked_mle_split_to_chunks failed for lk_n: {:?}", e))?;
         lk_numerator_gpu_chunks.push(gpu_chunks);
@@ -272,13 +284,14 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
 
     for wit in lk_d_wit.iter() {
         // For GPU backend, E must be GoldilocksExt2. This is ensured by the caller.
-        let chip_record_alpha_gl: GL64Ext = unsafe {
-            assert_eq!(std::mem::size_of::<E>(), std::mem::size_of::<GL64Ext>());
+        let chip_record_alpha_gl: BB31Ext = unsafe {
+            assert_eq!(std::mem::size_of::<E>(), std::mem::size_of::<BB31Ext>());
             std::mem::transmute_copy(&chip_record_alpha)
         };
         let gpu_chunks = cuda_hal
             .tower
             .masked_mle_split_to_chunks(
+                &cuda_hal,
                 wit.as_ceno_gpu_ext(),
                 num_instances_with_rotation,
                 NUM_FANIN_LOGUP,
@@ -349,10 +362,10 @@ fn build_tower_witness_gpu<'buf, E: ExtensionField>(
             .into_iter()
             .map(|lk_d_chunks| {
                 let nv = lk_d_chunks[0].num_vars();
-                let p1_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, GL64Ext::ONE)
+                let p1_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE)
                     .map_err(|e| format!("Failed to create p1 GPU polynomial with scalar: {:?}", e))
                     .unwrap();
-                let p2_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, GL64Ext::ONE)
+                let p2_gpu = GpuPolynomialExt::new_with_scalar(&cuda_hal.inner, nv, BB31Ext::ONE)
                     .map_err(|e| format!("Failed to create p2 GPU polynomial with scalar: {:?}", e))
                     .unwrap();
                 // Use [1, 1, q1, q2] format for the last layer
@@ -431,9 +444,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TowerProver<GpuBacke
         'a: 'b,
         'b: 'c,
     {
-        if std::any::TypeId::of::<E::BaseField>()
-            != std::any::TypeId::of::<p3::goldilocks::Goldilocks>()
-        {
+        if std::any::TypeId::of::<E::BaseField>() != std::any::TypeId::of::<BB31Base>() {
             panic!("GPU backend only supports Goldilocks base field");
         }
 
@@ -444,8 +455,8 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TowerProver<GpuBacke
         let r_set_len = cs.r_expressions.len() + cs.r_table_expressions.len();
 
         // GPU optimization: Use build_tower_witness_gpu which handles buffer allocation internally
-        let mut _prod_buffers: Vec<ceno_gpu::gl64::buffer::BufferImpl<GL64Ext>> = Vec::new();
-        let mut _logup_buffers: Vec<ceno_gpu::gl64::buffer::BufferImpl<GL64Ext>> = Vec::new();
+        let mut _prod_buffers: Vec<BufferImpl<BB31Ext>> = Vec::new();
+        let mut _logup_buffers: Vec<BufferImpl<BB31Ext>> = Vec::new();
 
         // Call build_tower_witness_gpu which will allocate buffers and build GPU specs
         let span = entered_span!("build_tower_witness", profiling_2 = true);
@@ -471,8 +482,8 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TowerProver<GpuBacke
         exit_span!(span);
 
         // transcript >>> BasicTranscript<E>
-        let basic_tr: &mut BasicTranscript<GoldilocksExt2> =
-            unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript<GoldilocksExt2>) };
+        let basic_tr: &mut BasicTranscript<BB31Ext> =
+            unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript<BB31Ext>) };
 
         let input = ceno_gpu::TowerInput {
             prod_specs: prod_gpu,
@@ -497,6 +508,8 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TowerProver<GpuBacke
 impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<GpuBackend<E, PCS>>
     for GpuProver<GpuBackend<E, PCS>>
 {
+    #[allow(clippy::type_complexity)]
+    #[tracing::instrument(skip_all, name = "table_witness", fields(profiling_2), level = "trace")]
     fn table_witness<'a>(
         &self,
         input: &ProofInput<'a, GpuBackend<E, PCS>>,
@@ -517,6 +530,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
                 .all(|(r, w)| r.table_spec.len == w.table_spec.len)
         );
 
+        let span = entered_span!("preprocess", profiling_2 = true);
         let layer_witin = input
             .witness
             .iter()
@@ -528,7 +542,6 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
         let num_vars = input.witness[0].num_vars();
 
         // main constraint: lookup denominator and numerator record witness inference
-        let record_span = entered_span!("record");
         let (num_non_zero_expr, term_coefficients, mle_indices_per_term, _) = cs
             .r_table_expressions
             .iter()
@@ -555,7 +568,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
                     &[],
                     challenges,
                 );
-                let coeffs_gl64: Vec<GL64Ext> = unsafe { std::mem::transmute(coeffs) };
+                let coeffs_gl64: Vec<BB31Ext> = unsafe { std::mem::transmute(coeffs) };
                 (coeffs_gl64, indices, size_info)
             })
             .fold(
@@ -569,9 +582,11 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
                     (num_non_zero_expr, coeff_acc, indices_acc, size_acc)
                 },
             );
+        exit_span!(span);
 
+        let span = entered_span!("witness_infer", profiling_2 = true);
         let cuda_hal = get_cuda_hal().unwrap();
-        let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<GL64Ext>> =
+        let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<BB31Ext>> =
             unsafe { std::mem::transmute(layer_witin) };
         let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec();
 
@@ -595,6 +610,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
                 &mut next_witness_buf,
             )
             .unwrap();
+        exit_span!(span);
 
         let next_mles = next_witness_buf
             .into_iter()
@@ -605,7 +621,6 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> MainSumcheckProver<G
             })
             .collect_vec();
 
-        exit_span!(record_span);
         next_mles
     }
 
@@ -744,9 +759,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> OpeningProver<GpuBac
         num_instances: &[(usize, usize)],
         transcript: &mut (impl Transcript<E> + 'static),
     ) -> PCS::Proof {
-        if std::any::TypeId::of::<E::BaseField>()
-            != std::any::TypeId::of::<p3::goldilocks::Goldilocks>()
-        {
+        if std::any::TypeId::of::<E::BaseField>() != std::any::TypeId::of::<BB31Base>() {
             panic!("GPU backend only supports Goldilocks base field");
         }
 
@@ -787,21 +800,24 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> OpeningProver<GpuBac
         // Type conversions using unsafe transmute
         let prover_param = &self.backend.pp;
         let pp_gl64: &mpcs::basefold::structure::BasefoldProverParams<
-            GL64Ext,
+            BB31Ext,
             mpcs::BasefoldRSParams,
         > = unsafe { std::mem::transmute(prover_param) };
         let rounds_gl64: Vec<_> = rounds
             .iter()
             .map(|(commitment, point_eval_pairs)| {
                 let commitment_gl64: &BasefoldCommitmentWithWitnessGpu<
-                    GL64Base,
-                    BufferImpl<GL64Base>,
+                    BB31Base,
+                    BufferImpl<BB31Base>,
+                    GpuDigestLayer,
+                    GpuMatrix<'static>,
+                    GpuPolynomial<'static>,
                 > = unsafe { std::mem::transmute(*commitment) };
                 let point_eval_pairs_gl64: Vec<_> = point_eval_pairs
                     .iter()
                     .map(|(point, evals)| {
-                        let point_gl64: &Vec<GL64Ext> = unsafe { std::mem::transmute(point) };
-                        let evals_gl64: &Vec<GL64Ext> = unsafe { std::mem::transmute(evals) };
+                        let point_gl64: &Vec<BB31Ext> = unsafe { std::mem::transmute(point) };
+                        let evals_gl64: &Vec<BB31Ext> = unsafe { std::mem::transmute(evals) };
                         (point_gl64.clone(), evals_gl64.clone())
                     })
                     .collect();
@@ -809,10 +825,10 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> OpeningProver<GpuBac
             })
             .collect();
 
-        let gpu_proof = if std::any::TypeId::of::<E>() == std::any::TypeId::of::<GoldilocksExt2>() {
+        let gpu_proof = if std::any::TypeId::of::<E>() == std::any::TypeId::of::<BB31Ext>() {
             let transcript_any = transcript as &mut dyn std::any::Any;
             let basic_transcript = transcript_any
-                .downcast_mut::<BasicTranscript<GoldilocksExt2>>()
+                .downcast_mut::<BasicTranscript<BB31Ext>>()
                 .expect("Type should match");
 
             let cuda_hal = get_cuda_hal().unwrap();
@@ -842,22 +858,27 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> DeviceTransporter<Gp
                 <GpuBackend<E, PCS> as ProverBackend>::Pcs,
             >,
         >,
-    ) -> DeviceProvingKey<GpuBackend<E, PCS>> {
+    ) -> DeviceProvingKey<'_, GpuBackend<E, PCS>> {
         let pcs_data_original = pk.fixed_commit_wd.clone().unwrap();
 
         // assert pcs match
-        let is_pcs_match =
-            std::mem::size_of::<mpcs::BasefoldCommitmentWithWitness<GoldilocksExt2>>()
-                == std::mem::size_of::<PCS::CommitmentWithWitness>();
+        let is_pcs_match = std::mem::size_of::<mpcs::BasefoldCommitmentWithWitness<BB31Ext>>()
+            == std::mem::size_of::<PCS::CommitmentWithWitness>();
         assert!(is_pcs_match, "pcs mismatch");
 
         // 1. transmute from PCS::CommitmentWithWitness to BasefoldCommitmentWithWitness<E>
-        let basefold_commitment: &mpcs::BasefoldCommitmentWithWitness<GoldilocksExt2> =
+        let basefold_commitment: &mpcs::BasefoldCommitmentWithWitness<BB31Ext> =
             unsafe { std::mem::transmute_copy(&pcs_data_original.as_ref()) };
-        // 2. convert from BasefoldCommitmentWithWitness<E> to BasefoldCommitmentWithWitness<GL64Base>
+        // 2. convert from BasefoldCommitmentWithWitness<E> to BasefoldCommitmentWithWitness<BB31Base>
         let cuda_hal = get_cuda_hal().unwrap();
-        let pcs_data_basefold =
-            convert_ceno_to_gpu_basefold_commitment(&cuda_hal, basefold_commitment);
+        let pcs_data_basefold = convert_ceno_to_gpu_basefold_commitment::<
+            CudaHalBB31,
+            BB31Ext,
+            BB31Base,
+            GpuDigestLayer,
+            GpuMatrix,
+            GpuPolynomial,
+        >(&cuda_hal, basefold_commitment);
         let pcs_data: <GpuBackend<E, PCS> as ProverBackend>::PcsData =
             unsafe { std::mem::transmute_copy(&pcs_data_basefold) };
         std::mem::forget(pcs_data_basefold);
@@ -878,7 +899,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> DeviceTransporter<Gp
 
     fn transport_mles<'a>(
         &self,
-        mles: Vec<MultilinearExtension<'a, E>>,
+        mles: &[MultilinearExtension<'a, E>],
     ) -> Vec<ArcMultilinearExtensionGpu<'a, E>> {
         let cuda_hal = get_cuda_hal().unwrap();
         mles.iter()
diff --git a/ceno_zkvm/src/scheme/hal.rs b/ceno_zkvm/src/scheme/hal.rs
index 85cb5ce45..17ad6b92a 100644
--- a/ceno_zkvm/src/scheme/hal.rs
+++ b/ceno_zkvm/src/scheme/hal.rs
@@ -167,7 +167,7 @@ pub trait DeviceTransporter<PB: ProverBackend> {
 
     fn transport_mles<'a>(
         &self,
-        mles: Vec<MultilinearExtension<'a, PB::E>>,
+        mles: &[MultilinearExtension<'a, PB::E>],
     ) -> Vec<Arc<PB::MultilinearPoly<'a>>>;
 }
 
diff --git a/ceno_zkvm/src/scheme/prover.rs b/ceno_zkvm/src/scheme/prover.rs
index 1a1c4f17e..e1094d77f 100644
--- a/ceno_zkvm/src/scheme/prover.rs
+++ b/ceno_zkvm/src/scheme/prover.rs
@@ -200,6 +200,10 @@ impl<
         ];
         tracing::debug!("global challenges in prover: {:?}", challenges);
 
+        let public_input_span = entered_span!("public_input", profiling_1 = true);
+        let public_input = self.device.transport_mles(&pi);
+        exit_span!(public_input_span);
+
         let main_proofs_span = entered_span!("main_proofs", profiling_1 = true);
         let (points, evaluations) = self.pk.circuit_pks.iter().enumerate().try_fold(
             (vec![], vec![]),
@@ -216,24 +220,29 @@ impl<
                     return Ok::<(Vec<_>, Vec<Vec<_>>), ZKVMError>((points, evaluations));
                 }
                 transcript.append_field_element(&E::BaseField::from_canonical_u64(index as u64));
+
                 // TODO: add an enum for circuit type either in constraint_system or vk
                 let witness_mle = witness_mles
                     .drain(..cs.num_witin())
                     .map(|mle| mle.into())
                     .collect_vec();
-                let structural_witness = self.device.transport_mles(
-                    structural_wits
-                        .remove(circuit_name)
-                        .map(|(sw, _)| sw)
-                        .unwrap_or(vec![]),
-                );
+
+                let structural_witness_span =
+                    entered_span!("structural_witness", profiling_2 = true);
+                let structural_mles = structural_wits
+                    .remove(circuit_name)
+                    .map(|(sw, _)| sw)
+                    .unwrap_or(vec![]);
+                let structural_witness = self.device.transport_mles(&structural_mles);
+                exit_span!(structural_witness_span);
+
                 let fixed = fixed_mles.drain(..cs.num_fixed()).collect_vec();
-                let public_input = self.device.transport_mles(pi.clone());
+
                 let mut input = ProofInput {
                     witness: witness_mle,
                     fixed,
                     structural_witness,
-                    public_input,
+                    public_input: public_input.clone(),
                     num_instances,
                 };
 
@@ -327,6 +336,8 @@ impl<
         let log2_num_instances = input.log2_num_instances();
         let num_var_with_rotation = log2_num_instances + cs.rotation_vars().unwrap_or(0);
 
+        // println!("create_chip_proof: {}", name);
+
         // build main witness
         let (records, is_padded) =
             build_main_witness::<E, PCS, PB, PD>(&self.device, cs, &input, challenges);
@@ -346,6 +357,7 @@ impl<
 
         // 1. prove the main constraints among witness polynomials
         // 2. prove the relation between last layer in the tower and read/write/logup records
+        let span = entered_span!("prove_main_constraints", profiling_2 = true);
         let (input_opening_point, evals, main_sumcheck_proofs, gkr_iop_proof) = self
             .device
             .prove_main_constraints(rt_tower, &input, cs, challenges, transcript)?;
@@ -353,6 +365,7 @@ impl<
             wits_in_evals,
             fixed_in_evals,
         } = evals;
+        exit_span!(span);
 
         // evaluate pi if there is instance query
         let mut pi_in_evals: HashMap<usize, E> = HashMap::new();
diff --git a/ceno_zkvm/src/scheme/utils.rs b/ceno_zkvm/src/scheme/utils.rs
index 194b77060..c8b67929e 100644
--- a/ceno_zkvm/src/scheme/utils.rs
+++ b/ceno_zkvm/src/scheme/utils.rs
@@ -16,7 +16,6 @@ use itertools::Itertools;
 use mpcs::PolynomialCommitmentScheme;
 pub use multilinear_extensions::wit_infer_by_expr;
 use multilinear_extensions::{
-    macros::{entered_span, exit_span},
     mle::{ArcMultilinearExtension, FieldType, IntoMLE, MultilinearExtension},
     util::ceil_log2,
 };
@@ -297,6 +296,12 @@ pub(crate) fn infer_tower_product_witness<E: ExtensionField>(
     wit_layers
 }
 
+#[tracing::instrument(
+    skip_all,
+    name = "build_main_witness",
+    fields(profiling_2),
+    level = "trace"
+)]
 pub fn build_main_witness<
     'a,
     E: ExtensionField,
@@ -439,7 +444,6 @@ pub fn gkr_witness<
     // generate all layer witness from input to output
     for (i, layer) in circuit.layers.iter().rev().enumerate() {
         tracing::debug!("generating input {i} layer with layer name {}", layer.name);
-        let span = entered_span!("per_layer_gen_witness", profiling_2 = true);
         // process in_evals to prepare layer witness
         // This should assume the input of the first layer is the phase1 witness of the circuit.
         let current_layer_wits = layer
@@ -486,7 +490,6 @@ pub fn gkr_witness<
                 }
                 other => unimplemented!("{:?}", other),
             });
-        exit_span!(span);
     }
     layer_wits.reverse();
 
diff --git a/gkr_iop/src/cpu/mod.rs b/gkr_iop/src/cpu/mod.rs
index 85e058b2b..9b2864150 100644
--- a/gkr_iop/src/cpu/mod.rs
+++ b/gkr_iop/src/cpu/mod.rs
@@ -7,6 +7,7 @@ use ff_ext::ExtensionField;
 use itertools::izip;
 use mpcs::{PolynomialCommitmentScheme, SecurityLevel, SecurityLevel::Conjecture100bits};
 use multilinear_extensions::{
+    macros::{entered_span, exit_span},
     mle::{ArcMultilinearExtension, MultilinearExtension, Point},
     wit_infer_by_monomial_expr,
 };
@@ -111,12 +112,13 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
         pub_io_evals: &[Arc<<CpuBackend<E, PCS> as ProverBackend>::MultilinearPoly<'a>>],
         challenges: &[E],
     ) -> Vec<Arc<<CpuBackend<E, PCS> as ProverBackend>::MultilinearPoly<'a>>> {
+        let span = entered_span!("witness_infer", profiling_2 = true);
         let out_evals: Vec<_> = layer
             .out_sel_and_eval_exprs
             .iter()
             .flat_map(|(sel_type, out_eval)| izip!(iter::repeat(sel_type), out_eval.iter()))
             .collect();
-        layer
+        let res = layer
             .exprs_with_selector_out_eval_monomial_form
             .par_iter()
             .zip_eq(layer.expr_names.par_iter())
@@ -141,10 +143,13 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
                     EvalExpression::Partition(_, _) => unimplemented!(),
                 }
             })
-            .collect::<Vec<_>>()
+            .collect::<Vec<_>>();
+        exit_span!(span);
+        res
     }
 }
 
+#[tracing::instrument(skip_all, name = "layer_witness", fields(profiling_2), level = "trace")]
 pub fn layer_witness<'a, E>(
     layer: &Layer<E>,
     layer_wits: &[ArcMultilinearExtension<'a, E>],
@@ -154,12 +159,13 @@ pub fn layer_witness<'a, E>(
 where
     E: ExtensionField,
 {
+    let span = entered_span!("witness_infer", profiling_2 = true);
     let out_evals: Vec<_> = layer
         .out_sel_and_eval_exprs
         .iter()
         .flat_map(|(sel_type, out_eval)| izip!(iter::repeat(sel_type), out_eval.iter()))
         .collect();
-    layer
+    let res = layer
         .exprs_with_selector_out_eval_monomial_form
         .par_iter()
         .zip_eq(layer.expr_names.par_iter())
@@ -184,5 +190,7 @@ where
                 EvalExpression::Partition(_, _) => unimplemented!(),
             }
         })
-        .collect::<Vec<_>>()
+        .collect::<Vec<_>>();
+    exit_span!(span);
+    res
 }
diff --git a/gkr_iop/src/gkr/layer/gpu/mod.rs b/gkr_iop/src/gkr/layer/gpu/mod.rs
index 022d0e98f..d9380c511 100644
--- a/gkr_iop/src/gkr/layer/gpu/mod.rs
+++ b/gkr_iop/src/gkr/layer/gpu/mod.rs
@@ -54,15 +54,18 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> LinearLayerProver<Gp
         out_point: &multilinear_extensions::mle::Point<E>,
         transcript: &mut impl transcript::Transcript<E>,
     ) -> crate::gkr::layer::sumcheck_layer::LayerProof<E> {
+        let span = entered_span!("LinearLayerProver", profiling_2 = true);
         let cpu_wits: Vec<Arc<MultilinearExtension<'_, E>>> = wit
             .0
             .into_iter()
             .map(|gpu_mle| Arc::new(gpu_mle.inner_to_mle()))
             .collect();
         let cpu_wit = LayerWitness::<CpuBackend<E, PCS>>(cpu_wits);
-        <CpuProver<CpuBackend<E, PCS>> as LinearLayerProver<CpuBackend<E, PCS>>>::prove(
+        let res = <CpuProver<CpuBackend<E, PCS>> as LinearLayerProver<CpuBackend<E, PCS>>>::prove(
             layer, cpu_wit, out_point, transcript,
-        )
+        );
+        exit_span!(span);
+        res
     }
 }
 
@@ -77,20 +80,23 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> SumcheckLayerProver<
         challenges: &[<GpuBackend<E, PCS> as ProverBackend>::E],
         transcript: &mut impl Transcript<<GpuBackend<E, PCS> as ProverBackend>::E>,
     ) -> LayerProof<<GpuBackend<E, PCS> as ProverBackend>::E> {
+        let span = entered_span!("SumcheckLayerProver", profiling_2 = true);
         let cpu_wits: Vec<Arc<MultilinearExtension<'_, E>>> = wit
             .0
             .into_iter()
             .map(|gpu_mle| Arc::new(gpu_mle.inner_to_mle()))
             .collect();
         let cpu_wit = LayerWitness::<CpuBackend<E, PCS>>(cpu_wits);
-        <CpuProver<CpuBackend<E, PCS>> as SumcheckLayerProver<CpuBackend<E, PCS>>>::prove(
+        let res = <CpuProver<CpuBackend<E, PCS>> as SumcheckLayerProver<CpuBackend<E, PCS>>>::prove(
             layer,
             num_threads,
             max_num_variables,
             cpu_wit,
             challenges,
             transcript,
-        )
+        );
+        exit_span!(span);
+        res
     }
 }
 
@@ -111,6 +117,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ZerocheckLayerProver
         LayerProof<<GpuBackend<E, PCS> as ProverBackend>::E>,
         Point<<GpuBackend<E, PCS> as ProverBackend>::E>,
     ) {
+        let span = entered_span!("ZerocheckLayerProver", profiling_2 = true);
         let num_threads = 1; // VP builder for GPU: do not use _num_threads
 
         assert_eq!(challenges.len(), 2);
@@ -163,7 +170,6 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ZerocheckLayerProver
         )
         .collect_vec();
 
-        let span = entered_span!("IOPProverState::prove", profiling_4 = true);
         let cuda_hal = get_cuda_hal().unwrap();
         let eqs_gpu = layer
             .out_sel_and_eval_exprs
@@ -222,11 +228,11 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ZerocheckLayerProver
             .unwrap_or(0);
 
         // Convert types for GPU function Call
-        let basic_tr: &mut BasicTranscript<GL64Ext> =
-            unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript<GL64Ext>) };
-        let term_coefficients_gl64: Vec<GL64Ext> =
+        let basic_tr: &mut BasicTranscript<BB31Ext> =
+            unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript<BB31Ext>) };
+        let term_coefficients_gl64: Vec<BB31Ext> =
             unsafe { std::mem::transmute(term_coefficients) };
-        let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<GL64Ext>> =
+        let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<BB31Ext>> =
             unsafe { std::mem::transmute(all_witins_gpu) };
         let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec();
         let (proof_gpu, evals_gpu, challenges_gpu) = cuda_hal
@@ -247,13 +253,12 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ZerocheckLayerProver
 
         // convert back to E: ExtensionField
         let proof_gpu_e =
-            unsafe { std::mem::transmute::<IOPProof<GL64Ext>, IOPProof<E>>(proof_gpu) };
-        let evals_gpu_e = unsafe { std::mem::transmute::<Vec<GL64Ext>, Vec<E>>(evals_gpu) };
+            unsafe { std::mem::transmute::<IOPProof<BB31Ext>, IOPProof<E>>(proof_gpu) };
+        let evals_gpu_e = unsafe { std::mem::transmute::<Vec<BB31Ext>, Vec<E>>(evals_gpu) };
         let row_challenges_e =
-            unsafe { std::mem::transmute::<Vec<GL64Ext>, Vec<E>>(row_challenges) };
+            unsafe { std::mem::transmute::<Vec<BB31Ext>, Vec<E>>(row_challenges) };
 
         exit_span!(span);
-
         (
             LayerProof {
                 main: SumcheckLayerProof {
@@ -292,7 +297,7 @@ pub(crate) fn prove_rotation_gpu<E: ExtensionField, PCS: PolynomialCommitmentSch
 
     // rotated_mles is non-deterministic input, rotated from existing witness polynomial
     // we will reduce it to zero check, and finally reduce to committed polynomial opening
-    let span = entered_span!("rotate_witin_selector", profiling_4 = true);
+    let span = entered_span!("rotate_witin_selector", profiling_3 = true);
     let rotated_mles_gpu = build_rotation_mles_gpu(
         &cuda_hal,
         raw_rotation_exprs,
@@ -315,7 +320,7 @@ pub(crate) fn prove_rotation_gpu<E: ExtensionField, PCS: PolynomialCommitmentSch
     .collect_vec();
     exit_span!(span);
 
-    let span = entered_span!("rotation IOPProverState::prove", profiling_4 = true);
+    let span = entered_span!("rotation IOPProverState::prove", profiling_3 = true);
     // gpu mles
     let mle_gpu_ref: Vec<&MultilinearExtensionGpu<E>> = rotated_mles_gpu
         .iter()
@@ -344,10 +349,10 @@ pub(crate) fn prove_rotation_gpu<E: ExtensionField, PCS: PolynomialCommitmentSch
         .unwrap_or(0);
 
     // Convert types for GPU function call
-    let basic_tr: &mut BasicTranscript<GL64Ext> =
-        unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript<GL64Ext>) };
-    let term_coefficients_gl64: Vec<GL64Ext> = unsafe { std::mem::transmute(term_coefficients) };
-    let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<GL64Ext>> =
+    let basic_tr: &mut BasicTranscript<BB31Ext> =
+        unsafe { &mut *(transcript as *mut _ as *mut BasicTranscript<BB31Ext>) };
+    let term_coefficients_gl64: Vec<BB31Ext> = unsafe { std::mem::transmute(term_coefficients) };
+    let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<BB31Ext>> =
         unsafe { std::mem::transmute(mle_gpu_ref) };
     let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec();
     // gpu prover
@@ -367,14 +372,14 @@ pub(crate) fn prove_rotation_gpu<E: ExtensionField, PCS: PolynomialCommitmentSch
     let evals_gpu = evals_gpu.into_iter().flatten().collect_vec();
     let row_challenges = challenges_gpu.iter().map(|c| c.elements).collect_vec();
 
-    let proof_gpu_e = unsafe { std::mem::transmute::<IOPProof<GL64Ext>, IOPProof<E>>(proof_gpu) };
-    let mut evals_gpu_e = unsafe { std::mem::transmute::<Vec<GL64Ext>, Vec<E>>(evals_gpu) };
-    let row_challenges_e = unsafe { std::mem::transmute::<Vec<GL64Ext>, Vec<E>>(row_challenges) };
+    let proof_gpu_e = unsafe { std::mem::transmute::<IOPProof<BB31Ext>, IOPProof<E>>(proof_gpu) };
+    let mut evals_gpu_e = unsafe { std::mem::transmute::<Vec<BB31Ext>, Vec<E>>(evals_gpu) };
+    let row_challenges_e = unsafe { std::mem::transmute::<Vec<BB31Ext>, Vec<E>>(row_challenges) };
     // skip selector/eq as verifier can derive itself
     evals_gpu_e.truncate(raw_rotation_exprs.len() * 2);
     exit_span!(span);
 
-    let span = entered_span!("rotation derived left/right eval", profiling_4 = true);
+    let span = entered_span!("rotation derived left/right eval", profiling_3 = true);
     let bh = BooleanHypercube::new(rotation_cyclic_group_log2);
     let (left_point, right_point) = bh.get_rotation_points(&row_challenges_e);
     let evals = evals_gpu_e
diff --git a/gkr_iop/src/gkr/layer/gpu/utils.rs b/gkr_iop/src/gkr/layer/gpu/utils.rs
index 156b76e0f..726b1112c 100644
--- a/gkr_iop/src/gkr/layer/gpu/utils.rs
+++ b/gkr_iop/src/gkr/layer/gpu/utils.rs
@@ -60,15 +60,13 @@ pub fn extract_mle_relationships_from_monomial_terms<'a, E: ExtensionField>(
     (term_coefficients, mle_indices_per_term, mle_size_info)
 }
 
-pub fn build_eq_x_r_with_sel_gpu<'a, E: ExtensionField>(
-    hal: &'a CudaHalGL64,
+pub fn build_eq_x_r_with_sel_gpu<E: ExtensionField>(
+    hal: &CudaHalBB31,
     point: &Point<E>,
     num_instances: usize,
     selector: &SelectorType<E>,
-) -> MultilinearExtensionGpu<'a, E> {
-    if std::any::TypeId::of::<E::BaseField>()
-        != std::any::TypeId::of::<p3::goldilocks::Goldilocks>()
-    {
+) -> MultilinearExtensionGpu<'static, E> {
+    if std::any::TypeId::of::<E::BaseField>() != std::any::TypeId::of::<BB31Base>() {
         panic!("GPU backend only supports Goldilocks base field");
     }
 
@@ -89,58 +87,75 @@ pub fn build_eq_x_r_with_sel_gpu<'a, E: ExtensionField>(
             GpuFieldType::Unreachable => panic!("Unreachable GpuFieldType"),
         };
         let indices_u32 = indices.iter().map(|x| *x as u32).collect_vec();
-        ordered_sparse32_selector_gpu(&hal.inner, &mut eq_buf.buf, &indices_u32, num_instances)
-            .unwrap();
+        ordered_sparse32_selector_gpu::<CudaHalBB31, BB31Ext, BB31Base>(
+            &hal.inner,
+            &mut eq_buf.buf,
+            &indices_u32,
+            num_instances,
+        )
+        .unwrap();
         eq_buf
     } else {
-        let point_gl64: &Point<GL64Ext> = unsafe { std::mem::transmute(point) };
+        let point_gl64: &Point<BB31Ext> = unsafe { std::mem::transmute(point) };
         let mut gpu_output = hal.alloc_ext_elems_on_device(eq_len).unwrap();
         let gpu_points = hal.alloc_ext_elems_from_host(point_gl64).unwrap();
-        build_mle_as_ceno(&hal.inner, &gpu_points, &mut gpu_output, num_instances).unwrap();
+        build_mle_as_ceno::<CudaHalBB31, BB31Ext, BB31Base>(
+            &hal.inner,
+            &gpu_points,
+            &mut gpu_output,
+            num_instances,
+        )
+        .unwrap();
         GpuPolynomialExt::new(gpu_output, point.len())
     };
     let mle_gl64 = MultilinearExtensionGpu::from_ceno_gpu_ext(eq_mle);
     unsafe {
-        std::mem::transmute::<MultilinearExtensionGpu<'a, GL64Ext>, MultilinearExtensionGpu<'a, E>>(
-            mle_gl64,
-        )
+        std::mem::transmute::<
+            MultilinearExtensionGpu<'static, BB31Ext>,
+            MultilinearExtensionGpu<'static, E>,
+        >(mle_gl64)
     }
 }
 
-pub fn build_eq_x_r_gpu<'a, E: ExtensionField>(
-    hal: &'a CudaHalGL64,
+pub fn build_eq_x_r_gpu<E: ExtensionField>(
+    hal: &CudaHalBB31,
     point: &Point<E>,
-) -> MultilinearExtensionGpu<'a, E> {
-    if std::any::TypeId::of::<E::BaseField>()
-        != std::any::TypeId::of::<p3::goldilocks::Goldilocks>()
-    {
+) -> MultilinearExtensionGpu<'static, E> {
+    if std::any::TypeId::of::<E::BaseField>() != std::any::TypeId::of::<BB31Base>() {
         panic!("GPU backend only supports Goldilocks base field");
     }
 
     let eq_len = 1 << point.len();
     // type eq
-    let point_gl64: &Point<GL64Ext> = unsafe { std::mem::transmute(point) };
+    let point_gl64: &Point<BB31Ext> = unsafe { std::mem::transmute(point) };
     let eq_mle = {
         let mut gpu_output = hal.alloc_ext_elems_on_device(eq_len).unwrap();
         let gpu_points = hal.alloc_ext_elems_from_host(point_gl64).unwrap();
-        build_mle_as_ceno(&hal.inner, &gpu_points, &mut gpu_output, eq_len).unwrap();
+        build_mle_as_ceno::<CudaHalBB31, BB31Ext, BB31Base>(
+            &hal.inner,
+            &gpu_points,
+            &mut gpu_output,
+            eq_len,
+        )
+        .unwrap();
         GpuPolynomialExt::new(gpu_output, point.len())
     };
     let mle_gl64 = MultilinearExtensionGpu::from_ceno_gpu_ext(eq_mle);
     unsafe {
-        std::mem::transmute::<MultilinearExtensionGpu<'a, GL64Ext>, MultilinearExtensionGpu<'a, E>>(
-            mle_gl64,
-        )
+        std::mem::transmute::<
+            MultilinearExtensionGpu<'static, BB31Ext>,
+            MultilinearExtensionGpu<'static, E>,
+        >(mle_gl64)
     }
 }
 
-pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
-    cuda_hal: &'a CudaHalGL64,
+pub fn build_rotation_mles_gpu<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
+    cuda_hal: &CudaHalBB31,
     raw_rotation_exprs: &[(Expression<E>, Expression<E>)],
     wit: &LayerWitness<GpuBackend<E, PCS>>,
     bh: &BooleanHypercube,
     rotation_cyclic_group_log2: usize,
-) -> Vec<MultilinearExtensionGpu<'a, E>> {
+) -> Vec<MultilinearExtensionGpu<'static, E>> {
     raw_rotation_exprs
         .iter()
         .map(|rotation_expr| match rotation_expr {
@@ -158,10 +173,16 @@ pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentS
                     _ => panic!("unimplemented input mle"),
                 };
                 let mut output_buf = cuda_hal.alloc_elems_on_device(input_buf.len()).unwrap();
-                rotation_next_base_mle_gpu(
+
+                // Safety: GPU buffers are actually 'static lifetime. We only read from input_buf
+                // during the GPU kernel execution, which completes synchronously before returning.
+                let input_buf_static: &BufferImpl<'static, BB31Base> =
+                    unsafe { std::mem::transmute(input_buf) };
+
+                rotation_next_base_mle_gpu::<CudaHalBB31, BB31Ext, BB31Base>(
                     &cuda_hal.inner,
                     &mut output_buf,
-                    input_buf,
+                    input_buf_static,
                     &rotation_index,
                     cyclic_group_size,
                 )
@@ -172,8 +193,8 @@ pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentS
                 ));
                 unsafe {
                     std::mem::transmute::<
-                        MultilinearExtensionGpu<GL64Ext>,
-                        MultilinearExtensionGpu<'_, E>,
+                        MultilinearExtensionGpu<'static, BB31Ext>,
+                        MultilinearExtensionGpu<'static, E>,
                     >(output_mle)
                 }
             }
@@ -182,32 +203,34 @@ pub fn build_rotation_mles_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentS
         .collect::<Vec<_>>()
 }
 
-pub fn build_rotation_selector_gpu<'a, E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
-    cuda_hal: &'a CudaHalGL64,
+pub fn build_rotation_selector_gpu<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>(
+    cuda_hal: &CudaHalBB31,
     wit: &LayerWitness<GpuBackend<E, PCS>>,
     rt: &Point<E>,
     bh: &BooleanHypercube,
     rotation_cyclic_subgroup_size: usize,
     rotation_cyclic_group_log2: usize,
-) -> MultilinearExtensionGpu<'a, E> {
+) -> MultilinearExtensionGpu<'static, E> {
     let total_len = wit[0].evaluations_len(); // Take first mle just to retrieve total length
     assert!(total_len.is_power_of_two());
     let mut output_buf = cuda_hal.alloc_ext_elems_on_device(total_len).unwrap();
+
     let eq = build_eq_x_r_gpu(cuda_hal, rt);
-    let eq_buf = match &eq.mle {
+    let eq_buf_owned = match eq.mle {
         GpuFieldType::Base(_) => panic!("should be ext field"),
-        GpuFieldType::Ext(mle) => mle.evaluations(),
+        GpuFieldType::Ext(mle) => mle.buf,
         GpuFieldType::Unreachable => panic!("Unreachable GpuFieldType"),
     };
+
     let rotation_index = bh
         .into_iter()
         .take(rotation_cyclic_subgroup_size)
         .map(|x| x as u32)
         .collect_vec();
-    rotation_selector_gpu(
+    rotation_selector_gpu::<CudaHalBB31, BB31Ext, BB31Base>(
         &cuda_hal.inner,
         &mut output_buf,
-        eq_buf,
+        &eq_buf_owned,
         &rotation_index,
         1 << rotation_cyclic_group_log2,
         rotation_cyclic_subgroup_size,
@@ -218,8 +241,9 @@ pub fn build_rotation_selector_gpu<'a, E: ExtensionField, PCS: PolynomialCommitm
         total_len.ilog2() as usize,
     ));
     unsafe {
-        std::mem::transmute::<MultilinearExtensionGpu<GL64Ext>, MultilinearExtensionGpu<'_, E>>(
-            output_mle,
-        )
+        std::mem::transmute::<
+            MultilinearExtensionGpu<'static, BB31Ext>,
+            MultilinearExtensionGpu<'static, E>,
+        >(output_mle)
     }
 }
diff --git a/gkr_iop/src/gpu/mod.rs b/gkr_iop/src/gpu/mod.rs
index 20de4e819..14a371a7b 100644
--- a/gkr_iop/src/gpu/mod.rs
+++ b/gkr_iop/src/gpu/mod.rs
@@ -4,7 +4,10 @@ use crate::{
 };
 use ff_ext::ExtensionField;
 use mpcs::{PolynomialCommitmentScheme, SecurityLevel};
-use multilinear_extensions::mle::{FieldType, MultilinearExtension, Point};
+use multilinear_extensions::{
+    macros::{entered_span, exit_span},
+    mle::{FieldType, MultilinearExtension, Point},
+};
 use p3::field::TwoAdicField;
 use std::{rc::Rc, sync::Arc};
 use witness::RowMajorMatrix;
@@ -17,37 +20,43 @@ use std::marker::PhantomData;
 pub mod gpu_prover {
     pub use ceno_gpu::{
         BasefoldCommitmentWithWitness as BasefoldCommitmentWithWitnessGpu, Buffer, CudaHal,
-        gl64::{
-            CudaHalGL64, GpuFieldType, GpuPolynomial, GpuPolynomialExt, buffer::BufferImpl,
-            build_mle_as_ceno, convert_ceno_to_gpu_basefold_commitment,
-            ordered_sparse32_selector_gpu, rotation_next_base_mle_gpu, rotation_selector_gpu,
+        bb31::{
+            CudaHalBB31, GpuDigestLayer, GpuFieldType, GpuMatrix, GpuPolynomial, GpuPolynomialExt,
+        },
+        common::{
+            basefold::utils::convert_ceno_to_gpu_basefold_commitment,
+            buffer::BufferImpl,
+            mle::{
+                build_mle_as_ceno, ordered_sparse32_selector_gpu, rotation_next_base_mle_gpu,
+                rotation_selector_gpu,
+            },
         },
     };
     use cudarc::driver::{CudaDevice, DriverError};
     use once_cell::sync::Lazy;
     use std::sync::{Arc, Mutex, MutexGuard};
 
-    pub type GL64Base = p3::goldilocks::Goldilocks;
-    pub type GL64Ext = ff_ext::GoldilocksExt2;
+    pub type BB31Base = p3::babybear::BabyBear;
+    pub type BB31Ext = ff_ext::BabyBearExt4;
 
     pub static CUDA_DEVICE: Lazy<Result<Arc<CudaDevice>, DriverError>> =
         Lazy::new(|| CudaDevice::new(0));
 
     #[allow(clippy::type_complexity)]
     pub static CUDA_HAL: Lazy<
-        Result<Arc<Mutex<CudaHalGL64>>, Box<dyn std::error::Error + Send + Sync>>,
+        Result<Arc<Mutex<CudaHalBB31>>, Box<dyn std::error::Error + Send + Sync>>,
     > = Lazy::new(|| {
         let device = CUDA_DEVICE
             .as_ref()
             .map_err(|e| format!("Device init failed: {:?}", e))?;
         device.bind_to_thread()?;
 
-        CudaHalGL64::new()
+        CudaHalBB31::new()
             .map(|hal| Arc::new(Mutex::new(hal)))
             .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
     });
 
-    pub fn get_cuda_hal() -> Result<MutexGuard<'static, CudaHalGL64>, String> {
+    pub fn get_cuda_hal() -> Result<MutexGuard<'static, CudaHalBB31>, String> {
         let device = CUDA_DEVICE
             .as_ref()
             .map_err(|e| format!("Device not available: {:?}", e))?;
@@ -146,7 +155,7 @@ impl<'a, E: ExtensionField> MultilinearPolynomial<E> for MultilinearExtensionGpu
 
 impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
     /// Get reference to internal GPU polynomial
-    pub fn inner(&self) -> &GpuFieldType {
+    pub fn inner(&self) -> &GpuFieldType<'_> {
         &self.mle
     }
 
@@ -180,12 +189,12 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
     }
 
     /// Create GPU version from CPU version of MultilinearExtension
-    pub fn from_ceno(cuda_hal: &CudaHalGL64, mle: &MultilinearExtension<'a, E>) -> Self {
+    pub fn from_ceno(cuda_hal: &CudaHalBB31, mle: &MultilinearExtension<'a, E>) -> Self {
         // check type of mle
         match mle.evaluations {
             FieldType::Base(_) => {
                 let mle_vec_ref = mle.get_base_field_vec();
-                let mle_vec_ref_gl64: &[GL64Base] = unsafe { std::mem::transmute(mle_vec_ref) };
+                let mle_vec_ref_gl64: &[BB31Base] = unsafe { std::mem::transmute(mle_vec_ref) };
                 let mle_gpu =
                     GpuPolynomial::from_ceno_vec(cuda_hal, mle_vec_ref_gl64, mle.num_vars())
                         .unwrap();
@@ -196,7 +205,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
             }
             FieldType::Ext(_) => {
                 let mle_vec_ref = mle.get_ext_field_vec();
-                let mle_vec_ref_gl64_ext: &[GL64Ext] = unsafe { std::mem::transmute(mle_vec_ref) };
+                let mle_vec_ref_gl64_ext: &[BB31Ext] = unsafe { std::mem::transmute(mle_vec_ref) };
                 let mle_gpu =
                     GpuPolynomialExt::from_ceno_vec(cuda_hal, mle_vec_ref_gl64_ext, mle.num_vars())
                         .unwrap();
@@ -231,7 +240,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
     }
 
     /// get inner poly reference with base field claim
-    pub fn as_ceno_gpu_base(&self) -> &GpuPolynomial {
+    pub fn as_ceno_gpu_base(&self) -> &GpuPolynomial<'_> {
         match &self.mle {
             GpuFieldType::Base(poly) => poly,
             GpuFieldType::Ext(_) => panic!("poly in ext field"),
@@ -240,7 +249,7 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
     }
 
     /// get inner poly reference with ext field claim
-    pub fn as_ceno_gpu_ext(&self) -> &GpuPolynomialExt {
+    pub fn as_ceno_gpu_ext(&self) -> &GpuPolynomialExt<'_> {
         match &self.mle {
             GpuFieldType::Base(_) => panic!("poly in base field"),
             GpuFieldType::Ext(poly) => poly,
@@ -286,8 +295,13 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ProverBackend for Gp
     type MultilinearPoly<'a> = MultilinearExtensionGpu<'a, E>;
     type Matrix = RowMajorMatrix<E::BaseField>;
     #[cfg(feature = "gpu")]
-    type PcsData =
-        BasefoldCommitmentWithWitnessGpu<E::BaseField, BufferImpl<'static, E::BaseField>>;
+    type PcsData = BasefoldCommitmentWithWitnessGpu<
+        E::BaseField,
+        BufferImpl<'static, E::BaseField>,
+        GpuDigestLayer,
+        GpuMatrix<'static>,
+        GpuPolynomial<'static>,
+    >;
     #[cfg(not(feature = "gpu"))]
     type PcsData = <PCS as PolynomialCommitmentScheme<E>>::CommitmentWithWitness;
 
@@ -320,15 +334,15 @@ where
 impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
     ProtocolWitnessGeneratorProver<GpuBackend<E, PCS>> for GpuProver<GpuBackend<E, PCS>>
 {
+    #[tracing::instrument(skip_all, name = "layer_witness", fields(profiling_2), level = "trace")]
     fn layer_witness<'a>(
         layer: &Layer<E>,
         layer_wits: &[Arc<<GpuBackend<E, PCS> as ProverBackend>::MultilinearPoly<'a>>],
         pub_io_evals: &[Arc<<GpuBackend<E, PCS> as ProverBackend>::MultilinearPoly<'a>>],
         challenges: &[E],
     ) -> Vec<Arc<<GpuBackend<E, PCS> as ProverBackend>::MultilinearPoly<'a>>> {
-        if std::any::TypeId::of::<E::BaseField>()
-            != std::any::TypeId::of::<p3::goldilocks::Goldilocks>()
-        {
+        let span = entered_span!("preprocess", profiling_2 = true);
+        if std::any::TypeId::of::<E::BaseField>() != std::any::TypeId::of::<BB31Base>() {
             panic!("GPU backend only supports Goldilocks base field");
         }
 
@@ -369,7 +383,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
                     &pub_io_evals,
                     challenges,
                 );
-                let coeffs_gl64: Vec<GL64Ext> = unsafe { std::mem::transmute(coeffs) };
+                let coeffs_gl64: Vec<BB31Ext> = unsafe { std::mem::transmute(coeffs) };
                 (coeffs_gl64, indices, size_info)
             })
             .fold(
@@ -390,10 +404,12 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
             .as_ref()
             .unwrap()
             .0;
+        exit_span!(span);
 
+        let span = entered_span!("witness_infer", profiling_2 = true);
         // process & transmute poly
         let all_witins_gpu = layer_wits.iter().map(|mle| mle.as_ref()).collect_vec();
-        let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<GL64Ext>> =
+        let all_witins_gpu_gl64: Vec<&MultilinearExtensionGpu<BB31Ext>> =
             unsafe { std::mem::transmute(all_witins_gpu) };
         let all_witins_gpu_type_gl64 = all_witins_gpu_gl64.iter().map(|mle| &mle.mle).collect_vec();
 
@@ -411,13 +427,14 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>>
         cuda_hal
             .witness_infer
             .wit_infer_by_monomial_expr(
-                &cuda_hal,
+                &*cuda_hal,
                 all_witins_gpu_type_gl64,
                 &term_coefficients,
                 &mle_indices_per_term,
                 &mut next_witness_buf,
             )
             .unwrap();
+        exit_span!(span);
 
         // recover it back and interleaving with default gpu
         let mut next_iter = next_witness_buf.into_iter();
diff --git a/utils/cuda_hal/src/lib.rs b/utils/cuda_hal/src/lib.rs
index 1ecfe9a9b..619c6fc80 100644
--- a/utils/cuda_hal/src/lib.rs
+++ b/utils/cuda_hal/src/lib.rs
@@ -12,16 +12,16 @@ compile_error!(
 );
 
 // Minimal stub exports to satisfy basic compilation when gpu feature is disabled
-pub mod gl64 {
-    pub struct CudaHalGL64;
+pub mod bb31 {
+    pub struct CudaHalBB31;
 
-    impl CudaHalGL64 {
+    impl CudaHalBB31 {
         pub fn new() -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
             Err("GPU placeholder: real implementation required".into())
         }
     }
 
-    pub fn convert_ceno_to_gpu_basefold_commitment<T>(_hal: &CudaHalGL64, _commitment: &T) -> T {
+    pub fn convert_ceno_to_gpu_basefold_commitment<T>(_hal: &CudaHalBB31, _commitment: &T) -> T {
         panic!("GPU placeholder: real implementation required")
     }