diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9fdd3d23d..1429113bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,6 +83,8 @@ if(FINUFFT_USE_CPU)
     add_subdirectory(src)
 endif()
 
+add_subdirectory(src/common)
+
 if(FINUFFT_USE_CUDA)
     include(cuda_setup)
     add_subdirectory(src/cuda)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 3eb4806a6..267d33e84 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -103,7 +103,7 @@ function(enable_asan target)
 endfunction()
 
 function(finufft_link_test target)
-    target_link_libraries(${target} PRIVATE finufft::finufft)
+    target_link_libraries(${target} PRIVATE finufft::finufft finufft_common)
     if(FINUFFT_USE_DUCC0)
         target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
     endif()
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
index ad70a2491..de214a1aa 100644
--- a/examples/cuda/CMakeLists.txt
+++ b/examples/cuda/CMakeLists.txt
@@ -5,6 +5,6 @@ foreach(srcfile ${example_src})
     get_filename_component(executable ${executable} NAME)
     add_executable(${executable} ${srcfile})
     target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-    target_link_libraries(${executable} cufinufft CUDA::cufft CUDA::cudart)
+    target_link_libraries(${executable} PRIVATE cufinufft CUDA::cufft CUDA::cudart $<BUILD_INTERFACE:finufft_common>)
     target_compile_features(${executable} PRIVATE cxx_std_17)
 endforeach()
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 1d1c4a6d3..93646e07c 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -89,7 +89,7 @@ class WithCudaDevice {
 };
 
 // math helpers whose source is in src/utils.cpp
-long next235beven(long n, long b);
+FINUFFT_EXPORT long next235beven(long n, long b);
 
 /**
  * does a complex atomic add on a shared memory address
diff --git a/include/finufft.fh b/include/finufft.fh
index c19874bbd..7658ea2bc 100644
--- a/include/finufft.fh
+++ b/include/finufft.fh
@@ -20,6 +20,7 @@ c     DEPRECATED: spread_kerpad kept for ABI compatibility, ignored by library
       real*8 upsampfac
       integer spread_thread, maxbatchsize, spread_nthr_atomic
       integer spread_max_sp_size
+      integer spread_function
       integer fftw_lock_fun, fftw_unlock_fun, fftw_lock_data
 
       end type
diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
index 37e7ada5f..1b5f80b5e 100644
--- a/include/finufft/spreadinterp.h
+++ b/include/finufft/spreadinterp.h
@@ -21,9 +21,9 @@ namespace finufft {
 namespace spreadinterp {
 
 template<typename T>
-FINUFFT_EXPORT_TEST int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac,
-                                       int kerevalmeth, int debug, int showwarn,
-                                       int spreadinterponly, int dim);
+FINUFFT_EXPORT_TEST int setup_spreader(
+    finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, int debug,
+    int showwarn, int spreadinterponly, int dim, int kernel_type = 0);
 
 int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, const finufft_spread_opts &opts);
 template<typename T>
diff --git a/include/finufft_common/kernel.h b/include/finufft_common/kernel.h
index 67102f48a..f94652219 100644
--- a/include/finufft_common/kernel.h
+++ b/include/finufft_common/kernel.h
@@ -6,6 +6,8 @@
 #include <vector>
 
 #include <finufft_common/constants.h>
+#include <finufft_common/utils.h>
+#include <finufft_spread_opts.h>
 
 namespace finufft::kernel {
 
@@ -56,15 +58,34 @@ template<class T, class F> std::vector<T> fit_monomials(F &&f, int n, T a, T b)
   return c;
 }
 
-template<typename T> T evaluate_kernel(T x, T beta, T c) {
-  /* ES ("exp sqrt") kernel evaluation at single real argument:
-      phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)),    for |x| < nspread/2
-     related to an asymptotic approximation to the Kaiser--Bessel, itself an
-     approximation to prolate spheroidal wavefunction (PSWF) of order 0.
-     This is the "reference implementation", used by eg finufft/onedim_* 2/17/17.
-     Rescaled so max is 1, Barnett 7/21/24
+template<typename T> T evaluate_kernel(T x, T beta, T c, int kernel_type = 0) {
+  /* Kernel evaluation at single real argument.
+     kernel_type == 0 : ES ("exp sqrt") kernel (default)
+       phi_ES(x) = exp(beta*(sqrt(1 - c*x^2) - 1))
+     kernel_type == 1 : Kaiser--Bessel (KB) kernel
+       phi_KB(x) = I_0(beta*sqrt(1 - c*x^2)) / I_0(beta)
+     Note: `std::cyl_bessel_i` from <cmath> is used for I_0.
+     Rescaled so max is 1.
   */
+  if (kernel_type == 1) {
+    // Kaiser--Bessel (normalized by I0(beta)). Use std::cyl_bessel_i from <cmath>.
+    const T inner        = std::sqrt(T(1) - c * x * x);
+    const T arg          = beta * inner;
+    const double i0_arg  = ::finufft::common::cyl_bessel_i(0, static_cast<double>(arg));
+    const double i0_beta = ::finufft::common::cyl_bessel_i(0, static_cast<double>(beta));
+    return static_cast<T>(i0_arg / i0_beta);
+  }
+
+  // default to ES
   return std::exp(beta * (std::sqrt(T(1) - c * x * x) - T(1)));
 }
 
+FINUFFT_EXPORT int compute_kernel_ns(double upsampfac, double tol, int kernel_type,
+                                     const finufft_spread_opts &opts);
+
+FINUFFT_EXPORT void initialize_kernel_params(finufft_spread_opts &opts, double upsampfac,
+                                             double tol, int kernel_type);
+
+FINUFFT_EXPORT double sigma_max_tol(double upsampfac, int kernel_type, int max_ns);
+
 } // namespace finufft::kernel
diff --git a/include/finufft_common/utils.h b/include/finufft_common/utils.h
index e8f8682db..008c1e1da 100644
--- a/include/finufft_common/utils.h
+++ b/include/finufft_common/utils.h
@@ -13,6 +13,9 @@ namespace common {
 FINUFFT_EXPORT_TEST void gaussquad(int n, double *xgl, double *wgl);
 std::tuple<double, double> leg_eval(int n, double x);
 
+// Series implementation of the modified Bessel function of the first kind I_nu(x)
+double cyl_bessel_i(double nu, double x) noexcept;
+
 // helper to generate the integer sequence in range [Start, End]
 template<int Offset, typename Seq> struct offset_seq;
 
diff --git a/include/finufft_mod.f90 b/include/finufft_mod.f90
index 59c5e567c..5be5fae96 100644
--- a/include/finufft_mod.f90
+++ b/include/finufft_mod.f90
@@ -21,6 +21,7 @@ module finufft_mod
    real(kind=C_DOUBLE) :: upsampfac
    integer(kind=C_INT) :: spread_thread, maxbatchsize
    integer(kind=C_INT) :: spread_nthr_atomic, spread_max_sp_size
+   integer(kind=C_INT) :: spread_function
    integer(kind=C_SIZE_T) :: fftw_lock_fun, fftw_unlock_fun, fftw_lock_data
    !  really, last should be type(C_PTR) :: etc, but fails to print nicely
 
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 24cdcd79b..aab78839d 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -32,6 +32,8 @@ typedef struct finufft_opts { // defaults see finufft_core.cpp:finufft_default_o
   int spread_nthr_atomic; // if >=0, threads above which spreader OMP critical goes
                           // atomic
   int spread_max_sp_size; // if >0, overrides spreader (dir=1) max subproblem size
+  int spread_function;    // (dev only) 0:DEFAULT, (do not change), there is no guarantee
+                          // what non-zero values do and behaviour can change anytime
                           // sphinx tag (don't remove): @opts_end
 
   // User can provide their own FFTW planner lock functions for thread safety
diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h
index f9f128a4d..8785c3782 100644
--- a/include/finufft_spread_opts.h
+++ b/include/finufft_spread_opts.h
@@ -26,6 +26,8 @@ typedef struct finufft_spread_opts {
   double ES_beta;
   double ES_halfwidth;
   double ES_c;
+  // Kernel selector: 0 = ES (default), 1 = Kaiser--Bessel (KB)
+  int kernel_type; // default 0
 } finufft_spread_opts;
 
 #endif // FINUFFT_SPREAD_OPTS_H
diff --git a/makefile b/makefile
index b4eb6b779..88bd2f282 100644
--- a/makefile
+++ b/makefile
@@ -152,7 +152,7 @@ STATICLIB = lib-static/$(LIBNAME).a
 ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 
 # spreader objs
-SOBJS = src/finufft_utils.o src/utils.o src/spreadinterp.o
+SOBJS = src/finufft_utils.o src/spreadinterp.o src/common/utils.o src/common/kernel.o
 
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
 OBJS = $(SOBJS) src/fft.o src/finufft_core.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS)
@@ -283,10 +283,10 @@ test/%: test/%.cpp $(DYNLIB)
 test/%f: test/%.cpp $(DYNLIB)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
 # low-level tests that are cleaner if depend on only specific objects...
-test/testutils: test/testutils.cpp src/finufft_utils.o src/utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutils
-test/testutilsf: test/testutils.cpp src/finufft_utils.o src/utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutilsf
+test/testutils: test/testutils.cpp src/finufft_utils.o src/common/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutils
+test/testutilsf: test/testutils.cpp src/finufft_utils.o src/common/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutilsf
 
 # make sure all double-prec test executables ready for testing
 TESTS := $(basename $(wildcard test/*.cpp))
diff --git a/matlab/finufft.cpp b/matlab/finufft.cpp
index ba5381f57..46583b9c4 100644
--- a/matlab/finufft.cpp
+++ b/matlab/finufft.cpp
@@ -1162,6 +1162,9 @@ typedef std::complex<float> fcomplex;
      else if (strcmp(fname[ifield],"spread_max_sp_size") == 0) {
        oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
      }
+     else if (strcmp(fname[ifield],"spread_function") == 0) {
+       oc->spread_function = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
+     }
      else if (strcmp(fname[ifield],"spreadinterponly") == 0) {
        oc->spreadinterponly = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
      }
@@ -1271,7 +1274,7 @@ mxWrapCopyZDef_single     (mxWrapCopy_single_dcomplex, dcomplex,
 mxWrapReturnZDef_single   (mxWrapReturn_single_dcomplex, dcomplex,
                     real_dcomplex, imag_dcomplex)
 
-/* ---- finufft.mw: 170 ----
+/* ---- finufft.mw: 173 ----
  * finufft_mex_setup();
  */
 static const char* stubids1_ = "finufft_mex_setup()";
@@ -1289,7 +1292,7 @@ void mexStub1(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 171 ----
+/* ---- finufft.mw: 174 ----
  * finufft_opts* o = new();
  */
 static const char* stubids2_ = "c o finufft_opts* = new()";
@@ -1310,7 +1313,7 @@ void mexStub2(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 173 ----
+/* ---- finufft.mw: 176 ----
  * finufft_plan* p = new();
  */
 static const char* stubids3_ = "c o finufft_plan* = new()";
@@ -1331,7 +1334,7 @@ void mexStub3(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 174 ----
+/* ---- finufft.mw: 177 ----
  * finufft_default_opts(finufft_opts* o);
  */
 static const char* stubids4_ = "finufft_default_opts(c i finufft_opts*)";
@@ -1355,7 +1358,7 @@ void mexStub4(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 176 ----
+/* ---- finufft.mw: 179 ----
  * finufftf_plan* p = new();
  */
 static const char* stubids5_ = "c o finufftf_plan* = new()";
@@ -1376,7 +1379,7 @@ void mexStub5(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 177 ----
+/* ---- finufft.mw: 180 ----
  * finufftf_default_opts(finufft_opts* o);
  */
 static const char* stubids6_ = "finufftf_default_opts(c i finufft_opts*)";
@@ -1400,7 +1403,7 @@ void mexStub6(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 188 ----
+/* ---- finufft.mw: 191 ----
  * copy_finufft_opts(mxArray opts, finufft_opts* o);
  */
 static const char* stubids7_ = "copy_finufft_opts(c i mxArray, c i finufft_opts*)";
@@ -1427,7 +1430,7 @@ void mexStub7(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 191 ----
+/* ---- finufft.mw: 194 ----
  * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, double tol, finufft_plan* plan, finufft_opts* o);
  */
 static const char* stubids8_ = "c o int = finufft_makeplan(c i int, c i int, c i int64_t[x], c i int, c i int, c i double, c i finufft_plan*, c i finufft_opts*)";
@@ -1520,7 +1523,7 @@ void mexStub8(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 194 ----
+/* ---- finufft.mw: 197 ----
  * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, float tol, finufftf_plan* plan, finufft_opts* o);
  */
 static const char* stubids9_ = "c o int = finufftf_makeplan(c i int, c i int, c i int64_t[x], c i int, c i int, c i float, c i finufftf_plan*, c i finufft_opts*)";
@@ -1613,7 +1616,7 @@ void mexStub9(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 196 ----
+/* ---- finufft.mw: 199 ----
  * delete(finufft_opts* o);
  */
 static const char* stubids10_ = "delete(c i finufft_opts*)";
@@ -1637,7 +1640,7 @@ void mexStub10(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 226 ----
+/* ---- finufft.mw: 229 ----
  * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj, double[] zj, int64_t nk, double[] s, double[] t, double[] u);
  */
 static const char* stubids11_ = "c o int = finufft_setpts(c i finufft_plan, c i int64_t, c i double[], c i double[], c i double[], c i int64_t, c i double[], c i double[], c i double[])";
@@ -1767,7 +1770,7 @@ void mexStub11(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 228 ----
+/* ---- finufft.mw: 231 ----
  * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj, float[] zj, int64_t nk, float[] s, float[] t, float[] u);
  */
 static const char* stubids12_ = "c o int = finufftf_setpts(c i finufftf_plan, c i int64_t, c i float[], c i float[], c i float[], c i int64_t, c i float[], c i float[], c i float[])";
@@ -1897,7 +1900,7 @@ void mexStub12(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 260 ----
+/* ---- finufft.mw: 263 ----
  * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[ncoeffs] result);
  */
 static const char* stubids13_ = "c o int = finufft_execute(c i finufft_plan, c i dcomplex[], c o dcomplex[x])";
@@ -1953,7 +1956,7 @@ void mexStub13(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 262 ----
+/* ---- finufft.mw: 265 ----
  * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[ncoeffs] result);
  */
 static const char* stubids14_ = "c o int = finufftf_execute(c i finufftf_plan, c i fcomplex[], c o fcomplex[x])";
@@ -2009,9 +2012,9 @@ void mexStub14(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 268 ----
+/* ---- finufft.mw: 271 ----
  * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result, dcomplex[] data_in);
- * Also at finufft.mw: 325
+ * Also at finufft.mw: 328
  */
 static const char* stubids15_ = "c o int = finufft_execute(c i finufft_plan, c o dcomplex[xx], c i dcomplex[])";
 
@@ -2068,9 +2071,9 @@ void mexStub15(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 270 ----
+/* ---- finufft.mw: 273 ----
  * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result, fcomplex[] data_in);
- * Also at finufft.mw: 327
+ * Also at finufft.mw: 330
  */
 static const char* stubids16_ = "c o int = finufftf_execute(c i finufftf_plan, c o fcomplex[xx], c i fcomplex[])";
 
@@ -2127,7 +2130,7 @@ void mexStub16(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 274 ----
+/* ---- finufft.mw: 277 ----
  * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk, n_trans] result);
  */
 static const char* stubids17_ = "c o int = finufft_execute(c i finufft_plan, c i dcomplex[], c o dcomplex[xx])";
@@ -2185,7 +2188,7 @@ void mexStub17(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 276 ----
+/* ---- finufft.mw: 279 ----
  * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk, n_trans] result);
  */
 static const char* stubids18_ = "c o int = finufftf_execute(c i finufftf_plan, c i fcomplex[], c o fcomplex[xx])";
@@ -2243,7 +2246,7 @@ void mexStub18(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 311 ----
+/* ---- finufft.mw: 314 ----
  * int ier = finufft_execute_adjoint(finufft_plan plan, output dcomplex[nj, n_trans] result, dcomplex[] data_in);
  */
 static const char* stubids19_ = "c o int = finufft_execute_adjoint(c i finufft_plan, c o dcomplex[xx], c i dcomplex[])";
@@ -2301,7 +2304,7 @@ void mexStub19(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 313 ----
+/* ---- finufft.mw: 316 ----
  * int ier = finufftf_execute_adjoint(finufftf_plan plan, output fcomplex[nj, n_trans] result, fcomplex[] data_in);
  */
 static const char* stubids20_ = "c o int = finufftf_execute_adjoint(c i finufftf_plan, c o fcomplex[xx], c i fcomplex[])";
@@ -2359,7 +2362,7 @@ void mexStub20(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 317 ----
+/* ---- finufft.mw: 320 ----
  * int ier = finufft_execute_adjoint(finufft_plan plan, dcomplex[] data_in, output dcomplex[ncoeffs] result);
  */
 static const char* stubids21_ = "c o int = finufft_execute_adjoint(c i finufft_plan, c i dcomplex[], c o dcomplex[x])";
@@ -2415,7 +2418,7 @@ void mexStub21(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 319 ----
+/* ---- finufft.mw: 322 ----
  * int ier = finufftf_execute_adjoint(finufftf_plan plan, fcomplex[] data_in, output fcomplex[ncoeffs] result);
  */
 static const char* stubids22_ = "c o int = finufftf_execute_adjoint(c i finufftf_plan, c i fcomplex[], c o fcomplex[x])";
@@ -2471,7 +2474,7 @@ void mexStub22(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 339 ----
+/* ---- finufft.mw: 342 ----
  * finufft_destroy(finufft_plan plan);
  */
 static const char* stubids25_ = "finufft_destroy(c i finufft_plan)";
@@ -2499,7 +2502,7 @@ void mexStub25(int nlhs, mxArray* plhs[],
         mexErrMsgTxt(mw_err_txt_);
 }
 
-/* ---- finufft.mw: 341 ----
+/* ---- finufft.mw: 344 ----
  * finufftf_destroy(finufftf_plan plan);
  */
 static const char* stubids26_ = "finufftf_destroy(c i finufftf_plan)";
@@ -2605,30 +2608,30 @@ void mexFunction(int nlhs, mxArray* plhs[],
     } else if (strcmp(id, "*profile report*") == 0) {
         if (!mexprofrecord_)
             mexPrintf("Profiler inactive\n");
-        mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[1]);
-        mexPrintf("%d calls to finufft.mw:171\n", mexprofrecord_[2]);
-        mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[3]);
-        mexPrintf("%d calls to finufft.mw:174\n", mexprofrecord_[4]);
-        mexPrintf("%d calls to finufft.mw:176\n", mexprofrecord_[5]);
-        mexPrintf("%d calls to finufft.mw:177\n", mexprofrecord_[6]);
-        mexPrintf("%d calls to finufft.mw:188\n", mexprofrecord_[7]);
-        mexPrintf("%d calls to finufft.mw:191\n", mexprofrecord_[8]);
-        mexPrintf("%d calls to finufft.mw:194\n", mexprofrecord_[9]);
-        mexPrintf("%d calls to finufft.mw:196\n", mexprofrecord_[10]);
-        mexPrintf("%d calls to finufft.mw:226\n", mexprofrecord_[11]);
-        mexPrintf("%d calls to finufft.mw:228\n", mexprofrecord_[12]);
-        mexPrintf("%d calls to finufft.mw:260\n", mexprofrecord_[13]);
-        mexPrintf("%d calls to finufft.mw:262\n", mexprofrecord_[14]);
-        mexPrintf("%d calls to finufft.mw:268 (finufft.mw:325)\n", mexprofrecord_[15]);
-        mexPrintf("%d calls to finufft.mw:270 (finufft.mw:327)\n", mexprofrecord_[16]);
-        mexPrintf("%d calls to finufft.mw:274\n", mexprofrecord_[17]);
-        mexPrintf("%d calls to finufft.mw:276\n", mexprofrecord_[18]);
-        mexPrintf("%d calls to finufft.mw:311\n", mexprofrecord_[19]);
-        mexPrintf("%d calls to finufft.mw:313\n", mexprofrecord_[20]);
-        mexPrintf("%d calls to finufft.mw:317\n", mexprofrecord_[21]);
-        mexPrintf("%d calls to finufft.mw:319\n", mexprofrecord_[22]);
-        mexPrintf("%d calls to finufft.mw:339\n", mexprofrecord_[25]);
-        mexPrintf("%d calls to finufft.mw:341\n", mexprofrecord_[26]);
+        mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[1]);
+        mexPrintf("%d calls to finufft.mw:174\n", mexprofrecord_[2]);
+        mexPrintf("%d calls to finufft.mw:176\n", mexprofrecord_[3]);
+        mexPrintf("%d calls to finufft.mw:177\n", mexprofrecord_[4]);
+        mexPrintf("%d calls to finufft.mw:179\n", mexprofrecord_[5]);
+        mexPrintf("%d calls to finufft.mw:180\n", mexprofrecord_[6]);
+        mexPrintf("%d calls to finufft.mw:191\n", mexprofrecord_[7]);
+        mexPrintf("%d calls to finufft.mw:194\n", mexprofrecord_[8]);
+        mexPrintf("%d calls to finufft.mw:197\n", mexprofrecord_[9]);
+        mexPrintf("%d calls to finufft.mw:199\n", mexprofrecord_[10]);
+        mexPrintf("%d calls to finufft.mw:229\n", mexprofrecord_[11]);
+        mexPrintf("%d calls to finufft.mw:231\n", mexprofrecord_[12]);
+        mexPrintf("%d calls to finufft.mw:263\n", mexprofrecord_[13]);
+        mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[14]);
+        mexPrintf("%d calls to finufft.mw:271 (finufft.mw:328)\n", mexprofrecord_[15]);
+        mexPrintf("%d calls to finufft.mw:273 (finufft.mw:330)\n", mexprofrecord_[16]);
+        mexPrintf("%d calls to finufft.mw:277\n", mexprofrecord_[17]);
+        mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[18]);
+        mexPrintf("%d calls to finufft.mw:314\n", mexprofrecord_[19]);
+        mexPrintf("%d calls to finufft.mw:316\n", mexprofrecord_[20]);
+        mexPrintf("%d calls to finufft.mw:320\n", mexprofrecord_[21]);
+        mexPrintf("%d calls to finufft.mw:322\n", mexprofrecord_[22]);
+        mexPrintf("%d calls to finufft.mw:342\n", mexprofrecord_[25]);
+        mexPrintf("%d calls to finufft.mw:344\n", mexprofrecord_[26]);
     } else if (strcmp(id, "*profile log*") == 0) {
         FILE* logfp;
         if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0)
@@ -2638,30 +2641,30 @@ void mexFunction(int nlhs, mxArray* plhs[],
             mexErrMsgTxt("Cannot open log for output");
         if (!mexprofrecord_)
             fprintf(logfp, "Profiler inactive\n");
-        fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[1]);
-        fprintf(logfp, "%d calls to finufft.mw:171\n", mexprofrecord_[2]);
-        fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[3]);
-        fprintf(logfp, "%d calls to finufft.mw:174\n", mexprofrecord_[4]);
-        fprintf(logfp, "%d calls to finufft.mw:176\n", mexprofrecord_[5]);
-        fprintf(logfp, "%d calls to finufft.mw:177\n", mexprofrecord_[6]);
-        fprintf(logfp, "%d calls to finufft.mw:188\n", mexprofrecord_[7]);
-        fprintf(logfp, "%d calls to finufft.mw:191\n", mexprofrecord_[8]);
-        fprintf(logfp, "%d calls to finufft.mw:194\n", mexprofrecord_[9]);
-        fprintf(logfp, "%d calls to finufft.mw:196\n", mexprofrecord_[10]);
-        fprintf(logfp, "%d calls to finufft.mw:226\n", mexprofrecord_[11]);
-        fprintf(logfp, "%d calls to finufft.mw:228\n", mexprofrecord_[12]);
-        fprintf(logfp, "%d calls to finufft.mw:260\n", mexprofrecord_[13]);
-        fprintf(logfp, "%d calls to finufft.mw:262\n", mexprofrecord_[14]);
-        fprintf(logfp, "%d calls to finufft.mw:268 (finufft.mw:325)\n", mexprofrecord_[15]);
-        fprintf(logfp, "%d calls to finufft.mw:270 (finufft.mw:327)\n", mexprofrecord_[16]);
-        fprintf(logfp, "%d calls to finufft.mw:274\n", mexprofrecord_[17]);
-        fprintf(logfp, "%d calls to finufft.mw:276\n", mexprofrecord_[18]);
-        fprintf(logfp, "%d calls to finufft.mw:311\n", mexprofrecord_[19]);
-        fprintf(logfp, "%d calls to finufft.mw:313\n", mexprofrecord_[20]);
-        fprintf(logfp, "%d calls to finufft.mw:317\n", mexprofrecord_[21]);
-        fprintf(logfp, "%d calls to finufft.mw:319\n", mexprofrecord_[22]);
-        fprintf(logfp, "%d calls to finufft.mw:339\n", mexprofrecord_[25]);
-        fprintf(logfp, "%d calls to finufft.mw:341\n", mexprofrecord_[26]);
+        fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[1]);
+        fprintf(logfp, "%d calls to finufft.mw:174\n", mexprofrecord_[2]);
+        fprintf(logfp, "%d calls to finufft.mw:176\n", mexprofrecord_[3]);
+        fprintf(logfp, "%d calls to finufft.mw:177\n", mexprofrecord_[4]);
+        fprintf(logfp, "%d calls to finufft.mw:179\n", mexprofrecord_[5]);
+        fprintf(logfp, "%d calls to finufft.mw:180\n", mexprofrecord_[6]);
+        fprintf(logfp, "%d calls to finufft.mw:191\n", mexprofrecord_[7]);
+        fprintf(logfp, "%d calls to finufft.mw:194\n", mexprofrecord_[8]);
+        fprintf(logfp, "%d calls to finufft.mw:197\n", mexprofrecord_[9]);
+        fprintf(logfp, "%d calls to finufft.mw:199\n", mexprofrecord_[10]);
+        fprintf(logfp, "%d calls to finufft.mw:229\n", mexprofrecord_[11]);
+        fprintf(logfp, "%d calls to finufft.mw:231\n", mexprofrecord_[12]);
+        fprintf(logfp, "%d calls to finufft.mw:263\n", mexprofrecord_[13]);
+        fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[14]);
+        fprintf(logfp, "%d calls to finufft.mw:271 (finufft.mw:328)\n", mexprofrecord_[15]);
+        fprintf(logfp, "%d calls to finufft.mw:273 (finufft.mw:330)\n", mexprofrecord_[16]);
+        fprintf(logfp, "%d calls to finufft.mw:277\n", mexprofrecord_[17]);
+        fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[18]);
+        fprintf(logfp, "%d calls to finufft.mw:314\n", mexprofrecord_[19]);
+        fprintf(logfp, "%d calls to finufft.mw:316\n", mexprofrecord_[20]);
+        fprintf(logfp, "%d calls to finufft.mw:320\n", mexprofrecord_[21]);
+        fprintf(logfp, "%d calls to finufft.mw:322\n", mexprofrecord_[22]);
+        fprintf(logfp, "%d calls to finufft.mw:342\n", mexprofrecord_[25]);
+        fprintf(logfp, "%d calls to finufft.mw:344\n", mexprofrecord_[26]);
         fclose(logfp);
     } else
         mexErrMsgTxt("Unknown identifier");
diff --git a/matlab/finufft.mw b/matlab/finufft.mw
index 4f82493c6..3979419cc 100644
--- a/matlab/finufft.mw
+++ b/matlab/finufft.mw
@@ -103,6 +103,9 @@ $     }
 $     else if (strcmp(fname[ifield],"spread_max_sp_size") == 0) {
 $       oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
 $     }
+$     else if (strcmp(fname[ifield],"spread_function") == 0) {
+$       oc->spread_function = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
+$     }
 $     else if (strcmp(fname[ifield],"spreadinterponly") == 0) {
 $       oc->spreadinterponly = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
 $     }
diff --git a/matlab/finufft1d1.m b/matlab/finufft1d1.m
index 0c133a22b..d67455eaf 100644
--- a/matlab/finufft1d1.m
+++ b/matlab/finufft1d1.m
@@ -31,6 +31,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %     opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
 %     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
 %   Outputs:
diff --git a/matlab/finufft1d2.m b/matlab/finufft1d2.m
index dd452e9a2..8f7584439 100644
--- a/matlab/finufft1d2.m
+++ b/matlab/finufft1d2.m
@@ -30,6 +30,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %     opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
 %     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
 %  Outputs:
diff --git a/matlab/finufft1d3.m b/matlab/finufft1d3.m
index cff3a6298..fbfda8fae 100644
--- a/matlab/finufft1d3.m
+++ b/matlab/finufft1d3.m
@@ -30,6 +30,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %   Outputs:
 %     f     length-nk complex vector of values at targets, or, if ntrans>1,
 %           a matrix of size (nk,ntrans)
diff --git a/matlab/finufft2d1.m b/matlab/finufft2d1.m
index 1cd940e04..c89d50cc9 100644
--- a/matlab/finufft2d1.m
+++ b/matlab/finufft2d1.m
@@ -35,6 +35,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %     opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
 %     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
 %   Outputs:
diff --git a/matlab/finufft2d2.m b/matlab/finufft2d2.m
index b17baba50..a227fe88d 100644
--- a/matlab/finufft2d2.m
+++ b/matlab/finufft2d2.m
@@ -32,6 +32,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %     opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
 %     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
 %  Outputs:
diff --git a/matlab/finufft2d3.m b/matlab/finufft2d3.m
index 62aff50df..8c7ec2046 100644
--- a/matlab/finufft2d3.m
+++ b/matlab/finufft2d3.m
@@ -31,6 +31,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %   Outputs:
 %     f     length-nk complex vector of values at targets, or, if ntrans>1,
 %           a matrix of size (nk,ntrans)
diff --git a/matlab/finufft3d1.m b/matlab/finufft3d1.m
index aab2d8693..19e4c145d 100644
--- a/matlab/finufft3d1.m
+++ b/matlab/finufft3d1.m
@@ -37,6 +37,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %     opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
 %     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
 %   Outputs:
diff --git a/matlab/finufft3d2.m b/matlab/finufft3d2.m
index 6bf0631b3..f65bee063 100644
--- a/matlab/finufft3d2.m
+++ b/matlab/finufft3d2.m
@@ -34,6 +34,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %     opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
 %     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
 %  Outputs:
diff --git a/matlab/finufft3d3.m b/matlab/finufft3d3.m
index edb5cdd5e..9cf7fda79 100644
--- a/matlab/finufft3d3.m
+++ b/matlab/finufft3d3.m
@@ -32,6 +32,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %   Outputs:
 %     f     length-nk complex vector of values at targets, or, if ntrans>1,
 %           a matrix of size (nk,ntrans)
diff --git a/matlab/finufft_plan.m b/matlab/finufft_plan.m
index 3179b9fc9..ed78d57de 100644
--- a/matlab/finufft_plan.m
+++ b/matlab/finufft_plan.m
@@ -56,6 +56,7 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
 %     opts.floatprec: library precision to use, 'double' (default) or 'single'.
 %     for type 1 and 2 only, the following opts fields are also relevant:
 %     opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
diff --git a/matlab/opts.docbit b/matlab/opts.docbit
index 1a290afd7..c7fc1be2f 100644
--- a/matlab/opts.docbit
+++ b/matlab/opts.docbit
@@ -12,3 +12,4 @@
 %     opts.spread_thread:   for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc
 %     opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
 %     opts.nthreads:   number of threads, or 0: use all available (default)
+%     opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only]
diff --git a/matlab/test/fig_accuracy.m b/matlab/test/fig_accuracy.m
index 9b84571b2..543535429 100644
--- a/matlab/test/fig_accuracy.m
+++ b/matlab/test/fig_accuracy.m
@@ -6,11 +6,15 @@
 %M=1e2; N=1e5; % confusion about N vs M controlling err prefac (it's N)
 isign   = +1;     % sign of imaginary unit in exponential
 o.debug = 0;      % choose 1 for timing breakdown text output
+o.spread_function = 0;
 
-% use one of these two...
+% use one of these...
 %tols = 10.^(-1:-0.02:-15); o.upsampfac = 2.0;
 tols = 10.^(-1:-0.02:-10); o.upsampfac=1.25;    % for lowupsampfac
-%tols = 10.^(-1:-0.02:-14); o.upsampfac=1.99;    % for lowupsampfac
+%tols = 10.^(-1:-0.02:-11); o.upsampfac=1.3;    % for lowupsampfac
+
+%tols = 10.^(-1:-0.02:-12); o.upsampfac=1.5;    % intermediate
+%tols = 10.^(-1:-0.02:-14); o.upsampfac=1.99;    % v close to 2
 
 % other expts...
 %tols = 1e-6;
@@ -30,4 +34,4 @@
 figure; loglog(tols,errs,'+'); hold on; plot(tols,tols,'-');
 axis tight; xlabel('tol'); ylabel('err');
 %title(sprintf('1d1: (maxerr)/||c||_1, M=%d, N=%d\n',M,N));
-title(sprintf('1d1: ||\tilde f - f||_2/||f||_2, M=%d, N=%d\n',M,N));
+title(sprintf('1d1 \\sigma=%g sf=%d M=%d N=%d: rel 2-norm err in f',o.upsampfac,o.spread_function,M,N));
diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt
index 3b0dec66d..a61667eff 100644
--- a/perftest/cuda/CMakeLists.txt
+++ b/perftest/cuda/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_executable(cuperftest cuperftest.cu)
 target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-target_link_libraries(cuperftest cufinufft CUDA::cufft CUDA::cudart)
+target_link_libraries(cuperftest PRIVATE cufinufft CUDA::cufft CUDA::cudart $<BUILD_INTERFACE:finufft_common>)
 target_compile_features(cuperftest PRIVATE cxx_std_17)
 set_target_properties(
     cuperftest
diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py
index 593fc5b16..e28f02f9f 100644
--- a/python/finufft/finufft/_finufft.py
+++ b/python/finufft/finufft/_finufft.py
@@ -86,6 +86,7 @@ class FinufftOpts(ctypes.Structure):
                       ('maxbatchsize', c_int),
                       ('spread_nthr_atomic', c_int),
                       ('spread_max_sp_size', c_int),
+                      ('spread_function', c_int),
                       ('fftw_lock_fun', c_void_p),
                       ('fftw_unlock_fun', c_void_p),
                       ('fftw_lock_data', c_void_p)]
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e326773c6..d257b0db4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -23,7 +23,6 @@ set(FINUFFT_SOURCES
     finufft_core.cpp
     c_interface.cpp
     finufft_utils.cpp
-    utils.cpp
 )
 
 if(FINUFFT_BUILD_FORTRAN)
@@ -48,7 +47,7 @@ if(FINUFFT_USE_DUCC0)
     target_compile_definitions(finufft PRIVATE FINUFFT_USE_DUCC0)
 endif()
 
-target_link_libraries(finufft PRIVATE $<BUILD_INTERFACE:finufft_fftlibs xsimd>)
+target_link_libraries(finufft PRIVATE $<BUILD_INTERFACE:finufft_fftlibs xsimd finufft_common>)
 if(FINUFFT_USE_OPENMP)
     target_link_libraries(finufft PRIVATE OpenMP::OpenMP_CXX)
     if(NOT FINUFFT_STATIC_LINKING)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
new file mode 100644
index 000000000..2658a66ef
--- /dev/null
+++ b/src/common/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.24)
+
+# Build a small static helper library for CPU-only kernel parameter helpers
+# Source is in ../kernel.cpp as requested
+set(FINUFFT_COMMON_SOURCES kernel.cpp utils.cpp)
+
+add_library(finufft_common STATIC ${FINUFFT_COMMON_SOURCES})
+
+# The public include directory is the top-level include/
+target_include_directories(
+    finufft_common
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>
+)
+set_target_properties(finufft_common PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+# Visibility and compile options consistent with finufft
+if(FINUFFT_SHARED_LINKING)
+    target_compile_definitions(finufft_common PRIVATE FINUFFT_DLL)
+endif()
+
+target_compile_features(finufft_common PRIVATE cxx_std_17)
+
+# If you need -lm etc.
+find_library(MATH_LIBRARY m)
+if(MATH_LIBRARY)
+    target_link_libraries(finufft_common PRIVATE ${MATH_LIBRARY})
+endif()
+
+set_target_properties(finufft_common PROPERTIES CXX_VISIBILITY_PRESET hidden VISIBILITY_INLINES_HIDDEN YES)
diff --git a/src/common/kernel.cpp b/src/common/kernel.cpp
new file mode 100644
index 000000000..a7d1cf30c
--- /dev/null
+++ b/src/common/kernel.cpp
@@ -0,0 +1,79 @@
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <finufft_common/common.h>
+#include <finufft_common/kernel.h>
+#include <finufft_spread_opts.h>
+
+namespace finufft::kernel {
+
+int compute_kernel_ns(double upsampfac, double tol, int kernel_type,
+                      const finufft_spread_opts &opts) {
+  // Note: opts is unused here but kept for API consistency if needed later
+  int ns;
+  if (upsampfac == 2.0) {
+    ns = (int)std::ceil(-std::log10(tol / 10.0));
+  } else {
+    ns = (int)std::ceil(
+        -std::log(tol) / (finufft::common::PI * std::sqrt(1.0 - 1.0 / upsampfac)));
+  }
+  ns = std::max(2, ns);
+  return ns;
+}
+
+void initialize_kernel_params(finufft_spread_opts &opts, double upsampfac, double tol,
+                              int kernel_type) {
+  int ns;
+  // Respect any pre-set opts.nspread (e.g., clipped by caller). If it's <=0 compute it.
+  if (opts.nspread > 0) {
+    ns = opts.nspread;
+  } else {
+    ns = compute_kernel_ns(upsampfac, tol, kernel_type, opts);
+  }
+
+  opts.kernel_type  = kernel_type;
+  opts.nspread      = ns; // ensure opts is populated with the (possibly clipped) ns
+  opts.ES_halfwidth = (double)ns / 2.0;
+  opts.ES_c         = 4.0 / (double)(ns * ns);
+
+  if (kernel_type == 1) {
+    // Kaiser-Bessel (KB)
+    // Formula from Beatty et al. 2005
+    double tmp   = (double)ns * (double)ns / (upsampfac * upsampfac);
+    double term2 = (upsampfac - 0.5) * (upsampfac - 0.5);
+    opts.ES_beta = common::PI * std::sqrt(tmp * term2 - 0.8);
+  } else {
+    // Exponential of Semicircle (ES)
+    double betaoverns = 2.30;
+    if (ns == 2)
+      betaoverns = 2.20;
+    else if (ns == 3)
+      betaoverns = 2.26;
+    else if (ns == 4)
+      betaoverns = 2.38;
+
+    if (upsampfac != 2.0) {
+      double gamma = 0.97;
+      betaoverns   = gamma * common::PI * (1.0 - 1.0 / (2.0 * upsampfac));
+    }
+    opts.ES_beta = betaoverns * (double)ns;
+  }
+
+  if (opts.debug) {
+    const char *kname = (kernel_type == 1) ? "KB" : "ES";
+    printf("setup_spreader: using spread kernel type %d (%s)\n", kernel_type, kname);
+    printf("setup_spreader eps=%.3g sigma=%.3g (%s): chose ns=%d beta=%.3g\n", tol,
+           upsampfac, kname, ns, opts.ES_beta);
+  }
+}
+
+double sigma_max_tol(double upsampfac, int kernel_type, int max_ns) {
+  if (upsampfac == 2.0) {
+    return 10.0 * std::pow(10.0, -(double)max_ns);
+  } else {
+    return std::exp(
+        -(double)max_ns * finufft::common::PI * std::sqrt(1.0 - 1.0 / upsampfac));
+  }
+}
+
+} // namespace finufft::kernel
diff --git a/src/utils.cpp b/src/common/utils.cpp
similarity index 73%
rename from src/utils.cpp
rename to src/common/utils.cpp
index ce8308fac..75b64ff55 100644
--- a/src/utils.cpp
+++ b/src/common/utils.cpp
@@ -1,5 +1,19 @@
+
 #include <cmath>
 #include <finufft_common/common.h>
+#include <limits>
+
+// Prefer the standard library's special-math `cyl_bessel_i` when available.
+#if defined(__has_include)
+#if __has_include(<version>)
+#include <version>
+#endif
+#endif
+// Feature-test macro for special math functions (if available in the standard
+// library implementation). Fall back to our series implementation otherwise.
+#if defined(__cpp_lib_math_special_functions)
+#define FINUFFT_HAVE_STD_CYL_BESSEL_I 1
+#endif
 
 #ifdef __CUDACC__
 #include <cufinufft/types.h>
@@ -24,7 +38,7 @@ void gaussquad(int n, double *xgl, double *wgl) {
   for (int i = 0; i < n / 2; i++) { // Loop through nodes
     convcount = 0;
     x         = std::cos((2 * i + 1) * PI / (2 * n)); // Initial guess: Chebyshev node
-    while (true) {                               // Newton iteration
+    while (true) {                                    // Newton iteration
       auto [p, dp] = leg_eval(n, x);
       dx           = -p / dp;
       x += dx; // Newton step
@@ -71,6 +85,34 @@ std::tuple<double, double> leg_eval(int n, double x) {
   return {p2, n * (x * p2 - p1) / (x * x - 1)};
 }
 
+double cyl_bessel_i(double nu, double x) noexcept {
+#if defined(FINUFFT_HAVE_STD_CYL_BESSEL_I)
+  return std::cyl_bessel_i(nu, x);
+#else
+  if (x == 0.0) {
+    if (nu == 0.0) return 1.0;
+    return 0.0;
+  }
+
+  const double halfx = x / 2.0;
+  double term        = std::pow(halfx, nu) / std::tgamma(nu + 1.0); // k = 0
+  double sum         = term;
+
+  static constexpr auto eps      = std::numeric_limits<double>::epsilon() * 10.0;
+  static constexpr auto max_iter = 100000;
+
+  for (int k = 1; k < max_iter; ++k) {
+    term *= (halfx * halfx) / (static_cast<double>(k) * (nu + static_cast<double>(k)));
+    sum += term;
+
+    if (std::abs(term) < eps * std::abs(sum)) {
+      break;
+    }
+  }
+  return sum;
+#endif
+}
+
 } // namespace common
 } // namespace finufft
 
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index 2f1b2aeaf..92825eeb5 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(staticAnalysis)
 include(setupIWYU)
-set(PRECISION_INDEPENDENT_SRC precision_independent.cu ../utils.cpp)
+set(PRECISION_INDEPENDENT_SRC precision_independent.cu)
 
 set(PRECISION_DEPENDENT_SRC
     spreadinterp.cpp
@@ -81,7 +81,7 @@ if(FINUFFT_SHARED_LINKING)
     endif()
 endif()
 
-target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft)
+target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft $<BUILD_INTERFACE:finufft_common>)
 # Expose only when not doing fully static linking
 if(NOT FINUFFT_STATIC_LINKING)
     target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
index 437ec39bb..6e3f866ae 100644
--- a/src/finufft_core.cpp
+++ b/src/finufft_core.cpp
@@ -107,7 +107,8 @@ static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps,
 {
   // this calls spreadinterp.cpp...
   int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth,
-                           opts.spread_debug, opts.showwarn, opts.spreadinterponly, dim);
+                           opts.spread_debug, opts.showwarn, opts.spreadinterponly, dim,
+                           opts.spread_function);
   // override various spread opts from their defaults...
   spopts.debug    = opts.spread_debug;
   spopts.sort     = opts.spread_sort; // could make dim or CPU choices here?
@@ -531,8 +532,9 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
   horner_coeffs.fill(TF(0));
 
   // Get the kernel parameters once
-  const TF beta    = TF(this->spopts.ES_beta);
-  const TF c_param = TF(this->spopts.ES_c);
+  const TF beta         = TF(this->spopts.ES_beta);
+  const TF c_param      = TF(this->spopts.ES_c);
+  const int kernel_type = this->spopts.kernel_type;
 
   nc = MIN_NC;
 
@@ -550,9 +552,9 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
     // original: 0.5 * (x - nspread + 2*j + 1)
     const TF shift = TF(2 * j + 1 - nspread);
 
-    const auto kernel = [shift, beta, c_param](TF x) -> TF {
+    const auto kernel = [shift, beta, c_param, kernel_type](TF x) -> TF {
       const TF t = TF(0.5) * (x + shift);
-      return evaluate_kernel(t, beta, c_param);
+      return evaluate_kernel(t, beta, c_param, kernel_type);
     };
 
     const auto coeffs = fit_monomials(kernel, static_cast<int>(n_coeffs), a, b);
@@ -567,8 +569,8 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
     // coeffs[0] is highest degree.
     int used = 0;
     for (size_t k = 0; k < coeffs.size(); ++k) {
-      if (std::abs(coeffs[k]) >= tol * 0.50) { // divide tol by 5 otherwise it fails in
-                                               // some cases
+      if (std::abs(coeffs[k]) >= tol * 0.5) { // divide tol by 2 otherwise it fails in
+                                              // some cases
         used = static_cast<int>(coeffs.size() - k);
         break;
       }
@@ -729,6 +731,7 @@ void finufft_default_opts_t(finufft_opts *o)
   o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
   o->spread_max_sp_size = 0;
+  o->spread_function    = 0;
   o->fftw_lock_fun      = nullptr;
   o->fftw_unlock_fun    = nullptr;
   o->fftw_lock_data     = nullptr;
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 88827c7a3..68101de8b 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -60,7 +60,7 @@ struct [[maybe_unused]] select_odd {
 
 // this finds the largest SIMD instruction set that can handle N elements
 // void otherwise -> compile error
-template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper() {
+template<class T, uint8_t N, uint8_t K = N> constexpr auto BestSIMDHelper() {
   if constexpr (N % K == 0) {
     // returns void in the worst case
     return xsimd::make_sized_batch<T, K>{};
@@ -78,7 +78,7 @@ template<class T, uint8_t ns> constexpr auto get_padding() {
 }
 
 template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_ns) {
-  if constexpr (ns < finufft::common::MIN_NSPREAD) {
+  if constexpr (ns < MIN_NSPREAD) {
     return 0;
   } else {
     if (runtime_ns == ns) {
@@ -90,7 +90,7 @@ template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_
 }
 
 template<class T> uint8_t get_padding(uint8_t ns) {
-  return get_padding_helper<T, 2 * ::finufft::common::MAX_NSPREAD>(ns);
+  return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
 }
 template<class T, uint8_t N>
 using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;
@@ -1999,7 +1999,7 @@ template int spreadinterpSorted<double>(
 
 template<typename T>
 int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth,
-                   int debug, int showwarn, int spreadinterponly, int dim)
+                   int debug, int showwarn, int spreadinterponly, int dim, int kernel_type)
 /* Initializes spreader kernel parameters given desired NUFFT tolerance eps,
    upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and debug flags.
    Horner polynomial evaluation is always used; the kerevalmeth argument is
@@ -2051,17 +2051,15 @@ int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerev
   }
   if (!spreadinterponly && (upsampfac < 1.25 || upsampfac > 2.0)) {
     if (showwarn)
-      fprintf(
-          stderr,
-          "%s warning: upsampfac=%.3g outside [1.25, 2.0] are unlikely to provide benefit and might break the library;\n",
-          __func__, upsampfac);
+      fprintf(stderr,
+              "%s warning: upsampfac=%.3g outside [1.25, 2.0] are unlikely to provide "
+              "benefit and might break the library;\n",
+              __func__, upsampfac);
   }
-  if (upsampfac == 2.0) // standard sigma (see SISC paper)
-    ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10
-  else // custom sigma
-    ns = std::ceil(-log(eps) / (T(PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula,
-  // gam=1
-  ns = max(2, ns); // (we don't have ns=1 version yet)
+  // Compute ns (kernel width) using central helper; caller handles clipping.
+  ns = compute_kernel_ns(upsampfac, (double)eps, kernel_type, opts);
+  // ns == 2 breaks for float with upsampfact = 2.00
+  if (std::is_same_v<T, float> && upsampfac == 2.00) ns = std::max(ns, 3);
   if (ns > MAX_NSPREAD) {
     // clip to fit allocated arrays, Horner rules
     if (showwarn)
@@ -2073,34 +2071,19 @@ int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerev
     ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
   opts.nspread = ns;
-  // setup for reference kernel eval (via formula): select beta width param...
-  // Horner kernels still need reference evals for FTs in onedim_*_kernel
-  opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines)
-  opts.ES_c         = 4.0 / (double)(ns * ns);
-  double betaoverns = 2.30; // gives decent betas for default sigma=2.0
-  if (ns == 2) betaoverns = 2.20; // some small-width tweaks...
-  if (ns == 3) betaoverns = 2.26;
-  if (ns == 4) betaoverns = 2.38;
-  if (upsampfac != 2.0) {
-    // again, override beta for custom sigma
-    T gamma    = 0.97; // must match devel/gen_all_horner_C_code.m !
-    betaoverns = gamma * T(PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based
-    // on cutoff
-  }
-  opts.ES_beta = betaoverns * ns; // set the kernel beta parameter
-  if (debug)
-    printf("%s eps=%.3g sigma=%.3g (Horner): chose ns=%d beta=%.3g\n", __func__,
-           (double)eps, upsampfac, ns, opts.ES_beta);
-
+  // default kernel selector: 0 = ES (exp-sqrt), 1 = KB (Kaiser--Bessel)
+  opts.kernel_type = kernel_type;
+  // initialize remaining ES/KB parameters (beta, c, halfwidth) based on opts.nspread
+  initialize_kernel_params(opts, upsampfac, (double)eps, kernel_type);
   return ier;
 }
 
 template FINUFFT_EXPORT_TEST int setup_spreader<float>(
     finufft_spread_opts &opts, float eps, double upsampfac, int kerevalmeth, int debug,
-    int showwarn, int spreadinterponly, int dim);
+    int showwarn, int spreadinterponly, int dim, int kernel_type);
 template FINUFFT_EXPORT_TEST int setup_spreader<double>(
     finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth, int debug,
-    int showwarn, int spreadinterponly, int dim);
+    int showwarn, int spreadinterponly, int dim, int kernel_type);
 
 template<typename T>
 T evaluate_kernel_runtime(T x, int ns, int nc, const T *horner_coeffs_ptr,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 83873f447..d677b4efa 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,6 +10,7 @@ set(TESTS
     finufft3dmany_test
     spreadinterp1d_test
     adjointness
+    accuracy_test
 )
 
 foreach(TEST ${TESTS})
@@ -96,6 +97,8 @@ function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX)
         WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
     )
 
+    add_test(NAME run_accuracy_test_${PREC} COMMAND accuracy_test${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
     add_test(NAME run_adjointness_${PREC} COMMAND adjointness${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 endfunction()
 
diff --git a/test/accuracy_test.cpp b/test/accuracy_test.cpp
new file mode 100644
index 000000000..cbd68acc3
--- /dev/null
+++ b/test/accuracy_test.cpp
@@ -0,0 +1,223 @@
+// test/accuracy_test.cpp
+// C++ analog of python/finufft/test/fig_accuracy.py
+// Sweeps tolerances from 1e-1 down to 1e-<max_digits> in 0.02-decade steps,
+// computes relative L2 error for FINUFFT1D1 and requires the average error
+// across each decade [1e-d,1e-(d+1)] to be <= 1e-d. Exits nonzero on failure
+// so CTest/CI can detect it.
+
+#include <cmath>
+#include <cstdlib>
+#include <finufft/test_defs.h>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <vector>
+// test utilities: direct DFT and norm helpers
+#include "utils/dirft1d.hpp"
+#include "utils/norms.hpp"
+#include <finufft_common/kernel.h>
+
+int main(int argc, char *argv[]) {
+  // Defaults
+  BIGINT M         = 2000; // # sources
+  BIGINT N         = 100;  // # modes
+  int isign        = +1;
+  double upsampfac = 2.0;
+  const auto seed  = std::random_device()();
+  int hold_inputs  = 1; // default: hold inputs (reuse across tolerances)
+  int kernel_type  = 0; // 0 => ES (default), 1 => KB
+  int max_digits   = 0; // <=0 means use machine precision (15 double / 7 float)
+  int debug_level  = 0; // optional: enable debug output (sets both opts.debug and
+                        // opts.spread_debug)
+  int showwarn = 0;     // whether to print warnings (default 0 => suppress)
+  int verbose  = 0;     // if 1 print per-tolerance FAILED lines and decade summaries
+
+  double w = 0.0;
+  // If user asked for help, print usage and exit
+  for (int ai = 1; ai < argc; ++ai) {
+    if (std::string(argv[ai]) == "-h" || std::string(argv[ai]) == "--help") {
+      std::cout << "Usage: " << argv[0]
+                << " [M] [N] [isign] [upsampfac] [hold_inputs] [kernel_type] "
+                   "[max_digits] [debug] [showwarn] [verbose]\n";
+      std::cout << "  M             : number of sources (default 2000)\n";
+      std::cout << "  N             : number of modes (default 100)\n";
+      std::cout << "  isign         : sign of transform (default +1)\n";
+      std::cout << "  upsampfac     : upsampling factor (default 2.0)\n";
+      std::cout
+          << "  hold_inputs   : if nonzero, reuse inputs across tolerances (default 1)\n";
+      std::cout << "  kernel_type   : spread kernel selection (0:ES default, 1:KB)\n";
+      std::cout << "  max_digits    : max digits to test (<=0 uses machine precision)\n";
+      std::cout
+          << "  debug         : optional debug level (0=no debug, 1=some, 2=more)\n";
+      std::cout
+          << "  showwarn      : whether to print warnings (0=silent default, 1=show)\n";
+      std::cout << "  verbose       : if 1 print FAILED accuracies and decade summaries "
+                   "(default 0)\n";
+      std::cout << "Example: " << argv[0] << " 10000 100 1 2.0 1 0 15 1 0 0\n";
+      return 0;
+    }
+  }
+  if (argc > 1) {
+    sscanf(argv[1], "%lf", &w);
+    M = (BIGINT)w;
+  }
+  if (argc > 2) {
+    sscanf(argv[2], "%lf", &w);
+    N = (BIGINT)w;
+  }
+  if (argc > 3) sscanf(argv[3], "%d", &isign);
+  if (argc > 4) sscanf(argv[4], "%lf", &upsampfac);
+  // Note: seed is internal (default 42) and not a command-line argument
+  if (argc > 5) sscanf(argv[5], "%d", &hold_inputs);
+  if (argc > 6) sscanf(argv[6], "%d", &kernel_type);
+  if (argc > 7) sscanf(argv[7], "%d", &max_digits);
+  if (argc > 8) sscanf(argv[8], "%d", &debug_level);
+  if (argc > 9) sscanf(argv[9], "%d", &showwarn);
+  if (argc > 10) sscanf(argv[10], "%d", &verbose);
+
+  if (max_digits <= 0) {
+    max_digits     = std::numeric_limits<FLT>::digits10;
+    double min_tol = finufft::kernel::sigma_max_tol(upsampfac, kernel_type,
+                                                    finufft::common::MAX_NSPREAD);
+    // Cap max_digits based on achievable tolerance for the chosen upsampling
+    // factor and kernel.  Use kernel::sigma_max_tol with the library's
+    // MAX_NSPREAD to compute the smallest attainable sigma, then convert to
+    // decimal digits and clamp. This prevents attempting to test digits
+    // finer than the kernel/spreader can reasonably achieve for the chosen
+    // `upsampfac`.
+    if (min_tol < 0.0)
+      throw std::runtime_error("accuracy_test: could not compute min_tol");
+    int max_digits_sigma = (int)std::floor(-std::log10(min_tol));
+    if (max_digits_sigma < 1) max_digits_sigma = 1;
+    if (max_digits > max_digits_sigma) {
+      max_digits = max_digits_sigma;
+    }
+  }
+
+  // Build tolerance grid: exps from -1 down to -max_digits in 0.02 steps.
+  // Use an integer-step loop to ensure the last exponent equals -max_digits
+  // (avoids floating-point drift that could omit the final decade).
+  std::vector<double> exps;
+  int nsteps = (int)std::round((max_digits - 1.0) / 0.02);
+  for (int i = 0; i <= nsteps; ++i) exps.push_back(-1.0 - 0.02 * (double)i);
+  const size_t NT = exps.size();
+  std::vector<double> tols(NT);
+  for (size_t t = 0; t < NT; ++t) tols[t] = pow(10.0, exps[t]);
+
+  // Setup opts (will be passed to guru makeplan). We will use the plan
+  // guru interface so we can override the plan's spread kernel selection
+  // via the plan method `set_spread_kernel_type` below.
+  finufft_opts opts{};
+  FINUFFT_DEFAULT_OPTS(&opts);
+  opts.upsampfac       = upsampfac;
+  opts.spread_function = kernel_type;
+  // set debug levels from command-line if requested
+  opts.debug        = debug_level;
+  opts.spread_debug = debug_level;
+  // set whether to show warnings (user-controllable)
+  opts.showwarn = showwarn;
+
+  // For reproducibility use srand when not holding inputs
+  srand(seed);
+
+  std::vector<FLT> x(M);
+  std::vector<CPX> c(M);
+  std::vector<CPX> F(N);
+  std::vector<CPX> fe(N);
+
+  for (BIGINT j = 0; j < M; ++j) {
+    x[j] = PI * randm11();
+  }
+  if (hold_inputs) {
+    for (BIGINT j = 0; j < M; ++j) c[j] = crandm11();
+  }
+
+  int ier = 0;
+
+  // Pass/fail tracking: we update these as each tolerance is tested.
+  int npass = 0;
+  int nfail = 0;
+  std::vector<int> decade_fail(max_digits + 1, 0);
+  std::vector<int> decade_total(max_digits + 1, 0);
+  std::cout << std::scientific << std::setprecision(6);
+
+  // Tunable slack multiplier: allow `slack * tol` as acceptable threshold.
+  // `base_slack` can be tuned to be more/less permissive. When a tolerance
+  // is close to machine precision (digit near `max_digits`) we increase the
+  // slack to account for limits of floating-point resolution.
+  const double base_slack = 3.0; // it fails in CI otherwsie
+  for (size_t t = 0; t < NT; ++t) {
+    double tol = tols[t];
+    if (!hold_inputs) {
+      for (BIGINT j = 0; j < M; ++j) {
+        x[j] = PI * randm11();
+        c[j] = crandm11();
+      }
+    }
+
+    ier = FINUFFT1D1(M, x.data(), c.data(), isign, (FLT)tol, N, F.data(), &opts);
+    if (ier > 1) {
+      std::cerr << "accuracy_test: FINUFFT1D1 returned ier=" << ier << "\n";
+      return ier;
+    }
+
+    // Compute exact result using direct DFT test utility and compute
+    // relative L2 error using the test norms utilities.
+    dirft1d1<BIGINT>(M, x, c, isign, N, fe);
+    double rel_err = relerrtwonorm<BIGINT>(N, fe, F);
+
+    // Compute integer digit from the precomputed exponents for exactness.
+    int d = (int)std::round(-exps[t]);
+    if (d < 1) d = 1;
+    if (d > max_digits) d = max_digits;
+
+    // If this tolerance is exactly a power-of-ten (1e-d), print the
+    // achieved accuracy for that power-of-ten tolerance.
+    if (std::fabs(-exps[t] - d) < 1e-12) {
+      std::cout << "tol 1e-" << d << " achieved=" << rel_err << "\n";
+    }
+
+    // Compute the required threshold using a tunable slack multiplier.
+    double slack = base_slack;
+#ifdef SINGLE
+    if (d == 6) slack *= 5.0;
+    if (d == 7) slack *= 50.0;
+#else
+    if (d == 14) slack *= 1.10;
+    if (d == 15) slack *= 13.0;
+#endif
+
+    const double req = tol * slack; // final acceptance threshold
+    const bool pass  = (rel_err <= req);
+    if (pass) {
+      ++npass;
+    } else {
+      ++nfail;
+      ++decade_fail[d];
+      // Print failures immediately only in verbose mode (suppress PASSED lines to reduce
+      // noise).
+      if (verbose) {
+        std::cout << "tol=" << tol << " req=" << req << " rel_err=" << rel_err
+                  << " -> FAILED (interval=1e-" << d << ",1e-" << (d + 1) << ")\n";
+      }
+    }
+    ++decade_total[d];
+
+    // If the next sample is in a different interval (or we're at the last sample),
+    // print the decade summary now so progress appears as we go.
+    int next_d = -1;
+    if (t + 1 < NT) next_d = static_cast<int>(std::round(-exps[t + 1]));
+    if (next_d < 1) next_d = 1;
+    if (next_d > max_digits) next_d = max_digits;
+    if (next_d != d) {
+      // Only print intermediate decade summaries in verbose mode; otherwise
+      // the program will only emit the final SUMMARY to reduce noise.
+      std::cout << "-- [1e-" << (d) << ", 1e-" << (d + 1)
+                << "] summary: total=" << decade_total[d] << " failed=" << decade_fail[d]
+                << "\n";
+    }
+  }
+
+  std::cout << "\nSUMMARY: " << npass << " passed, " << nfail << " failed\n";
+  return nfail != 0;
+}
diff --git a/test/check_finufft.sh b/test/check_finufft.sh
index 213d1c784..2ed15b531 100755
--- a/test/check_finufft.sh
+++ b/test/check_finufft.sh
@@ -111,6 +111,12 @@ T=adjointness$PRECSUF
 E=${PIPESTATUS[0]}
 if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi
 
+((N++))
+T=accuracy_test$PRECSUF
+./$T$FEX 2>$DIR/$T.err.out | tee $DIR/$T.out
+E=${PIPESTATUS[0]}
+if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi
+
 # END TESTS ---------------------------------------------------------
 
 
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index 1ee87f185..8c26967fc 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -5,9 +5,11 @@ foreach(srcfile ${test_src})
     get_filename_component(executable ${executable} NAME)
     add_executable(${executable} ${srcfile})
     target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-    target_link_libraries(${executable} cufinufft CUDA::cufft CUDA::cudart)
+    # Link cufinufft and CUDA runtime/cufft and also link finufft_common
+    # privately so only test executables get it (keep finufft_common PRIVATE in cufinufft).
+    target_link_libraries(${executable} PRIVATE cufinufft CUDA::cufft CUDA::cudart $<BUILD_INTERFACE:finufft_common>)
     if(MATH_LIBRARY)
-        target_link_libraries(${executable} ${MATH_LIBRARY})
+        target_link_libraries(${executable} PRIVATE ${MATH_LIBRARY})
     endif()
     target_compile_features(${executable} PUBLIC cxx_std_17)
     set_target_properties(