Fix kernel for ns=2 sigma=2 in float32

DiamonDinoia · DiamonDinoia · commit 6b06e918da32 · 2025-12-10T22:35:55.000+01:00
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -103,7 +103,7 @@ function(enable_asan target)
 endfunction()
 
 function(finufft_link_test target)
-    target_link_libraries(${target} PRIVATE finufft::finufft finufft::common)
+    target_link_libraries(${target} PRIVATE finufft::finufft finufft_common)
     if(FINUFFT_USE_DUCC0)
         target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
     endif()
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
@@ -5,6 +5,6 @@ foreach(srcfile ${example_src})
     get_filename_component(executable ${executable} NAME)
     add_executable(${executable} ${srcfile})
     target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-    target_link_libraries(${executable} cufinufft CUDA::cufft CUDA::cudart)
+    target_link_libraries(${executable} PRIVATE cufinufft CUDA::cufft CUDA::cudart $<BUILD_INTERFACE:finufft_common>)
     target_compile_features(${executable} PRIVATE cxx_std_17)
 endforeach()
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
@@ -89,7 +89,7 @@ class WithCudaDevice {
 };
 
 // math helpers whose source is in src/utils.cpp
-long next235beven(long n, long b);
+FINUFFT_EXPORT long next235beven(long n, long b);
 
 /**
  * does a complex atomic add on a shared memory address
diff --git a/makefile b/makefile
@@ -152,7 +152,7 @@ STATICLIB = lib-static/$(LIBNAME).a
 ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 
 # spreader objs
-SOBJS = src/finufft_utils.o src/utils.o src/spreadinterp.o
+SOBJS = src/finufft_utils.o src/spreadinterp.o src/common/utils.o src/common/kernel.o
 
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
 OBJS = $(SOBJS) src/fft.o src/finufft_core.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS)
@@ -283,10 +283,10 @@ test/%: test/%.cpp $(DYNLIB)
 test/%f: test/%.cpp $(DYNLIB)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
 # low-level tests that are cleaner if depend on only specific objects...
-test/testutils: test/testutils.cpp src/finufft_utils.o src/utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutils
-test/testutilsf: test/testutils.cpp src/finufft_utils.o src/utils.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutilsf
+test/testutils: test/testutils.cpp src/finufft_utils.o src/common/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutils
+test/testutilsf: test/testutils.cpp src/finufft_utils.o src/common/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutilsf
 
 # make sure all double-prec test executables ready for testing
 TESTS := $(basename $(wildcard test/*.cpp))
diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_executable(cuperftest cuperftest.cu)
 target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
-target_link_libraries(cuperftest cufinufft CUDA::cufft CUDA::cudart)
+target_link_libraries(cuperftest PRIVATE cufinufft CUDA::cufft CUDA::cudart $<BUILD_INTERFACE:finufft_common>)
 target_compile_features(cuperftest PRIVATE cxx_std_17)
 set_target_properties(
     cuperftest
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -47,8 +47,7 @@ if(FINUFFT_USE_DUCC0)
     target_compile_definitions(finufft PRIVATE FINUFFT_USE_DUCC0)
 endif()
 
-target_link_libraries(finufft PRIVATE $<BUILD_INTERFACE:finufft_fftlibs xsimd>)
-target_link_libraries(finufft PRIVATE finufft_common)
+target_link_libraries(finufft PRIVATE $<BUILD_INTERFACE:finufft_fftlibs xsimd finufft_common>)
 if(FINUFFT_USE_OPENMP)
     target_link_libraries(finufft PRIVATE OpenMP::OpenMP_CXX)
     if(NOT FINUFFT_STATIC_LINKING)
@@ -100,5 +99,4 @@ endif()
 
 set(_targets ${INSTALL_TARGETS})
 list(APPEND _targets finufft)
-list(APPEND _targets finufft_common)
 set(INSTALL_TARGETS "${_targets}" PARENT_SCOPE)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
@@ -5,13 +5,13 @@ cmake_minimum_required(VERSION 3.24)
 set(FINUFFT_COMMON_SOURCES kernel.cpp utils.cpp)
 
 add_library(finufft_common STATIC ${FINUFFT_COMMON_SOURCES})
-add_library(finufft::common ALIAS finufft_common)
 
 # The public include directory is the top-level include/
 target_include_directories(
     finufft_common
     PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>
 )
+set_target_properties(finufft_common PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 # Visibility and compile options consistent with finufft
 if(FINUFFT_SHARED_LINKING)
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
@@ -23,9 +23,17 @@ namespace finufft {
 namespace common {
 
 void gaussquad(int n, double *xgl, double *wgl) {
+  // n-node Gauss-Legendre quadrature, adapted from a code by Jason Kaye (2022-2023),
+  // from the utils file of https://github.com/flatironinstitute/cppdlr version 1.2,
+  // which is Apache-2 licensed. It uses Newton iteration from Chebyshev points.
+  // Double-precision only.
+  // Adapted by Barnett 6/8/25 to write nodes (xgl) and weights (wgl) into arrays
+  // that the user must pre-allocate to length at least n.
+
   double x = 0, dx = 0;
   int convcount = 0;
 
+  // Get Gauss-Legendre nodes
   xgl[n / 2] = 0;                   // If odd number of nodes, middle node is 0
   for (int i = 0; i < n / 2; i++) { // Loop through nodes
     convcount = 0;
@@ -39,27 +47,35 @@ void gaussquad(int n, double *xgl, double *wgl) {
       }
       if (convcount == 3) {
         break;
-      }
+      } // If convergence tol hit 3 times, stop
     }
     xgl[i]         = -x;
     xgl[n - i - 1] = x; // Symmetric nodes
   }
 
+  // Get Gauss-Legendre weights from formula
+  // w_i = -2 / ((n+1)*P_n'(x_i)*P_{n+1}(x_i)) (Atkinson '89, pg. 276)
   for (int i = 0; i < n / 2 + 1; i++) {
     auto [junk1, dp] = leg_eval(n, xgl[i]);
-    auto [p, junk2]  = leg_eval(n + 1, xgl[i]);
-    wgl[i]           = -2 / ((n + 1) * dp * p);
-    wgl[n - i - 1]   = wgl[i];
+    auto [p, junk2]  = leg_eval(n + 1, xgl[i]); // This is a bit inefficient, but who
+                                                // cares...
+    wgl[i]         = -2 / ((n + 1) * dp * p);
+    wgl[n - i - 1] = wgl[i];
   }
 }
 
 std::tuple<double, double> leg_eval(int n, double x) {
+  // return Legendre polynomial P_n(x) and its derivative P'_n(x).
+  // Uses Legendre three-term recurrence.
+  // Used by gaussquad above, with which it shares the same authorship and source.
+
   if (n == 0) {
     return {1.0, 0.0};
   }
   if (n == 1) {
     return {x, 1.0};
   }
+  // Three-term recurrence and formula for derivative
   double p0 = 0.0, p1 = 1.0, p2 = x;
   for (int i = 1; i < n; i++) {
     p0 = p1;
@@ -103,7 +119,13 @@ double cyl_bessel_i(double nu, double x) noexcept {
 namespace cufinufft {
 namespace utils {
 
-long next235beven(long n, long b) {
+long next235beven(long n, long b)
+// finds even integer not less than n, with prime factors no larger than 5
+// (ie, "smooth") and is a multiple of b (b is a number that the only prime
+// factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17
+// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
+// added condition about b, Melody Shih 05/31/20
+{
   if (n <= 2) return 2;
   if (n % 2 == 1) n += 1;                // even
   long nplus  = n - 2;                   // to cancel out the +=2 at start of loop
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
@@ -81,8 +81,7 @@ if(FINUFFT_SHARED_LINKING)
     endif()
 endif()
 
-target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft)
-target_link_libraries(cufinufft PRIVATE finufft_common)
+target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft $<BUILD_INTERFACE:finufft_common>)
 # Expose only when not doing fully static linking
 if(NOT FINUFFT_STATIC_LINKING)
     target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
@@ -569,8 +569,8 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
     // coeffs[0] is highest degree.
     int used = 0;
     for (size_t k = 0; k < coeffs.size(); ++k) {
-      if (std::abs(coeffs[k]) >= tol * 0.50) { // divide tol by 5 otherwise it fails in
-                                               // some cases
+      if (std::abs(coeffs[k]) >= tol * 0.5) { // divide tol by 2 otherwise it fails in
+                                              // some cases
         used = static_cast<int>(coeffs.size() - k);
         break;
       }
@@ -633,7 +633,7 @@ template<typename TF> int FINUFFT_PLAN_T<TF>::initSpreadAndFFT() {
         printf(" spread_thread=%d\n", opts.spread_thread);
     }
 
-  } else {               // ..... usual NUFFT: eval Fourier series, alloc workspace .....
+  } else { // ..... usual NUFFT: eval Fourier series, alloc workspace .....
 
     if (opts.showwarn) { // user warn round-off error (due to prob condition #)...
       for (int idim = 0; idim < dim; ++idim)
@@ -647,8 +647,8 @@ template<typename TF> int FINUFFT_PLAN_T<TF>::initSpreadAndFFT() {
     // determine fine grid sizes, sanity check, then alloc...
     for (int idim = 0; idim < dim; ++idim) {
       int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
-      if (nfier) return nfier;                  // nf too big; we're done
-      phiHat[idim].resize(nfdim[idim] / 2 + 1); // alloc fseries
+      if (nfier) return nfier;                    // nf too big; we're done
+      phiHat[idim].resize(nfdim[idim] / 2 + 1);   // alloc fseries
     }
 
     if (opts.debug) { // "long long" here is to avoid warnings with printf...
@@ -681,7 +681,7 @@ template<typename TF> int FINUFFT_PLAN_T<TF>::initSpreadAndFFT() {
     }
 
     timer.restart(); // plan the FFTW (to act in-place on the workspace fwBatch)
-    int nthr_fft  = opts.nthreads;
+    int nthr_fft = opts.nthreads;
     const auto ns = gridsize_for_fft(*this);
     std::vector<TC, xsimd::aligned_allocator<TC, 64>> fwBatch(nf() * batchSize);
     fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
@@ -853,8 +853,8 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
   // spreader/Horner internals now using the provided upsampfac.
   if (opts.upsampfac != 0.0) {
     upsamp_locked = true; // user explicitly set upsampfac, don't auto-update
-    ier           = setup_spreader_for_nufft(spopts, tol, opts, dim);
-    if (ier > 1)          // proceed if success or warning
+    ier = setup_spreader_for_nufft(spopts, tol, opts, dim);
+    if (ier > 1) // proceed if success or warning
       throw int(ier);
     precompute_horner_coeffs();
 
@@ -927,12 +927,12 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, const TF *xj, const TF *yj, const TF *
     return FINUFFT_ERR_NUM_NU_PTS_INVALID;
   }
 
-  if (type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
-                   // (all we can do is check and maybe bin-sort the NU pts)
+  if (type != 3) {          // ------------------ TYPE 1,2 SETPTS -------------------
+                            // (all we can do is check and maybe bin-sort the NU pts)
     // If upsampfac is not locked by user (auto mode), choose or update it now
     // based on the actual density nj/N(). Re-plan if density changed significantly.
     if (!upsamp_locked) {
-      double density   = double(nj) / double(N());
+      double density = double(nj) / double(N());
       double upsampfac = bestUpsamplingFactor<TF>(opts.nthreads, density, dim, type, tol);
       // Re-plan if this is the first call (upsampfac==0) or if upsampfac changed
       if (upsampfac != opts.upsampfac) {
@@ -1095,9 +1095,8 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, const TF *xj, const TF *yj, const TF *
     t2opts.debug        = std::max(0, opts.debug - 1);    // don't print as much detail
     t2opts.spread_debug = std::max(0, opts.spread_debug - 1);
     t2opts.showwarn     = 0;                              // so don't see warnings 2x
-    if (!upsamp_locked)
-      t2opts.upsampfac = 0.0; // if the upsampfac was auto, let inner
-                              // t2 pick it again (from density=nj/Nf)
+    if (!upsamp_locked) t2opts.upsampfac = 0.0; // if the upsampfac was auto, let inner
+                                                // t2 pick it again (from density=nj/Nf)
     // (...could vary other t2opts here?)
     // MR: temporary hack, until we have figured out the C++ interface.
     FINUFFT_PLAN_T<TF> *tmpplan;
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
diff --git a/test/accuracy_test.cpp b/test/accuracy_test.cpp
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt