Merge remote-tracking branch 'flatiron/master' into add-kernel-selection-opts

DiamonDinoia · DiamonDinoia · commit 53d2599a0c90 · 2025-12-10T17:40:02.000+01:00
diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml
@@ -29,15 +29,15 @@ jobs:
           cmake -S . -B ./build -G Ninja \
             -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo \
             -DFINUFFT_BUILD_TESTS=ON \
-            -DFINUFFT_USE_SANITIZERS=OFF
+            -DFINUFFT_USE_SANITIZERS=OFF \
+            -DMEMORYCHECK_COMMAND=$(which valgrind) \
+            -DMEMORYCHECK_COMMAND_OPTIONS="--leak-check=full --show-leak-kinds=definite,possible --errors-for-leak-kinds=definite --error-exitcode=1 --track-origins=yes --undef-value-errors=yes" \
+            -DMEMORYCHECK_TYPE=Valgrind
 
       - name: Build
         run: cmake --build ./build --config RelWithDebInfo
 
       - name: Memcheck (CTest)
         working-directory: ./build
-        env:
-          CTEST_MEMORYCHECK_COMMAND: valgrind
-          CTEST_MEMORYCHECK_COMMAND_OPTIONS: "--leak-check=full --show-leak-kinds=definite,possible --errors-for-leak-kinds=definite"
         run: |
           ctest -T memcheck --output-on-failure -j
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,9 +1,14 @@
 repos:
-  - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: 'v21.1.7'
+  - repo: local
     hooks:
-      - id: clang-format
-        types_or: [c++, c, cuda]
+      - id: git-clang-format-staged
+        name: git clang-format (staged only)
+        entry: git-clang-format --staged
+        language: python
+        additional_dependencies:
+          - clang-format
+        pass_filenames: false
+        types_or: [ c++, c, cuda ]
         files: \.(c|cc|cpp|h|hpp|cu|cuh)$
         exclude: '(^|/)(matlab/.*)$'
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/include/finufft/finufft_utils.hpp b/include/finufft/finufft_utils.hpp
@@ -5,7 +5,67 @@
 
 #include "finufft_core.h"
 #include <cmath>
+#include <cstddef>
 #include <finufft_common/common.h>
+#if __has_include(<xsimd/xsimd.hpp>)
+#include <array>
+#include <finufft/xsimd.hpp>
+#include <type_traits>
+
+namespace finufft::utils {
+
+template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width() {
+  // finds the smallest simd width that can handle N elements
+  // simd size is batch size the SIMD width in xsimd terminology
+  if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
+    return min_simd_width<T, N * 2>();
+  } else {
+    return N;
+  }
+};
+
+template<class T, uint8_t N> constexpr std::size_t find_optimal_simd_width() {
+  // finds the smallest simd width that minimizes the number of iterations
+  // NOTE: might be suboptimal for some cases 2^N+1 for example
+  // in the future we might want to implement a more sophisticated algorithm
+
+  uint8_t optimal_simd_width = min_simd_width<T>();
+  uint8_t min_iterations     = (N + optimal_simd_width - 1) / optimal_simd_width;
+  for (uint8_t simd_width = optimal_simd_width;
+       simd_width <= xsimd::batch<T, xsimd::best_arch>::size; simd_width *= 2) {
+    uint8_t iterations = (N + simd_width - 1) / simd_width;
+    if (iterations < min_iterations) {
+      min_iterations     = iterations;
+      optimal_simd_width = simd_width;
+    }
+  }
+  return static_cast<std::size_t>(optimal_simd_width);
+}
+
+template<class T, uint8_t N> constexpr std::size_t GetPaddedSIMDWidth() {
+  // helper function to get the SIMD width with padding for the given number of elements
+  // that minimizes the number of iterations
+
+  return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
+}
+template<class T, uint8_t ns>
+constexpr std::size_t get_simd_width_helper(uint8_t runtime_ns) {
+  if constexpr (ns < finufft::common::MIN_NSPREAD) {
+    return static_cast<std::size_t>(0);
+  } else {
+    if (runtime_ns == ns) {
+      return GetPaddedSIMDWidth<T, ns>();
+    } else {
+      return get_simd_width_helper<T, ns - 1>(runtime_ns);
+    }
+  }
+}
+template<class T> constexpr std::size_t GetPaddedSIMDWidth(int runtime_ns) {
+  return get_simd_width_helper<T, 2 * ::finufft::common::MAX_NSPREAD>(runtime_ns);
+}
+
+} // namespace finufft::utils
+#endif // __has_include(xsimd)
 
 namespace finufft::utils {
 
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
@@ -520,31 +520,33 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
   // Solve for piecewise Horner coeffs for the function kernel.h:evaluate_kernel()
   // Marco Barbone, Fall 2025.
   const auto nspread = spopts.nspread;
-  // "max_degree" really is "number of coeffs"
-  const auto max_degree = std::max(nspread + 3, MIN_NC);
+  // "number of coeffs"
+  const auto n_coeffs = std::max(nspread + 3, MIN_NC);
 
   // get the xsimd padding
-  static constexpr auto simd_size = xsimd::batch<TF>::size;
-  const auto padded_ns            = (nspread + simd_size - 1) & -simd_size;
+  // (must match that used in spreadinterp.cpp), if we change horner simd_width there
+  // we must also change it here
+  const auto simd_size = GetPaddedSIMDWidth<TF>(2 * nspread);
+  const auto padded_ns = (nspread + simd_size - 1) & -simd_size;
 
   horner_coeffs.fill(TF(0));
 
-  // Precompute kernel parameters once
+  // Get the kernel parameters once
   const TF beta         = TF(this->spopts.ES_beta);
   const TF c_param      = TF(this->spopts.ES_c);
   const int kernel_type = this->spopts.kernel_type;
 
   nc = MIN_NC;
 
-  // Temporary storage: same transposed layout as horner_coeffs:
-  // [k * padded_ns + j], with k in [0, max_degree), j in [0, padded_ns)
-  std::vector<TF> tmp_coeffs(static_cast<size_t>(max_degree) * padded_ns, TF(0));
-
   static constexpr TF a = TF(-1.0);
   static constexpr TF b = TF(1.0);
 
   // First pass: fit at max_degree, cache coeffs, and determine nc.
-  // horner layout uses "smallest cN first": coeffs[0] is lowest degree term.
+  // Note: `fit_monomials()` returns coefficients in descending-degree order
+  // (highest-degree first): coeffs[0] is the highest-degree term. We store
+  // them so that `horner_coeffs[k * padded_ns + j]` holds the k'th Horner
+  // coefficient (k=0 -> highest-degree). `horner_coeffs` was filled with
+  // zeros above, so panels that need fewer coefficients leave the rest as 0.
   for (int j = 0; j < nspread; ++j) {
     // Map x ∈ [-1, 1] to the physical interval for panel j.
     // original: 0.5 * (x - nspread + 2*j + 1)
@@ -555,37 +557,50 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
       return evaluate_kernel(t, beta, c_param, kernel_type);
     };
 
-    const auto coeffs = fit_monomials(kernel, static_cast<int>(max_degree), a, b);
+    const auto coeffs = fit_monomials(kernel, static_cast<int>(n_coeffs), a, b);
 
-    // Cache coefficients in tmp, preserving order:
-    // coeffs[k] corresponds to horner_coeffs[k] (smallest degree first).
+    // Cache coefficients directly into final table (transposed/padded):
+    // coeffs[k] is highest->lowest, store at row k for panel j.
     for (size_t k = 0; k < coeffs.size(); ++k) {
-      tmp_coeffs[k * padded_ns + j] = coeffs[k];
+      horner_coeffs[k * padded_ns + j] = coeffs[k];
     }
 
-    // Determine highest index with |c_k| >= tol, so we keep a contiguous prefix
-    // [0..current_nc-1].
-    int current_nc = 0;
-    for (int k = static_cast<int>(coeffs.size()) - 1; k >= 0; --k) {
-      if (std::abs(coeffs[static_cast<size_t>(k)]) >= tol) {
-        current_nc = k + 1; // number of coeffs we actually care about for this panel
+    // Determine effective number of coeffs by skipping leading zeros.
+    // coeffs[0] is highest degree.
+    int used = 0;
+    for (size_t k = 0; k < coeffs.size(); ++k) {
+      if (std::abs(coeffs[k]) >= tol * 0.50) { // divide tol by 5 otherwise it fails in
+                                               // some cases
+        used = static_cast<int>(coeffs.size() - k);
         break;
       }
     }
-    if (current_nc > nc) nc = current_nc;
+    if (used > nc) nc = used;
   }
 
-  // Second pass: copy the first nc coeffs into horner_coeffs (no refit, no reordering).
-  for (int k = 0; k < nc; ++k) {
-    const auto row_offset = static_cast<size_t>(k) * padded_ns;
-    for (size_t j = 0; j < padded_ns; ++j) {
-      horner_coeffs[row_offset + j] = tmp_coeffs[row_offset + j];
+  // If the max required degree (nc) is less than max_degree, we must shift
+  // the coefficients "left" (to lower row indices) so that the significant
+  // coefficients end at row nc-1.
+  if (nc < static_cast<int>(n_coeffs)) {
+    const size_t shift = n_coeffs - nc;
+    for (size_t k = 0; k < static_cast<size_t>(nc); ++k) {
+      const size_t src_row = k + shift;
+      const size_t dst_row = k;
+      for (size_t j = 0; j < padded_ns; ++j) {
+        horner_coeffs[dst_row * padded_ns + j] = horner_coeffs[src_row * padded_ns + j];
+      }
+    }
+    // Zero out the now-unused tail rows for cleanliness
+    for (size_t k = nc; k < static_cast<size_t>(n_coeffs); ++k) {
+      for (size_t j = 0; j < padded_ns; ++j) {
+        horner_coeffs[k * padded_ns + j] = TF(0);
+      }
     }
   }
 
   if (opts.debug > 2) {
     // Print transposed layout: all "index 0" coeffs for intervals, then "index 1", ...
-    // Note: k is the coefficient index in Horner order, with smallest degree first.
+    // Note: k is the coefficient index in Horner order, with highest degree first.
     for (size_t k = 0; k < static_cast<size_t>(nc); ++k) {
       printf("[%s] idx=%lu: ", __func__, k);
       for (size_t j = 0; j < padded_ns; ++j) // use padded_ns to show padding as well
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
@@ -69,59 +69,16 @@ template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper
   }
 }
 
-template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width() {
-  // finds the smallest simd width that can handle N elements
-  // simd size is batch size the SIMD width in xsimd terminology
-  if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
-    return min_simd_width<T, N * 2>();
-  } else {
-    return N;
-  }
-};
-
-template<class T, uint8_t N> constexpr auto find_optimal_simd_width() {
-  // finds the smallest simd width that minimizes the number of iterations
-  // NOTE: might be suboptimal for some cases 2^N+1 for example
-  // in the future we might want to implement a more sophisticated algorithm
-  uint8_t optimal_simd_width = min_simd_width<T>();
-  uint8_t min_iterations     = (N + optimal_simd_width - 1) / optimal_simd_width;
-  for (uint8_t simd_width = optimal_simd_width;
-       simd_width <= xsimd::batch<T, xsimd::best_arch>::size; simd_width *= 2) {
-    uint8_t iterations = (N + simd_width - 1) / simd_width;
-    if (iterations < min_iterations) {
-      min_iterations     = iterations;
-      optimal_simd_width = simd_width;
-    }
-  }
-  return optimal_simd_width;
-}
-
-template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth() {
-  // helper function to get the SIMD width with padding for the given number of elements
-  // that minimizes the number of iterations
-  return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
-}
-
 template<class T, uint8_t N>
 using PaddedSIMD = typename xsimd::make_sized_batch<T, GetPaddedSIMDWidth<T, N>()>::type;
 
 template<class T, uint8_t ns> constexpr auto get_padding() {
-  // helper function to get the padding for the given number of elements
-  // ns is known at compile time, rounds ns to the next multiple of the SIMD width
-  // then subtracts ns to get the padding using a bitwise and trick
-  // WARING: this trick works only for power of 2s
-  // SOURCE: Agner Fog's VCL manual
   constexpr uint8_t width = GetPaddedSIMDWidth<T, ns>();
   return ((ns + width - 1) & (-width)) - ns;
 }
 
 template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_ns) {
-  // helper function to get the padding for the given number of elements where ns is
-  // known at runtime, it uses recursion to find the padding
-  // this allows to avoid having a function with a large number of switch cases
-  // as GetPaddedSIMDWidth requires a compile time value
-  // it cannot be a lambda function because of the template recursion
-  if constexpr (ns < 2) {
+  if constexpr (ns < finufft::common::MIN_NSPREAD) {
     return 0;
   } else {
     if (runtime_ns == ns) {
@@ -133,12 +90,8 @@ template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_
 }
 
 template<class T> uint8_t get_padding(uint8_t ns) {
-  // return the padding as a function of the number of elements
-  // 2 * MAX_NSPREAD is the maximum number of elements that we can have
-  // that's why is hardcoded here
-  return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
+  return get_padding_helper<T, 2 * ::finufft::common::MAX_NSPREAD>(ns);
 }
-
 template<class T, uint8_t N>
 using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;