Skip to content

Commit 53d2599

Browse files
committed
Merge remote-tracking branch 'flatiron/master' into add-kernel-selection-opts
2 parents dbca829 + 170a264 commit 53d2599

File tree

5 files changed

+117
-84
lines changed

5 files changed

+117
-84
lines changed

.github/workflows/valgrind.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,15 @@ jobs:
2929
cmake -S . -B ./build -G Ninja \
3030
-DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo \
3131
-DFINUFFT_BUILD_TESTS=ON \
32-
-DFINUFFT_USE_SANITIZERS=OFF
32+
-DFINUFFT_USE_SANITIZERS=OFF \
33+
-DMEMORYCHECK_COMMAND=$(which valgrind) \
34+
-DMEMORYCHECK_COMMAND_OPTIONS="--leak-check=full --show-leak-kinds=definite,possible --errors-for-leak-kinds=definite --error-exitcode=1 --track-origins=yes --undef-value-errors=yes" \
35+
-DMEMORYCHECK_TYPE=Valgrind
3336
3437
- name: Build
3538
run: cmake --build ./build --config RelWithDebInfo
3639

3740
- name: Memcheck (CTest)
3841
working-directory: ./build
39-
env:
40-
CTEST_MEMORYCHECK_COMMAND: valgrind
41-
CTEST_MEMORYCHECK_COMMAND_OPTIONS: "--leak-check=full --show-leak-kinds=definite,possible --errors-for-leak-kinds=definite"
4242
run: |
4343
ctest -T memcheck --output-on-failure -j

.pre-commit-config.yaml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
repos:
2-
- repo: https://github.com/pre-commit/mirrors-clang-format
3-
rev: 'v21.1.7'
2+
- repo: local
43
hooks:
5-
- id: clang-format
6-
types_or: [c++, c, cuda]
4+
- id: git-clang-format-staged
5+
name: git clang-format (staged only)
6+
entry: git-clang-format --staged
7+
language: python
8+
additional_dependencies:
9+
- clang-format
10+
pass_filenames: false
11+
types_or: [ c++, c, cuda ]
712
files: \.(c|cc|cpp|h|hpp|cu|cuh)$
813
exclude: '(^|/)(matlab/.*)$'
914
- repo: https://github.com/pre-commit/pre-commit-hooks

include/finufft/finufft_utils.hpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,67 @@
55

66
#include "finufft_core.h"
77
#include <cmath>
8+
#include <cstddef>
89
#include <finufft_common/common.h>
10+
#if __has_include(<xsimd/xsimd.hpp>)
11+
#include <array>
12+
#include <finufft/xsimd.hpp>
13+
#include <type_traits>
14+
15+
namespace finufft::utils {
16+
17+
template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width() {
18+
// finds the smallest simd width that can handle N elements
19+
// simd size is batch size the SIMD width in xsimd terminology
20+
if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
21+
return min_simd_width<T, N * 2>();
22+
} else {
23+
return N;
24+
}
25+
};
26+
27+
template<class T, uint8_t N> constexpr std::size_t find_optimal_simd_width() {
28+
// finds the smallest simd width that minimizes the number of iterations
29+
// NOTE: might be suboptimal for some cases 2^N+1 for example
30+
// in the future we might want to implement a more sophisticated algorithm
31+
32+
uint8_t optimal_simd_width = min_simd_width<T>();
33+
uint8_t min_iterations = (N + optimal_simd_width - 1) / optimal_simd_width;
34+
for (uint8_t simd_width = optimal_simd_width;
35+
simd_width <= xsimd::batch<T, xsimd::best_arch>::size; simd_width *= 2) {
36+
uint8_t iterations = (N + simd_width - 1) / simd_width;
37+
if (iterations < min_iterations) {
38+
min_iterations = iterations;
39+
optimal_simd_width = simd_width;
40+
}
41+
}
42+
return static_cast<std::size_t>(optimal_simd_width);
43+
}
44+
45+
template<class T, uint8_t N> constexpr std::size_t GetPaddedSIMDWidth() {
46+
// helper function to get the SIMD width with padding for the given number of elements
47+
// that minimizes the number of iterations
48+
49+
return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
50+
}
51+
template<class T, uint8_t ns>
52+
constexpr std::size_t get_simd_width_helper(uint8_t runtime_ns) {
53+
if constexpr (ns < finufft::common::MIN_NSPREAD) {
54+
return static_cast<std::size_t>(0);
55+
} else {
56+
if (runtime_ns == ns) {
57+
return GetPaddedSIMDWidth<T, ns>();
58+
} else {
59+
return get_simd_width_helper<T, ns - 1>(runtime_ns);
60+
}
61+
}
62+
}
63+
template<class T> constexpr std::size_t GetPaddedSIMDWidth(int runtime_ns) {
64+
return get_simd_width_helper<T, 2 * ::finufft::common::MAX_NSPREAD>(runtime_ns);
65+
}
66+
67+
} // namespace finufft::utils
68+
#endif // __has_include(xsimd)
969

1070
namespace finufft::utils {
1171

src/finufft_core.cpp

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -520,31 +520,33 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
520520
// Solve for piecewise Horner coeffs for the function kernel.h:evaluate_kernel()
521521
// Marco Barbone, Fall 2025.
522522
const auto nspread = spopts.nspread;
523-
// "max_degree" really is "number of coeffs"
524-
const auto max_degree = std::max(nspread + 3, MIN_NC);
523+
// "number of coeffs"
524+
const auto n_coeffs = std::max(nspread + 3, MIN_NC);
525525

526526
// get the xsimd padding
527-
static constexpr auto simd_size = xsimd::batch<TF>::size;
528-
const auto padded_ns = (nspread + simd_size - 1) & -simd_size;
527+
// (must match that used in spreadinterp.cpp), if we change horner simd_width there
528+
// we must also change it here
529+
const auto simd_size = GetPaddedSIMDWidth<TF>(2 * nspread);
530+
const auto padded_ns = (nspread + simd_size - 1) & -simd_size;
529531

530532
horner_coeffs.fill(TF(0));
531533

532-
// Precompute kernel parameters once
534+
// Get the kernel parameters once
533535
const TF beta = TF(this->spopts.ES_beta);
534536
const TF c_param = TF(this->spopts.ES_c);
535537
const int kernel_type = this->spopts.kernel_type;
536538

537539
nc = MIN_NC;
538540

539-
// Temporary storage: same transposed layout as horner_coeffs:
540-
// [k * padded_ns + j], with k in [0, max_degree), j in [0, padded_ns)
541-
std::vector<TF> tmp_coeffs(static_cast<size_t>(max_degree) * padded_ns, TF(0));
542-
543541
static constexpr TF a = TF(-1.0);
544542
static constexpr TF b = TF(1.0);
545543

546544
// First pass: fit at max_degree, cache coeffs, and determine nc.
547-
// horner layout uses "smallest cN first": coeffs[0] is lowest degree term.
545+
// Note: `fit_monomials()` returns coefficients in descending-degree order
546+
// (highest-degree first): coeffs[0] is the highest-degree term. We store
547+
// them so that `horner_coeffs[k * padded_ns + j]` holds the k'th Horner
548+
// coefficient (k=0 -> highest-degree). `horner_coeffs` was filled with
549+
// zeros above, so panels that need fewer coefficients leave the rest as 0.
548550
for (int j = 0; j < nspread; ++j) {
549551
// Map x ∈ [-1, 1] to the physical interval for panel j.
550552
// original: 0.5 * (x - nspread + 2*j + 1)
@@ -555,37 +557,50 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
555557
return evaluate_kernel(t, beta, c_param, kernel_type);
556558
};
557559

558-
const auto coeffs = fit_monomials(kernel, static_cast<int>(max_degree), a, b);
560+
const auto coeffs = fit_monomials(kernel, static_cast<int>(n_coeffs), a, b);
559561

560-
// Cache coefficients in tmp, preserving order:
561-
// coeffs[k] corresponds to horner_coeffs[k] (smallest degree first).
562+
// Cache coefficients directly into final table (transposed/padded):
563+
// coeffs[k] is highest->lowest, store at row k for panel j.
562564
for (size_t k = 0; k < coeffs.size(); ++k) {
563-
tmp_coeffs[k * padded_ns + j] = coeffs[k];
565+
horner_coeffs[k * padded_ns + j] = coeffs[k];
564566
}
565567

566-
// Determine highest index with |c_k| >= tol, so we keep a contiguous prefix
567-
// [0..current_nc-1].
568-
int current_nc = 0;
569-
for (int k = static_cast<int>(coeffs.size()) - 1; k >= 0; --k) {
570-
if (std::abs(coeffs[static_cast<size_t>(k)]) >= tol) {
571-
current_nc = k + 1; // number of coeffs we actually care about for this panel
568+
// Determine effective number of coeffs by skipping leading zeros.
569+
// coeffs[0] is highest degree.
570+
int used = 0;
571+
for (size_t k = 0; k < coeffs.size(); ++k) {
572+
if (std::abs(coeffs[k]) >= tol * 0.50) { // divide tol by 5 otherwise it fails in
573+
// some cases
574+
used = static_cast<int>(coeffs.size() - k);
572575
break;
573576
}
574577
}
575-
if (current_nc > nc) nc = current_nc;
578+
if (used > nc) nc = used;
576579
}
577580

578-
// Second pass: copy the first nc coeffs into horner_coeffs (no refit, no reordering).
579-
for (int k = 0; k < nc; ++k) {
580-
const auto row_offset = static_cast<size_t>(k) * padded_ns;
581-
for (size_t j = 0; j < padded_ns; ++j) {
582-
horner_coeffs[row_offset + j] = tmp_coeffs[row_offset + j];
581+
// If the max required degree (nc) is less than max_degree, we must shift
582+
// the coefficients "left" (to lower row indices) so that the significant
583+
// coefficients end at row nc-1.
584+
if (nc < static_cast<int>(n_coeffs)) {
585+
const size_t shift = n_coeffs - nc;
586+
for (size_t k = 0; k < static_cast<size_t>(nc); ++k) {
587+
const size_t src_row = k + shift;
588+
const size_t dst_row = k;
589+
for (size_t j = 0; j < padded_ns; ++j) {
590+
horner_coeffs[dst_row * padded_ns + j] = horner_coeffs[src_row * padded_ns + j];
591+
}
592+
}
593+
// Zero out the now-unused tail rows for cleanliness
594+
for (size_t k = nc; k < static_cast<size_t>(n_coeffs); ++k) {
595+
for (size_t j = 0; j < padded_ns; ++j) {
596+
horner_coeffs[k * padded_ns + j] = TF(0);
597+
}
583598
}
584599
}
585600

586601
if (opts.debug > 2) {
587602
// Print transposed layout: all "index 0" coeffs for intervals, then "index 1", ...
588-
// Note: k is the coefficient index in Horner order, with smallest degree first.
603+
// Note: k is the coefficient index in Horner order, with highest degree first.
589604
for (size_t k = 0; k < static_cast<size_t>(nc); ++k) {
590605
printf("[%s] idx=%lu: ", __func__, k);
591606
for (size_t j = 0; j < padded_ns; ++j) // use padded_ns to show padding as well

src/spreadinterp.cpp

Lines changed: 2 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -69,59 +69,16 @@ template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper
6969
}
7070
}
7171

72-
template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width() {
73-
// finds the smallest simd width that can handle N elements
74-
// simd size is batch size the SIMD width in xsimd terminology
75-
if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
76-
return min_simd_width<T, N * 2>();
77-
} else {
78-
return N;
79-
}
80-
};
81-
82-
template<class T, uint8_t N> constexpr auto find_optimal_simd_width() {
83-
// finds the smallest simd width that minimizes the number of iterations
84-
// NOTE: might be suboptimal for some cases 2^N+1 for example
85-
// in the future we might want to implement a more sophisticated algorithm
86-
uint8_t optimal_simd_width = min_simd_width<T>();
87-
uint8_t min_iterations = (N + optimal_simd_width - 1) / optimal_simd_width;
88-
for (uint8_t simd_width = optimal_simd_width;
89-
simd_width <= xsimd::batch<T, xsimd::best_arch>::size; simd_width *= 2) {
90-
uint8_t iterations = (N + simd_width - 1) / simd_width;
91-
if (iterations < min_iterations) {
92-
min_iterations = iterations;
93-
optimal_simd_width = simd_width;
94-
}
95-
}
96-
return optimal_simd_width;
97-
}
98-
99-
template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth() {
100-
// helper function to get the SIMD width with padding for the given number of elements
101-
// that minimizes the number of iterations
102-
return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
103-
}
104-
10572
template<class T, uint8_t N>
10673
using PaddedSIMD = typename xsimd::make_sized_batch<T, GetPaddedSIMDWidth<T, N>()>::type;
10774

10875
template<class T, uint8_t ns> constexpr auto get_padding() {
109-
// helper function to get the padding for the given number of elements
110-
// ns is known at compile time, rounds ns to the next multiple of the SIMD width
111-
// then subtracts ns to get the padding using a bitwise and trick
112-
// WARING: this trick works only for power of 2s
113-
// SOURCE: Agner Fog's VCL manual
11476
constexpr uint8_t width = GetPaddedSIMDWidth<T, ns>();
11577
return ((ns + width - 1) & (-width)) - ns;
11678
}
11779

11880
template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_ns) {
119-
// helper function to get the padding for the given number of elements where ns is
120-
// known at runtime, it uses recursion to find the padding
121-
// this allows to avoid having a function with a large number of switch cases
122-
// as GetPaddedSIMDWidth requires a compile time value
123-
// it cannot be a lambda function because of the template recursion
124-
if constexpr (ns < 2) {
81+
if constexpr (ns < finufft::common::MIN_NSPREAD) {
12582
return 0;
12683
} else {
12784
if (runtime_ns == ns) {
@@ -133,12 +90,8 @@ template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_
13390
}
13491

13592
template<class T> uint8_t get_padding(uint8_t ns) {
136-
// return the padding as a function of the number of elements
137-
// 2 * MAX_NSPREAD is the maximum number of elements that we can have
138-
// that's why is hardcoded here
139-
return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
93+
return get_padding_helper<T, 2 * ::finufft::common::MAX_NSPREAD>(ns);
14094
}
141-
14295
template<class T, uint8_t N>
14396
using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;
14497

0 commit comments

Comments
 (0)