Skip to content

Commit 6b06e91

Browse files
committed
Fix kernel for ns=2 sigma=2 in float32
1 parent 53d2599 commit 6b06e91

File tree

14 files changed

+136
-120
lines changed

14 files changed

+136
-120
lines changed

cmake/utils.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ function(enable_asan target)
103103
endfunction()
104104

105105
function(finufft_link_test target)
106-
target_link_libraries(${target} PRIVATE finufft::finufft finufft::common)
106+
target_link_libraries(${target} PRIVATE finufft::finufft finufft_common)
107107
if(FINUFFT_USE_DUCC0)
108108
target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0)
109109
endif()

examples/cuda/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ foreach(srcfile ${example_src})
55
get_filename_component(executable ${executable} NAME)
66
add_executable(${executable} ${srcfile})
77
target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
8-
target_link_libraries(${executable} cufinufft CUDA::cufft CUDA::cudart)
8+
target_link_libraries(${executable} PRIVATE cufinufft CUDA::cufft CUDA::cudart $<BUILD_INTERFACE:finufft_common>)
99
target_compile_features(${executable} PRIVATE cxx_std_17)
1010
endforeach()

include/cufinufft/utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ class WithCudaDevice {
8989
};
9090

9191
// math helpers whose source is in src/utils.cpp
92-
long next235beven(long n, long b);
92+
FINUFFT_EXPORT long next235beven(long n, long b);
9393

9494
/**
9595
* does a complex atomic add on a shared memory address

makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ STATICLIB = lib-static/$(LIBNAME).a
152152
ABSDYNLIB = $(FINUFFT)$(DYNLIB)
153153

154154
# spreader objs
155-
SOBJS = src/finufft_utils.o src/utils.o src/spreadinterp.o
155+
SOBJS = src/finufft_utils.o src/spreadinterp.o src/common/utils.o src/common/kernel.o
156156

157157
# all lib dual-precision objs (note DUCC_OBJS empty if unused)
158158
OBJS = $(SOBJS) src/fft.o src/finufft_core.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS)
@@ -283,10 +283,10 @@ test/%: test/%.cpp $(DYNLIB)
283283
test/%f: test/%.cpp $(DYNLIB)
284284
$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
285285
# low-level tests that are cleaner if depend on only specific objects...
286-
test/testutils: test/testutils.cpp src/finufft_utils.o src/utils.o
287-
$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutils
288-
test/testutilsf: test/testutils.cpp src/finufft_utils.o src/utils.o
289-
$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutilsf
286+
test/testutils: test/testutils.cpp src/finufft_utils.o src/common/utils.o
287+
$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutils
288+
test/testutilsf: test/testutils.cpp src/finufft_utils.o src/common/utils.o
289+
$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutilsf
290290

291291
# make sure all double-prec test executables ready for testing
292292
TESTS := $(basename $(wildcard test/*.cpp))

perftest/cuda/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
add_executable(cuperftest cuperftest.cu)
22
target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
3-
target_link_libraries(cuperftest cufinufft CUDA::cufft CUDA::cudart)
3+
target_link_libraries(cuperftest PRIVATE cufinufft CUDA::cufft CUDA::cudart $<BUILD_INTERFACE:finufft_common>)
44
target_compile_features(cuperftest PRIVATE cxx_std_17)
55
set_target_properties(
66
cuperftest

src/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ if(FINUFFT_USE_DUCC0)
4747
target_compile_definitions(finufft PRIVATE FINUFFT_USE_DUCC0)
4848
endif()
4949

50-
target_link_libraries(finufft PRIVATE $<BUILD_INTERFACE:finufft_fftlibs xsimd>)
51-
target_link_libraries(finufft PRIVATE finufft_common)
50+
target_link_libraries(finufft PRIVATE $<BUILD_INTERFACE:finufft_fftlibs xsimd finufft_common>)
5251
if(FINUFFT_USE_OPENMP)
5352
target_link_libraries(finufft PRIVATE OpenMP::OpenMP_CXX)
5453
if(NOT FINUFFT_STATIC_LINKING)
@@ -100,5 +99,4 @@ endif()
10099

101100
set(_targets ${INSTALL_TARGETS})
102101
list(APPEND _targets finufft)
103-
list(APPEND _targets finufft_common)
104102
set(INSTALL_TARGETS "${_targets}" PARENT_SCOPE)

src/common/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ cmake_minimum_required(VERSION 3.24)
55
set(FINUFFT_COMMON_SOURCES kernel.cpp utils.cpp)
66

77
add_library(finufft_common STATIC ${FINUFFT_COMMON_SOURCES})
8-
add_library(finufft::common ALIAS finufft_common)
98

109
# The public include directory is the top-level include/
1110
target_include_directories(
1211
finufft_common
1312
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>
1413
)
14+
set_target_properties(finufft_common PROPERTIES POSITION_INDEPENDENT_CODE ON)
1515

1616
# Visibility and compile options consistent with finufft
1717
if(FINUFFT_SHARED_LINKING)

src/common/utils.cpp

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,17 @@ namespace finufft {
2323
namespace common {
2424

2525
void gaussquad(int n, double *xgl, double *wgl) {
26+
// n-node Gauss-Legendre quadrature, adapted from a code by Jason Kaye (2022-2023),
27+
// from the utils file of https://github.com/flatironinstitute/cppdlr version 1.2,
28+
// which is Apache-2 licensed. It uses Newton iteration from Chebyshev points.
29+
// Double-precision only.
30+
// Adapted by Barnett 6/8/25 to write nodes (xgl) and weights (wgl) into arrays
31+
// that the user must pre-allocate to length at least n.
32+
2633
double x = 0, dx = 0;
2734
int convcount = 0;
2835

36+
// Get Gauss-Legendre nodes
2937
xgl[n / 2] = 0; // If odd number of nodes, middle node is 0
3038
for (int i = 0; i < n / 2; i++) { // Loop through nodes
3139
convcount = 0;
@@ -39,27 +47,35 @@ void gaussquad(int n, double *xgl, double *wgl) {
3947
}
4048
if (convcount == 3) {
4149
break;
42-
}
50+
} // If convergence tol hit 3 times, stop
4351
}
4452
xgl[i] = -x;
4553
xgl[n - i - 1] = x; // Symmetric nodes
4654
}
4755

56+
// Get Gauss-Legendre weights from formula
57+
// w_i = -2 / ((n+1)*P_n'(x_i)*P_{n+1}(x_i)) (Atkinson '89, pg. 276)
4858
for (int i = 0; i < n / 2 + 1; i++) {
4959
auto [junk1, dp] = leg_eval(n, xgl[i]);
50-
auto [p, junk2] = leg_eval(n + 1, xgl[i]);
51-
wgl[i] = -2 / ((n + 1) * dp * p);
52-
wgl[n - i - 1] = wgl[i];
60+
auto [p, junk2] = leg_eval(n + 1, xgl[i]); // This is a bit inefficient, but who
61+
// cares...
62+
wgl[i] = -2 / ((n + 1) * dp * p);
63+
wgl[n - i - 1] = wgl[i];
5364
}
5465
}
5566

5667
std::tuple<double, double> leg_eval(int n, double x) {
68+
// return Legendre polynomial P_n(x) and its derivative P'_n(x).
69+
// Uses Legendre three-term recurrence.
70+
// Used by gaussquad above, with which it shares the same authorship and source.
71+
5772
if (n == 0) {
5873
return {1.0, 0.0};
5974
}
6075
if (n == 1) {
6176
return {x, 1.0};
6277
}
78+
// Three-term recurrence and formula for derivative
6379
double p0 = 0.0, p1 = 1.0, p2 = x;
6480
for (int i = 1; i < n; i++) {
6581
p0 = p1;
@@ -103,7 +119,13 @@ double cyl_bessel_i(double nu, double x) noexcept {
103119
namespace cufinufft {
104120
namespace utils {
105121

106-
long next235beven(long n, long b) {
122+
long next235beven(long n, long b)
123+
// finds even integer not less than n, with prime factors no larger than 5
124+
// (ie, "smooth") and is a multiple of b (b is a number that the only prime
125+
// factors are 2,3,5). Adapted from fortran in hellskitchen. Barnett 2/9/17
126+
// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
127+
// added condition about b, Melody Shih 05/31/20
128+
{
107129
if (n <= 2) return 2;
108130
if (n % 2 == 1) n += 1; // even
109131
long nplus = n - 2; // to cancel out the +=2 at start of loop

src/cuda/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@ if(FINUFFT_SHARED_LINKING)
8181
endif()
8282
endif()
8383

84-
target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft)
85-
target_link_libraries(cufinufft PRIVATE finufft_common)
84+
target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft $<BUILD_INTERFACE:finufft_common>)
8685
# Expose only when not doing fully static linking
8786
if(NOT FINUFFT_STATIC_LINKING)
8887
target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft)

src/finufft_core.cpp

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -569,8 +569,8 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
569569
// coeffs[0] is highest degree.
570570
int used = 0;
571571
for (size_t k = 0; k < coeffs.size(); ++k) {
572-
if (std::abs(coeffs[k]) >= tol * 0.50) { // divide tol by 5 otherwise it fails in
573-
// some cases
572+
if (std::abs(coeffs[k]) >= tol * 0.5) { // divide tol by 2 otherwise it fails in
573+
// some cases
574574
used = static_cast<int>(coeffs.size() - k);
575575
break;
576576
}
@@ -633,7 +633,7 @@ template<typename TF> int FINUFFT_PLAN_T<TF>::initSpreadAndFFT() {
633633
printf(" spread_thread=%d\n", opts.spread_thread);
634634
}
635635

636-
} else { // ..... usual NUFFT: eval Fourier series, alloc workspace .....
636+
} else { // ..... usual NUFFT: eval Fourier series, alloc workspace .....
637637

638638
if (opts.showwarn) { // user warn round-off error (due to prob condition #)...
639639
for (int idim = 0; idim < dim; ++idim)
@@ -647,8 +647,8 @@ template<typename TF> int FINUFFT_PLAN_T<TF>::initSpreadAndFFT() {
647647
// determine fine grid sizes, sanity check, then alloc...
648648
for (int idim = 0; idim < dim; ++idim) {
649649
int nfier = set_nf_type12(mstu[idim], opts, spopts, &nfdim[idim]);
650-
if (nfier) return nfier; // nf too big; we're done
651-
phiHat[idim].resize(nfdim[idim] / 2 + 1); // alloc fseries
650+
if (nfier) return nfier; // nf too big; we're done
651+
phiHat[idim].resize(nfdim[idim] / 2 + 1); // alloc fseries
652652
}
653653

654654
if (opts.debug) { // "long long" here is to avoid warnings with printf...
@@ -681,7 +681,7 @@ template<typename TF> int FINUFFT_PLAN_T<TF>::initSpreadAndFFT() {
681681
}
682682

683683
timer.restart(); // plan the FFTW (to act in-place on the workspace fwBatch)
684-
int nthr_fft = opts.nthreads;
684+
int nthr_fft = opts.nthreads;
685685
const auto ns = gridsize_for_fft(*this);
686686
std::vector<TC, xsimd::aligned_allocator<TC, 64>> fwBatch(nf() * batchSize);
687687
fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft);
@@ -853,8 +853,8 @@ FINUFFT_PLAN_T<TF>::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i
853853
// spreader/Horner internals now using the provided upsampfac.
854854
if (opts.upsampfac != 0.0) {
855855
upsamp_locked = true; // user explicitly set upsampfac, don't auto-update
856-
ier = setup_spreader_for_nufft(spopts, tol, opts, dim);
857-
if (ier > 1) // proceed if success or warning
856+
ier = setup_spreader_for_nufft(spopts, tol, opts, dim);
857+
if (ier > 1) // proceed if success or warning
858858
throw int(ier);
859859
precompute_horner_coeffs();
860860

@@ -927,12 +927,12 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, const TF *xj, const TF *yj, const TF *
927927
return FINUFFT_ERR_NUM_NU_PTS_INVALID;
928928
}
929929

930-
if (type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
931-
// (all we can do is check and maybe bin-sort the NU pts)
930+
if (type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
931+
// (all we can do is check and maybe bin-sort the NU pts)
932932
// If upsampfac is not locked by user (auto mode), choose or update it now
933933
// based on the actual density nj/N(). Re-plan if density changed significantly.
934934
if (!upsamp_locked) {
935-
double density = double(nj) / double(N());
935+
double density = double(nj) / double(N());
936936
double upsampfac = bestUpsamplingFactor<TF>(opts.nthreads, density, dim, type, tol);
937937
// Re-plan if this is the first call (upsampfac==0) or if upsampfac changed
938938
if (upsampfac != opts.upsampfac) {
@@ -1095,9 +1095,8 @@ int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, const TF *xj, const TF *yj, const TF *
10951095
t2opts.debug = std::max(0, opts.debug - 1); // don't print as much detail
10961096
t2opts.spread_debug = std::max(0, opts.spread_debug - 1);
10971097
t2opts.showwarn = 0; // so don't see warnings 2x
1098-
if (!upsamp_locked)
1099-
t2opts.upsampfac = 0.0; // if the upsampfac was auto, let inner
1100-
// t2 pick it again (from density=nj/Nf)
1098+
if (!upsamp_locked) t2opts.upsampfac = 0.0; // if the upsampfac was auto, let inner
1099+
// t2 pick it again (from density=nj/Nf)
11011100
// (...could vary other t2opts here?)
11021101
// MR: temporary hack, until we have figured out the C++ interface.
11031102
FINUFFT_PLAN_T<TF> *tmpplan;

0 commit comments

Comments
 (0)