diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fdd3d23d..1429113bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,8 @@ if(FINUFFT_USE_CPU) add_subdirectory(src) endif() +add_subdirectory(src/common) + if(FINUFFT_USE_CUDA) include(cuda_setup) add_subdirectory(src/cuda) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 3eb4806a6..267d33e84 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -103,7 +103,7 @@ function(enable_asan target) endfunction() function(finufft_link_test target) - target_link_libraries(${target} PRIVATE finufft::finufft) + target_link_libraries(${target} PRIVATE finufft::finufft finufft_common) if(FINUFFT_USE_DUCC0) target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0) endif() diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt index ad70a2491..de214a1aa 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/cuda/CMakeLists.txt @@ -5,6 +5,6 @@ foreach(srcfile ${example_src}) get_filename_component(executable ${executable} NAME) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) - target_link_libraries(${executable} cufinufft CUDA::cufft CUDA::cudart) + target_link_libraries(${executable} PRIVATE cufinufft CUDA::cufft CUDA::cudart $) target_compile_features(${executable} PRIVATE cxx_std_17) endforeach() diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 1d1c4a6d3..93646e07c 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -89,7 +89,7 @@ class WithCudaDevice { }; // math helpers whose source is in src/utils.cpp -long next235beven(long n, long b); +FINUFFT_EXPORT long next235beven(long n, long b); /** * does a complex atomic add on a shared memory address diff --git a/include/finufft.fh b/include/finufft.fh index c19874bbd..7658ea2bc 100644 --- a/include/finufft.fh +++ b/include/finufft.fh @@ -20,6 +20,7 @@ c DEPRECATED: spread_kerpad kept for ABI compatibility, ignored by library real*8 upsampfac integer spread_thread, maxbatchsize, spread_nthr_atomic integer spread_max_sp_size + integer spread_function integer fftw_lock_fun, fftw_unlock_fun, fftw_lock_data end type diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 37e7ada5f..1b5f80b5e 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -21,9 +21,9 @@ namespace finufft { namespace spreadinterp { template -FINUFFT_EXPORT_TEST int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, - int kerevalmeth, int debug, int showwarn, - int spreadinterponly, int dim); +FINUFFT_EXPORT_TEST int setup_spreader( + finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, int debug, + int showwarn, int spreadinterponly, int dim, int kernel_type = 0); int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, const finufft_spread_opts &opts); template diff --git a/include/finufft_common/kernel.h b/include/finufft_common/kernel.h index 67102f48a..f94652219 100644 --- a/include/finufft_common/kernel.h +++ b/include/finufft_common/kernel.h @@ -6,6 +6,8 @@ #include #include +#include +#include namespace finufft::kernel { @@ -56,15 +58,34 @@ template std::vector fit_monomials(F &&f, int n, T a, T b) return c; } -template T evaluate_kernel(T x, T beta, T c) { - /* ES ("exp sqrt") kernel evaluation at single real argument: - phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)), for |x| < nspread/2 - related to an asymptotic approximation to the Kaiser--Bessel, itself an - approximation to prolate spheroidal wavefunction (PSWF) of order 0. - This is the "reference implementation", used by eg finufft/onedim_* 2/17/17. - Rescaled so max is 1, Barnett 7/21/24 +template T evaluate_kernel(T x, T beta, T c, int kernel_type = 0) { + /* Kernel evaluation at single real argument. + kernel_type == 0 : ES ("exp sqrt") kernel (default) + phi_ES(x) = exp(beta*(sqrt(1 - c*x^2) - 1)) + kernel_type == 1 : Kaiser--Bessel (KB) kernel + phi_KB(x) = I_0(beta*sqrt(1 - c*x^2)) / I_0(beta) + Note: `std::cyl_bessel_i` from is used for I_0. + Rescaled so max is 1. */ + if (kernel_type == 1) { + // Kaiser--Bessel (normalized by I0(beta)). Use std::cyl_bessel_i from . + const T inner = std::sqrt(T(1) - c * x * x); + const T arg = beta * inner; + const double i0_arg = ::finufft::common::cyl_bessel_i(0, static_cast(arg)); + const double i0_beta = ::finufft::common::cyl_bessel_i(0, static_cast(beta)); + return static_cast(i0_arg / i0_beta); + } + + // default to ES return std::exp(beta * (std::sqrt(T(1) - c * x * x) - T(1))); } +FINUFFT_EXPORT int compute_kernel_ns(double upsampfac, double tol, int kernel_type, + const finufft_spread_opts &opts); + +FINUFFT_EXPORT void initialize_kernel_params(finufft_spread_opts &opts, double upsampfac, + double tol, int kernel_type); + +FINUFFT_EXPORT double sigma_max_tol(double upsampfac, int kernel_type, int max_ns); + } // namespace finufft::kernel diff --git a/include/finufft_common/utils.h b/include/finufft_common/utils.h index e8f8682db..008c1e1da 100644 --- a/include/finufft_common/utils.h +++ b/include/finufft_common/utils.h @@ -13,6 +13,9 @@ namespace common { FINUFFT_EXPORT_TEST void gaussquad(int n, double *xgl, double *wgl); std::tuple leg_eval(int n, double x); +// Series implementation of the modified Bessel function of the first kind I_nu(x) +double cyl_bessel_i(double nu, double x) noexcept; + // helper to generate the integer sequence in range [Start, End] template struct offset_seq; diff --git a/include/finufft_mod.f90 b/include/finufft_mod.f90 index 59c5e567c..5be5fae96 100644 --- a/include/finufft_mod.f90 +++ b/include/finufft_mod.f90 @@ -21,6 +21,7 @@ module finufft_mod real(kind=C_DOUBLE) :: upsampfac integer(kind=C_INT) :: spread_thread, maxbatchsize integer(kind=C_INT) :: spread_nthr_atomic, spread_max_sp_size + integer(kind=C_INT) :: spread_function integer(kind=C_SIZE_T) :: fftw_lock_fun, fftw_unlock_fun, fftw_lock_data ! really, last should be type(C_PTR) :: etc, but fails to print nicely diff --git a/include/finufft_opts.h b/include/finufft_opts.h index 24cdcd79b..aab78839d 100644 --- a/include/finufft_opts.h +++ b/include/finufft_opts.h @@ -32,6 +32,8 @@ typedef struct finufft_opts { // defaults see finufft_core.cpp:finufft_default_o int spread_nthr_atomic; // if >=0, threads above which spreader OMP critical goes // atomic int spread_max_sp_size; // if >0, overrides spreader (dir=1) max subproblem size + int spread_function; // (dev only) 0:DEFAULT, (do not change), there is no guarantee + // what non-zero values do and behaviour can change anytime // sphinx tag (don't remove): @opts_end // User can provide their own FFTW planner lock functions for thread safety diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h index f9f128a4d..8785c3782 100644 --- a/include/finufft_spread_opts.h +++ b/include/finufft_spread_opts.h @@ -26,6 +26,8 @@ typedef struct finufft_spread_opts { double ES_beta; double ES_halfwidth; double ES_c; + // Kernel selector: 0 = ES (default), 1 = Kaiser--Bessel (KB) + int kernel_type; // default 0 } finufft_spread_opts; #endif // FINUFFT_SPREAD_OPTS_H diff --git a/makefile b/makefile index b4eb6b779..88bd2f282 100644 --- a/makefile +++ b/makefile @@ -152,7 +152,7 @@ STATICLIB = lib-static/$(LIBNAME).a ABSDYNLIB = $(FINUFFT)$(DYNLIB) # spreader objs -SOBJS = src/finufft_utils.o src/utils.o src/spreadinterp.o +SOBJS = src/finufft_utils.o src/spreadinterp.o src/common/utils.o src/common/kernel.o # all lib dual-precision objs (note DUCC_OBJS empty if unused) OBJS = $(SOBJS) src/fft.o src/finufft_core.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS) @@ -283,10 +283,10 @@ test/%: test/%.cpp $(DYNLIB) test/%f: test/%.cpp $(DYNLIB) $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@ # low-level tests that are cleaner if depend on only specific objects... -test/testutils: test/testutils.cpp src/finufft_utils.o src/utils.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutils -test/testutilsf: test/testutils.cpp src/finufft_utils.o src/utils.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/utils.o $(LIBS) -o test/testutilsf +test/testutils: test/testutils.cpp src/finufft_utils.o src/common/utils.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutils +test/testutilsf: test/testutils.cpp src/finufft_utils.o src/common/utils.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/finufft_utils.o src/common/utils.o $(LIBS) -o test/testutilsf # make sure all double-prec test executables ready for testing TESTS := $(basename $(wildcard test/*.cpp)) diff --git a/matlab/finufft.cpp b/matlab/finufft.cpp index ba5381f57..46583b9c4 100644 --- a/matlab/finufft.cpp +++ b/matlab/finufft.cpp @@ -1162,6 +1162,9 @@ typedef std::complex fcomplex; else if (strcmp(fname[ifield],"spread_max_sp_size") == 0) { oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); } + else if (strcmp(fname[ifield],"spread_function") == 0) { + oc->spread_function = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); + } else if (strcmp(fname[ifield],"spreadinterponly") == 0) { oc->spreadinterponly = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); } @@ -1271,7 +1274,7 @@ mxWrapCopyZDef_single (mxWrapCopy_single_dcomplex, dcomplex, mxWrapReturnZDef_single (mxWrapReturn_single_dcomplex, dcomplex, real_dcomplex, imag_dcomplex) -/* ---- finufft.mw: 170 ---- +/* ---- finufft.mw: 173 ---- * finufft_mex_setup(); */ static const char* stubids1_ = "finufft_mex_setup()"; @@ -1289,7 +1292,7 @@ void mexStub1(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 171 ---- +/* ---- finufft.mw: 174 ---- * finufft_opts* o = new(); */ static const char* stubids2_ = "c o finufft_opts* = new()"; @@ -1310,7 +1313,7 @@ void mexStub2(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 173 ---- +/* ---- finufft.mw: 176 ---- * finufft_plan* p = new(); */ static const char* stubids3_ = "c o finufft_plan* = new()"; @@ -1331,7 +1334,7 @@ void mexStub3(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 174 ---- +/* ---- finufft.mw: 177 ---- * finufft_default_opts(finufft_opts* o); */ static const char* stubids4_ = "finufft_default_opts(c i finufft_opts*)"; @@ -1355,7 +1358,7 @@ void mexStub4(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 176 ---- +/* ---- finufft.mw: 179 ---- * finufftf_plan* p = new(); */ static const char* stubids5_ = "c o finufftf_plan* = new()"; @@ -1376,7 +1379,7 @@ void mexStub5(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 177 ---- +/* ---- finufft.mw: 180 ---- * finufftf_default_opts(finufft_opts* o); */ static const char* stubids6_ = "finufftf_default_opts(c i finufft_opts*)"; @@ -1400,7 +1403,7 @@ void mexStub6(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 188 ---- +/* ---- finufft.mw: 191 ---- * copy_finufft_opts(mxArray opts, finufft_opts* o); */ static const char* stubids7_ = "copy_finufft_opts(c i mxArray, c i finufft_opts*)"; @@ -1427,7 +1430,7 @@ void mexStub7(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 191 ---- +/* ---- finufft.mw: 194 ---- * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, double tol, finufft_plan* plan, finufft_opts* o); */ static const char* stubids8_ = "c o int = finufft_makeplan(c i int, c i int, c i int64_t[x], c i int, c i int, c i double, c i finufft_plan*, c i finufft_opts*)"; @@ -1520,7 +1523,7 @@ void mexStub8(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 194 ---- +/* ---- finufft.mw: 197 ---- * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, float tol, finufftf_plan* plan, finufft_opts* o); */ static const char* stubids9_ = "c o int = finufftf_makeplan(c i int, c i int, c i int64_t[x], c i int, c i int, c i float, c i finufftf_plan*, c i finufft_opts*)"; @@ -1613,7 +1616,7 @@ void mexStub9(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 196 ---- +/* ---- finufft.mw: 199 ---- * delete(finufft_opts* o); */ static const char* stubids10_ = "delete(c i finufft_opts*)"; @@ -1637,7 +1640,7 @@ void mexStub10(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 226 ---- +/* ---- finufft.mw: 229 ---- * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj, double[] zj, int64_t nk, double[] s, double[] t, double[] u); */ static const char* stubids11_ = "c o int = finufft_setpts(c i finufft_plan, c i int64_t, c i double[], c i double[], c i double[], c i int64_t, c i double[], c i double[], c i double[])"; @@ -1767,7 +1770,7 @@ void mexStub11(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 228 ---- +/* ---- finufft.mw: 231 ---- * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj, float[] zj, int64_t nk, float[] s, float[] t, float[] u); */ static const char* stubids12_ = "c o int = finufftf_setpts(c i finufftf_plan, c i int64_t, c i float[], c i float[], c i float[], c i int64_t, c i float[], c i float[], c i float[])"; @@ -1897,7 +1900,7 @@ void mexStub12(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 260 ---- +/* ---- finufft.mw: 263 ---- * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[ncoeffs] result); */ static const char* stubids13_ = "c o int = finufft_execute(c i finufft_plan, c i dcomplex[], c o dcomplex[x])"; @@ -1953,7 +1956,7 @@ void mexStub13(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 262 ---- +/* ---- finufft.mw: 265 ---- * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[ncoeffs] result); */ static const char* stubids14_ = "c o int = finufftf_execute(c i finufftf_plan, c i fcomplex[], c o fcomplex[x])"; @@ -2009,9 +2012,9 @@ void mexStub14(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 268 ---- +/* ---- finufft.mw: 271 ---- * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result, dcomplex[] data_in); - * Also at finufft.mw: 325 + * Also at finufft.mw: 328 */ static const char* stubids15_ = "c o int = finufft_execute(c i finufft_plan, c o dcomplex[xx], c i dcomplex[])"; @@ -2068,9 +2071,9 @@ void mexStub15(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 270 ---- +/* ---- finufft.mw: 273 ---- * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result, fcomplex[] data_in); - * Also at finufft.mw: 327 + * Also at finufft.mw: 330 */ static const char* stubids16_ = "c o int = finufftf_execute(c i finufftf_plan, c o fcomplex[xx], c i fcomplex[])"; @@ -2127,7 +2130,7 @@ void mexStub16(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 274 ---- +/* ---- finufft.mw: 277 ---- * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk, n_trans] result); */ static const char* stubids17_ = "c o int = finufft_execute(c i finufft_plan, c i dcomplex[], c o dcomplex[xx])"; @@ -2185,7 +2188,7 @@ void mexStub17(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 276 ---- +/* ---- finufft.mw: 279 ---- * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk, n_trans] result); */ static const char* stubids18_ = "c o int = finufftf_execute(c i finufftf_plan, c i fcomplex[], c o fcomplex[xx])"; @@ -2243,7 +2246,7 @@ void mexStub18(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 311 ---- +/* ---- finufft.mw: 314 ---- * int ier = finufft_execute_adjoint(finufft_plan plan, output dcomplex[nj, n_trans] result, dcomplex[] data_in); */ static const char* stubids19_ = "c o int = finufft_execute_adjoint(c i finufft_plan, c o dcomplex[xx], c i dcomplex[])"; @@ -2301,7 +2304,7 @@ void mexStub19(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 313 ---- +/* ---- finufft.mw: 316 ---- * int ier = finufftf_execute_adjoint(finufftf_plan plan, output fcomplex[nj, n_trans] result, fcomplex[] data_in); */ static const char* stubids20_ = "c o int = finufftf_execute_adjoint(c i finufftf_plan, c o fcomplex[xx], c i fcomplex[])"; @@ -2359,7 +2362,7 @@ void mexStub20(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 317 ---- +/* ---- finufft.mw: 320 ---- * int ier = finufft_execute_adjoint(finufft_plan plan, dcomplex[] data_in, output dcomplex[ncoeffs] result); */ static const char* stubids21_ = "c o int = finufft_execute_adjoint(c i finufft_plan, c i dcomplex[], c o dcomplex[x])"; @@ -2415,7 +2418,7 @@ void mexStub21(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 319 ---- +/* ---- finufft.mw: 322 ---- * int ier = finufftf_execute_adjoint(finufftf_plan plan, fcomplex[] data_in, output fcomplex[ncoeffs] result); */ static const char* stubids22_ = "c o int = finufftf_execute_adjoint(c i finufftf_plan, c i fcomplex[], c o fcomplex[x])"; @@ -2471,7 +2474,7 @@ void mexStub22(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 339 ---- +/* ---- finufft.mw: 342 ---- * finufft_destroy(finufft_plan plan); */ static const char* stubids25_ = "finufft_destroy(c i finufft_plan)"; @@ -2499,7 +2502,7 @@ void mexStub25(int nlhs, mxArray* plhs[], mexErrMsgTxt(mw_err_txt_); } -/* ---- finufft.mw: 341 ---- +/* ---- finufft.mw: 344 ---- * finufftf_destroy(finufftf_plan plan); */ static const char* stubids26_ = "finufftf_destroy(c i finufftf_plan)"; @@ -2605,30 +2608,30 @@ void mexFunction(int nlhs, mxArray* plhs[], } else if (strcmp(id, "*profile report*") == 0) { if (!mexprofrecord_) mexPrintf("Profiler inactive\n"); - mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[1]); - mexPrintf("%d calls to finufft.mw:171\n", mexprofrecord_[2]); - mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[3]); - mexPrintf("%d calls to finufft.mw:174\n", mexprofrecord_[4]); - mexPrintf("%d calls to finufft.mw:176\n", mexprofrecord_[5]); - mexPrintf("%d calls to finufft.mw:177\n", mexprofrecord_[6]); - mexPrintf("%d calls to finufft.mw:188\n", mexprofrecord_[7]); - mexPrintf("%d calls to finufft.mw:191\n", mexprofrecord_[8]); - mexPrintf("%d calls to finufft.mw:194\n", mexprofrecord_[9]); - mexPrintf("%d calls to finufft.mw:196\n", mexprofrecord_[10]); - mexPrintf("%d calls to finufft.mw:226\n", mexprofrecord_[11]); - mexPrintf("%d calls to finufft.mw:228\n", mexprofrecord_[12]); - mexPrintf("%d calls to finufft.mw:260\n", mexprofrecord_[13]); - mexPrintf("%d calls to finufft.mw:262\n", mexprofrecord_[14]); - mexPrintf("%d calls to finufft.mw:268 (finufft.mw:325)\n", mexprofrecord_[15]); - mexPrintf("%d calls to finufft.mw:270 (finufft.mw:327)\n", mexprofrecord_[16]); - mexPrintf("%d calls to finufft.mw:274\n", mexprofrecord_[17]); - mexPrintf("%d calls to finufft.mw:276\n", mexprofrecord_[18]); - mexPrintf("%d calls to finufft.mw:311\n", mexprofrecord_[19]); - mexPrintf("%d calls to finufft.mw:313\n", mexprofrecord_[20]); - mexPrintf("%d calls to finufft.mw:317\n", mexprofrecord_[21]); - mexPrintf("%d calls to finufft.mw:319\n", mexprofrecord_[22]); - mexPrintf("%d calls to finufft.mw:339\n", mexprofrecord_[25]); - mexPrintf("%d calls to finufft.mw:341\n", mexprofrecord_[26]); + mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[1]); + mexPrintf("%d calls to finufft.mw:174\n", mexprofrecord_[2]); + mexPrintf("%d calls to finufft.mw:176\n", mexprofrecord_[3]); + mexPrintf("%d calls to finufft.mw:177\n", mexprofrecord_[4]); + mexPrintf("%d calls to finufft.mw:179\n", mexprofrecord_[5]); + mexPrintf("%d calls to finufft.mw:180\n", mexprofrecord_[6]); + mexPrintf("%d calls to finufft.mw:191\n", mexprofrecord_[7]); + mexPrintf("%d calls to finufft.mw:194\n", mexprofrecord_[8]); + mexPrintf("%d calls to finufft.mw:197\n", mexprofrecord_[9]); + mexPrintf("%d calls to finufft.mw:199\n", mexprofrecord_[10]); + mexPrintf("%d calls to finufft.mw:229\n", mexprofrecord_[11]); + mexPrintf("%d calls to finufft.mw:231\n", mexprofrecord_[12]); + mexPrintf("%d calls to finufft.mw:263\n", mexprofrecord_[13]); + mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[14]); + mexPrintf("%d calls to finufft.mw:271 (finufft.mw:328)\n", mexprofrecord_[15]); + mexPrintf("%d calls to finufft.mw:273 (finufft.mw:330)\n", mexprofrecord_[16]); + mexPrintf("%d calls to finufft.mw:277\n", mexprofrecord_[17]); + mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[18]); + mexPrintf("%d calls to finufft.mw:314\n", mexprofrecord_[19]); + mexPrintf("%d calls to finufft.mw:316\n", mexprofrecord_[20]); + mexPrintf("%d calls to finufft.mw:320\n", mexprofrecord_[21]); + mexPrintf("%d calls to finufft.mw:322\n", mexprofrecord_[22]); + mexPrintf("%d calls to finufft.mw:342\n", mexprofrecord_[25]); + mexPrintf("%d calls to finufft.mw:344\n", mexprofrecord_[26]); } else if (strcmp(id, "*profile log*") == 0) { FILE* logfp; if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0) @@ -2638,30 +2641,30 @@ void mexFunction(int nlhs, mxArray* plhs[], mexErrMsgTxt("Cannot open log for output"); if (!mexprofrecord_) fprintf(logfp, "Profiler inactive\n"); - fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[1]); - fprintf(logfp, "%d calls to finufft.mw:171\n", mexprofrecord_[2]); - fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[3]); - fprintf(logfp, "%d calls to finufft.mw:174\n", mexprofrecord_[4]); - fprintf(logfp, "%d calls to finufft.mw:176\n", mexprofrecord_[5]); - fprintf(logfp, "%d calls to finufft.mw:177\n", mexprofrecord_[6]); - fprintf(logfp, "%d calls to finufft.mw:188\n", mexprofrecord_[7]); - fprintf(logfp, "%d calls to finufft.mw:191\n", mexprofrecord_[8]); - fprintf(logfp, "%d calls to finufft.mw:194\n", mexprofrecord_[9]); - fprintf(logfp, "%d calls to finufft.mw:196\n", mexprofrecord_[10]); - fprintf(logfp, "%d calls to finufft.mw:226\n", mexprofrecord_[11]); - fprintf(logfp, "%d calls to finufft.mw:228\n", mexprofrecord_[12]); - fprintf(logfp, "%d calls to finufft.mw:260\n", mexprofrecord_[13]); - fprintf(logfp, "%d calls to finufft.mw:262\n", mexprofrecord_[14]); - fprintf(logfp, "%d calls to finufft.mw:268 (finufft.mw:325)\n", mexprofrecord_[15]); - fprintf(logfp, "%d calls to finufft.mw:270 (finufft.mw:327)\n", mexprofrecord_[16]); - fprintf(logfp, "%d calls to finufft.mw:274\n", mexprofrecord_[17]); - fprintf(logfp, "%d calls to finufft.mw:276\n", mexprofrecord_[18]); - fprintf(logfp, "%d calls to finufft.mw:311\n", mexprofrecord_[19]); - fprintf(logfp, "%d calls to finufft.mw:313\n", mexprofrecord_[20]); - fprintf(logfp, "%d calls to finufft.mw:317\n", mexprofrecord_[21]); - fprintf(logfp, "%d calls to finufft.mw:319\n", mexprofrecord_[22]); - fprintf(logfp, "%d calls to finufft.mw:339\n", mexprofrecord_[25]); - fprintf(logfp, "%d calls to finufft.mw:341\n", mexprofrecord_[26]); + fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[1]); + fprintf(logfp, "%d calls to finufft.mw:174\n", mexprofrecord_[2]); + fprintf(logfp, "%d calls to finufft.mw:176\n", mexprofrecord_[3]); + fprintf(logfp, "%d calls to finufft.mw:177\n", mexprofrecord_[4]); + fprintf(logfp, "%d calls to finufft.mw:179\n", mexprofrecord_[5]); + fprintf(logfp, "%d calls to finufft.mw:180\n", mexprofrecord_[6]); + fprintf(logfp, "%d calls to finufft.mw:191\n", mexprofrecord_[7]); + fprintf(logfp, "%d calls to finufft.mw:194\n", mexprofrecord_[8]); + fprintf(logfp, "%d calls to finufft.mw:197\n", mexprofrecord_[9]); + fprintf(logfp, "%d calls to finufft.mw:199\n", mexprofrecord_[10]); + fprintf(logfp, "%d calls to finufft.mw:229\n", mexprofrecord_[11]); + fprintf(logfp, "%d calls to finufft.mw:231\n", mexprofrecord_[12]); + fprintf(logfp, "%d calls to finufft.mw:263\n", mexprofrecord_[13]); + fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[14]); + fprintf(logfp, "%d calls to finufft.mw:271 (finufft.mw:328)\n", mexprofrecord_[15]); + fprintf(logfp, "%d calls to finufft.mw:273 (finufft.mw:330)\n", mexprofrecord_[16]); + fprintf(logfp, "%d calls to finufft.mw:277\n", mexprofrecord_[17]); + fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[18]); + fprintf(logfp, "%d calls to finufft.mw:314\n", mexprofrecord_[19]); + fprintf(logfp, "%d calls to finufft.mw:316\n", mexprofrecord_[20]); + fprintf(logfp, "%d calls to finufft.mw:320\n", mexprofrecord_[21]); + fprintf(logfp, "%d calls to finufft.mw:322\n", mexprofrecord_[22]); + fprintf(logfp, "%d calls to finufft.mw:342\n", mexprofrecord_[25]); + fprintf(logfp, "%d calls to finufft.mw:344\n", mexprofrecord_[26]); fclose(logfp); } else mexErrMsgTxt("Unknown identifier"); diff --git a/matlab/finufft.mw b/matlab/finufft.mw index 4f82493c6..3979419cc 100644 --- a/matlab/finufft.mw +++ b/matlab/finufft.mw @@ -103,6 +103,9 @@ $ } $ else if (strcmp(fname[ifield],"spread_max_sp_size") == 0) { $ oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); $ } +$ else if (strcmp(fname[ifield],"spread_function") == 0) { +$ oc->spread_function = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); +$ } $ else if (strcmp(fname[ifield],"spreadinterponly") == 0) { $ oc->spreadinterponly = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); $ } diff --git a/matlab/finufft1d1.m b/matlab/finufft1d1.m index 0c133a22b..d67455eaf 100644 --- a/matlab/finufft1d1.m +++ b/matlab/finufft1d1.m @@ -31,6 +31,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering) % opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp) % Outputs: diff --git a/matlab/finufft1d2.m b/matlab/finufft1d2.m index dd452e9a2..8f7584439 100644 --- a/matlab/finufft1d2.m +++ b/matlab/finufft1d2.m @@ -30,6 +30,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering) % opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp) % Outputs: diff --git a/matlab/finufft1d3.m b/matlab/finufft1d3.m index cff3a6298..fbfda8fae 100644 --- a/matlab/finufft1d3.m +++ b/matlab/finufft1d3.m @@ -30,6 +30,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % Outputs: % f length-nk complex vector of values at targets, or, if ntrans>1, % a matrix of size (nk,ntrans) diff --git a/matlab/finufft2d1.m b/matlab/finufft2d1.m index 1cd940e04..c89d50cc9 100644 --- a/matlab/finufft2d1.m +++ b/matlab/finufft2d1.m @@ -35,6 +35,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering) % opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp) % Outputs: diff --git a/matlab/finufft2d2.m b/matlab/finufft2d2.m index b17baba50..a227fe88d 100644 --- a/matlab/finufft2d2.m +++ b/matlab/finufft2d2.m @@ -32,6 +32,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering) % opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp) % Outputs: diff --git a/matlab/finufft2d3.m b/matlab/finufft2d3.m index 62aff50df..8c7ec2046 100644 --- a/matlab/finufft2d3.m +++ b/matlab/finufft2d3.m @@ -31,6 +31,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % Outputs: % f length-nk complex vector of values at targets, or, if ntrans>1, % a matrix of size (nk,ntrans) diff --git a/matlab/finufft3d1.m b/matlab/finufft3d1.m index aab2d8693..19e4c145d 100644 --- a/matlab/finufft3d1.m +++ b/matlab/finufft3d1.m @@ -37,6 +37,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering) % opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp) % Outputs: diff --git a/matlab/finufft3d2.m b/matlab/finufft3d2.m index 6bf0631b3..f65bee063 100644 --- a/matlab/finufft3d2.m +++ b/matlab/finufft3d2.m @@ -34,6 +34,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering) % opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp) % Outputs: diff --git a/matlab/finufft3d3.m b/matlab/finufft3d3.m index edb5cdd5e..9cf7fda79 100644 --- a/matlab/finufft3d3.m +++ b/matlab/finufft3d3.m @@ -32,6 +32,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % Outputs: % f length-nk complex vector of values at targets, or, if ntrans>1, % a matrix of size (nk,ntrans) diff --git a/matlab/finufft_plan.m b/matlab/finufft_plan.m index 3179b9fc9..ed78d57de 100644 --- a/matlab/finufft_plan.m +++ b/matlab/finufft_plan.m @@ -56,6 +56,7 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] % opts.floatprec: library precision to use, 'double' (default) or 'single'. % for type 1 and 2 only, the following opts fields are also relevant: % opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering) diff --git a/matlab/opts.docbit b/matlab/opts.docbit index 1a290afd7..c7fc1be2f 100644 --- a/matlab/opts.docbit +++ b/matlab/opts.docbit @@ -12,3 +12,4 @@ % opts.spread_thread: for ntrans>1 only. 0:auto, 1:seq multi, 2:par, etc % opts.maxbatchsize: for ntrans>1 only. max blocking size, or 0 for auto. % opts.nthreads: number of threads, or 0: use all available (default) +% opts.spread_function: 0 (default), >0 (nonstandard kernels) [experts only] diff --git a/matlab/test/fig_accuracy.m b/matlab/test/fig_accuracy.m index 9b84571b2..543535429 100644 --- a/matlab/test/fig_accuracy.m +++ b/matlab/test/fig_accuracy.m @@ -6,11 +6,15 @@ %M=1e2; N=1e5; % confusion about N vs M controlling err prefac (it's N) isign = +1; % sign of imaginary unit in exponential o.debug = 0; % choose 1 for timing breakdown text output +o.spread_function = 0; -% use one of these two... +% use one of these... %tols = 10.^(-1:-0.02:-15); o.upsampfac = 2.0; tols = 10.^(-1:-0.02:-10); o.upsampfac=1.25; % for lowupsampfac -%tols = 10.^(-1:-0.02:-14); o.upsampfac=1.99; % for lowupsampfac +%tols = 10.^(-1:-0.02:-11); o.upsampfac=1.3; % for lowupsampfac + +%tols = 10.^(-1:-0.02:-12); o.upsampfac=1.5; % intermediate +%tols = 10.^(-1:-0.02:-14); o.upsampfac=1.99; % v close to 2 % other expts... %tols = 1e-6; @@ -30,4 +34,4 @@ figure; loglog(tols,errs,'+'); hold on; plot(tols,tols,'-'); axis tight; xlabel('tol'); ylabel('err'); %title(sprintf('1d1: (maxerr)/||c||_1, M=%d, N=%d\n',M,N)); -title(sprintf('1d1: ||\tilde f - f||_2/||f||_2, M=%d, N=%d\n',M,N)); +title(sprintf('1d1 \\sigma=%g sf=%d M=%d N=%d: rel 2-norm err in f',o.upsampfac,o.spread_function,M,N)); diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 3b0dec66d..a61667eff 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,6 +1,6 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) -target_link_libraries(cuperftest cufinufft CUDA::cufft CUDA::cudart) +target_link_libraries(cuperftest PRIVATE cufinufft CUDA::cufft CUDA::cudart $) target_compile_features(cuperftest PRIVATE cxx_std_17) set_target_properties( cuperftest diff --git a/python/finufft/finufft/_finufft.py b/python/finufft/finufft/_finufft.py index 593fc5b16..e28f02f9f 100644 --- a/python/finufft/finufft/_finufft.py +++ b/python/finufft/finufft/_finufft.py @@ -86,6 +86,7 @@ class FinufftOpts(ctypes.Structure): ('maxbatchsize', c_int), ('spread_nthr_atomic', c_int), ('spread_max_sp_size', c_int), + ('spread_function', c_int), ('fftw_lock_fun', c_void_p), ('fftw_unlock_fun', c_void_p), ('fftw_lock_data', c_void_p)] diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e326773c6..d257b0db4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,7 +23,6 @@ set(FINUFFT_SOURCES finufft_core.cpp c_interface.cpp finufft_utils.cpp - utils.cpp ) if(FINUFFT_BUILD_FORTRAN) @@ -48,7 +47,7 @@ if(FINUFFT_USE_DUCC0) target_compile_definitions(finufft PRIVATE FINUFFT_USE_DUCC0) endif() -target_link_libraries(finufft PRIVATE $) +target_link_libraries(finufft PRIVATE $) if(FINUFFT_USE_OPENMP) target_link_libraries(finufft PRIVATE OpenMP::OpenMP_CXX) if(NOT FINUFFT_STATIC_LINKING) diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt new file mode 100644 index 000000000..2658a66ef --- /dev/null +++ b/src/common/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.24) + +# Build a small static helper library for CPU-only kernel parameter helpers +# Source is in ../kernel.cpp as requested +set(FINUFFT_COMMON_SOURCES kernel.cpp utils.cpp) + +add_library(finufft_common STATIC ${FINUFFT_COMMON_SOURCES}) + +# The public include directory is the top-level include/ +target_include_directories( + finufft_common + PUBLIC $ $ +) +set_target_properties(finufft_common PROPERTIES POSITION_INDEPENDENT_CODE ON) + +# Visibility and compile options consistent with finufft +if(FINUFFT_SHARED_LINKING) + target_compile_definitions(finufft_common PRIVATE FINUFFT_DLL) +endif() + +target_compile_features(finufft_common PRIVATE cxx_std_17) + +# If you need -lm etc. +find_library(MATH_LIBRARY m) +if(MATH_LIBRARY) + target_link_libraries(finufft_common PRIVATE ${MATH_LIBRARY}) +endif() + +set_target_properties(finufft_common PROPERTIES CXX_VISIBILITY_PRESET hidden VISIBILITY_INLINES_HIDDEN YES) diff --git a/src/common/kernel.cpp b/src/common/kernel.cpp new file mode 100644 index 000000000..a7d1cf30c --- /dev/null +++ b/src/common/kernel.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include + +namespace finufft::kernel { + +int compute_kernel_ns(double upsampfac, double tol, int kernel_type, + const finufft_spread_opts &opts) { + // Note: opts is unused here but kept for API consistency if needed later + int ns; + if (upsampfac == 2.0) { + ns = (int)std::ceil(-std::log10(tol / 10.0)); + } else { + ns = (int)std::ceil( + -std::log(tol) / (finufft::common::PI * std::sqrt(1.0 - 1.0 / upsampfac))); + } + ns = std::max(2, ns); + return ns; +} + +void initialize_kernel_params(finufft_spread_opts &opts, double upsampfac, double tol, + int kernel_type) { + int ns; + // Respect any pre-set opts.nspread (e.g., clipped by caller). If it's <=0 compute it. + if (opts.nspread > 0) { + ns = opts.nspread; + } else { + ns = compute_kernel_ns(upsampfac, tol, kernel_type, opts); + } + + opts.kernel_type = kernel_type; + opts.nspread = ns; // ensure opts is populated with the (possibly clipped) ns + opts.ES_halfwidth = (double)ns / 2.0; + opts.ES_c = 4.0 / (double)(ns * ns); + + if (kernel_type == 1) { + // Kaiser-Bessel (KB) + // Formula from Beatty et al. 2005 + double tmp = (double)ns * (double)ns / (upsampfac * upsampfac); + double term2 = (upsampfac - 0.5) * (upsampfac - 0.5); + opts.ES_beta = common::PI * std::sqrt(tmp * term2 - 0.8); + } else { + // Exponential of Semicircle (ES) + double betaoverns = 2.30; + if (ns == 2) + betaoverns = 2.20; + else if (ns == 3) + betaoverns = 2.26; + else if (ns == 4) + betaoverns = 2.38; + + if (upsampfac != 2.0) { + double gamma = 0.97; + betaoverns = gamma * common::PI * (1.0 - 1.0 / (2.0 * upsampfac)); + } + opts.ES_beta = betaoverns * (double)ns; + } + + if (opts.debug) { + const char *kname = (kernel_type == 1) ? "KB" : "ES"; + printf("setup_spreader: using spread kernel type %d (%s)\n", kernel_type, kname); + printf("setup_spreader eps=%.3g sigma=%.3g (%s): chose ns=%d beta=%.3g\n", tol, + upsampfac, kname, ns, opts.ES_beta); + } +} + +double sigma_max_tol(double upsampfac, int kernel_type, int max_ns) { + if (upsampfac == 2.0) { + return 10.0 * std::pow(10.0, -(double)max_ns); + } else { + return std::exp( + -(double)max_ns * finufft::common::PI * std::sqrt(1.0 - 1.0 / upsampfac)); + } +} + +} // namespace finufft::kernel diff --git a/src/utils.cpp b/src/common/utils.cpp similarity index 73% rename from src/utils.cpp rename to src/common/utils.cpp index ce8308fac..75b64ff55 100644 --- a/src/utils.cpp +++ b/src/common/utils.cpp @@ -1,5 +1,19 @@ + #include #include +#include + +// Prefer the standard library's special-math `cyl_bessel_i` when available. +#if defined(__has_include) +#if __has_include() +#include +#endif +#endif +// Feature-test macro for special math functions (if available in the standard +// library implementation). Fall back to our series implementation otherwise. +#if defined(__cpp_lib_math_special_functions) +#define FINUFFT_HAVE_STD_CYL_BESSEL_I 1 +#endif #ifdef __CUDACC__ #include @@ -24,7 +38,7 @@ void gaussquad(int n, double *xgl, double *wgl) { for (int i = 0; i < n / 2; i++) { // Loop through nodes convcount = 0; x = std::cos((2 * i + 1) * PI / (2 * n)); // Initial guess: Chebyshev node - while (true) { // Newton iteration + while (true) { // Newton iteration auto [p, dp] = leg_eval(n, x); dx = -p / dp; x += dx; // Newton step @@ -71,6 +85,34 @@ std::tuple leg_eval(int n, double x) { return {p2, n * (x * p2 - p1) / (x * x - 1)}; } +double cyl_bessel_i(double nu, double x) noexcept { +#if defined(FINUFFT_HAVE_STD_CYL_BESSEL_I) + return std::cyl_bessel_i(nu, x); +#else + if (x == 0.0) { + if (nu == 0.0) return 1.0; + return 0.0; + } + + const double halfx = x / 2.0; + double term = std::pow(halfx, nu) / std::tgamma(nu + 1.0); // k = 0 + double sum = term; + + static constexpr auto eps = std::numeric_limits::epsilon() * 10.0; + static constexpr auto max_iter = 100000; + + for (int k = 1; k < max_iter; ++k) { + term *= (halfx * halfx) / (static_cast(k) * (nu + static_cast(k))); + sum += term; + + if (std::abs(term) < eps * std::abs(sum)) { + break; + } + } + return sum; +#endif +} + } // namespace common } // namespace finufft diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 2f1b2aeaf..92825eeb5 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -1,6 +1,6 @@ include(staticAnalysis) include(setupIWYU) -set(PRECISION_INDEPENDENT_SRC precision_independent.cu ../utils.cpp) +set(PRECISION_INDEPENDENT_SRC precision_independent.cu) set(PRECISION_DEPENDENT_SRC spreadinterp.cpp @@ -81,7 +81,7 @@ if(FINUFFT_SHARED_LINKING) endif() endif() -target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft) +target_link_libraries(cufinufft PRIVATE CUDA::cudart CUDA::cufft $) # Expose only when not doing fully static linking if(NOT FINUFFT_STATIC_LINKING) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft) diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 437ec39bb..6e3f866ae 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -107,7 +107,8 @@ static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, { // this calls spreadinterp.cpp... int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth, - opts.spread_debug, opts.showwarn, opts.spreadinterponly, dim); + opts.spread_debug, opts.showwarn, opts.spreadinterponly, dim, + opts.spread_function); // override various spread opts from their defaults... spopts.debug = opts.spread_debug; spopts.sort = opts.spread_sort; // could make dim or CPU choices here? @@ -531,8 +532,9 @@ template void FINUFFT_PLAN_T::precompute_horner_coeffs() { horner_coeffs.fill(TF(0)); // Get the kernel parameters once - const TF beta = TF(this->spopts.ES_beta); - const TF c_param = TF(this->spopts.ES_c); + const TF beta = TF(this->spopts.ES_beta); + const TF c_param = TF(this->spopts.ES_c); + const int kernel_type = this->spopts.kernel_type; nc = MIN_NC; @@ -550,9 +552,9 @@ template void FINUFFT_PLAN_T::precompute_horner_coeffs() { // original: 0.5 * (x - nspread + 2*j + 1) const TF shift = TF(2 * j + 1 - nspread); - const auto kernel = [shift, beta, c_param](TF x) -> TF { + const auto kernel = [shift, beta, c_param, kernel_type](TF x) -> TF { const TF t = TF(0.5) * (x + shift); - return evaluate_kernel(t, beta, c_param); + return evaluate_kernel(t, beta, c_param, kernel_type); }; const auto coeffs = fit_monomials(kernel, static_cast(n_coeffs), a, b); @@ -567,8 +569,8 @@ template void FINUFFT_PLAN_T::precompute_horner_coeffs() { // coeffs[0] is highest degree. int used = 0; for (size_t k = 0; k < coeffs.size(); ++k) { - if (std::abs(coeffs[k]) >= tol * 0.50) { // divide tol by 5 otherwise it fails in - // some cases + if (std::abs(coeffs[k]) >= tol * 0.5) { // divide tol by 2 otherwise it fails in + // some cases used = static_cast(coeffs.size() - k); break; } @@ -729,6 +731,7 @@ void finufft_default_opts_t(finufft_opts *o) o->maxbatchsize = 0; o->spread_nthr_atomic = -1; o->spread_max_sp_size = 0; + o->spread_function = 0; o->fftw_lock_fun = nullptr; o->fftw_unlock_fun = nullptr; o->fftw_lock_data = nullptr; diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 88827c7a3..68101de8b 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -60,7 +60,7 @@ struct [[maybe_unused]] select_odd { // this finds the largest SIMD instruction set that can handle N elements // void otherwise -> compile error -template static constexpr auto BestSIMDHelper() { +template constexpr auto BestSIMDHelper() { if constexpr (N % K == 0) { // returns void in the worst case return xsimd::make_sized_batch{}; @@ -78,7 +78,7 @@ template constexpr auto get_padding() { } template constexpr auto get_padding_helper(uint8_t runtime_ns) { - if constexpr (ns < finufft::common::MIN_NSPREAD) { + if constexpr (ns < MIN_NSPREAD) { return 0; } else { if (runtime_ns == ns) { @@ -90,7 +90,7 @@ template constexpr auto get_padding_helper(uint8_t runtime_ } template uint8_t get_padding(uint8_t ns) { - return get_padding_helper(ns); + return get_padding_helper(ns); } template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; @@ -1999,7 +1999,7 @@ template int spreadinterpSorted( template int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, - int debug, int showwarn, int spreadinterponly, int dim) + int debug, int showwarn, int spreadinterponly, int dim, int kernel_type) /* Initializes spreader kernel parameters given desired NUFFT tolerance eps, upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and debug flags. Horner polynomial evaluation is always used; the kerevalmeth argument is @@ -2051,17 +2051,15 @@ int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerev } if (!spreadinterponly && (upsampfac < 1.25 || upsampfac > 2.0)) { if (showwarn) - fprintf( - stderr, - "%s warning: upsampfac=%.3g outside [1.25, 2.0] are unlikely to provide benefit and might break the library;\n", - __func__, upsampfac); + fprintf(stderr, + "%s warning: upsampfac=%.3g outside [1.25, 2.0] are unlikely to provide " + "benefit and might break the library;\n", + __func__, upsampfac); } - if (upsampfac == 2.0) // standard sigma (see SISC paper) - ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10 - else // custom sigma - ns = std::ceil(-log(eps) / (T(PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula, - // gam=1 - ns = max(2, ns); // (we don't have ns=1 version yet) + // Compute ns (kernel width) using central helper; caller handles clipping. + ns = compute_kernel_ns(upsampfac, (double)eps, kernel_type, opts); + // ns == 2 breaks for float with upsampfact = 2.00 + if (std::is_same_v && upsampfac == 2.00) ns = std::max(ns, 3); if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules if (showwarn) @@ -2073,34 +2071,19 @@ int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerev ier = FINUFFT_WARN_EPS_TOO_SMALL; } opts.nspread = ns; - // setup for reference kernel eval (via formula): select beta width param... - // Horner kernels still need reference evals for FTs in onedim_*_kernel - opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines) - opts.ES_c = 4.0 / (double)(ns * ns); - double betaoverns = 2.30; // gives decent betas for default sigma=2.0 - if (ns == 2) betaoverns = 2.20; // some small-width tweaks... - if (ns == 3) betaoverns = 2.26; - if (ns == 4) betaoverns = 2.38; - if (upsampfac != 2.0) { - // again, override beta for custom sigma - T gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! - betaoverns = gamma * T(PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based - // on cutoff - } - opts.ES_beta = betaoverns * ns; // set the kernel beta parameter - if (debug) - printf("%s eps=%.3g sigma=%.3g (Horner): chose ns=%d beta=%.3g\n", __func__, - (double)eps, upsampfac, ns, opts.ES_beta); - + // default kernel selector: 0 = ES (exp-sqrt), 1 = KB (Kaiser--Bessel) + opts.kernel_type = kernel_type; + // initialize remaining ES/KB parameters (beta, c, halfwidth) based on opts.nspread + initialize_kernel_params(opts, upsampfac, (double)eps, kernel_type); return ier; } template FINUFFT_EXPORT_TEST int setup_spreader( finufft_spread_opts &opts, float eps, double upsampfac, int kerevalmeth, int debug, - int showwarn, int spreadinterponly, int dim); + int showwarn, int spreadinterponly, int dim, int kernel_type); template FINUFFT_EXPORT_TEST int setup_spreader( finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth, int debug, - int showwarn, int spreadinterponly, int dim); + int showwarn, int spreadinterponly, int dim, int kernel_type); template T evaluate_kernel_runtime(T x, int ns, int nc, const T *horner_coeffs_ptr, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 83873f447..d677b4efa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -10,6 +10,7 @@ set(TESTS finufft3dmany_test spreadinterp1d_test adjointness + accuracy_test ) foreach(TEST ${TESTS}) @@ -96,6 +97,8 @@ function(add_tests_with_prec PREC REQ_TOL CHECK_TOL SUFFIX) WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) + add_test(NAME run_accuracy_test_${PREC} COMMAND accuracy_test${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + add_test(NAME run_adjointness_${PREC} COMMAND adjointness${SUFFIX} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) endfunction() diff --git a/test/accuracy_test.cpp b/test/accuracy_test.cpp new file mode 100644 index 000000000..cbd68acc3 --- /dev/null +++ b/test/accuracy_test.cpp @@ -0,0 +1,223 @@ +// test/accuracy_test.cpp +// C++ analog of python/finufft/test/fig_accuracy.py +// Sweeps tolerances from 1e-1 down to 1e- in 0.02-decade steps, +// computes relative L2 error for FINUFFT1D1 and requires the average error +// across each decade [1e-d,1e-(d+1)] to be <= 1e-d. Exits nonzero on failure +// so CTest/CI can detect it. + +#include +#include +#include +#include +#include +#include +#include +// test utilities: direct DFT and norm helpers +#include "utils/dirft1d.hpp" +#include "utils/norms.hpp" +#include + +int main(int argc, char *argv[]) { + // Defaults + BIGINT M = 2000; // # sources + BIGINT N = 100; // # modes + int isign = +1; + double upsampfac = 2.0; + const auto seed = std::random_device()(); + int hold_inputs = 1; // default: hold inputs (reuse across tolerances) + int kernel_type = 0; // 0 => ES (default), 1 => KB + int max_digits = 0; // <=0 means use machine precision (15 double / 7 float) + int debug_level = 0; // optional: enable debug output (sets both opts.debug and + // opts.spread_debug) + int showwarn = 0; // whether to print warnings (default 0 => suppress) + int verbose = 0; // if 1 print per-tolerance FAILED lines and decade summaries + + double w = 0.0; + // If user asked for help, print usage and exit + for (int ai = 1; ai < argc; ++ai) { + if (std::string(argv[ai]) == "-h" || std::string(argv[ai]) == "--help") { + std::cout << "Usage: " << argv[0] + << " [M] [N] [isign] [upsampfac] [hold_inputs] [kernel_type] " + "[max_digits] [debug] [showwarn] [verbose]\n"; + std::cout << " M : number of sources (default 2000)\n"; + std::cout << " N : number of modes (default 100)\n"; + std::cout << " isign : sign of transform (default +1)\n"; + std::cout << " upsampfac : upsampling factor (default 2.0)\n"; + std::cout + << " hold_inputs : if nonzero, reuse inputs across tolerances (default 1)\n"; + std::cout << " kernel_type : spread kernel selection (0:ES default, 1:KB)\n"; + std::cout << " max_digits : max digits to test (<=0 uses machine precision)\n"; + std::cout + << " debug : optional debug level (0=no debug, 1=some, 2=more)\n"; + std::cout + << " showwarn : whether to print warnings (0=silent default, 1=show)\n"; + std::cout << " verbose : if 1 print FAILED accuracies and decade summaries " + "(default 0)\n"; + std::cout << "Example: " << argv[0] << " 10000 100 1 2.0 1 0 15 1 0 0\n"; + return 0; + } + } + if (argc > 1) { + sscanf(argv[1], "%lf", &w); + M = (BIGINT)w; + } + if (argc > 2) { + sscanf(argv[2], "%lf", &w); + N = (BIGINT)w; + } + if (argc > 3) sscanf(argv[3], "%d", &isign); + if (argc > 4) sscanf(argv[4], "%lf", &upsampfac); + // Note: seed is internal (default 42) and not a command-line argument + if (argc > 5) sscanf(argv[5], "%d", &hold_inputs); + if (argc > 6) sscanf(argv[6], "%d", &kernel_type); + if (argc > 7) sscanf(argv[7], "%d", &max_digits); + if (argc > 8) sscanf(argv[8], "%d", &debug_level); + if (argc > 9) sscanf(argv[9], "%d", &showwarn); + if (argc > 10) sscanf(argv[10], "%d", &verbose); + + if (max_digits <= 0) { + max_digits = std::numeric_limits::digits10; + double min_tol = finufft::kernel::sigma_max_tol(upsampfac, kernel_type, + finufft::common::MAX_NSPREAD); + // Cap max_digits based on achievable tolerance for the chosen upsampling + // factor and kernel. Use kernel::sigma_max_tol with the library's + // MAX_NSPREAD to compute the smallest attainable sigma, then convert to + // decimal digits and clamp. This prevents attempting to test digits + // finer than the kernel/spreader can reasonably achieve for the chosen + // `upsampfac`. + if (min_tol < 0.0) + throw std::runtime_error("accuracy_test: could not compute min_tol"); + int max_digits_sigma = (int)std::floor(-std::log10(min_tol)); + if (max_digits_sigma < 1) max_digits_sigma = 1; + if (max_digits > max_digits_sigma) { + max_digits = max_digits_sigma; + } + } + + // Build tolerance grid: exps from -1 down to -max_digits in 0.02 steps. + // Use an integer-step loop to ensure the last exponent equals -max_digits + // (avoids floating-point drift that could omit the final decade). + std::vector exps; + int nsteps = (int)std::round((max_digits - 1.0) / 0.02); + for (int i = 0; i <= nsteps; ++i) exps.push_back(-1.0 - 0.02 * (double)i); + const size_t NT = exps.size(); + std::vector tols(NT); + for (size_t t = 0; t < NT; ++t) tols[t] = pow(10.0, exps[t]); + + // Setup opts (will be passed to guru makeplan). We will use the plan + // guru interface so we can override the plan's spread kernel selection + // via the plan method `set_spread_kernel_type` below. + finufft_opts opts{}; + FINUFFT_DEFAULT_OPTS(&opts); + opts.upsampfac = upsampfac; + opts.spread_function = kernel_type; + // set debug levels from command-line if requested + opts.debug = debug_level; + opts.spread_debug = debug_level; + // set whether to show warnings (user-controllable) + opts.showwarn = showwarn; + + // For reproducibility use srand when not holding inputs + srand(seed); + + std::vector x(M); + std::vector c(M); + std::vector F(N); + std::vector fe(N); + + for (BIGINT j = 0; j < M; ++j) { + x[j] = PI * randm11(); + } + if (hold_inputs) { + for (BIGINT j = 0; j < M; ++j) c[j] = crandm11(); + } + + int ier = 0; + + // Pass/fail tracking: we update these as each tolerance is tested. + int npass = 0; + int nfail = 0; + std::vector decade_fail(max_digits + 1, 0); + std::vector decade_total(max_digits + 1, 0); + std::cout << std::scientific << std::setprecision(6); + + // Tunable slack multiplier: allow `slack * tol` as acceptable threshold. + // `base_slack` can be tuned to be more/less permissive. When a tolerance + // is close to machine precision (digit near `max_digits`) we increase the + // slack to account for limits of floating-point resolution. + const double base_slack = 3.0; // it fails in CI otherwsie + for (size_t t = 0; t < NT; ++t) { + double tol = tols[t]; + if (!hold_inputs) { + for (BIGINT j = 0; j < M; ++j) { + x[j] = PI * randm11(); + c[j] = crandm11(); + } + } + + ier = FINUFFT1D1(M, x.data(), c.data(), isign, (FLT)tol, N, F.data(), &opts); + if (ier > 1) { + std::cerr << "accuracy_test: FINUFFT1D1 returned ier=" << ier << "\n"; + return ier; + } + + // Compute exact result using direct DFT test utility and compute + // relative L2 error using the test norms utilities. + dirft1d1(M, x, c, isign, N, fe); + double rel_err = relerrtwonorm(N, fe, F); + + // Compute integer digit from the precomputed exponents for exactness. + int d = (int)std::round(-exps[t]); + if (d < 1) d = 1; + if (d > max_digits) d = max_digits; + + // If this tolerance is exactly a power-of-ten (1e-d), print the + // achieved accuracy for that power-of-ten tolerance. + if (std::fabs(-exps[t] - d) < 1e-12) { + std::cout << "tol 1e-" << d << " achieved=" << rel_err << "\n"; + } + + // Compute the required threshold using a tunable slack multiplier. + double slack = base_slack; +#ifdef SINGLE + if (d == 6) slack *= 5.0; + if (d == 7) slack *= 50.0; +#else + if (d == 14) slack *= 1.10; + if (d == 15) slack *= 13.0; +#endif + + const double req = tol * slack; // final acceptance threshold + const bool pass = (rel_err <= req); + if (pass) { + ++npass; + } else { + ++nfail; + ++decade_fail[d]; + // Print failures immediately only in verbose mode (suppress PASSED lines to reduce + // noise). + if (verbose) { + std::cout << "tol=" << tol << " req=" << req << " rel_err=" << rel_err + << " -> FAILED (interval=1e-" << d << ",1e-" << (d + 1) << ")\n"; + } + } + ++decade_total[d]; + + // If the next sample is in a different interval (or we're at the last sample), + // print the decade summary now so progress appears as we go. + int next_d = -1; + if (t + 1 < NT) next_d = static_cast(std::round(-exps[t + 1])); + if (next_d < 1) next_d = 1; + if (next_d > max_digits) next_d = max_digits; + if (next_d != d) { + // Only print intermediate decade summaries in verbose mode; otherwise + // the program will only emit the final SUMMARY to reduce noise. + std::cout << "-- [1e-" << (d) << ", 1e-" << (d + 1) + << "] summary: total=" << decade_total[d] << " failed=" << decade_fail[d] + << "\n"; + } + } + + std::cout << "\nSUMMARY: " << npass << " passed, " << nfail << " failed\n"; + return nfail != 0; +} diff --git a/test/check_finufft.sh b/test/check_finufft.sh index 213d1c784..2ed15b531 100755 --- a/test/check_finufft.sh +++ b/test/check_finufft.sh @@ -111,6 +111,12 @@ T=adjointness$PRECSUF E=${PIPESTATUS[0]} if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi +((N++)) +T=accuracy_test$PRECSUF +./$T$FEX 2>$DIR/$T.err.out | tee $DIR/$T.out +E=${PIPESTATUS[0]} +if [[ $E -eq 0 ]]; then echo passed; elif [[ $E -eq $SIGSEGV ]]; then echo crashed; ((CRASHES++)); else echo failed; ((FAILS++)); fi + # END TESTS --------------------------------------------------------- diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 1ee87f185..8c26967fc 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -5,9 +5,11 @@ foreach(srcfile ${test_src}) get_filename_component(executable ${executable} NAME) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) - target_link_libraries(${executable} cufinufft CUDA::cufft CUDA::cudart) + # Link cufinufft and CUDA runtime/cufft and also link finufft_common + # privately so only test executables get it (keep finufft_common PRIVATE in cufinufft). + target_link_libraries(${executable} PRIVATE cufinufft CUDA::cufft CUDA::cudart $) if(MATH_LIBRARY) - target_link_libraries(${executable} ${MATH_LIBRARY}) + target_link_libraries(${executable} PRIVATE ${MATH_LIBRARY}) endif() target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties(