@@ -520,31 +520,33 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
520520 // Solve for piecewise Horner coeffs for the function kernel.h:evaluate_kernel()
521521 // Marco Barbone, Fall 2025.
522522 const auto nspread = spopts.nspread ;
523- // "max_degree" really is " number of coeffs"
524- const auto max_degree = std::max (nspread + 3 , MIN_NC);
523+ // "number of coeffs"
524+ const auto n_coeffs = std::max (nspread + 3 , MIN_NC);
525525
526526 // get the xsimd padding
527- static constexpr auto simd_size = xsimd::batch<TF>::size;
528- const auto padded_ns = (nspread + simd_size - 1 ) & -simd_size;
527+ // (must match that used in spreadinterp.cpp), if we change horner simd_width there
528+ // we must also change it here
529+ const auto simd_size = GetPaddedSIMDWidth<TF>(2 * nspread);
530+ const auto padded_ns = (nspread + simd_size - 1 ) & -simd_size;
529531
530532 horner_coeffs.fill (TF (0 ));
531533
532- // Precompute kernel parameters once
534+ // Get the kernel parameters once
533535 const TF beta = TF (this ->spopts .ES_beta );
534536 const TF c_param = TF (this ->spopts .ES_c );
535537 const int kernel_type = this ->spopts .kernel_type ;
536538
537539 nc = MIN_NC;
538540
539- // Temporary storage: same transposed layout as horner_coeffs:
540- // [k * padded_ns + j], with k in [0, max_degree), j in [0, padded_ns)
541- std::vector<TF> tmp_coeffs (static_cast <size_t >(max_degree) * padded_ns, TF (0 ));
542-
543541 static constexpr TF a = TF (-1.0 );
544542 static constexpr TF b = TF (1.0 );
545543
546544 // First pass: fit at max_degree, cache coeffs, and determine nc.
547- // horner layout uses "smallest cN first": coeffs[0] is lowest degree term.
545+ // Note: `fit_monomials()` returns coefficients in descending-degree order
546+ // (highest-degree first): coeffs[0] is the highest-degree term. We store
547+ // them so that `horner_coeffs[k * padded_ns + j]` holds the k'th Horner
548+ // coefficient (k=0 -> highest-degree). `horner_coeffs` was filled with
549+ // zeros above, so panels that need fewer coefficients leave the rest as 0.
548550 for (int j = 0 ; j < nspread; ++j) {
549551 // Map x ∈ [-1, 1] to the physical interval for panel j.
550552 // original: 0.5 * (x - nspread + 2*j + 1)
@@ -555,37 +557,50 @@ template<typename TF> void FINUFFT_PLAN_T<TF>::precompute_horner_coeffs() {
555557 return evaluate_kernel (t, beta, c_param, kernel_type);
556558 };
557559
558- const auto coeffs = fit_monomials (kernel, static_cast <int >(max_degree ), a, b);
560+ const auto coeffs = fit_monomials (kernel, static_cast <int >(n_coeffs ), a, b);
559561
560- // Cache coefficients in tmp, preserving order :
561- // coeffs[k] corresponds to horner_coeffs[k] (smallest degree first) .
562+ // Cache coefficients directly into final table (transposed/padded) :
563+ // coeffs[k] is highest->lowest, store at row k for panel j .
562564 for (size_t k = 0 ; k < coeffs.size (); ++k) {
563- tmp_coeffs [k * padded_ns + j] = coeffs[k];
565+ horner_coeffs [k * padded_ns + j] = coeffs[k];
564566 }
565567
566- // Determine highest index with |c_k| >= tol, so we keep a contiguous prefix
567- // [0..current_nc-1].
568- int current_nc = 0 ;
569- for (int k = static_cast <int >(coeffs.size ()) - 1 ; k >= 0 ; --k) {
570- if (std::abs (coeffs[static_cast <size_t >(k)]) >= tol) {
571- current_nc = k + 1 ; // number of coeffs we actually care about for this panel
568+ // Determine effective number of coeffs by skipping leading zeros.
569+ // coeffs[0] is highest degree.
570+ int used = 0 ;
571+ for (size_t k = 0 ; k < coeffs.size (); ++k) {
572+ if (std::abs (coeffs[k]) >= tol * 0.50 ) { // divide tol by 5 otherwise it fails in
573+ // some cases
574+ used = static_cast <int >(coeffs.size () - k);
572575 break ;
573576 }
574577 }
575- if (current_nc > nc) nc = current_nc ;
578+ if (used > nc) nc = used ;
576579 }
577580
578- // Second pass: copy the first nc coeffs into horner_coeffs (no refit, no reordering).
579- for (int k = 0 ; k < nc; ++k) {
580- const auto row_offset = static_cast <size_t >(k) * padded_ns;
581- for (size_t j = 0 ; j < padded_ns; ++j) {
582- horner_coeffs[row_offset + j] = tmp_coeffs[row_offset + j];
581+ // If the max required degree (nc) is less than max_degree, we must shift
582+ // the coefficients "left" (to lower row indices) so that the significant
583+ // coefficients end at row nc-1.
584+ if (nc < static_cast <int >(n_coeffs)) {
585+ const size_t shift = n_coeffs - nc;
586+ for (size_t k = 0 ; k < static_cast <size_t >(nc); ++k) {
587+ const size_t src_row = k + shift;
588+ const size_t dst_row = k;
589+ for (size_t j = 0 ; j < padded_ns; ++j) {
590+ horner_coeffs[dst_row * padded_ns + j] = horner_coeffs[src_row * padded_ns + j];
591+ }
592+ }
593+ // Zero out the now-unused tail rows for cleanliness
594+ for (size_t k = nc; k < static_cast <size_t >(n_coeffs); ++k) {
595+ for (size_t j = 0 ; j < padded_ns; ++j) {
596+ horner_coeffs[k * padded_ns + j] = TF (0 );
597+ }
583598 }
584599 }
585600
586601 if (opts.debug > 2 ) {
587602 // Print transposed layout: all "index 0" coeffs for intervals, then "index 1", ...
588- // Note: k is the coefficient index in Horner order, with smallest degree first.
603+ // Note: k is the coefficient index in Horner order, with highest degree first.
589604 for (size_t k = 0 ; k < static_cast <size_t >(nc); ++k) {
590605 printf (" [%s] idx=%lu: " , __func__, k);
591606 for (size_t j = 0 ; j < padded_ns; ++j) // use padded_ns to show padding as well
0 commit comments