From 674538068ba21d79055b79a7aadfbdbf7662abdc Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 10:35:33 -0400
Subject: [PATCH 01/11] Implement pow1p

Co-authored-by: fancidev <fancidev@gmail.com>
---
 .../boost/math/special_functions/pow1p.hpp    | 213 ++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 include/boost/math/special_functions/pow1p.hpp
diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp
new file mode 100644
index 0000000000..6179f2146f
--- /dev/null
+++ b/include/boost/math/special_functions/pow1p.hpp
@@ -0,0 +1,213 @@
+//  (C) Copyright Matt Borland 2024.
+//  (C) Copyrigh Fancidev 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_SF_POW1P_HPP
+#define BOOST_MATH_SF_POW1P_HPP
+
+#include <boost/math/tools/config.hpp>
+#include <boost/math/tools/numeric_limits.hpp>
+#include <boost/math/tools/promotion.hpp>
+#include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/policies/error_handling.hpp>
+
+// For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter
+// NVRTC does not support the forward declarations header
+#ifndef BOOST_MATH_ENABLE_CUDA
+#  include <boost/math/special_functions/next.hpp>
+#  ifndef BOOST_MATH_HAS_NVRTC
+#    include <boost/math/special_functions/math_fwd.hpp>
+#  endif // BOOST_MATH_HAS_NVRTC
+#endif // BOOST_MATH_ENABLE_CUDA
+
+namespace boost {
+namespace math {
+
+namespace detail {
+
+template <typename T, typename Policy>
+BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
+{
+    BOOST_MATH_STD_USING
+    constexpr auto function = "boost::math::pow1p<%1%>(%1%, %1%)";
+
+    // The special values follow the spec of the pow() function defined in
+    // IEEE-754, Section 9.2.1.  The order of the `if` statements matters.
+    if (y == T(0)) 
+    { 
+        // pow(x, +/-0)
+        return T(1);
+    }
+    
+    if (x == T(-1))
+    { 
+        // pow(+/-0, y)
+        if (boost::math::isfinite(y) && y < 0) 
+        {
+            return boost::math::policies::raise_domain_error<T>(function, "Division by 0", x, pol);
+        }
+
+        // Gets correct special handling
+        return pow(T(0), y);
+    }
+
+    if (x == T(-2) && boost::math::isinf(y)) 
+    { 
+        // pow(-1, +/-inf)
+        return T(1);
+    }
+
+    if (x == T(0)) { // pow(+1, y)
+        return T(1);
+    }
+
+    if (boost::math::isinf(y))
+    {
+        if (boost::math::signbit(y))
+        {
+            // Pow(y, -inf)
+            return (x < 0 && x > -2) ? boost::math::numeric_limits<T>::infinity() : T(0);
+        }
+        else
+        {
+            // pow(x, +inf)
+            return (x < 0 && x > -2) ? T(0) : boost::math::numeric_limits<T>::infinity();
+        }
+    }
+
+    if (boost::math::isinf(x)) 
+    { 
+        // pow(+/-inf, y)
+        return pow(x, y);
+    }
+
+    // Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and
+    // and y = {+/-0, +/-inf} have been handled.  Next, we handle `nan` and
+    // y = {+/-1}.
+    if (boost::math::isnan(x)) 
+    {
+        return x;
+    }
+    else if (boost::math::isnan(y))
+    {
+        return y;
+    }
+
+    if (y == T(1)) 
+    {
+        return T(1) + x;
+    }
+    if (y == T(-1)) 
+    {
+        return T(1) / (T(1) + x); // guaranteed no overflow
+    }
+
+    // Handle (1+x) < 0
+    if (x < -1) 
+    {
+        if (fmod(y, T(1)) != T(0))
+        { 
+            // y is not an integer so we have an error
+            return boost::math::policies::raise_evaluation_error<T>(function, "Y is not an integer", y, pol);
+        }
+
+        // TODO(fancidev): maybe use (1+x)^y == [(1+x)^2]^(y/2)
+        return pow(1+x, y);
+    }
+
+    // Now x, y are finite and not equal to 0 or +/-1, and x > -1.
+    // To compute (1+x)^y, write (1+x) == (s+t) where s is equal to (1+x)
+    // rounded toward 1, and t is the (exact) rounding error.
+    T s, t;
+    if (x < 0) 
+    {
+        s = T(1) + x;
+        t = x - (s - T(1));
+        if (t > 0) 
+        {
+            #ifdef BOOST_MATH_ENABLE_CUDA
+            s = ::nextafter(s, T(1));
+            #else
+            s = boost::math::nextafter(s, T(1));
+            #endif
+
+            t = x - (s - T(1));
+        }
+    } 
+    else if (x < 1) 
+    {
+        s = T(1) + x;
+        t = x - (s - T(1));
+        if (t < 0) 
+        {
+            #ifdef BOOST_MATH_ENABLE_CUDA
+            s = ::nextafter(s, T(0));
+            #else
+            s = boost::math::nextafter(s, T(0));
+            #endif
+
+            t = x - (s - T(1));
+        }
+    } 
+    else
+    {
+        s = x + T(1);
+        t = T(1) - (s - x);
+        if (t < 0) 
+        {
+            #ifdef BOOST_MATH_ENABLE_CUDA
+            s = ::nextafter(s, T(0));
+            #else
+            s = boost::math::nextafter(s, T(0));
+            #endif
+
+            t = T(1) - (s - x);
+        }
+    }
+
+    // Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0.
+    // Then (1+x)^y == (s+t)^y == (s^y)*((1+u)^y), where u := t / s.
+    // It can be shown that either both terms <= 1 or both >= 1, so
+    // if the first term over/underflows, then the result over/underflows.
+    T u = t / s;
+    T term1 = pow(s, y);
+    if (term1 == T(0) || boost::math::isinf(term1))
+    {
+        return term1;
+    }
+
+    // (1+u)^y == exp(y*log(1+u)).  Since u is close to machine epsilon,
+    // log(1+u) ~= u.  Let y*u == z+w, where z is the rounded result and
+    // w is the rounding error.  This improves accuracy when y is large.
+    // Then exp(y*u) == exp(z)*exp(w).
+    T z = y * u;
+    T w = fma(y, u, -z);
+    T term2 = exp(z) * exp(w);
+    
+    return term1 * term2;
+}
+
+} // Namespace detail
+
+template <typename T1, typename T2, typename Policy>
+BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+pow1p(const T1 x, const T2 y, const Policy& pol)
+{
+    using result_type = tools::promote_args_t<T1, T2>;
+    return detail::pow1p_imp(static_cast<result_type>(x), static_cast<result_type>(y), pol);
+}
+
+template <typename T1, typename T2>
+BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+pow1p(const T1 x, const T2 y)
+{
+    using result_type = tools::promote_args_t<T1, T2>;
+    return detail::pow1p_imp(static_cast<result_type>(x), static_cast<result_type>(y), policies::policy<>());
+}
+
+} // Namespace math
+} // Namespace boost
+
+#endif // BOOST_MATH_SF_POW1P_HPP

From 65c41898cb182bc6bbc328b75a2eb6880b92c7cf Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 11:03:36 -0400
Subject: [PATCH 02/11] Add detection of FMA for real_concept like types

---
 .../boost/math/special_functions/pow1p.hpp    | 37 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp
index 6179f2146f..40b78278b9 100644
--- a/include/boost/math/special_functions/pow1p.hpp
+++ b/include/boost/math/special_functions/pow1p.hpp
@@ -10,6 +10,7 @@
 #include <boost/math/tools/config.hpp>
 #include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/tools/promotion.hpp>
+#include <boost/math/tools/is_detected.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
 #include <boost/math/policies/error_handling.hpp>
 
@@ -18,6 +19,7 @@
 #ifndef BOOST_MATH_ENABLE_CUDA
 #  include <boost/math/special_functions/next.hpp>
 #  ifndef BOOST_MATH_HAS_NVRTC
+#    include <utility>
 #    include <boost/math/special_functions/math_fwd.hpp>
 #  endif // BOOST_MATH_HAS_NVRTC
 #endif // BOOST_MATH_ENABLE_CUDA
@@ -27,6 +29,29 @@ namespace math {
 
 namespace detail {
 
+#ifndef BOOST_MATH_ENABLE_CUDA
+
+template <typename T>
+using fma_t = decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()));
+
+template <typename T>
+constexpr bool is_fma_detected_v = boost::math::tools::is_detected<fma_t, T>::value;
+
+template <typename T, boost::math::enable_if_t<is_fma_detected_v<T>, bool> = true>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z)
+{
+    using std::fma;
+    return fma(x, y, z);
+}
+
+template <typename T, boost::math::enable_if_t<!is_fma_detected_v<T>, bool> = true>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z)
+{
+    return x * y + z;
+}
+
+#endif // BOOST_MATH_ENABLE_CUDA
+
 template <typename T, typename Policy>
 BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
 {
@@ -59,7 +84,9 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
         return T(1);
     }
 
-    if (x == T(0)) { // pow(+1, y)
+    if (x == T(0))
+    { 
+        // pow(+1, y)
         return T(1);
     }
 
@@ -183,9 +210,15 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
     // w is the rounding error.  This improves accuracy when y is large.
     // Then exp(y*u) == exp(z)*exp(w).
     T z = y * u;
+
+    #ifdef BOOST_MATH_ENABLE_CUDA
     T w = fma(y, u, -z);
+    #else
+    T w = local_fma(y, u, -z);
+    #endif
+
     T term2 = exp(z) * exp(w);
-    
+
     return term1 * term2;
 }
 

From 988ea84a0fe99e2d40f43b6d38c1754bfa291903 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 12:13:09 -0400
Subject: [PATCH 03/11] Simplifications

---
 include/boost/math/special_functions/pow1p.hpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp
index 40b78278b9..07b36cf03e 100644
--- a/include/boost/math/special_functions/pow1p.hpp
+++ b/include/boost/math/special_functions/pow1p.hpp
@@ -11,6 +11,7 @@
 #include <boost/math/tools/numeric_limits.hpp>
 #include <boost/math/tools/promotion.hpp>
 #include <boost/math/tools/is_detected.hpp>
+#include <boost/math/tools/precision.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
 #include <boost/math/policies/error_handling.hpp>
 
@@ -60,7 +61,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
 
     // The special values follow the spec of the pow() function defined in
     // IEEE-754, Section 9.2.1.  The order of the `if` statements matters.
-    if (y == T(0)) 
+    if (abs(y) == T(0)) 
     { 
         // pow(x, +/-0)
         return T(1);
@@ -92,15 +93,15 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
 
     if (boost::math::isinf(y))
     {
-        if (boost::math::signbit(y))
+        if (boost::math::signbit(y) == 1)
         {
-            // Pow(y, -inf)
-            return (x < 0 && x > -2) ? boost::math::numeric_limits<T>::infinity() : T(0);
+            // pow(x, -inf)
+            return T(0);
         }
         else
         {
             // pow(x, +inf)
-            return (x < 0 && x > -2) ? T(0) : boost::math::numeric_limits<T>::infinity();
+            return y;
         }
     }
 
@@ -134,12 +135,6 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
     // Handle (1+x) < 0
     if (x < -1) 
     {
-        if (fmod(y, T(1)) != T(0))
-        { 
-            // y is not an integer so we have an error
-            return boost::math::policies::raise_evaluation_error<T>(function, "Y is not an integer", y, pol);
-        }
-
         // TODO(fancidev): maybe use (1+x)^y == [(1+x)^2]^(y/2)
         return pow(1+x, y);
     }

From f0b351878e223a067d37396491e1bd8afcda4cd2 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 12:13:35 -0400
Subject: [PATCH 04/11] Add normal testset

---
 test/Jamfile.v2     |   1 +
 test/test_pow1p.cpp | 106 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 test/test_pow1p.cpp

diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index 85ddf38f9b..18018d63af 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -128,6 +128,7 @@ test-suite special_fun :
 
    [ run hypot_test.cpp test_instances//test_instances pch_light ../../test/build//boost_unit_test_framework ]
    [ run pow_test.cpp ../../test/build//boost_unit_test_framework ]
+   [ run test_pow1p.cpp ]
    [ run logaddexp_test.cpp ../../test/build//boost_unit_test_framework ]
    [ run logsumexp_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx11_hdr_initializer_list cxx11_variadic_templates ] ]
    [ run ccmath_sqrt_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] [ check-target-builds ../config//has_float128 "GCC libquadmath and __float128 support" : <define>BOOST_MATH_TEST_FLOAT128 <linkflags>"-Bstatic -lquadmath -Bdynamic" ] ]
diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp
new file mode 100644
index 0000000000..9ef5295652
--- /dev/null
+++ b/test/test_pow1p.cpp
@@ -0,0 +1,106 @@
+//  (C) Copyright Matt Borland 2024.
+//  (C) Copyrigh Fancidev 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <boost/math/special_functions/pow1p.hpp>
+#include <boost/math/concepts/real_concept.hpp>
+#include <exception>
+#include <random>
+#include "math_unit_test.hpp"
+
+template <typename T>
+void test()
+{
+    using std::pow;
+
+    // First we hit all the special cases
+    // pow(x, +/-0)
+    CHECK_EQUAL(boost::math::pow1p(T(1), T(0)), T(1));
+    
+    // pow(0, y)
+    CHECK_THROW(boost::math::pow1p(T(-1), T(-1)), std::domain_error);
+    CHECK_EQUAL(boost::math::pow1p(T(-1), T(1)), T(0));
+
+    // pow(-1, inf)
+    CHECK_EQUAL(boost::math::pow1p(T(-2), boost::math::numeric_limits<T>::infinity()), T(1));
+    
+    // pow(1, y)
+    CHECK_EQUAL(boost::math::pow1p(T(0), T(2)), T(1));
+
+    // pow(x, +/-inf)
+    BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits<T>::has_infinity)
+    {
+        CHECK_EQUAL(boost::math::pow1p(T(5), -boost::math::numeric_limits<T>::infinity()), T(0));
+        CHECK_EQUAL(boost::math::pow1p(T(5), boost::math::numeric_limits<T>::infinity()), boost::math::numeric_limits<T>::infinity());
+    
+
+    // pow(+/-inf, y)
+        CHECK_EQUAL(boost::math::pow1p(boost::math::numeric_limits<T>::infinity(), T(2)), boost::math::numeric_limits<T>::infinity());
+        CHECK_EQUAL(boost::math::pow1p(-boost::math::numeric_limits<T>::infinity(), T(2)), boost::math::numeric_limits<T>::infinity());
+    }
+
+    // NANs for x and y
+    BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits<T>::has_quiet_NaN)
+    {
+        CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(boost::math::numeric_limits<T>::quiet_NaN(), T(1))), true);
+        CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(T(1), boost::math::numeric_limits<T>::quiet_NaN())), true);
+    }
+
+    // pow(x, +/-1)
+    CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(1)), pow(T(3), T(1)), 10);
+    CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(-1)), pow(T(3), T(-1)), 10);
+    
+    // (1+x) < 0
+    CHECK_ULP_CLOSE(boost::math::pow1p(T(-3), T(2)), pow(T(-2), T(2)), 10);
+
+    // x < 0
+    std::mt19937_64 rng;
+    std::uniform_real_distribution<double> dist (-1, 0);
+    std::uniform_real_distribution<double> dist_y (0, 10);
+    constexpr int N = 1024;
+    for (int i = 0; i < N; ++i)
+    {
+        const auto x = static_cast<T>(dist(rng));
+        const auto y = static_cast<T>(dist_y(rng));
+
+        CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100);
+    }
+
+    // 0 < x < 1
+    std::uniform_real_distribution<double> dist_x_1(0, 1);
+    for (int i = 0; i < N; ++i)
+    {
+        const auto x = static_cast<T>(dist_x_1(rng));
+        const auto y = static_cast<T>(dist_y(rng));
+
+        CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100);
+    }
+
+    // Else
+    std::uniform_real_distribution<double> dist_other_x(1, 1000);
+    for (int i = 0; i < N; ++i)
+    {
+        const auto x = static_cast<T>(dist_other_x(rng));
+        const auto y = static_cast<T>(dist_y(rng));
+
+        CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100);
+    }
+}
+
+int main()
+{
+    test<float>();
+    test<double>();
+
+    #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS
+    test<long double>();
+    #endif
+
+    #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS
+    test<boost::math::concepts::real_concept>();
+    #endif
+
+    return boost::math::test::report_errors();
+}

From 80b94cc9fd479683ed9b5a1f0664b6916378acdd Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 13:13:42 -0400
Subject: [PATCH 05/11] Add to special functions index

---
 include/boost/math/special_functions.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/boost/math/special_functions.hpp b/include/boost/math/special_functions.hpp
index d3aa95c5c1..3275ab3d2b 100644
--- a/include/boost/math/special_functions.hpp
+++ b/include/boost/math/special_functions.hpp
@@ -79,6 +79,7 @@
 #include <boost/math/special_functions/lambert_w.hpp>
 #include <boost/math/special_functions/gegenbauer.hpp>
 #include <boost/math/special_functions/jacobi.hpp>
+#include <boost/math/special_functions/pow1p.hpp>
 #ifndef BOOST_MATH_NO_EXCEPTIONS
 #include <boost/math/special_functions/legendre_stieltjes.hpp>
 #endif

From 7d4794abb3dda1cd49f287d87bd3feb8dcd89cd0 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 13:30:19 -0400
Subject: [PATCH 06/11] Add pow1p NVCC testing

---
 test/cuda_jamfile         |   2 +
 test/test_pow1p_double.cu | 111 ++++++++++++++++++++++++++++++++++++++
 test/test_pow1p_float.cu  | 111 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 test/test_pow1p_double.cu
 create mode 100644 test/test_pow1p_float.cu

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index 64fac76fba..2829aa19ee 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -171,3 +171,5 @@ run test_trigamma_float.cu ;
 
 run test_trunc_double.cu ;
 run test_trunc_float.cu ;
+run test_pow1p_double.cu ;
+run test_pow1p_float.cu ;
diff --git a/test/test_pow1p_double.cu b/test/test_pow1p_double.cu
new file mode 100644
index 0000000000..c7746bd3e7
--- /dev/null
+++ b/test/test_pow1p_double.cu
@@ -0,0 +1,111 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef double float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng;
+    std::uniform_real_distribution<float_type> x_vals(-1, 1);
+    std::uniform_real_distribution<float_type> y_vals(-10, 10);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = x_vals(rng);
+        input_vector2[i] = y_vals(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i]));
+    }
+
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_pow1p_float.cu b/test/test_pow1p_float.cu
new file mode 100644
index 0000000000..0ecdf0606c
--- /dev/null
+++ b/test/test_pow1p_float.cu
@@ -0,0 +1,111 @@
+
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <boost/math/special_functions.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+typedef float float_type;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    using std::cos;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<float_type> input_vector1(numElements);
+
+    // Allocate the managed input vector B
+    cuda_managed_ptr<float_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<float_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    std::mt19937_64 rng;
+    std::uniform_real_distribution<float_type> x_vals(-1, 1);
+    std::uniform_real_distribution<float_type> y_vals(-10, 10);
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = x_vals(rng);
+        input_vector2[i] = y_vals(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 1024;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<float_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+       results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i]));
+    }
+
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From f6fd761e2197a064149b669c772955783182d81e Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 13:39:14 -0400
Subject: [PATCH 07/11] Add NVRTC testing and fix missing header

---
 .../boost/math/special_functions/pow1p.hpp    |   1 +
 test/nvrtc_jamfile                            |   1 +
 test/test_pow1p_nvrtc_double.cpp              | 191 ++++++++++++++++++
 test/test_pow1p_nvrtc_float.cpp               | 191 ++++++++++++++++++
 4 files changed, 384 insertions(+)
 create mode 100644 test/test_pow1p_nvrtc_double.cpp
 create mode 100644 test/test_pow1p_nvrtc_float.cpp

diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp
index 07b36cf03e..2666808dbd 100644
--- a/include/boost/math/special_functions/pow1p.hpp
+++ b/include/boost/math/special_functions/pow1p.hpp
@@ -13,6 +13,7 @@
 #include <boost/math/tools/is_detected.hpp>
 #include <boost/math/tools/precision.hpp>
 #include <boost/math/special_functions/fpclassify.hpp>
+#include <boost/math/special_functions/sign.hpp>
 #include <boost/math/policies/error_handling.hpp>
 
 // For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter
diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile
index de235822ef..d636fc67be 100644
--- a/test/nvrtc_jamfile
+++ b/test/nvrtc_jamfile
@@ -168,3 +168,4 @@ run test_trigamma_nvrtc_double.cpp ;
 run test_trigamma_nvrtc_float.cpp ;
 
 run test_trunc_nvrtc_double.cpp ;
+run test_pow1p_nvrtc_double.cpp ;
diff --git a/test/test_pow1p_nvrtc_double.cpp b/test/test_pow1p_nvrtc_double.cpp
new file mode 100644
index 0000000000..d9f97d94c0
--- /dev/null
+++ b/test/test_pow1p_nvrtc_double.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/pow1p.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/pow1p.hpp>
+extern "C" __global__ 
+void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_trunc_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> x_vals(-1, 1);
+        std::uniform_real_distribution<float_type> y_vals(-10, 10);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(x_vals(rng));
+            h_in2[i] = static_cast<float_type>(y_vals(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::pow1p(h_in1[i], h_in2[i]);
+                       
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/test/test_pow1p_nvrtc_float.cpp b/test/test_pow1p_nvrtc_float.cpp
new file mode 100644
index 0000000000..d9f97d94c0
--- /dev/null
+++ b/test/test_pow1p_nvrtc_float.cpp
@@ -0,0 +1,191 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
+#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
+
+// Must be included first
+#include <nvrtc.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <exception>
+#include <boost/math/special_functions/pow1p.hpp>
+#include <boost/math/special_functions/relative_difference.hpp>
+
+typedef double float_type;
+
+const char* cuda_kernel = R"(
+typedef double float_type;
+#include <cuda/std/type_traits>
+#include <boost/math/special_functions/pow1p.hpp>
+extern "C" __global__ 
+void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < numElements)
+    {
+        out[i] = boost::math::pow1p(in1[i], in2[i]);
+    }
+}
+)";
+
+void checkCUDAError(cudaError_t result, const char* msg)
+{
+    if (result != cudaSuccess)
+    {
+        std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkCUError(CUresult result, const char* msg)
+{
+    if (result != CUDA_SUCCESS)
+    {
+        const char* errorStr;
+        cuGetErrorString(result, &errorStr);
+        std::cerr << msg << ": " << errorStr << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkNVRTCError(nvrtcResult result, const char* msg)
+{
+    if (result != NVRTC_SUCCESS)
+    {
+        std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() 
+{
+    try
+    {
+        // Initialize CUDA driver API
+        checkCUError(cuInit(0), "Failed to initialize CUDA");
+
+        // Create CUDA context
+        CUcontext context;
+        CUdevice device;
+        checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
+        checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
+
+        nvrtcProgram prog;
+        nvrtcResult res;
+
+        res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr);
+        checkNVRTCError(res, "Failed to create NVRTC program");
+
+        nvrtcAddNameExpression(prog, "test_trunc_kernel");
+
+        #ifdef BOOST_MATH_NVRTC_CI_RUN
+        const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #else
+        const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
+        #endif
+
+        // Compile the program
+        res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
+        if (res != NVRTC_SUCCESS) 
+        {
+            size_t log_size;
+            nvrtcGetProgramLogSize(prog, &log_size);
+            char* log = new char[log_size];
+            nvrtcGetProgramLog(prog, log);
+            std::cerr << "Compilation failed:\n" << log << std::endl;
+            delete[] log;
+            exit(EXIT_FAILURE);
+        }
+
+        // Get PTX from the program
+        size_t ptx_size;
+        nvrtcGetPTXSize(prog, &ptx_size);
+        char* ptx = new char[ptx_size];
+        nvrtcGetPTX(prog, ptx);
+
+        // Load PTX into CUDA module
+        CUmodule module;
+        CUfunction kernel;
+        checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
+        checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function");
+
+        int numElements = 5000;
+        float_type *h_in1, *h_in2, *h_out;
+        float_type *d_in1, *d_in2, *d_out;
+
+        // Allocate memory on the host
+        h_in1 = new float_type[numElements];
+        h_in2 = new float_type[numElements];
+        h_out = new float_type[numElements];
+
+        // Initialize input arrays
+        std::mt19937_64 rng(42);
+        std::uniform_real_distribution<float_type> x_vals(-1, 1);
+        std::uniform_real_distribution<float_type> y_vals(-10, 10);
+        for (int i = 0; i < numElements; ++i) 
+        {
+            h_in1[i] = static_cast<float_type>(x_vals(rng));
+            h_in2[i] = static_cast<float_type>(y_vals(rng));
+        }
+
+        checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
+        checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
+        checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
+
+        checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
+        checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
+
+        int blockSize = 256;
+        int numBlocks = (numElements + blockSize - 1) / blockSize;
+        void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
+        checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
+
+        checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
+
+        // Verify Result
+        for (int i = 0; i < numElements; ++i) 
+        {
+            const auto res = boost::math::pow1p(h_in1[i], h_in2[i]);
+                       
+            if (std::isfinite(res))
+            {
+                if (boost::math::epsilon_difference(res, h_out[i]) > 300)
+                {
+                    std::cout << "error at line: " << i
+                            << "\nParallel: " << h_out[i]
+                            << "\n  Serial: " << res
+                            << "\n    Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
+                }
+            }
+        }
+
+        cudaFree(d_in1);
+        cudaFree(d_in2);
+        cudaFree(d_out);
+        delete[] h_in1;
+        delete[] h_in2;
+        delete[] h_out;
+
+        nvrtcDestroyProgram(&prog);
+        delete[] ptx;
+
+        cuCtxDestroy(context);
+
+        std::cout << "Kernel executed successfully." << std::endl;
+        return 0;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Stopped with exception: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}

From 6066680f59612c7f85220fa676b9518be000f71a Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 13:44:47 -0400
Subject: [PATCH 08/11] Add testing and fixes for SYCL

---
 include/boost/math/special_functions/pow1p.hpp | 14 +++++++-------
 test/sycl_jamfile                              |  1 +
 test/test_pow1p.cpp                            |  2 ++
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp
index 2666808dbd..22781e6023 100644
--- a/include/boost/math/special_functions/pow1p.hpp
+++ b/include/boost/math/special_functions/pow1p.hpp
@@ -18,7 +18,7 @@
 
 // For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter
 // NVRTC does not support the forward declarations header
-#ifndef BOOST_MATH_ENABLE_CUDA
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
 #  include <boost/math/special_functions/next.hpp>
 #  ifndef BOOST_MATH_HAS_NVRTC
 #    include <utility>
@@ -31,7 +31,7 @@ namespace math {
 
 namespace detail {
 
-#ifndef BOOST_MATH_ENABLE_CUDA
+#ifndef BOOST_MATH_HAS_GPU_SUPPORT
 
 template <typename T>
 using fma_t = decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()));
@@ -52,7 +52,7 @@ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y,
     return x * y + z;
 }
 
-#endif // BOOST_MATH_ENABLE_CUDA
+#endif // BOOST_MATH_HAS_GPU_SUPPORT
 
 template <typename T, typename Policy>
 BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
@@ -150,7 +150,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
         t = x - (s - T(1));
         if (t > 0) 
         {
-            #ifdef BOOST_MATH_ENABLE_CUDA
+            #ifdef BOOST_MATH_HAS_GPU_SUPPORT
             s = ::nextafter(s, T(1));
             #else
             s = boost::math::nextafter(s, T(1));
@@ -165,7 +165,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
         t = x - (s - T(1));
         if (t < 0) 
         {
-            #ifdef BOOST_MATH_ENABLE_CUDA
+            #ifdef BOOST_MATH_HAS_GPU_SUPPORT
             s = ::nextafter(s, T(0));
             #else
             s = boost::math::nextafter(s, T(0));
@@ -180,7 +180,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
         t = T(1) - (s - x);
         if (t < 0) 
         {
-            #ifdef BOOST_MATH_ENABLE_CUDA
+            #ifdef BOOST_MATH_HAS_GPU_SUPPORT
             s = ::nextafter(s, T(0));
             #else
             s = boost::math::nextafter(s, T(0));
@@ -207,7 +207,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
     // Then exp(y*u) == exp(z)*exp(w).
     T z = y * u;
 
-    #ifdef BOOST_MATH_ENABLE_CUDA
+    #ifdef BOOST_MATH_HAS_GPU_SUPPORT
     T w = fma(y, u, -z);
     #else
     T w = local_fma(y, u, -z);
diff --git a/test/sycl_jamfile b/test/sycl_jamfile
index 97c48474ca..3dd7d054af 100644
--- a/test/sycl_jamfile
+++ b/test/sycl_jamfile
@@ -38,3 +38,4 @@ run test_digamma_simple.cpp ;
 run test_trigamma.cpp ;
 run test_erf.cpp ;
 run test_gamma.cpp ;
+run test_pow1p.cpp ;
diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp
index 9ef5295652..67d7ee1acd 100644
--- a/test/test_pow1p.cpp
+++ b/test/test_pow1p.cpp
@@ -20,7 +20,9 @@ void test()
     CHECK_EQUAL(boost::math::pow1p(T(1), T(0)), T(1));
     
     // pow(0, y)
+    #ifndef BOOST_MATH_NO_EXCEPTIONS
     CHECK_THROW(boost::math::pow1p(T(-1), T(-1)), std::domain_error);
+    #endif
     CHECK_EQUAL(boost::math::pow1p(T(-1), T(1)), T(0));
 
     // pow(-1, inf)

From 866d9d01652138daf6f2a6b7b1e2f65f36ed89e2 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 19 Aug 2024 13:54:56 -0400
Subject: [PATCH 09/11] Add pow1p to the docs

---
 doc/math.qbk     |  1 +
 doc/sf/pow1p.qbk | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 doc/sf/pow1p.qbk

diff --git a/doc/math.qbk b/doc/math.qbk
index d6b90efb0b..67375da114 100644
--- a/doc/math.qbk
+++ b/doc/math.qbk
@@ -313,6 +313,7 @@ and use the function's name as the link text.]
 [def __sqrt1pm1 [link math_toolkit.powers.sqrt1pm1 sqrt1pm1]]
 [def __rsqrt [link math_toolkit.powers.rsqrt rsqrt]]
 [def __powm1 [link math_toolkit.powers.powm1 powm1]]
+[def __pow1p [link math_toolkit.powers.pow1p pow1p]]
 [def __hypot [link math_toolkit.powers.hypot hypot]]
 [def __pow [link math_toolkit.powers.ct_pow pow]]
 
diff --git a/doc/sf/pow1p.qbk b/doc/sf/pow1p.qbk
new file mode 100644
index 0000000000..a2201d77ae
--- /dev/null
+++ b/doc/sf/pow1p.qbk
@@ -0,0 +1,34 @@
+[/
+  Copyright Matt Borland 2024
+  Distributed under the Boost Software License, Version 1.0.
+  (See accompanying file LICENSE_1_0.txt or copy at
+  http://www.boost.org/LICENSE_1_0.txt).
+]
+
+[section:pow1p pow1p]
+
+[h4 Synopsis]
+
+    #include <boost/math/special_functions/pow1p.hpp>
+
+    namespace boost {
+    namespace math {
+
+    template <typename T1, typename T2, typename Policy>
+    BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+    pow1p(const T1 x, const T2 y, const Policy& pol)
+
+    template <typename T1, typename T2>
+    BOOST_MATH_GPU_ENABLED tools::promote_args_t<T1, T2>
+    pow1p(const T1 x, const T2 y)
+
+    }} // namespaces
+
+
+The function `pow1p` computes (1 + x)[super y ] where x and y are real numbers. 
+This function is particularly useful for scenarios where adding 1 to x before raising it to the power of y is required. 
+It provides a more numerically stable and efficient way to perform this computation, especially for small values of x.
+When x is very small, directly computing (1 + x)[super y ] using standard arithmetic operations can lead to a loss of precision due to floating-point arithmetic limitations. 
+The pow1p function helps mitigate this issue by internally handling such cases more accurately.
+
+[endsect]

From 1a450eb109bcfec1db4af51704ad96855005ac76 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 26 Aug 2024 09:56:52 -0400
Subject: [PATCH 10/11] Simplify FMA, fix concept fails, and sync from upstream

---
 .../boost/math/special_functions/pow1p.hpp    | 81 ++++++++++++-------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp
index 22781e6023..1b718f99bf 100644
--- a/include/boost/math/special_functions/pow1p.hpp
+++ b/include/boost/math/special_functions/pow1p.hpp
@@ -17,13 +17,10 @@
 #include <boost/math/policies/error_handling.hpp>
 
 // For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter
-// NVRTC does not support the forward declarations header
 #ifndef BOOST_MATH_HAS_GPU_SUPPORT
 #  include <boost/math/special_functions/next.hpp>
-#  ifndef BOOST_MATH_HAS_NVRTC
-#    include <utility>
-#    include <boost/math/special_functions/math_fwd.hpp>
-#  endif // BOOST_MATH_HAS_NVRTC
+#  include <boost/math/special_functions/math_fwd.hpp>
+#  include <utility>
 #endif // BOOST_MATH_ENABLE_CUDA
 
 namespace boost {
@@ -52,6 +49,20 @@ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y,
     return x * y + z;
 }
 
+#else
+
+template <typename T>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z)
+{
+    return ::fma(x, y, z);
+}
+
+template <>
+BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED float local_fma(const float x, const float y, const float z)
+{
+    return ::fmaf(x, y, z);
+}
+
 #endif // BOOST_MATH_HAS_GPU_SUPPORT
 
 template <typename T, typename Policy>
@@ -71,7 +82,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
     if (x == T(-1))
     { 
         // pow(+/-0, y)
-        if (boost::math::isfinite(y) && y < 0) 
+        if ((boost::math::isfinite)(y) && y < 0) 
         {
             return boost::math::policies::raise_domain_error<T>(function, "Division by 0", x, pol);
         }
@@ -80,7 +91,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
         return pow(T(0), y);
     }
 
-    if (x == T(-2) && boost::math::isinf(y)) 
+    if (x == T(-2) && (boost::math::isinf)(y)) 
     { 
         // pow(-1, +/-inf)
         return T(1);
@@ -92,9 +103,9 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
         return T(1);
     }
 
-    if (boost::math::isinf(y))
+    if ((boost::math::isinf)(y))
     {
-        if (boost::math::signbit(y) == 1)
+        if ((boost::math::signbit)(y) == 1)
         {
             // pow(x, -inf)
             return T(0);
@@ -106,7 +117,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
         }
     }
 
-    if (boost::math::isinf(x)) 
+    if ((boost::math::isinf)(x)) 
     { 
         // pow(+/-inf, y)
         return pow(x, y);
@@ -115,11 +126,11 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
     // Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and
     // and y = {+/-0, +/-inf} have been handled.  Next, we handle `nan` and
     // y = {+/-1}.
-    if (boost::math::isnan(x)) 
+    if ((boost::math::isnan)(x)) 
     {
         return x;
     }
-    else if (boost::math::isnan(y))
+    else if ((boost::math::isnan)(y))
     {
         return y;
     }
@@ -191,30 +202,46 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol)
     }
 
     // Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0.
-    // Then (1+x)^y == (s+t)^y == (s^y)*((1+u)^y), where u := t / s.
-    // It can be shown that either both terms <= 1 or both >= 1, so
+    // Write (1+x)^y == (s+t)^y == (s^y)*((1+t/s)^y) == term1*term2.
+    // It can be shown that either both terms <= 1 or both >= 1; so
     // if the first term over/underflows, then the result over/underflows.
-    T u = t / s;
+    // And of course, term2 == 1 if t == 0.
     T term1 = pow(s, y);
-    if (term1 == T(0) || boost::math::isinf(term1))
+    if (t == T(0) || term1 == T(0) || (boost::math::isinf)(term1))
     {
         return term1;
     }
 
-    // (1+u)^y == exp(y*log(1+u)).  Since u is close to machine epsilon,
-    // log(1+u) ~= u.  Let y*u == z+w, where z is the rounded result and
-    // w is the rounding error.  This improves accuracy when y is large.
-    // Then exp(y*u) == exp(z)*exp(w).
-    T z = y * u;
+    // (1+t/s)^y == exp(y*log(1+t/s)).  The relative error of the result is
+    // equal to the absolute error of the exponent (plus the relative error
+    // of 'exp').  Therefore, when the exponent is small, it is accurately
+    // evaluated to machine epsilon using T arithmetic.  In addition, since
+    // t/s <= epsilon, log(1+t/s) is well approximated by t/s to first order.
+    T u = t / s;
+    T w = y * u;
+    if (abs(w) <= T(0.5)) 
+    {
+        T term2 = exp(w);
+        return term1 * term2;
+    }
+
+    // Now y*log(1+t/s) is large, and its relative error is "magnified" by
+    // the exponent.  To reduce the error, we use double-T arithmetic, and
+    // expand log(1+t/s) to second order.
+
+    // (u + uu) ~= t/s.
+    T r1 = local_fma(-u, s, t);
+    T uu = r1 / s;
 
-    #ifdef BOOST_MATH_HAS_GPU_SUPPORT
-    T w = fma(y, u, -z);
-    #else
-    T w = local_fma(y, u, -z);
-    #endif
+    // (u + vv) ~= log(1+(u+uu)) ~= log(1+t/s).
+    T vv = local_fma(T(-0.5)*u, u, uu);
 
-    T term2 = exp(z) * exp(w);
+    // (w + ww) ~= y*(u+vv) ~= y*log(1+t/s).
+    T r2 = local_fma(y, u, -w);
+    T ww = local_fma(y, vv, r2);
 
+    // TODO: maybe ww is small enough such that exp(ww) ~= 1+ww.
+    T term2 = exp(w) * exp(ww);
     return term1 * term2;
 }
 

From f7150ee9765eec76cedc9e8d867458743a1bf715 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Mon, 26 Aug 2024 09:57:02 -0400
Subject: [PATCH 11/11] Add <stdfloat> tests

---
 test/test_pow1p.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp
index 67d7ee1acd..9b5ba7c8e7 100644
--- a/test/test_pow1p.cpp
+++ b/test/test_pow1p.cpp
@@ -8,6 +8,11 @@
 #include <boost/math/concepts/real_concept.hpp>
 #include <exception>
 #include <random>
+
+#if __has_include(<stdfloat>) && !defined(BOOST_MATH_HAS_GPU_SUPPORT)
+#  include <stdfloat>
+#endif
+
 #include "math_unit_test.hpp"
 
 template <typename T>
@@ -93,8 +98,17 @@ void test()
 
 int main()
 {
+    #ifdef __STDCPP_FLOAT32_T__
+    test<std::float32_t>();
+    #else
     test<float>();
+    #endif
+
+    #ifdef __STDCPP_FLOAT64_T__
+    test<std::float64_t>();
+    #else
     test<double>();
+    #endif
 
     #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS
     test<long double>();