From 674538068ba21d79055b79a7aadfbdbf7662abdc Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 10:35:33 -0400 Subject: [PATCH 01/11] Implement pow1p Co-authored-by: fancidev --- .../boost/math/special_functions/pow1p.hpp | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 include/boost/math/special_functions/pow1p.hpp diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp new file mode 100644 index 0000000000..6179f2146f --- /dev/null +++ b/include/boost/math/special_functions/pow1p.hpp @@ -0,0 +1,213 @@ +// (C) Copyright Matt Borland 2024. +// (C) Copyrigh Fancidev 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_MATH_SF_POW1P_HPP +#define BOOST_MATH_SF_POW1P_HPP + +#include +#include +#include +#include +#include + +// For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter +// NVRTC does not support the forward declarations header +#ifndef BOOST_MATH_ENABLE_CUDA +# include +# ifndef BOOST_MATH_HAS_NVRTC +# include +# endif // BOOST_MATH_HAS_NVRTC +#endif // BOOST_MATH_ENABLE_CUDA + +namespace boost { +namespace math { + +namespace detail { + +template +BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) +{ + BOOST_MATH_STD_USING + constexpr auto function = "boost::math::pow1p<%1%>(%1%, %1%)"; + + // The special values follow the spec of the pow() function defined in + // IEEE-754, Section 9.2.1. The order of the `if` statements matters. + if (y == T(0)) + { + // pow(x, +/-0) + return T(1); + } + + if (x == T(-1)) + { + // pow(+/-0, y) + if (boost::math::isfinite(y) && y < 0) + { + return boost::math::policies::raise_domain_error(function, "Division by 0", x, pol); + } + + // Gets correct special handling + return pow(T(0), y); + } + + if (x == T(-2) && boost::math::isinf(y)) + { + // pow(-1, +/-inf) + return T(1); + } + + if (x == T(0)) { // pow(+1, y) + return T(1); + } + + if (boost::math::isinf(y)) + { + if (boost::math::signbit(y)) + { + // Pow(y, -inf) + return (x < 0 && x > -2) ? boost::math::numeric_limits::infinity() : T(0); + } + else + { + // pow(x, +inf) + return (x < 0 && x > -2) ? T(0) : boost::math::numeric_limits::infinity(); + } + } + + if (boost::math::isinf(x)) + { + // pow(+/-inf, y) + return pow(x, y); + } + + // Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and + // and y = {+/-0, +/-inf} have been handled. Next, we handle `nan` and + // y = {+/-1}. + if (boost::math::isnan(x)) + { + return x; + } + else if (boost::math::isnan(y)) + { + return y; + } + + if (y == T(1)) + { + return T(1) + x; + } + if (y == T(-1)) + { + return T(1) / (T(1) + x); // guaranteed no overflow + } + + // Handle (1+x) < 0 + if (x < -1) + { + if (fmod(y, T(1)) != T(0)) + { + // y is not an integer so we have an error + return boost::math::policies::raise_evaluation_error(function, "Y is not an integer", y, pol); + } + + // TODO(fancidev): maybe use (1+x)^y == [(1+x)^2]^(y/2) + return pow(1+x, y); + } + + // Now x, y are finite and not equal to 0 or +/-1, and x > -1. + // To compute (1+x)^y, write (1+x) == (s+t) where s is equal to (1+x) + // rounded toward 1, and t is the (exact) rounding error. + T s, t; + if (x < 0) + { + s = T(1) + x; + t = x - (s - T(1)); + if (t > 0) + { + #ifdef BOOST_MATH_ENABLE_CUDA + s = ::nextafter(s, T(1)); + #else + s = boost::math::nextafter(s, T(1)); + #endif + + t = x - (s - T(1)); + } + } + else if (x < 1) + { + s = T(1) + x; + t = x - (s - T(1)); + if (t < 0) + { + #ifdef BOOST_MATH_ENABLE_CUDA + s = ::nextafter(s, T(0)); + #else + s = boost::math::nextafter(s, T(0)); + #endif + + t = x - (s - T(1)); + } + } + else + { + s = x + T(1); + t = T(1) - (s - x); + if (t < 0) + { + #ifdef BOOST_MATH_ENABLE_CUDA + s = ::nextafter(s, T(0)); + #else + s = boost::math::nextafter(s, T(0)); + #endif + + t = T(1) - (s - x); + } + } + + // Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0. + // Then (1+x)^y == (s+t)^y == (s^y)*((1+u)^y), where u := t / s. + // It can be shown that either both terms <= 1 or both >= 1, so + // if the first term over/underflows, then the result over/underflows. + T u = t / s; + T term1 = pow(s, y); + if (term1 == T(0) || boost::math::isinf(term1)) + { + return term1; + } + + // (1+u)^y == exp(y*log(1+u)). Since u is close to machine epsilon, + // log(1+u) ~= u. Let y*u == z+w, where z is the rounded result and + // w is the rounding error. This improves accuracy when y is large. + // Then exp(y*u) == exp(z)*exp(w). + T z = y * u; + T w = fma(y, u, -z); + T term2 = exp(z) * exp(w); + + return term1 * term2; +} + +} // Namespace detail + +template +BOOST_MATH_GPU_ENABLED tools::promote_args_t +pow1p(const T1 x, const T2 y, const Policy& pol) +{ + using result_type = tools::promote_args_t; + return detail::pow1p_imp(static_cast(x), static_cast(y), pol); +} + +template +BOOST_MATH_GPU_ENABLED tools::promote_args_t +pow1p(const T1 x, const T2 y) +{ + using result_type = tools::promote_args_t; + return detail::pow1p_imp(static_cast(x), static_cast(y), policies::policy<>()); +} + +} // Namespace math +} // Namespace boost + +#endif // BOOST_MATH_SF_POW1P_HPP From 65c41898cb182bc6bbc328b75a2eb6880b92c7cf Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 11:03:36 -0400 Subject: [PATCH 02/11] Add detection of FMA for real_concept like types --- .../boost/math/special_functions/pow1p.hpp | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp index 6179f2146f..40b78278b9 100644 --- a/include/boost/math/special_functions/pow1p.hpp +++ b/include/boost/math/special_functions/pow1p.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -18,6 +19,7 @@ #ifndef BOOST_MATH_ENABLE_CUDA # include # ifndef BOOST_MATH_HAS_NVRTC +# include # include # endif // BOOST_MATH_HAS_NVRTC #endif // BOOST_MATH_ENABLE_CUDA @@ -27,6 +29,29 @@ namespace math { namespace detail { +#ifndef BOOST_MATH_ENABLE_CUDA + +template +using fma_t = decltype(fma(std::declval(), std::declval(), std::declval())); + +template +constexpr bool is_fma_detected_v = boost::math::tools::is_detected::value; + +template , bool> = true> +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z) +{ + using std::fma; + return fma(x, y, z); +} + +template , bool> = true> +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z) +{ + return x * y + z; +} + +#endif // BOOST_MATH_ENABLE_CUDA + template BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) { @@ -59,7 +84,9 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) return T(1); } - if (x == T(0)) { // pow(+1, y) + if (x == T(0)) + { + // pow(+1, y) return T(1); } @@ -183,9 +210,15 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) // w is the rounding error. This improves accuracy when y is large. // Then exp(y*u) == exp(z)*exp(w). T z = y * u; + + #ifdef BOOST_MATH_ENABLE_CUDA T w = fma(y, u, -z); + #else + T w = local_fma(y, u, -z); + #endif + T term2 = exp(z) * exp(w); - + return term1 * term2; } From 988ea84a0fe99e2d40f43b6d38c1754bfa291903 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 12:13:09 -0400 Subject: [PATCH 03/11] Simplifications --- include/boost/math/special_functions/pow1p.hpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp index 40b78278b9..07b36cf03e 100644 --- a/include/boost/math/special_functions/pow1p.hpp +++ b/include/boost/math/special_functions/pow1p.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -60,7 +61,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) // The special values follow the spec of the pow() function defined in // IEEE-754, Section 9.2.1. The order of the `if` statements matters. - if (y == T(0)) + if (abs(y) == T(0)) { // pow(x, +/-0) return T(1); @@ -92,15 +93,15 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) if (boost::math::isinf(y)) { - if (boost::math::signbit(y)) + if (boost::math::signbit(y) == 1) { - // Pow(y, -inf) - return (x < 0 && x > -2) ? boost::math::numeric_limits::infinity() : T(0); + // pow(x, -inf) + return T(0); } else { // pow(x, +inf) - return (x < 0 && x > -2) ? T(0) : boost::math::numeric_limits::infinity(); + return y; } } @@ -134,12 +135,6 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) // Handle (1+x) < 0 if (x < -1) { - if (fmod(y, T(1)) != T(0)) - { - // y is not an integer so we have an error - return boost::math::policies::raise_evaluation_error(function, "Y is not an integer", y, pol); - } - // TODO(fancidev): maybe use (1+x)^y == [(1+x)^2]^(y/2) return pow(1+x, y); } From f0b351878e223a067d37396491e1bd8afcda4cd2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 12:13:35 -0400 Subject: [PATCH 04/11] Add normal testset --- test/Jamfile.v2 | 1 + test/test_pow1p.cpp | 106 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 test/test_pow1p.cpp diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 85ddf38f9b..18018d63af 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -128,6 +128,7 @@ test-suite special_fun : [ run hypot_test.cpp test_instances//test_instances pch_light ../../test/build//boost_unit_test_framework ] [ run pow_test.cpp ../../test/build//boost_unit_test_framework ] + [ run test_pow1p.cpp ] [ run logaddexp_test.cpp ../../test/build//boost_unit_test_framework ] [ run logsumexp_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx11_hdr_initializer_list cxx11_variadic_templates ] ] [ run ccmath_sqrt_test.cpp ../../test/build//boost_unit_test_framework : : : [ requires cxx17_if_constexpr ] [ check-target-builds ../config//has_float128 "GCC libquadmath and __float128 support" : BOOST_MATH_TEST_FLOAT128 "-Bstatic -lquadmath -Bdynamic" ] ] diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp new file mode 100644 index 0000000000..9ef5295652 --- /dev/null +++ b/test/test_pow1p.cpp @@ -0,0 +1,106 @@ +// (C) Copyright Matt Borland 2024. +// (C) Copyrigh Fancidev 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include "math_unit_test.hpp" + +template +void test() +{ + using std::pow; + + // First we hit all the special cases + // pow(x, +/-0) + CHECK_EQUAL(boost::math::pow1p(T(1), T(0)), T(1)); + + // pow(0, y) + CHECK_THROW(boost::math::pow1p(T(-1), T(-1)), std::domain_error); + CHECK_EQUAL(boost::math::pow1p(T(-1), T(1)), T(0)); + + // pow(-1, inf) + CHECK_EQUAL(boost::math::pow1p(T(-2), boost::math::numeric_limits::infinity()), T(1)); + + // pow(1, y) + CHECK_EQUAL(boost::math::pow1p(T(0), T(2)), T(1)); + + // pow(x, +/-inf) + BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits::has_infinity) + { + CHECK_EQUAL(boost::math::pow1p(T(5), -boost::math::numeric_limits::infinity()), T(0)); + CHECK_EQUAL(boost::math::pow1p(T(5), boost::math::numeric_limits::infinity()), boost::math::numeric_limits::infinity()); + + + // pow(+/-inf, y) + CHECK_EQUAL(boost::math::pow1p(boost::math::numeric_limits::infinity(), T(2)), boost::math::numeric_limits::infinity()); + CHECK_EQUAL(boost::math::pow1p(-boost::math::numeric_limits::infinity(), T(2)), boost::math::numeric_limits::infinity()); + } + + // NANs for x and y + BOOST_MATH_IF_CONSTEXPR (boost::math::numeric_limits::has_quiet_NaN) + { + CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(boost::math::numeric_limits::quiet_NaN(), T(1))), true); + CHECK_EQUAL(boost::math::isnan(boost::math::pow1p(T(1), boost::math::numeric_limits::quiet_NaN())), true); + } + + // pow(x, +/-1) + CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(1)), pow(T(3), T(1)), 10); + CHECK_ULP_CLOSE(boost::math::pow1p(T(2), T(-1)), pow(T(3), T(-1)), 10); + + // (1+x) < 0 + CHECK_ULP_CLOSE(boost::math::pow1p(T(-3), T(2)), pow(T(-2), T(2)), 10); + + // x < 0 + std::mt19937_64 rng; + std::uniform_real_distribution dist (-1, 0); + std::uniform_real_distribution dist_y (0, 10); + constexpr int N = 1024; + for (int i = 0; i < N; ++i) + { + const auto x = static_cast(dist(rng)); + const auto y = static_cast(dist_y(rng)); + + CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100); + } + + // 0 < x < 1 + std::uniform_real_distribution dist_x_1(0, 1); + for (int i = 0; i < N; ++i) + { + const auto x = static_cast(dist_x_1(rng)); + const auto y = static_cast(dist_y(rng)); + + CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100); + } + + // Else + std::uniform_real_distribution dist_other_x(1, 1000); + for (int i = 0; i < N; ++i) + { + const auto x = static_cast(dist_other_x(rng)); + const auto y = static_cast(dist_y(rng)); + + CHECK_ULP_CLOSE(boost::math::pow1p(x, y), pow(x + 1, y), 100); + } +} + +int main() +{ + test(); + test(); + + #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS + test(); + #endif + + #ifndef BOOST_MATH_NO_REAL_CONCEPT_TESTS + test(); + #endif + + return boost::math::test::report_errors(); +} From 80b94cc9fd479683ed9b5a1f0664b6916378acdd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 13:13:42 -0400 Subject: [PATCH 05/11] Add to special functions index --- include/boost/math/special_functions.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/boost/math/special_functions.hpp b/include/boost/math/special_functions.hpp index d3aa95c5c1..3275ab3d2b 100644 --- a/include/boost/math/special_functions.hpp +++ b/include/boost/math/special_functions.hpp @@ -79,6 +79,7 @@ #include #include #include +#include #ifndef BOOST_MATH_NO_EXCEPTIONS #include #endif From 7d4794abb3dda1cd49f287d87bd3feb8dcd89cd0 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 13:30:19 -0400 Subject: [PATCH 06/11] Add pow1p NVCC testing --- test/cuda_jamfile | 2 + test/test_pow1p_double.cu | 111 ++++++++++++++++++++++++++++++++++++++ test/test_pow1p_float.cu | 111 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 test/test_pow1p_double.cu create mode 100644 test/test_pow1p_float.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 64fac76fba..2829aa19ee 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -171,3 +171,5 @@ run test_trigamma_float.cu ; run test_trunc_double.cu ; run test_trunc_float.cu ; +run test_pow1p_double.cu ; +run test_pow1p_float.cu ; diff --git a/test/test_pow1p_double.cu b/test/test_pow1p_double.cu new file mode 100644 index 0000000000..c7746bd3e7 --- /dev/null +++ b/test/test_pow1p_double.cu @@ -0,0 +1,111 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef double float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng; + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = x_vals(rng); + input_vector2[i] = y_vals(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i])); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_pow1p_float.cu b/test/test_pow1p_float.cu new file mode 100644 index 0000000000..0ecdf0606c --- /dev/null +++ b/test/test_pow1p_float.cu @@ -0,0 +1,111 @@ + +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +typedef float float_type; + +/** + * CUDA Kernel Device code + * + */ +__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + using std::cos; + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector A + cuda_managed_ptr input_vector1(numElements); + + // Allocate the managed input vector B + cuda_managed_ptr input_vector2(numElements); + + // Allocate the managed output vector C + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors + std::mt19937_64 rng; + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = x_vals(rng); + input_vector2[i] = y_vals(rng); + } + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 1024; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + std::vector results; + results.reserve(numElements); + w.reset(); + for(int i = 0; i < numElements; ++i) + { + results.push_back(boost::math::pow1p(input_vector1[i], input_vector2[i])); + } + + double t = w.elapsed(); + // check the results + for(int i = 0; i < numElements; ++i) + { + if (boost::math::epsilon_difference(output_vector[i], results[i]) > 10) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From f6fd761e2197a064149b669c772955783182d81e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 13:39:14 -0400 Subject: [PATCH 07/11] Add NVRTC testing and fix missing header --- .../boost/math/special_functions/pow1p.hpp | 1 + test/nvrtc_jamfile | 1 + test/test_pow1p_nvrtc_double.cpp | 191 ++++++++++++++++++ test/test_pow1p_nvrtc_float.cpp | 191 ++++++++++++++++++ 4 files changed, 384 insertions(+) create mode 100644 test/test_pow1p_nvrtc_double.cpp create mode 100644 test/test_pow1p_nvrtc_float.cpp diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp index 07b36cf03e..2666808dbd 100644 --- a/include/boost/math/special_functions/pow1p.hpp +++ b/include/boost/math/special_functions/pow1p.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include // For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter diff --git a/test/nvrtc_jamfile b/test/nvrtc_jamfile index de235822ef..d636fc67be 100644 --- a/test/nvrtc_jamfile +++ b/test/nvrtc_jamfile @@ -168,3 +168,4 @@ run test_trigamma_nvrtc_double.cpp ; run test_trigamma_nvrtc_float.cpp ; run test_trunc_nvrtc_double.cpp ; +run test_pow1p_nvrtc_double.cpp ; diff --git a/test/test_pow1p_nvrtc_double.cpp b/test/test_pow1p_nvrtc_double.cpp new file mode 100644 index 0000000000..d9f97d94c0 --- /dev/null +++ b/test/test_pow1p_nvrtc_double.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_trunc_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(x_vals(rng)); + h_in2[i] = static_cast(y_vals(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::pow1p(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/test/test_pow1p_nvrtc_float.cpp b/test/test_pow1p_nvrtc_float.cpp new file mode 100644 index 0000000000..d9f97d94c0 --- /dev/null +++ b/test/test_pow1p_nvrtc_float.cpp @@ -0,0 +1,191 @@ +// Copyright John Maddock 2016. +// Copyright Matt Borland 2024. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error +#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false + +// Must be included first +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef double float_type; + +const char* cuda_kernel = R"( +typedef double float_type; +#include +#include +extern "C" __global__ +void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) + { + out[i] = boost::math::pow1p(in1[i], in2[i]); + } +} +)"; + +void checkCUDAError(cudaError_t result, const char* msg) +{ + if (result != cudaSuccess) + { + std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkCUError(CUresult result, const char* msg) +{ + if (result != CUDA_SUCCESS) + { + const char* errorStr; + cuGetErrorString(result, &errorStr); + std::cerr << msg << ": " << errorStr << std::endl; + exit(EXIT_FAILURE); + } +} + +void checkNVRTCError(nvrtcResult result, const char* msg) +{ + if (result != NVRTC_SUCCESS) + { + std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl; + exit(EXIT_FAILURE); + } +} + +int main() +{ + try + { + // Initialize CUDA driver API + checkCUError(cuInit(0), "Failed to initialize CUDA"); + + // Create CUDA context + CUcontext context; + CUdevice device; + checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device"); + checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context"); + + nvrtcProgram prog; + nvrtcResult res; + + res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr); + checkNVRTCError(res, "Failed to create NVRTC program"); + + nvrtcAddNameExpression(prog, "test_trunc_kernel"); + + #ifdef BOOST_MATH_NVRTC_CI_RUN + const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #else + const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"}; + #endif + + // Compile the program + res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts); + if (res != NVRTC_SUCCESS) + { + size_t log_size; + nvrtcGetProgramLogSize(prog, &log_size); + char* log = new char[log_size]; + nvrtcGetProgramLog(prog, log); + std::cerr << "Compilation failed:\n" << log << std::endl; + delete[] log; + exit(EXIT_FAILURE); + } + + // Get PTX from the program + size_t ptx_size; + nvrtcGetPTXSize(prog, &ptx_size); + char* ptx = new char[ptx_size]; + nvrtcGetPTX(prog, ptx); + + // Load PTX into CUDA module + CUmodule module; + CUfunction kernel; + checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module"); + checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function"); + + int numElements = 5000; + float_type *h_in1, *h_in2, *h_out; + float_type *d_in1, *d_in2, *d_out; + + // Allocate memory on the host + h_in1 = new float_type[numElements]; + h_in2 = new float_type[numElements]; + h_out = new float_type[numElements]; + + // Initialize input arrays + std::mt19937_64 rng(42); + std::uniform_real_distribution x_vals(-1, 1); + std::uniform_real_distribution y_vals(-10, 10); + for (int i = 0; i < numElements; ++i) + { + h_in1[i] = static_cast(x_vals(rng)); + h_in2[i] = static_cast(y_vals(rng)); + } + + checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1"); + checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2"); + checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out"); + + checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1"); + checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2"); + + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + void* args[] = { &d_in1, &d_in2, &d_out, &numElements }; + checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed"); + + checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out"); + + // Verify Result + for (int i = 0; i < numElements; ++i) + { + const auto res = boost::math::pow1p(h_in1[i], h_in2[i]); + + if (std::isfinite(res)) + { + if (boost::math::epsilon_difference(res, h_out[i]) > 300) + { + std::cout << "error at line: " << i + << "\nParallel: " << h_out[i] + << "\n Serial: " << res + << "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl; + } + } + } + + cudaFree(d_in1); + cudaFree(d_in2); + cudaFree(d_out); + delete[] h_in1; + delete[] h_in2; + delete[] h_out; + + nvrtcDestroyProgram(&prog); + delete[] ptx; + + cuCtxDestroy(context); + + std::cout << "Kernel executed successfully." << std::endl; + return 0; + } + catch(const std::exception& e) + { + std::cerr << "Stopped with exception: " << e.what() << std::endl; + return EXIT_FAILURE; + } +} From 6066680f59612c7f85220fa676b9518be000f71a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 13:44:47 -0400 Subject: [PATCH 08/11] Add testing and fixes for SYCL --- include/boost/math/special_functions/pow1p.hpp | 14 +++++++------- test/sycl_jamfile | 1 + test/test_pow1p.cpp | 2 ++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp index 2666808dbd..22781e6023 100644 --- a/include/boost/math/special_functions/pow1p.hpp +++ b/include/boost/math/special_functions/pow1p.hpp @@ -18,7 +18,7 @@ // For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter // NVRTC does not support the forward declarations header -#ifndef BOOST_MATH_ENABLE_CUDA +#ifndef BOOST_MATH_HAS_GPU_SUPPORT # include # ifndef BOOST_MATH_HAS_NVRTC # include @@ -31,7 +31,7 @@ namespace math { namespace detail { -#ifndef BOOST_MATH_ENABLE_CUDA +#ifndef BOOST_MATH_HAS_GPU_SUPPORT template using fma_t = decltype(fma(std::declval(), std::declval(), std::declval())); @@ -52,7 +52,7 @@ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, return x * y + z; } -#endif // BOOST_MATH_ENABLE_CUDA +#endif // BOOST_MATH_HAS_GPU_SUPPORT template BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) @@ -150,7 +150,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) t = x - (s - T(1)); if (t > 0) { - #ifdef BOOST_MATH_ENABLE_CUDA + #ifdef BOOST_MATH_HAS_GPU_SUPPORT s = ::nextafter(s, T(1)); #else s = boost::math::nextafter(s, T(1)); @@ -165,7 +165,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) t = x - (s - T(1)); if (t < 0) { - #ifdef BOOST_MATH_ENABLE_CUDA + #ifdef BOOST_MATH_HAS_GPU_SUPPORT s = ::nextafter(s, T(0)); #else s = boost::math::nextafter(s, T(0)); @@ -180,7 +180,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) t = T(1) - (s - x); if (t < 0) { - #ifdef BOOST_MATH_ENABLE_CUDA + #ifdef BOOST_MATH_HAS_GPU_SUPPORT s = ::nextafter(s, T(0)); #else s = boost::math::nextafter(s, T(0)); @@ -207,7 +207,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) // Then exp(y*u) == exp(z)*exp(w). T z = y * u; - #ifdef BOOST_MATH_ENABLE_CUDA + #ifdef BOOST_MATH_HAS_GPU_SUPPORT T w = fma(y, u, -z); #else T w = local_fma(y, u, -z); diff --git a/test/sycl_jamfile b/test/sycl_jamfile index 97c48474ca..3dd7d054af 100644 --- a/test/sycl_jamfile +++ b/test/sycl_jamfile @@ -38,3 +38,4 @@ run test_digamma_simple.cpp ; run test_trigamma.cpp ; run test_erf.cpp ; run test_gamma.cpp ; +run test_pow1p.cpp ; diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp index 9ef5295652..67d7ee1acd 100644 --- a/test/test_pow1p.cpp +++ b/test/test_pow1p.cpp @@ -20,7 +20,9 @@ void test() CHECK_EQUAL(boost::math::pow1p(T(1), T(0)), T(1)); // pow(0, y) + #ifndef BOOST_MATH_NO_EXCEPTIONS CHECK_THROW(boost::math::pow1p(T(-1), T(-1)), std::domain_error); + #endif CHECK_EQUAL(boost::math::pow1p(T(-1), T(1)), T(0)); // pow(-1, inf) From 866d9d01652138daf6f2a6b7b1e2f65f36ed89e2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 19 Aug 2024 13:54:56 -0400 Subject: [PATCH 09/11] Add pow1p to the docs --- doc/math.qbk | 1 + doc/sf/pow1p.qbk | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 doc/sf/pow1p.qbk diff --git a/doc/math.qbk b/doc/math.qbk index d6b90efb0b..67375da114 100644 --- a/doc/math.qbk +++ b/doc/math.qbk @@ -313,6 +313,7 @@ and use the function's name as the link text.] [def __sqrt1pm1 [link math_toolkit.powers.sqrt1pm1 sqrt1pm1]] [def __rsqrt [link math_toolkit.powers.rsqrt rsqrt]] [def __powm1 [link math_toolkit.powers.powm1 powm1]] +[def __pow1p [link math_toolkit.powers.pow1p pow1p]] [def __hypot [link math_toolkit.powers.hypot hypot]] [def __pow [link math_toolkit.powers.ct_pow pow]] diff --git a/doc/sf/pow1p.qbk b/doc/sf/pow1p.qbk new file mode 100644 index 0000000000..a2201d77ae --- /dev/null +++ b/doc/sf/pow1p.qbk @@ -0,0 +1,34 @@ +[/ + Copyright Matt Borland 2024 + Distributed under the Boost Software License, Version 1.0. + (See accompanying file LICENSE_1_0.txt or copy at + http://www.boost.org/LICENSE_1_0.txt). +] + +[section:pow1p pow1p] + +[h4 Synopsis] + + #include + + namespace boost { + namespace math { + + template + BOOST_MATH_GPU_ENABLED tools::promote_args_t + pow1p(const T1 x, const T2 y, const Policy& pol) + + template + BOOST_MATH_GPU_ENABLED tools::promote_args_t + pow1p(const T1 x, const T2 y) + + }} // namespaces + + +The function `pow1p` computes (1 + x)[super y ] where x and y are real numbers. +This function is particularly useful for scenarios where adding 1 to x before raising it to the power of y is required. +It provides a more numerically stable and efficient way to perform this computation, especially for small values of x. +When x is very small, directly computing (1 + x)[super y ] using standard arithmetic operations can lead to a loss of precision due to floating-point arithmetic limitations. +The pow1p function helps mitigate this issue by internally handling such cases more accurately. + +[endsect] From 1a450eb109bcfec1db4af51704ad96855005ac76 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 09:56:52 -0400 Subject: [PATCH 10/11] Simplify FMA, fix concept fails, and sync from upstream --- .../boost/math/special_functions/pow1p.hpp | 81 ++++++++++++------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/include/boost/math/special_functions/pow1p.hpp b/include/boost/math/special_functions/pow1p.hpp index 22781e6023..1b718f99bf 100644 --- a/include/boost/math/special_functions/pow1p.hpp +++ b/include/boost/math/special_functions/pow1p.hpp @@ -17,13 +17,10 @@ #include // For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter -// NVRTC does not support the forward declarations header #ifndef BOOST_MATH_HAS_GPU_SUPPORT # include -# ifndef BOOST_MATH_HAS_NVRTC -# include -# include -# endif // BOOST_MATH_HAS_NVRTC +# include +# include #endif // BOOST_MATH_ENABLE_CUDA namespace boost { @@ -52,6 +49,20 @@ BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, return x * y + z; } +#else + +template +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED T local_fma(const T x, const T y, const T z) +{ + return ::fma(x, y, z); +} + +template <> +BOOST_MATH_FORCEINLINE BOOST_MATH_GPU_ENABLED float local_fma(const float x, const float y, const float z) +{ + return ::fmaf(x, y, z); +} + #endif // BOOST_MATH_HAS_GPU_SUPPORT template @@ -71,7 +82,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) if (x == T(-1)) { // pow(+/-0, y) - if (boost::math::isfinite(y) && y < 0) + if ((boost::math::isfinite)(y) && y < 0) { return boost::math::policies::raise_domain_error(function, "Division by 0", x, pol); } @@ -80,7 +91,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) return pow(T(0), y); } - if (x == T(-2) && boost::math::isinf(y)) + if (x == T(-2) && (boost::math::isinf)(y)) { // pow(-1, +/-inf) return T(1); @@ -92,9 +103,9 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) return T(1); } - if (boost::math::isinf(y)) + if ((boost::math::isinf)(y)) { - if (boost::math::signbit(y) == 1) + if ((boost::math::signbit)(y) == 1) { // pow(x, -inf) return T(0); @@ -106,7 +117,7 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) } } - if (boost::math::isinf(x)) + if ((boost::math::isinf)(x)) { // pow(+/-inf, y) return pow(x, y); @@ -115,11 +126,11 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) // Up to this point, (1+x) = {+/-0, +/-1, +/-inf} have been handled, and // and y = {+/-0, +/-inf} have been handled. Next, we handle `nan` and // y = {+/-1}. - if (boost::math::isnan(x)) + if ((boost::math::isnan)(x)) { return x; } - else if (boost::math::isnan(y)) + else if ((boost::math::isnan)(y)) { return y; } @@ -191,30 +202,46 @@ BOOST_MATH_GPU_ENABLED T pow1p_imp(const T x, const T y, const Policy& pol) } // Because x > -1 and s is rounded toward 1, s is guaranteed to be > 0. - // Then (1+x)^y == (s+t)^y == (s^y)*((1+u)^y), where u := t / s. - // It can be shown that either both terms <= 1 or both >= 1, so + // Write (1+x)^y == (s+t)^y == (s^y)*((1+t/s)^y) == term1*term2. + // It can be shown that either both terms <= 1 or both >= 1; so // if the first term over/underflows, then the result over/underflows. - T u = t / s; + // And of course, term2 == 1 if t == 0. T term1 = pow(s, y); - if (term1 == T(0) || boost::math::isinf(term1)) + if (t == T(0) || term1 == T(0) || (boost::math::isinf)(term1)) { return term1; } - // (1+u)^y == exp(y*log(1+u)). Since u is close to machine epsilon, - // log(1+u) ~= u. Let y*u == z+w, where z is the rounded result and - // w is the rounding error. This improves accuracy when y is large. - // Then exp(y*u) == exp(z)*exp(w). - T z = y * u; + // (1+t/s)^y == exp(y*log(1+t/s)). The relative error of the result is + // equal to the absolute error of the exponent (plus the relative error + // of 'exp'). Therefore, when the exponent is small, it is accurately + // evaluated to machine epsilon using T arithmetic. In addition, since + // t/s <= epsilon, log(1+t/s) is well approximated by t/s to first order. + T u = t / s; + T w = y * u; + if (abs(w) <= T(0.5)) + { + T term2 = exp(w); + return term1 * term2; + } + + // Now y*log(1+t/s) is large, and its relative error is "magnified" by + // the exponent. To reduce the error, we use double-T arithmetic, and + // expand log(1+t/s) to second order. + + // (u + uu) ~= t/s. + T r1 = local_fma(-u, s, t); + T uu = r1 / s; - #ifdef BOOST_MATH_HAS_GPU_SUPPORT - T w = fma(y, u, -z); - #else - T w = local_fma(y, u, -z); - #endif + // (u + vv) ~= log(1+(u+uu)) ~= log(1+t/s). + T vv = local_fma(T(-0.5)*u, u, uu); - T term2 = exp(z) * exp(w); + // (w + ww) ~= y*(u+vv) ~= y*log(1+t/s). + T r2 = local_fma(y, u, -w); + T ww = local_fma(y, vv, r2); + // TODO: maybe ww is small enough such that exp(ww) ~= 1+ww. + T term2 = exp(w) * exp(ww); return term1 * term2; } From f7150ee9765eec76cedc9e8d867458743a1bf715 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Mon, 26 Aug 2024 09:57:02 -0400 Subject: [PATCH 11/11] Add tests --- test/test_pow1p.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/test_pow1p.cpp b/test/test_pow1p.cpp index 67d7ee1acd..9b5ba7c8e7 100644 --- a/test/test_pow1p.cpp +++ b/test/test_pow1p.cpp @@ -8,6 +8,11 @@ #include #include #include + +#if __has_include() && !defined(BOOST_MATH_HAS_GPU_SUPPORT) +# include +#endif + #include "math_unit_test.hpp" template @@ -93,8 +98,17 @@ void test() int main() { + #ifdef __STDCPP_FLOAT32_T__ + test(); + #else test(); + #endif + + #ifdef __STDCPP_FLOAT64_T__ + test(); + #else test(); + #endif #ifndef BOOST_MATH_NO_LONG_DOUBLE_MATH_FUNCTIONS test();