From 8cf4a1ff6589b1541df775bb306fbbd643ea25ac Mon Sep 17 00:00:00 2001 From: Fikret Ardal Date: Tue, 1 Jul 2025 19:26:32 +0300 Subject: [PATCH] added reduce_mul implementation for common arch and specialized for x86_64 architecture --- .../arch/common/xsimd_common_arithmetic.hpp | 14 +++++++ .../arch/common/xsimd_common_details.hpp | 2 + .../xsimd/arch/common/xsimd_common_math.hpp | 34 +++++++++++++++++ include/xsimd/arch/xsimd_avx.hpp | 9 +++++ include/xsimd/arch/xsimd_avx512f.hpp | 33 +++++++++++++++++ include/xsimd/arch/xsimd_common_fwd.hpp | 2 + include/xsimd/arch/xsimd_sse2.hpp | 37 +++++++++++++++++++ include/xsimd/arch/xsimd_sse3.hpp | 9 +++++ include/xsimd/types/xsimd_api.hpp | 14 +++++++ test/test_batch.cpp | 8 ++++ test/test_batch_complex.cpp | 7 ++++ 11 files changed, 169 insertions(+) diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp index dba8c38ca..745b2e075 100644 --- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp @@ -141,6 +141,20 @@ namespace xsimd return res; } + // hmul + template ::value, void>::type*/> + XSIMD_INLINE T hmul(batch const& self, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + self.store_aligned(buffer); + T res = 1; + for (T val : buffer) + { + res *= val; + } + return res; + } + // incr template XSIMD_INLINE batch incr(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp index 03088eec4..515587e77 100644 --- a/include/xsimd/arch/common/xsimd_common_details.hpp +++ b/include/xsimd/arch/common/xsimd_common_details.hpp @@ -74,6 +74,8 @@ namespace xsimd XSIMD_INLINE batch, A> nearbyint_as_int(const batch& x) noexcept; template XSIMD_INLINE T reduce_add(batch const&) noexcept; + template + XSIMD_INLINE T reduce_mul(batch const&) noexcept; template XSIMD_INLINE batch select(batch_bool const&, batch const&, batch const&) noexcept; template diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp index 9bc460922..1624b77fd 100644 --- a/include/xsimd/arch/common/xsimd_common_math.hpp +++ b/include/xsimd/arch/common/xsimd_common_math.hpp @@ -2114,6 +2114,24 @@ namespace xsimd } }; + template + struct reverse + { + static constexpr T get(T i, T) + { + return i >= N ? i : N - i - 1; + } + }; + + template Lvl> + XSIMD_INLINE std::complex reduce_mul(batch, A> const& self, std::integral_constant, Lvl>) noexcept + { + XSIMD_IF_CONSTEXPR(Lvl == 1) { return self.get(0); } + using index_type = as_unsigned_integer_t; + using batch_type = batch, A>; + batch_type rev = swizzle(self, make_batch_constant, A>()); + return reduce_mul(self * rev, std::integral_constant()); + } template XSIMD_INLINE T reduce(Op, batch const& self, std::integral_constant) noexcept { @@ -2129,6 +2147,13 @@ namespace xsimd } } + // reduce_mul + template + XSIMD_INLINE std::complex reduce_mul(batch, A> const& self, requires_arch) noexcept + { + return detail::reduce_mul(self, std::integral_constant, batch, A>::size>()); + } + // reduce_max template XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept @@ -2147,6 +2172,15 @@ namespace xsimd self, std::integral_constant::size>()); } + // reduce_mul + template + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + return detail::reduce([](batch const& x, batch const& y) + { return mul(x, y); }, + self, std::integral_constant::size>()); + } + // remainder template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 883f055be..eb36d7b5e 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1076,6 +1076,15 @@ namespace xsimd __m128i low = _mm256_castsi256_si128(acc); return reduce_min(batch(low)); } + // reduce_mul + template ::value || std::is_same::value || std::is_same::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) + { + typename batch::register_type low, high; + detail::split_avx(self, low, high); + batch blow(low), bhigh(high); + return reduce_mul(blow * bhigh); + } // rsqrt template diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 26947dffc..cd75494d3 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -17,6 +17,11 @@ #include #include "../types/xsimd_avx512f_register.hpp" +#include "xsimd/config/xsimd_inline.hpp" +#include "xsimd/types/xsimd_register.hpp" +#include "xsimd/xsimd.hpp" + +#include namespace xsimd { @@ -1545,6 +1550,34 @@ namespace xsimd __m256i low = _mm512_castsi512_si256(acc); return reduce_min(batch(low)); } + // reduce_mul + template ::value || std::is_same::value || std::is_same::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm512_reduce_mul_ps(self); + } + else XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm512_reduce_mul_pd(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm512_reduce_mul_epi32(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm512_reduce_mul_epi64(self); + } + else + { + typename batch::register_type low, high; + detail::split_avx512(self, low, high); + batch blow(low), bhigh(high); + return reduce_mul(blow * bhigh); + } + } // rsqrt template diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 26a9bdbdb..dcfd47698 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -37,6 +37,8 @@ namespace xsimd XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE T hadd(batch const& self, requires_arch) noexcept; + template ::value, void>::type> + XSIMD_INLINE T hmul(batch const& self, requires_arch) noexcept; } } diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 94d7af5d4..d83162177 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1252,6 +1252,43 @@ namespace xsimd { return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); } + // reduce_mul + template + XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept + { + __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self)); + __m128 tmp2 = _mm_mul_ps(self, _mm_shuffle_ps(tmp1, tmp1, 0x1)); + return _mm_cvtss_f32(tmp2); + } + + template ::value, void>::type> + XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + batch tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0,1,2,3)); + tmp1 = tmp1 * self; + batch tmp2 = _mm_unpackhi_epi32(tmp1, tmp1); + tmp2 = tmp2 * tmp1; + return _mm_cvtsi128_si32(tmp2); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + batch tmp1 = _mm_unpackhi_epi64(self, self); + tmp1 = tmp1 * self; + return _mm_cvtsi128_si64(tmp1); + } + else + { + return hmul(self, common {}); + } + } + + template + XSIMD_INLINE double reduce_mul(batch const& self, requires_arch) noexcept + { + return _mm_cvtsd_f64(_mm_mul_pd(self, _mm_unpackhi_pd(self, self))); + } // reduce_max template ::type> diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp index 9dbc4b343..a88d693cd 100644 --- a/include/xsimd/arch/xsimd_sse3.hpp +++ b/include/xsimd/arch/xsimd_sse3.hpp @@ -51,6 +51,15 @@ namespace xsimd return _mm_cvtss_f32(tmp1); } + // reduce_mul + template + XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept + { + __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self)); + __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1)); + return _mm_cvtss_f32(tmp2); + } + } } diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 54ac836d2..4dbba6d93 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1864,6 +1864,20 @@ namespace xsimd return kernel::reduce_min(x, A {}); } + /** + * @ingroup batch_reducers + * + * Multiplication of all the scalars of the batch \c x. + * @param x batch involved in the reduction + * @return the result of the reduction. + */ + template + XSIMD_INLINE T reduce_mul(batch const& x) noexcept + { + detail::static_check_supported_config(); + return kernel::reduce_mul(x, A {}); + } + /** * @ingroup batch_math * diff --git a/test/test_batch.cpp b/test/test_batch.cpp index 05c13b4b8..403cd22b4 100644 --- a/test/test_batch.cpp +++ b/test/test_batch.cpp @@ -800,6 +800,14 @@ struct batch_test INFO("reduce_min"); CHECK_SCALAR_EQ(res, expected); } + + // reduce_mul + { + value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type{1}, std::multiplies()); + value_type res = reduce_mul(batch_lhs()); + INFO("reduce_mul"); + CHECK_SCALAR_EQ(res, expected); + } } template diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp index e06b31807..a51bb7c82 100644 --- a/test/test_batch_complex.cpp +++ b/test/test_batch_complex.cpp @@ -572,6 +572,13 @@ struct batch_complex_test value_type res = reduce_add(batch_lhs()); CHECK_SCALAR_EQ(res, expected); } + + // reduce_mul + { + value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type{1}, std::multiplies()); + value_type res = reduce_mul(batch_lhs()); + CHECK_SCALAR_EQ(res, expected); + } } void test_fused_operations() const