From 8cf4a1ff6589b1541df775bb306fbbd643ea25ac Mon Sep 17 00:00:00 2001
From: Fikret Ardal <fikret.ardal@tum.de>
Date: Tue, 1 Jul 2025 19:26:32 +0300
Subject: [PATCH] added reduce_mul implementation for common arch and
 specialized for x86_64 architecture

---
 .../arch/common/xsimd_common_arithmetic.hpp   | 14 +++++++
 .../arch/common/xsimd_common_details.hpp      |  2 +
 .../xsimd/arch/common/xsimd_common_math.hpp   | 34 +++++++++++++++++
 include/xsimd/arch/xsimd_avx.hpp              |  9 +++++
 include/xsimd/arch/xsimd_avx512f.hpp          | 33 +++++++++++++++++
 include/xsimd/arch/xsimd_common_fwd.hpp       |  2 +
 include/xsimd/arch/xsimd_sse2.hpp             | 37 +++++++++++++++++++
 include/xsimd/arch/xsimd_sse3.hpp             |  9 +++++
 include/xsimd/types/xsimd_api.hpp             | 14 +++++++
 test/test_batch.cpp                           |  8 ++++
 test/test_batch_complex.cpp                   |  7 ++++
 11 files changed, 169 insertions(+)
diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
index dba8c38ca..745b2e075 100644
--- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
+++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
@@ -141,6 +141,20 @@ namespace xsimd
             return res;
         }
 
+        //  hmul
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        XSIMD_INLINE T hmul(batch<T, A> const& self, requires_arch<common>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(buffer);
+            T res = 1;
+            for (T val : buffer)
+            {
+                res *= val;
+            }
+            return res;
+        }
+
         // incr
         template <class A, class T>
         XSIMD_INLINE batch<T, A> incr(batch<T, A> const& self, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp
index 03088eec4..515587e77 100644
--- a/include/xsimd/arch/common/xsimd_common_details.hpp
+++ b/include/xsimd/arch/common/xsimd_common_details.hpp
@@ -74,6 +74,8 @@ namespace xsimd
     XSIMD_INLINE batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
     template <class T, class A>
     XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
+    template<class T, class A>
+    XSIMD_INLINE T reduce_mul(batch<T, A> const&) noexcept;
     template <class T, class A>
     XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
     template <class T, class A>
diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp
index 9bc460922..1624b77fd 100644
--- a/include/xsimd/arch/common/xsimd_common_math.hpp
+++ b/include/xsimd/arch/common/xsimd_common_math.hpp
@@ -2114,6 +2114,24 @@ namespace xsimd
                 }
             };
 
+            template <class T, T N>
+            struct reverse
+            {
+                static constexpr T get(T i, T)
+                {
+                    return i >= N ? i : N - i - 1;
+                }
+            };
+
+            template <class A, class T, as_unsigned_integer_t<T> Lvl>
+            XSIMD_INLINE std::complex<T> reduce_mul(batch<std::complex<T>, A> const& self, std::integral_constant<as_unsigned_integer_t<T>, Lvl>) noexcept
+            {
+                XSIMD_IF_CONSTEXPR(Lvl == 1) { return self.get(0); }
+                using index_type = as_unsigned_integer_t<T>;
+                using batch_type = batch<std::complex<T>, A>;
+                batch_type rev = swizzle(self, make_batch_constant<index_type, reverse<index_type, Lvl>, A>());
+                return reduce_mul(self * rev, std::integral_constant<index_type, Lvl / 2>());
+            }
             template <class Op, class A, class T>
             XSIMD_INLINE T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
             {
@@ -2129,6 +2147,13 @@ namespace xsimd
             }
         }
 
+        // reduce_mul
+        template <class A, class T>
+        XSIMD_INLINE std::complex<T> reduce_mul(batch<std::complex<T>, A> const& self, requires_arch<common>) noexcept
+        {
+            return detail::reduce_mul(self, std::integral_constant<as_unsigned_integer_t<T>, batch<std::complex<T>, A>::size>());
+        }
+
         // reduce_max
         template <class A, class T>
         XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<common>) noexcept
@@ -2147,6 +2172,15 @@ namespace xsimd
                                   self, std::integral_constant<unsigned, batch<T, A>::size>());
         }
 
+        // reduce_mul
+        template <class A, class T>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<common>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return mul(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
         // remainder
         template <class A>
         XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 883f055be..eb36d7b5e 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -1076,6 +1076,15 @@ namespace xsimd
             __m128i low = _mm256_castsi256_si128(acc);
             return reduce_min(batch<T, sse4_2>(low));
         }
+        // reduce_mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx>)
+        {
+            typename batch<T, sse4_2>::register_type low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_mul(blow * bhigh);
+        }
 
         // rsqrt
         template <class A>
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index 26947dffc..cd75494d3 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -17,6 +17,11 @@
 #include <type_traits>
 
 #include "../types/xsimd_avx512f_register.hpp"
+#include "xsimd/config/xsimd_inline.hpp"
+#include "xsimd/types/xsimd_register.hpp"
+#include "xsimd/xsimd.hpp"
+
+#include <avx512fintrin.h>
 
 namespace xsimd
 {
@@ -1545,6 +1550,34 @@ namespace xsimd
             __m256i low = _mm512_castsi512_si256(acc);
             return reduce_min(batch<T, avx2>(low));
         }
+        // reduce_mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(std::is_same<T,float>::value)
+            {
+                return _mm512_reduce_mul_ps(self);
+            }
+            else XSIMD_IF_CONSTEXPR(std::is_same<T,double>::value)
+            {
+                return _mm512_reduce_mul_pd(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_reduce_mul_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_reduce_mul_epi64(self);
+            }
+            else
+            {
+                typename batch<T, avx2>::register_type low, high;
+                detail::split_avx512(self, low, high);
+                batch<T, avx2> blow(low), bhigh(high);
+                return reduce_mul(blow * bhigh);
+            }
+        }
 
         // rsqrt
         template <class A>
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
index 26a9bdbdb..dcfd47698 100644
--- a/include/xsimd/arch/xsimd_common_fwd.hpp
+++ b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -37,6 +37,8 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T hmul(batch<T, A> const& self, requires_arch<common>) noexcept;
 
     }
 }
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
index 94d7af5d4..d83162177 100644
--- a/include/xsimd/arch/xsimd_sse2.hpp
+++ b/include/xsimd/arch/xsimd_sse2.hpp
@@ -1252,6 +1252,43 @@ namespace xsimd
         {
             return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
         }
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp2 = _mm_mul_ps(self, _mm_shuffle_ps(tmp1, tmp1, 0x1));
+            return _mm_cvtss_f32(tmp2);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_mul(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                batch<T, A> tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0,1,2,3));
+                tmp1 = tmp1 * self;
+                batch<T, A> tmp2 = _mm_unpackhi_epi32(tmp1, tmp1);
+                tmp2 = tmp2 * tmp1;
+                return _mm_cvtsi128_si32(tmp2);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, A> tmp1 = _mm_unpackhi_epi64(self, self);
+                tmp1 = tmp1 * self;
+                return _mm_cvtsi128_si64(tmp1);
+            }
+            else
+            {
+                return hmul(self, common {});
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE double reduce_mul(batch<double,A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_mul_pd(self, _mm_unpackhi_pd(self, self)));
+        }
 
         // reduce_max
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp
index 9dbc4b343..a88d693cd 100644
--- a/include/xsimd/arch/xsimd_sse3.hpp
+++ b/include/xsimd/arch/xsimd_sse3.hpp
@@ -51,6 +51,15 @@ namespace xsimd
             return _mm_cvtss_f32(tmp1);
         }
 
+        // reduce_mul
+        template <class A>
+        XSIMD_INLINE float reduce_mul(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1));
+            return _mm_cvtss_f32(tmp2);
+        }
+
     }
 
 }
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index 54ac836d2..4dbba6d93 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -1864,6 +1864,20 @@ namespace xsimd
         return kernel::reduce_min<A>(x, A {});
     }
 
+    /**
+     * @ingroup batch_reducers
+     *
+     * Multiplication of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    XSIMD_INLINE T reduce_mul(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_mul<A>(x, A {});
+    }
+
     /**
      * @ingroup batch_math
      *
diff --git a/test/test_batch.cpp b/test/test_batch.cpp
index 05c13b4b8..403cd22b4 100644
--- a/test/test_batch.cpp
+++ b/test/test_batch.cpp
@@ -800,6 +800,14 @@ struct batch_test
             INFO("reduce_min");
             CHECK_SCALAR_EQ(res, expected);
         }
+
+        // reduce_mul
+         {
+             value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type{1}, std::multiplies<value_type>());
+             value_type res = reduce_mul(batch_lhs());
+             INFO("reduce_mul");
+             CHECK_SCALAR_EQ(res, expected);
+         }
     }
 
     template <size_t N>
diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp
index e06b31807..a51bb7c82 100644
--- a/test/test_batch_complex.cpp
+++ b/test/test_batch_complex.cpp
@@ -572,6 +572,13 @@ struct batch_complex_test
             value_type res = reduce_add(batch_lhs());
             CHECK_SCALAR_EQ(res, expected);
         }
+
+        // reduce_mul
+         {
+             value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type{1}, std::multiplies<value_type>());
+             value_type res = reduce_mul(batch_lhs());
+             CHECK_SCALAR_EQ(res, expected);
+         }
     }
 
     void test_fused_operations() const