Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/RAJA/policy/cuda/multi_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
//! get value for bin, assumes synchronization occurred elsewhere
T get(int bin) const
{
::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
reducer(m_identity);
::RAJA::HighAccuracyReduce<T, typename Combiner::operator_type> reducer(
m_identity);
for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
{
int tally_offset =
Expand Down
4 changes: 2 additions & 2 deletions include/RAJA/policy/cuda/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1126,8 +1126,8 @@ class Reduce
if (n != end)
{
tally_or_val_ptr.list->synchronize_resources();
::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
reducer(std::move(val.value));
::RAJA::HighAccuracyReduce<T, typename Combiner::operator_type> reducer(
std::move(val.value));
for (; n != end; ++n)
{
T(&values)[tally_slots] = *n;
Expand Down
4 changes: 2 additions & 2 deletions include/RAJA/policy/hip/multi_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,8 @@ struct MultiReduceGridAtomicHostInit_TallyData
//! get value for bin, assumes synchronization occurred elsewhere
T get(int bin) const
{
::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
reducer(m_identity);
::RAJA::HighAccuracyReduce<T, typename Combiner::operator_type> reducer(
m_identity);
for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
{
int tally_offset =
Expand Down
4 changes: 2 additions & 2 deletions include/RAJA/policy/hip/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1118,8 +1118,8 @@ class Reduce
if (n != end)
{
tally_or_val_ptr.list->synchronize_resources();
::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
reducer(std::move(val.value));
::RAJA::HighAccuracyReduce<T, typename Combiner::operator_type> reducer(
std::move(val.value));
for (; n != end; ++n)
{
T(&values)[tally_slots] = *n;
Expand Down
2 changes: 1 addition & 1 deletion include/RAJA/policy/openmp/multi_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ struct MultiReduceDataOMP<

T get(size_t bin) const
{
::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
::RAJA::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
reducer(m_identity);
for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx)
{
Expand Down
241 changes: 177 additions & 64 deletions include/RAJA/util/reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@
namespace RAJA
{

namespace detail
{

/*!
\brief Reduce class that does a reduction with a left fold.
*/
Expand Down Expand Up @@ -91,6 +88,14 @@ struct LeftFoldReduce
m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
}

/*!
\brief combine a value into the reducer
*/
RAJA_HOST_DEVICE RAJA_INLINE void operator+=(T val)
{
combine(std::move(val));
}

private:
BinaryOp m_op;
T m_accumulated_value;
Expand Down Expand Up @@ -214,6 +219,14 @@ struct BinaryTreeReduce
++m_count;
}

/*!
\brief combine a value into the reducer
*/
RAJA_HOST_DEVICE RAJA_INLINE void operator+=(T val)
{
combine(std::move(val));
}

private:
BinaryOp m_op;

Expand Down Expand Up @@ -241,76 +254,88 @@ struct BinaryTreeReduce
}
};

template<typename T, typename BinaryOp>
using HighAccuracyReduce =
std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
BinaryTreeReduce<T, BinaryOp>,
LeftFoldReduce<T, BinaryOp>>;

/*!
\brief Combine into a single value using a left fold with the given
operation using O(N) operations and O(1) memory
\brief Reduce class that does a reduction with a left fold.

\note KahanSum does not take an binary operation as the only valid operation
is plus.
*/
template<typename Iter, typename T, typename BinaryOp>
RAJA_HOST_DEVICE RAJA_INLINE T
left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
template<typename T>
struct KahanSum
{
LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
static_assert(std::is_floating_point_v<T>, "T must be a floating point type");

for (; begin != end; ++begin)
{
RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit KahanSum(
T init = T()) noexcept
: m_accumulated_value(std::move(init)),
m_accumulated_carry(T())
{}

reducer.combine(*begin);
}
KahanSum(KahanSum const&) = delete;
KahanSum& operator=(KahanSum const&) = delete;
KahanSum(KahanSum&&) = delete;
KahanSum& operator=(KahanSum&&) = delete;

return reducer.get_and_clear();
}
~KahanSum() = default;

/*!
\brief reduce using a binary tree with the given operation
and using O(N) operations and O(lg(n)) memory

This is more accurate than sequentially adding into a single value for
floating point types.
*/
template<typename Iter, typename T, typename BinaryOp>
RAJA_HOST_DEVICE RAJA_INLINE T
binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
{
using std::distance;
using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
std::move(op));
/*!
\brief reset the combined value of the reducer to the identity
*/
RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
{
m_accumulated_value = T();
m_accumulated_carry = T();
}

for (; begin != end; ++begin)
/*!
\brief return the combined value and clear the reducer
*/
RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
{
T accumulated_value = std::move(m_accumulated_value);

reducer.combine(*begin);
}
clear();

return reducer.get_and_clear();
}
return accumulated_value;
}

/*!
\brief reducer that uses a high accuracy implementation when round-off error
is a concern, or a faster algorithm with it is not a concern
*/
template<typename Iter, typename T, typename BinaryOp>
RAJA_HOST_DEVICE RAJA_INLINE T
high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
{
HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
/*!
\brief return the combined value
*/
RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; }

for (; begin != end; ++begin)
/*!
\brief combine a value into the reducer
*/
RAJA_HOST_DEVICE RAJA_INLINE void combine(T val)
{
// volatile used to prevent compiler optimizations that assume
// floating-point operations are associative
T y = val - m_accumulated_carry;
volatile T t = m_accumulated_value + y;
volatile T z = t - m_accumulated_value;
m_accumulated_carry = z - y;
m_accumulated_value = t;
}

reducer.combine(*begin);
/*!
\brief combine a value into the reducer
*/
RAJA_HOST_DEVICE RAJA_INLINE void operator+=(T val)
{
combine(std::move(val));
}

return reducer.get_and_clear();
}
private:
T m_accumulated_value;
T m_accumulated_carry;
};

} // namespace detail
template<typename T, typename BinaryOp>
using HighAccuracyReduce =
std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
BinaryTreeReduce<T, BinaryOp>,
LeftFoldReduce<T, BinaryOp>>;

/*!
\brief Accumulate given range to a single value
Expand All @@ -322,17 +347,26 @@ template<typename Container,
typename BinaryOp = operators::plus<T>>
RAJA_HOST_DEVICE RAJA_INLINE
concepts::enable_if_t<T, type_traits::is_range<Container>>
accumulate(Container&& c,
T init = BinaryOp::identity(),
BinaryOp op = BinaryOp {})
left_fold_reduce(Container&& c,
T init = BinaryOp::identity(),
BinaryOp op = BinaryOp {})
{
using std::begin;
using std::end;
static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
"BinaryOp must model BinaryFunction");

return detail::left_fold_reduce(begin(c), end(c), std::move(init),
std::move(op));
auto begin_it = begin(c);
auto end_it = end(c);

LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));

for (; begin_it != end_it; ++begin_it)
{
reducer.combine(*begin_it);
}

return reducer.get_and_clear();
}

/*!
Expand All @@ -350,12 +384,50 @@ RAJA_HOST_DEVICE RAJA_INLINE
BinaryOp op = BinaryOp {})
{
using std::begin;
using std::distance;
using std::end;
static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
"BinaryOp must model BinaryFunction");

return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
std::move(op));
auto begin_it = begin(c);
auto end_it = end(c);
using SizeType = std::make_unsigned_t<decltype(distance(begin_it, end_it))>;

BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
std::move(op));

for (; begin_it != end_it; ++begin_it)
{
reducer.combine(*begin_it);
}

return reducer.get_and_clear();
}

/*!
\brief Accumulate given range to a single value
using a left fold algorithm in O(N) operations and O(1) extra memory
see https://en.cppreference.com/w/cpp/algorithm/accumulate
*/
template<typename Container, typename T = detail::ContainerVal<Container>>
RAJA_HOST_DEVICE RAJA_INLINE concepts::
enable_if_t<T, type_traits::is_range<Container>, std::is_floating_point<T>>
kahan_sum(Container&& c, T init = T())
{
using std::begin;
using std::end;

auto begin_it = begin(c);
auto end_it = end(c);

KahanSum<T> reducer(std::move(init));

for (; begin_it != end_it; ++begin_it)
{
reducer.combine(*begin_it);
}

return reducer.get_and_clear();
}

/*!
Expand All @@ -378,8 +450,49 @@ RAJA_HOST_DEVICE RAJA_INLINE
static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
"BinaryOp must model BinaryFunction");

return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
std::move(op));
auto begin_it = begin(c);
auto end_it = end(c);

HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));

for (; begin_it != end_it; ++begin_it)
{
reducer.combine(*begin_it);
}

return reducer.get_and_clear();
}

/*!
\brief Accumulate given range to a single value
using a left fold algorithm in O(N) operations and O(1) extra memory
see https://en.cppreference.com/w/cpp/algorithm/accumulate
*/
template<typename Container,
typename T = detail::ContainerVal<Container>,
typename BinaryOp = operators::plus<T>>
RAJA_HOST_DEVICE RAJA_INLINE
concepts::enable_if_t<T, type_traits::is_range<Container>>
accumulate(Container&& c,
T init = BinaryOp::identity(),
BinaryOp op = BinaryOp {})
{
using std::begin;
using std::end;
static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
"BinaryOp must model BinaryFunction");

auto begin_it = begin(c);
auto end_it = end(c);

LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));

for (; begin_it != end_it; ++begin_it)
{
reducer.combine(*begin_it);
}

return reducer.get_and_clear();
}

} // namespace RAJA
Expand Down
2 changes: 1 addition & 1 deletion test/unit/algorithm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ if(RAJA_ENABLE_HIP)
endif()


set( UTIL_REDUCES BinaryTree Accumulate )
set( UTIL_REDUCES BinaryTree Accumulate Kahan )

RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Sequential Default "${UTIL_REDUCES}" )

Expand Down
Loading