diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp index d094a729c1..bdc7405779 100644 --- a/include/RAJA/pattern/params/reducer.hpp +++ b/include/RAJA/pattern/params/reducer.hpp @@ -95,7 +95,7 @@ namespace detail #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) // Device related attributes. value_type * devicetarget = nullptr; - RAJA::detail::SoAPtr device_mem; + RAJA::detail::SoAPtr device_mem; unsigned int * device_count = nullptr; #endif diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index c49ec46049..024dffe9e4 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -118,7 +118,8 @@ namespace detail struct cudaInfo { cuda_dim_t gridDim{0, 0, 0}; cuda_dim_t blockDim{0, 0, 0}; - ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)}; + ::RAJA::resources::Cuda cuda_res{::RAJA::resources::Cuda::CudaFromStream(0,0)}; + ::RAJA::resources::Resource res{::RAJA::resources::Cuda::CudaFromStream(0,0)}; bool setup_reducers = false; #if defined(RAJA_ENABLE_OPENMP) cudaInfo* thread_states = nullptr; @@ -251,20 +252,26 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; } //! get resource for current launch RAJA_INLINE -::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; } +::RAJA::resources::Cuda& currentCudaResource() { return detail::tl_status.cuda_res; } + +//! get resource for current launch +RAJA_INLINE +::RAJA::resources::Resource& currentResource() { return detail::tl_status.res; } //! create copy of loop_body that is setup for device execution -template +template RAJA_INLINE typename std::remove_reference::type make_launch_body( cuda_dim_t gridDim, cuda_dim_t blockDim, size_t RAJA_UNUSED_ARG(dynamic_smem), - ::RAJA::resources::Cuda res, + Res res, LOOP_BODY&& loop_body) { detail::SetterResetter setup_reducers_srer( detail::tl_status.setup_reducers, true); - detail::SetterResetter<::RAJA::resources::Cuda> res_srer( + detail::SetterResetter<::RAJA::resources::Cuda> cuda_res_srer( + detail::tl_status.cuda_res, res); + detail::SetterResetter<::RAJA::resources::Resource> res_srer( detail::tl_status.res, res); detail::tl_status.gridDim = gridDim; diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index ce278c4ee2..5534b56498 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -187,13 +187,14 @@ __launch_bounds__(BlockSize, 1) __global__ //////////////////////////////////////////////////////////////////////// // -template +template RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, + resources::EventProxy, + std::is_base_of, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> -forall_impl(resources::Cuda cuda_res, +forall_impl(Res cuda_res, cuda_exec_explicit, Iterable&& iter, LoopBody&& loop_body, @@ -250,16 +251,17 @@ forall_impl(resources::Cuda cuda_res, RAJA_FT_END; } - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } -template +template RAJA_INLINE concepts::enable_if_t< - resources::EventProxy, + resources::EventProxy, + std::is_base_of, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty> > -forall_impl(resources::Cuda cuda_res, +forall_impl(Res cuda_res, cuda_exec_explicit, Iterable&& iter, LoopBody&& loop_body, @@ -325,7 +327,7 @@ forall_impl(resources::Cuda cuda_res, RAJA_FT_END; } - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } @@ -348,13 +350,17 @@ forall_impl(resources::Cuda cuda_res, * ****************************************************************************** */ -template -RAJA_INLINE resources::EventProxy -forall_impl(resources::Cuda r, +RAJA_INLINE +concepts::enable_if_t< + resources::EventProxy, + std::is_base_of > +forall_impl(Res r, ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) @@ -369,7 +375,7 @@ forall_impl(resources::Cuda r, } // iterate over segments of index set if (!Async) RAJA::cuda::synchronize(r); - return resources::EventProxy(r); + return resources::EventProxy(r); } } // namespace cuda diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp index 6d142ca19b..e449a7b5ca 100644 --- a/include/RAJA/policy/cuda/params/reduce.hpp +++ b/include/RAJA/policy/cuda/params/reduce.hpp @@ -15,10 +15,10 @@ namespace detail { // Init template camp::concepts::enable_if< type_traits::is_cuda_policy > - init(Reducer& red, const RAJA::cuda::detail::cudaInfo & cs) + init(Reducer& red, RAJA::cuda::detail::cudaInfo& cs) { cudaMalloc( (void**)(&(red.devicetarget)), sizeof(T)); - red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z); + red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z, cs.res); red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc(1); } diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 57ff050d2c..9f921e280c 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -711,12 +711,13 @@ class PinnedTally //! Object put in Pinned memory with value and pointer to next Node struct Node { Node* next; + ::RAJA::resources::Resource res; T value; }; //! Object per resource to keep track of pinned memory nodes struct ResourceNode { ResourceNode* next; - ::RAJA::resources::Cuda res; + ::RAJA::resources::Cuda cuda_res; Node* node_list; }; @@ -741,7 +742,7 @@ class PinnedTally return ret; } - ::RAJA::resources::Cuda& operator*() { return m_rn->res; } + ::RAJA::resources::Cuda& operator*() { return m_rn->cuda_res; } bool operator==(const ResourceIterator& rhs) const { @@ -823,27 +824,23 @@ class PinnedTally ResourceNodeIterator end() { return {nullptr, nullptr}; } //! get new value for use in resource - T* new_value(::RAJA::resources::Cuda res) + Node* new_value(::RAJA::resources::Cuda& cuda_res, ::RAJA::resources::Resource& res) { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); #endif ResourceNode* rn = resource_list; while (rn) { - if (rn->res.get_stream() == res.get_stream()) break; + if (rn->cuda_res.get_stream() == cuda_res.get_stream()) break; rn = rn->next; } if (!rn) { - rn = (ResourceNode*)malloc(sizeof(ResourceNode)); - rn->next = resource_list; - rn->res = res; - rn->node_list = nullptr; + rn = new ResourceNode{resource_list, cuda_res, nullptr}; resource_list = rn; } - Node* n = cuda::pinned_mempool_type::getInstance().template malloc(1); - n->next = rn->node_list; - rn->node_list = n; - return &n->value; + Node* n_mem = res.template allocate(1, ::RAJA::resources::MemoryAccess::Pinned); + rn->node_list = new(n_mem) Node{rn->node_list, res, T{}}; + return rn->node_list; } //! synchronize all resources used @@ -863,10 +860,12 @@ class PinnedTally while (rn->node_list) { Node* n = rn->node_list; rn->node_list = n->next; - cuda::pinned_mempool_type::getInstance().free(n); + auto res{std::move(n->res)}; + n->~Node(); + res.deallocate(n, ::RAJA::resources::MemoryAccess::Pinned); } resource_list = rn->next; - free(rn); + delete rn; } } @@ -896,7 +895,7 @@ struct Reduce_Data { mutable T value; T identity; unsigned int* device_count; - RAJA::detail::SoAPtr device; + RAJA::detail::SoAPtr device; bool own_device_ptr; Reduce_Data() : Reduce_Data(T(), T()){}; @@ -952,15 +951,15 @@ struct Reduce_Data { //! check and setup for device // allocate device pointers and get a new result buffer from the pinned tally - bool setupForDevice() + bool setupForDevice(::RAJA::resources::Resource& res) { bool act = !device.allocated() && setupReducers(); if (act) { cuda_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - device.allocate(numBlocks); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + device.allocate(numBlocks, res); + device_count = static_cast( + res.calloc(sizeof(unsigned int), ::RAJA::resources::MemoryAccess::Device)); own_device_ptr = true; } return act; @@ -968,12 +967,12 @@ struct Reduce_Data { //! if own resources teardown device setup // free device pointers - bool teardownForDevice() + bool teardownForDevice(::RAJA::resources::Resource& res) { bool act = own_device_ptr; if (act) { - device.deallocate(); - device_zeroed_mempool_type::getInstance().free(device_count); + device.deallocate(res); + res.deallocate(device_count, ::RAJA::resources::MemoryAccess::Device); device_count = nullptr; own_device_ptr = false; } @@ -1042,13 +1041,13 @@ struct ReduceAtomic_Data { //! check and setup for device // allocate device pointers and get a new result buffer from the pinned tally - bool setupForDevice() + bool setupForDevice(::RAJA::resources::Resource& res) { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(1); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + device = res.template allocate(1, ::RAJA::resources::MemoryAccess::Device); + device_count = static_cast( + res.calloc(sizeof(unsigned int), ::RAJA::resources::MemoryAccess::Device)); own_device_ptr = true; } return act; @@ -1056,13 +1055,13 @@ struct ReduceAtomic_Data { //! if own resources teardown device setup // free device pointers - bool teardownForDevice() + bool teardownForDevice(::RAJA::resources::Resource& res) { bool act = own_device_ptr; if (act) { - device_mempool_type::getInstance().free(device); + res.deallocate(device, ::RAJA::resources::MemoryAccess::Device); device = nullptr; - device_zeroed_mempool_type::getInstance().free(device_count); + res.deallocate(device_count, ::RAJA::resources::MemoryAccess::Device); device_count = nullptr; own_device_ptr = false; } @@ -1081,7 +1080,7 @@ class Reduce // the original object's parent is itself explicit Reduce(T init_val, T identity_ = Combiner::identity()) : parent{this}, - tally_or_val_ptr{new PinnedTally}, + tally_or_node_ptr{new PinnedTally}, val(init_val, identity_) { } @@ -1093,7 +1092,7 @@ class Reduce } //! copy and on host attempt to setup for device - // init val_ptr to avoid uninitialized read caused by host copy of + // init node_ptr to avoid uninitialized read caused by host copy of // reducer in host device lambda not being used on device. RAJA_HOST_DEVICE Reduce(const Reduce& other) @@ -1102,15 +1101,15 @@ class Reduce #else : parent{&other}, #endif - tally_or_val_ptr{other.tally_or_val_ptr}, + tally_or_node_ptr{other.tally_or_node_ptr}, val(other.val) { #if !defined(RAJA_DEVICE_CODE) if (parent) { - if (val.setupForDevice()) { - tally_or_val_ptr.val_ptr = - tally_or_val_ptr.list->new_value(currentResource()); - val.init_grid_val(tally_or_val_ptr.val_ptr); + if (val.setupForDevice(currentResource())) { + tally_or_node_ptr.node_ptr = + tally_or_node_ptr.list->new_value(currentCudaResource(), currentResource()); + val.init_grid_val(&tally_or_node_ptr.node_ptr->value); parent = nullptr; } } @@ -1124,23 +1123,23 @@ class Reduce { #if !defined(RAJA_DEVICE_CODE) if (parent == this) { - delete tally_or_val_ptr.list; - tally_or_val_ptr.list = nullptr; + delete tally_or_node_ptr.list; + tally_or_node_ptr.list = nullptr; } else if (parent) { if (val.value != val.identity) { #if defined(RAJA_ENABLE_OPENMP) - lock_guard lock(tally_or_val_ptr.list->m_mutex); + lock_guard lock(tally_or_node_ptr.list->m_mutex); #endif parent->combine(val.value); } } else { - if (val.teardownForDevice()) { - tally_or_val_ptr.val_ptr = nullptr; + if (val.teardownForDevice(tally_or_node_ptr.node_ptr->res)) { + tally_or_node_ptr.node_ptr = nullptr; } } #else if (!parent->parent) { - val.grid_reduce(tally_or_val_ptr.val_ptr); + val.grid_reduce(&tally_or_node_ptr.node_ptr->value); } else { parent->combine(val.value); } @@ -1150,14 +1149,14 @@ class Reduce //! map result value back to host if not done already; return aggregate value operator T() { - auto n = tally_or_val_ptr.list->begin(); - auto end = tally_or_val_ptr.list->end(); + auto n = tally_or_node_ptr.list->begin(); + auto end = tally_or_node_ptr.list->end(); if (n != end) { - tally_or_val_ptr.list->synchronize_resources(); + tally_or_node_ptr.list->synchronize_resources(); for (; n != end; ++n) { Combiner{}(val.value, *n); } - tally_or_val_ptr.list->free_list(); + tally_or_node_ptr.list->free_list(); } return val.value; } @@ -1179,15 +1178,15 @@ class Reduce const Reduce* parent; //! union to hold either pointer to PinnedTally or poiter to value - // only use list before setup for device and only use val_ptr after + // only use list before setup for device and only use node_ptr after union tally_u { PinnedTally* list; - T* val_ptr; + typename PinnedTally::Node* node_ptr; constexpr tally_u(PinnedTally* l) : list(l){}; - constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; + constexpr tally_u(typename PinnedTally::Node* n_ptr) : node_ptr(n_ptr){}; }; - tally_u tally_or_val_ptr; + tally_u tally_or_node_ptr; //! cuda reduction data storage class and folding algorithm using reduce_data_type = typename std::conditional< diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp index dc1cec63cd..18e6f8cfaf 100644 --- a/include/RAJA/policy/cuda/scan.hpp +++ b/include/RAJA/policy/cuda/scan.hpp @@ -42,11 +42,13 @@ namespace scan \brief explicit inclusive inplace scan given range, function, and initial value */ -template +template RAJA_INLINE -resources::EventProxy +concepts::enable_if_t< + resources::EventProxy, + std::is_base_of > inclusive_inplace( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, InputIter begin, InputIter end, @@ -66,9 +68,8 @@ inclusive_inplace( len, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, @@ -78,27 +79,30 @@ inclusive_inplace( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief explicit exclusive inplace scan given range, function, and initial value */ -template RAJA_INLINE -resources::EventProxy +concepts::enable_if_t< + resources::EventProxy, + std::is_base_of > exclusive_inplace( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, InputIter begin, InputIter end, @@ -120,9 +124,8 @@ exclusive_inplace( len, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, @@ -133,27 +136,30 @@ exclusive_inplace( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief explicit inclusive scan given input range, output, function, and initial value */ -template RAJA_INLINE -resources::EventProxy +concepts::enable_if_t< + resources::EventProxy, + std::is_base_of > inclusive( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, InputIter begin, InputIter end, @@ -174,9 +180,8 @@ inclusive( len, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, @@ -186,18 +191,19 @@ inclusive( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief explicit exclusive scan given input range, output, function, and initial value */ -template RAJA_INLINE -resources::EventProxy +concepts::enable_if_t< + resources::EventProxy, + std::is_base_of > exclusive( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, InputIter begin, InputIter end, @@ -230,9 +238,8 @@ exclusive( len, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, @@ -243,11 +250,11 @@ exclusive( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } } // namespace scan diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp index 9ada9a252b..cf9e4b48f3 100644 --- a/include/RAJA/policy/cuda/sort.hpp +++ b/include/RAJA/policy/cuda/sort.hpp @@ -44,8 +44,9 @@ namespace sort /*! \brief static assert unimplemented stable sort */ -template -concepts::enable_if_t, +template +concepts::enable_if_t, + std::is_base_of, concepts::negate>, std::is_pointer, @@ -53,7 +54,7 @@ concepts::enable_if_t, camp::is_same>>, camp::is_same>>>>>> stable( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, Iter, Iter, @@ -69,18 +70,19 @@ stable( camp::is_same>>::value, "stable_sort is only implemented for RAJA::operators::less or RAJA::operators::greater"); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief stable sort given range in ascending order */ -template -concepts::enable_if_t, +template +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer> stable( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, Iter begin, Iter end, @@ -95,7 +97,7 @@ stable( int end_bit=sizeof(R)*CHAR_BIT; // Allocate temporary storage for the output array - R* d_out = cuda::device_mempool_type::getInstance().malloc(len); + R* d_out = cuda_res.template allocate(len, ::RAJA::resources::MemoryAccess::Device); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the begin buffer @@ -112,9 +114,8 @@ stable( end_bit, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage, @@ -125,7 +126,7 @@ stable( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); if (d_keys.Current() == d_out) { @@ -133,22 +134,23 @@ stable( cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_out); + cuda_res.deallocate(d_out, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief stable sort given range in descending order */ -template -concepts::enable_if_t, +template +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer> stable( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, Iter begin, Iter end, @@ -163,7 +165,7 @@ stable( int end_bit=sizeof(R)*CHAR_BIT; // Allocate temporary storage for the output array - R* d_out = cuda::device_mempool_type::getInstance().malloc(len); + R* d_out = cuda_res.template allocate(len, ::RAJA::resources::MemoryAccess::Device); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the begin buffer @@ -180,9 +182,8 @@ stable( end_bit, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, @@ -193,7 +194,7 @@ stable( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); if (d_keys.Current() == d_out) { @@ -201,19 +202,20 @@ stable( cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_out); + cuda_res.deallocate(d_out, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief static assert unimplemented sort */ -template -concepts::enable_if_t, +template +concepts::enable_if_t, + std::is_base_of, concepts::negate>, std::is_pointer, @@ -221,7 +223,7 @@ concepts::enable_if_t, camp::is_same>>, camp::is_same>>>>>> unstable( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, Iter, Iter, @@ -237,18 +239,19 @@ unstable( camp::is_same>>::value, "sort is only implemented for RAJA::operators::less or RAJA::operators::greater"); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief sort given range in ascending order */ -template -concepts::enable_if_t, +template +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer> unstable( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit p, Iter begin, Iter end, @@ -260,12 +263,13 @@ unstable( /*! \brief sort given range in descending order */ -template -concepts::enable_if_t, +template +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer> unstable( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit p, Iter begin, Iter end, @@ -278,9 +282,10 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template -concepts::enable_if_t, +concepts::enable_if_t, + std::is_base_of, concepts::negate>, std::is_pointer, @@ -289,7 +294,7 @@ concepts::enable_if_t, camp::is_same>>, camp::is_same>>>>>> stable_pairs( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, KeyIter, KeyIter, @@ -308,20 +313,21 @@ stable_pairs( camp::is_same>>::value, "stable_sort_pairs is only implemented for RAJA::operators::less or RAJA::operators::greater"); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief stable sort given range of pairs in ascending order of keys */ -template -concepts::enable_if_t, +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer, std::is_pointer> stable_pairs( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, @@ -338,8 +344,8 @@ stable_pairs( int end_bit=sizeof(K)*CHAR_BIT; // Allocate temporary storage for the output arrays - K* d_keys_out = cuda::device_mempool_type::getInstance().malloc(len); - V* d_vals_out = cuda::device_mempool_type::getInstance().malloc(len); + K* d_keys_out = cuda_res.template allocate(len, ::RAJA::resources::MemoryAccess::Device); + V* d_vals_out = cuda_res.template allocate(len, ::RAJA::resources::MemoryAccess::Device); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the keys_begin and vals_begin buffers @@ -358,9 +364,8 @@ stable_pairs( end_bit, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage, @@ -372,7 +377,7 @@ stable_pairs( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); if (d_keys.Current() == d_keys_out) { @@ -385,25 +390,26 @@ stable_pairs( cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_keys_out); - cuda::device_mempool_type::getInstance().free(d_vals_out); + cuda_res.deallocate(d_keys_out, ::RAJA::resources::MemoryAccess::Device); + cuda_res.deallocate(d_vals_out, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief stable sort given range of pairs in descending order of keys */ -template -concepts::enable_if_t, +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer, std::is_pointer> stable_pairs( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, @@ -420,8 +426,8 @@ stable_pairs( int end_bit=sizeof(K)*CHAR_BIT; // Allocate temporary storage for the output arrays - K* d_keys_out = cuda::device_mempool_type::getInstance().malloc(len); - V* d_vals_out = cuda::device_mempool_type::getInstance().malloc(len); + K* d_keys_out = cuda_res.template allocate(len, ::RAJA::resources::MemoryAccess::Device); + V* d_vals_out = cuda_res.template allocate(len, ::RAJA::resources::MemoryAccess::Device); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the keys_begin and vals_begin buffers @@ -440,9 +446,8 @@ stable_pairs( end_bit, stream)); // Allocate temporary storage - d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( - temp_storage_bytes); + d_temp_storage = cuda_res.template allocate( + temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device); // Run cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, @@ -454,7 +459,7 @@ stable_pairs( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device); if (d_keys.Current() == d_keys_out) { @@ -467,21 +472,22 @@ stable_pairs( cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_keys_out); - cuda::device_mempool_type::getInstance().free(d_vals_out); + cuda_res.deallocate(d_keys_out, ::RAJA::resources::MemoryAccess::Device); + cuda_res.deallocate(d_vals_out, ::RAJA::resources::MemoryAccess::Device); cuda::launch(cuda_res, Async); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief static assert unimplemented sort pairs */ -template -concepts::enable_if_t, +concepts::enable_if_t, + std::is_base_of, concepts::negate>, std::is_pointer, @@ -490,7 +496,7 @@ concepts::enable_if_t, camp::is_same>>, camp::is_same>>>>>> unstable_pairs( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit, KeyIter, KeyIter, @@ -509,20 +515,21 @@ unstable_pairs( camp::is_same>>::value, "sort_pairs is only implemented for RAJA::operators::less or RAJA::operators::greater"); - return resources::EventProxy(cuda_res); + return resources::EventProxy(cuda_res); } /*! \brief stable sort given range of pairs in ascending order of keys */ -template -concepts::enable_if_t, +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer, std::is_pointer> unstable_pairs( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, @@ -535,14 +542,15 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template -concepts::enable_if_t, +concepts::enable_if_t, + std::is_base_of, type_traits::is_arithmetic>, std::is_pointer, std::is_pointer> unstable_pairs( - resources::Cuda cuda_res, + Res cuda_res, cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp index a7c81413fb..5b63144651 100644 --- a/include/RAJA/util/SoAPtr.hpp +++ b/include/RAJA/util/SoAPtr.hpp @@ -36,29 +36,30 @@ namespace detail * This is useful for creating a vectorizable data layout and getting * coalesced memory accesses or avoiding shared memory bank conflicts in cuda. */ -template > +template class SoAPtr { using value_type = T; public: SoAPtr() = default; - explicit SoAPtr(size_t size) - : mem(mempool::getInstance().template malloc(size)) + template < typename Res > + SoAPtr(size_t size, Res& res) + : mem(res.template allocate(size, ::RAJA::resources::MemoryAccess::Device)) { } - SoAPtr& allocate(size_t size) + template < typename Res > + SoAPtr& allocate(size_t size, Res& res) { - mem = mempool::getInstance().template malloc(size); + mem = res.template allocate(size, ::RAJA::resources::MemoryAccess::Device); return *this; } - SoAPtr& deallocate() + template < typename Res > + SoAPtr& deallocate(Res& res) { - mempool::getInstance().free(mem); + res.deallocate(mem, ::RAJA::resources::MemoryAccess::Device); mem = nullptr; return *this; } @@ -75,8 +76,8 @@ class SoAPtr /*! * @brief Specialization for RAJA::reduce::detail::ValueLoc. */ -template -class SoAPtr, mempool> +template +class SoAPtr> { using value_type = RAJA::reduce::detail::ValueLoc; using first_type = T; @@ -84,24 +85,27 @@ class SoAPtr, mempool> public: SoAPtr() = default; - explicit SoAPtr(size_t size) - : mem(mempool::getInstance().template malloc(size)), - mem_idx(mempool::getInstance().template malloc(size)) + template < typename Res > + explicit SoAPtr(size_t size, Res& res) + : mem(res.template allocate(size, ::RAJA::resources::MemoryAccess::Device)), + mem_idx(res.template allocate(size, ::RAJA::resources::MemoryAccess::Device)) { } - SoAPtr& allocate(size_t size) + template < typename Res > + SoAPtr& allocate(size_t size, Res& res) { - mem = mempool::getInstance().template malloc(size); - mem_idx = mempool::getInstance().template malloc(size); + mem = res.template allocate(size, ::RAJA::resources::MemoryAccess::Device); + mem_idx = res.template allocate(size, ::RAJA::resources::MemoryAccess::Device); return *this; } - SoAPtr& deallocate() + template < typename Res > + SoAPtr& deallocate(Res& res) { - mempool::getInstance().free(mem); + res.deallocate(mem, ::RAJA::resources::MemoryAccess::Device); mem = nullptr; - mempool::getInstance().free(mem_idx); + res.deallocate(mem_idx, ::RAJA::resources::MemoryAccess::Device); mem_idx = nullptr; return *this; } diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp index be2a3f19b1..4585e33a70 100644 --- a/include/RAJA/util/resource.hpp +++ b/include/RAJA/util/resource.hpp @@ -151,19 +151,19 @@ namespace RAJA namespace type_traits { - template struct is_resource : std::false_type {}; - template <> struct is_resource : std::true_type {}; + template struct is_resource : std::false_type {}; + template struct is_resource::value>> : std::true_type {}; #if defined(RAJA_CUDA_ACTIVE) - template <> struct is_resource : std::true_type {}; + template struct is_resource::value>> : std::true_type {}; #endif #if defined(RAJA_HIP_ACTIVE) - template <> struct is_resource : std::true_type {}; + template struct is_resource::value>> : std::true_type {}; #endif #if defined(RAJA_SYCL_ACTIVE) - template <> struct is_resource : std::true_type {}; + template struct is_resource::value>> : std::true_type {}; #endif #if defined(RAJA_ENABLE_TARGET_OPENMP) - template <> struct is_resource : std::true_type {}; + template struct is_resource::value>> : std::true_type {}; #endif } // end namespace type_traits diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp index d498ca89f8..d6057eb7ba 100644 --- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp +++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp @@ -53,10 +53,9 @@ void KernelBasicFissionFusionLoopTestImpl( &test_array_y); - memset(static_cast(test_array_x), - 0, - sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); - + working_res.memset(working_array_x, + 0, + sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); RAJA::kernel( RAJA::make_tuple(seg, seg), diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp index c259827241..460d47b66b 100644 --- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp +++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp @@ -53,9 +53,9 @@ void KernelConditionalFissionFusionLoopTestImpl( &test_array_y); - memset(static_cast(test_array_x), - 0, - sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); + working_res.memset(working_array_x, + 0, + sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); for (int param = 0; param < 2; ++param) {