diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index d094a729c1..bdc7405779 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -95,7 +95,7 @@ namespace detail
 #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
     // Device related attributes.
     value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+    RAJA::detail::SoAPtr<value_type> device_mem;
     unsigned int * device_count = nullptr;
 #endif
 
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index c49ec46049..024dffe9e4 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -118,7 +118,8 @@ namespace detail
 struct cudaInfo {
   cuda_dim_t gridDim{0, 0, 0};
   cuda_dim_t blockDim{0, 0, 0};
-  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)}; 
+  ::RAJA::resources::Cuda cuda_res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
+  ::RAJA::resources::Resource res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
   bool setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
   cudaInfo* thread_states = nullptr;
@@ -251,20 +252,26 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get resource for current launch
 RAJA_INLINE
-::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
+::RAJA::resources::Cuda& currentCudaResource() { return detail::tl_status.cuda_res; }
+
+//! get resource for current launch
+RAJA_INLINE
+::RAJA::resources::Resource& currentResource() { return detail::tl_status.res; }
 
 //! create copy of loop_body that is setup for device execution
-template <typename LOOP_BODY>
+template <typename Res, typename LOOP_BODY>
 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
     cuda_dim_t gridDim,
     cuda_dim_t blockDim,
     size_t RAJA_UNUSED_ARG(dynamic_smem),
-    ::RAJA::resources::Cuda res,
+    Res res,
     LOOP_BODY&& loop_body)
 {
   detail::SetterResetter<bool> setup_reducers_srer(
       detail::tl_status.setup_reducers, true);
-  detail::SetterResetter<::RAJA::resources::Cuda> res_srer(
+  detail::SetterResetter<::RAJA::resources::Cuda> cuda_res_srer(
+      detail::tl_status.cuda_res, res);
+  detail::SetterResetter<::RAJA::resources::Resource> res_srer(
       detail::tl_status.res, res);
 
   detail::tl_status.gridDim = gridDim;
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index ce278c4ee2..5534b56498 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -187,13 +187,14 @@ __launch_bounds__(BlockSize, 1) __global__
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async, typename ForallParam>
+template <typename Res, typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async, typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
+  resources::EventProxy<Res>,
+  std::is_base_of<resources::Cuda, Res>,
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Cuda cuda_res,
+forall_impl(Res cuda_res,
             cuda_exec_explicit<BlockSize, BlocksPerSM, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -250,16 +251,17 @@ forall_impl(resources::Cuda cuda_res,
     RAJA_FT_END;
   }
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async, typename ForallParam>
+template <typename Res, typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async, typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
+  resources::EventProxy<Res>,
+  std::is_base_of<resources::Cuda, Res>,
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Cuda cuda_res,
+forall_impl(Res cuda_res,
             cuda_exec_explicit<BlockSize, BlocksPerSM, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -325,7 +327,7 @@ forall_impl(resources::Cuda cuda_res,
     RAJA_FT_END;
   }
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 
@@ -348,13 +350,17 @@ forall_impl(resources::Cuda cuda_res,
  *
  ******************************************************************************
  */
-template <typename LoopBody,
+template <typename Res,
+          typename LoopBody,
           size_t BlockSize,
           size_t BlocksPerSM,
           bool Async,
           typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-forall_impl(resources::Cuda r,
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<Res>,
+  std::is_base_of<resources::Cuda, Res> >
+forall_impl(Res r,
             ExecPolicy<seq_segit, cuda_exec_explicit<BlockSize, BlocksPerSM, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
@@ -369,7 +375,7 @@ forall_impl(resources::Cuda r,
   }  // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
-  return resources::EventProxy<resources::Cuda>(r);
+  return resources::EventProxy<Res>(r);
 }
 
 }  // namespace cuda
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 6d142ca19b..e449a7b5ca 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -15,10 +15,10 @@ namespace detail {
   // Init
   template<typename EXEC_POL, typename OP, typename T>
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red, const RAJA::cuda::detail::cudaInfo & cs)
+  init(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& cs)
   {
     cudaMalloc( (void**)(&(red.devicetarget)), sizeof(T));
-    red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z);
+    red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z, cs.res);
     red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
   }
 
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 57ff050d2c..9f921e280c 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -711,12 +711,13 @@ class PinnedTally
   //! Object put in Pinned memory with value and pointer to next Node
   struct Node {
     Node* next;
+    ::RAJA::resources::Resource res;
     T value;
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode {
     ResourceNode* next;
-    ::RAJA::resources::Cuda res;
+    ::RAJA::resources::Cuda cuda_res;
     Node* node_list;
   };
 
@@ -741,7 +742,7 @@ class PinnedTally
       return ret;
     }
 
-    ::RAJA::resources::Cuda& operator*() { return m_rn->res; }
+    ::RAJA::resources::Cuda& operator*() { return m_rn->cuda_res; }
 
     bool operator==(const ResourceIterator& rhs) const
     {
@@ -823,27 +824,23 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  T* new_value(::RAJA::resources::Cuda res)
+  Node* new_value(::RAJA::resources::Cuda& cuda_res, ::RAJA::resources::Resource& res)
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
     while (rn) {
-      if (rn->res.get_stream() == res.get_stream()) break;
+      if (rn->cuda_res.get_stream() == cuda_res.get_stream()) break;
       rn = rn->next;
     }
     if (!rn) {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
-      rn->node_list = nullptr;
+      rn = new ResourceNode{resource_list, cuda_res, nullptr};
       resource_list = rn;
     }
-    Node* n = cuda::pinned_mempool_type::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
-    rn->node_list = n;
-    return &n->value;
+    Node* n_mem = res.template allocate<Node>(1, ::RAJA::resources::MemoryAccess::Pinned);
+    rn->node_list = new(n_mem) Node{rn->node_list, res, T{}};
+    return rn->node_list;
   }
 
   //! synchronize all resources used
@@ -863,10 +860,12 @@ class PinnedTally
       while (rn->node_list) {
         Node* n = rn->node_list;
         rn->node_list = n->next;
-        cuda::pinned_mempool_type::getInstance().free(n);
+        auto res{std::move(n->res)};
+        n->~Node();
+        res.deallocate(n, ::RAJA::resources::MemoryAccess::Pinned);
       }
       resource_list = rn->next;
-      free(rn);
+      delete rn;
     }
   }
 
@@ -896,7 +895,7 @@ struct Reduce_Data {
   mutable T value;
   T identity;
   unsigned int* device_count;
-  RAJA::detail::SoAPtr<T, device_mempool_type> device;
+  RAJA::detail::SoAPtr<T> device;
   bool own_device_ptr;
 
   Reduce_Data() : Reduce_Data(T(), T()){};
@@ -952,15 +951,15 @@ struct Reduce_Data {
 
   //! check and setup for device
   //  allocate device pointers and get a new result buffer from the pinned tally
-  bool setupForDevice()
+  bool setupForDevice(::RAJA::resources::Resource& res)
   {
     bool act = !device.allocated() && setupReducers();
     if (act) {
       cuda_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      device.allocate(numBlocks);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      device.allocate(numBlocks, res);
+      device_count = static_cast<unsigned int*>(
+          res.calloc(sizeof(unsigned int), ::RAJA::resources::MemoryAccess::Device));
       own_device_ptr = true;
     }
     return act;
@@ -968,12 +967,12 @@ struct Reduce_Data {
 
   //! if own resources teardown device setup
   //  free device pointers
-  bool teardownForDevice()
+  bool teardownForDevice(::RAJA::resources::Resource& res)
   {
     bool act = own_device_ptr;
     if (act) {
-      device.deallocate();
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      device.deallocate(res);
+      res.deallocate(device_count, ::RAJA::resources::MemoryAccess::Device);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -1042,13 +1041,13 @@ struct ReduceAtomic_Data {
 
   //! check and setup for device
   //  allocate device pointers and get a new result buffer from the pinned tally
-  bool setupForDevice()
+  bool setupForDevice(::RAJA::resources::Resource& res)
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(1);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      device = res.template allocate<T>(1, ::RAJA::resources::MemoryAccess::Device);
+      device_count = static_cast<unsigned int*>(
+          res.calloc(sizeof(unsigned int), ::RAJA::resources::MemoryAccess::Device));
       own_device_ptr = true;
     }
     return act;
@@ -1056,13 +1055,13 @@ struct ReduceAtomic_Data {
 
   //! if own resources teardown device setup
   //  free device pointers
-  bool teardownForDevice()
+  bool teardownForDevice(::RAJA::resources::Resource& res)
   {
     bool act = own_device_ptr;
     if (act) {
-      device_mempool_type::getInstance().free(device);
+      res.deallocate(device, ::RAJA::resources::MemoryAccess::Device);
       device = nullptr;
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      res.deallocate(device_count, ::RAJA::resources::MemoryAccess::Device);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -1081,7 +1080,7 @@ class Reduce
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
       : parent{this},
-        tally_or_val_ptr{new PinnedTally<T>},
+        tally_or_node_ptr{new PinnedTally<T>},
         val(init_val, identity_)
   {
   }
@@ -1093,7 +1092,7 @@ class Reduce
   }
 
   //! copy and on host attempt to setup for device
-  //  init val_ptr to avoid uninitialized read caused by host copy of
+  //  init node_ptr to avoid uninitialized read caused by host copy of
   //  reducer in host device lambda not being used on device.
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
@@ -1102,15 +1101,15 @@ class Reduce
 #else
       : parent{&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_node_ptr{other.tally_or_node_ptr},
         val(other.val)
   {
 #if !defined(RAJA_DEVICE_CODE)
     if (parent) {
-      if (val.setupForDevice()) {
-        tally_or_val_ptr.val_ptr =
-            tally_or_val_ptr.list->new_value(currentResource());
-        val.init_grid_val(tally_or_val_ptr.val_ptr);
+      if (val.setupForDevice(currentResource())) {
+        tally_or_node_ptr.node_ptr =
+            tally_or_node_ptr.list->new_value(currentCudaResource(), currentResource());
+        val.init_grid_val(&tally_or_node_ptr.node_ptr->value);
         parent = nullptr;
       }
     }
@@ -1124,23 +1123,23 @@ class Reduce
   {
 #if !defined(RAJA_DEVICE_CODE)
     if (parent == this) {
-      delete tally_or_val_ptr.list;
-      tally_or_val_ptr.list = nullptr;
+      delete tally_or_node_ptr.list;
+      tally_or_node_ptr.list = nullptr;
     } else if (parent) {
       if (val.value != val.identity) {
 #if defined(RAJA_ENABLE_OPENMP)
-        lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
+        lock_guard<omp::mutex> lock(tally_or_node_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
     } else {
-      if (val.teardownForDevice()) {
-        tally_or_val_ptr.val_ptr = nullptr;
+      if (val.teardownForDevice(tally_or_node_ptr.node_ptr->res)) {
+        tally_or_node_ptr.node_ptr = nullptr;
       }
     }
 #else
     if (!parent->parent) {
-      val.grid_reduce(tally_or_val_ptr.val_ptr);
+      val.grid_reduce(&tally_or_node_ptr.node_ptr->value);
     } else {
       parent->combine(val.value);
     }
@@ -1150,14 +1149,14 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
-    auto end = tally_or_val_ptr.list->end();
+    auto n = tally_or_node_ptr.list->begin();
+    auto end = tally_or_node_ptr.list->end();
     if (n != end) {
-      tally_or_val_ptr.list->synchronize_resources();
+      tally_or_node_ptr.list->synchronize_resources();
       for (; n != end; ++n) {
         Combiner{}(val.value, *n);
       }
-      tally_or_val_ptr.list->free_list();
+      tally_or_node_ptr.list->free_list();
     }
     return val.value;
   }
@@ -1179,15 +1178,15 @@ class Reduce
   const Reduce* parent;
 
   //! union to hold either pointer to PinnedTally or poiter to value
-  //  only use list before setup for device and only use val_ptr after
+  //  only use list before setup for device and only use node_ptr after
   union tally_u {
     PinnedTally<T>* list;
-    T* val_ptr;
+    typename PinnedTally<T>::Node* node_ptr;
     constexpr tally_u(PinnedTally<T>* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(typename PinnedTally<T>::Node* n_ptr) : node_ptr(n_ptr){};
   };
 
-  tally_u tally_or_val_ptr;
+  tally_u tally_or_node_ptr;
 
   //! cuda reduction data storage class and folding algorithm
   using reduce_data_type = typename std::conditional<
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index dc1cec63cd..18e6f8cfaf 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -42,11 +42,13 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename InputIter, typename Function>
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename InputIter, typename Function>
 RAJA_INLINE
-resources::EventProxy<resources::Cuda>
+concepts::enable_if_t<
+  resources::EventProxy<Res>,
+  std::is_base_of<resources::Cuda, Res> >
 inclusive_inplace(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
@@ -66,9 +68,8 @@ inclusive_inplace(
                                               len,
                                               stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
                                               temp_storage_bytes,
@@ -78,27 +79,30 @@ inclusive_inplace(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <size_t BLOCK_SIZE,
+template <typename Res,
+          size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename Function,
           typename T>
 RAJA_INLINE
-resources::EventProxy<resources::Cuda>
+concepts::enable_if_t<
+  resources::EventProxy<Res>,
+  std::is_base_of<resources::Cuda, Res> >
 exclusive_inplace(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
@@ -120,9 +124,8 @@ exclusive_inplace(
                                               len,
                                               stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
                                               temp_storage_bytes,
@@ -133,27 +136,30 @@ exclusive_inplace(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <size_t BLOCK_SIZE,
+template <typename Res,
+          size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
           typename OutputIter,
           typename Function>
 RAJA_INLINE
-resources::EventProxy<resources::Cuda>
+concepts::enable_if_t<
+  resources::EventProxy<Res>,
+  std::is_base_of<resources::Cuda, Res> >
 inclusive(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
@@ -174,9 +180,8 @@ inclusive(
                                               len,
                                               stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
                                               temp_storage_bytes,
@@ -186,18 +191,19 @@ inclusive(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <size_t BLOCK_SIZE,
+template <typename Res,
+          size_t BLOCK_SIZE,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -205,9 +211,11 @@ template <size_t BLOCK_SIZE,
           typename Function,
           typename T>
 RAJA_INLINE
-resources::EventProxy<resources::Cuda>
+concepts::enable_if_t<
+  resources::EventProxy<Res>,
+  std::is_base_of<resources::Cuda, Res> >
 exclusive(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
@@ -230,9 +238,8 @@ exclusive(
                                               len,
                                               stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
                                               temp_storage_bytes,
@@ -243,11 +250,11 @@ exclusive(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index 9ada9a252b..cf9e4b48f3 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,8 +44,9 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                         std::is_pointer<Iter>,
@@ -53,7 +54,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
@@ -69,18 +70,19 @@ stable(
       camp::is_same<Compare, operators::greater<iterval>>>::value,
       "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief stable sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
@@ -95,7 +97,7 @@ stable(
   int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
-  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
+  R* d_out = cuda_res.template allocate<R>(len, ::RAJA::resources::MemoryAccess::Device);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the begin buffer
@@ -112,9 +114,8 @@ stable(
                                               end_bit,
                                               stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
@@ -125,7 +126,7 @@ stable(
                                               end_bit,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   if (d_keys.Current() == d_out) {
 
@@ -133,22 +134,23 @@ stable(
     cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_out);
+  cuda_res.deallocate(d_out, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief stable sort given range in descending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
@@ -163,7 +165,7 @@ stable(
   int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
-  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
+  R* d_out = cuda_res.template allocate<R>(len, ::RAJA::resources::MemoryAccess::Device);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the begin buffer
@@ -180,9 +182,8 @@ stable(
                                                         end_bit,
                                                         stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
@@ -193,7 +194,7 @@ stable(
                                                         end_bit,
                                                         stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   if (d_keys.Current() == d_out) {
 
@@ -201,19 +202,20 @@ stable(
     cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_out);
+  cuda_res.deallocate(d_out, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 
 /*!
         \brief static assert unimplemented sort
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                         std::is_pointer<Iter>,
@@ -221,7 +223,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
@@ -237,18 +239,19 @@ unstable(
       camp::is_same<Compare, operators::greater<iterval>>>::value,
       "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief sort given range in ascending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
@@ -260,12 +263,13 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
@@ -278,9 +282,10 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
                         std::is_pointer<KeyIter>,
@@ -289,7 +294,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
@@ -308,20 +313,21 @@ stable_pairs(
       camp::is_same<Compare, operators::greater<K>>>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
                       std::is_pointer<KeyIter>,
                       std::is_pointer<ValIter>>
 stable_pairs(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
@@ -338,8 +344,8 @@ stable_pairs(
   int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
-  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
-  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
+  K* d_keys_out = cuda_res.template allocate<K>(len, ::RAJA::resources::MemoryAccess::Device);
+  V* d_vals_out = cuda_res.template allocate<V>(len, ::RAJA::resources::MemoryAccess::Device);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the keys_begin and vals_begin buffers
@@ -358,9 +364,8 @@ stable_pairs(
                                                end_bit,
                                                stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
@@ -372,7 +377,7 @@ stable_pairs(
                                                end_bit,
                                                stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   if (d_keys.Current() == d_keys_out) {
 
@@ -385,25 +390,26 @@ stable_pairs(
     cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_keys_out);
-  cuda::device_mempool_type::getInstance().free(d_vals_out);
+  cuda_res.deallocate(d_keys_out, ::RAJA::resources::MemoryAccess::Device);
+  cuda_res.deallocate(d_vals_out, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
                       std::is_pointer<KeyIter>,
                       std::is_pointer<ValIter>>
 stable_pairs(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
@@ -420,8 +426,8 @@ stable_pairs(
   int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
-  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
-  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
+  K* d_keys_out = cuda_res.template allocate<K>(len, ::RAJA::resources::MemoryAccess::Device);
+  V* d_vals_out = cuda_res.template allocate<V>(len, ::RAJA::resources::MemoryAccess::Device);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the keys_begin and vals_begin buffers
@@ -440,9 +446,8 @@ stable_pairs(
                                                          end_bit,
                                                          stream));
   // Allocate temporary storage
-  d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
-          temp_storage_bytes);
+  d_temp_storage = cuda_res.template allocate<unsigned char>(
+      temp_storage_bytes, ::RAJA::resources::MemoryAccess::Device);
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
@@ -454,7 +459,7 @@ stable_pairs(
                                                          end_bit,
                                                          stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda_res.deallocate(d_temp_storage, ::RAJA::resources::MemoryAccess::Device);
 
   if (d_keys.Current() == d_keys_out) {
 
@@ -467,21 +472,22 @@ stable_pairs(
     cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_keys_out);
-  cuda::device_mempool_type::getInstance().free(d_vals_out);
+  cuda_res.deallocate(d_keys_out, ::RAJA::resources::MemoryAccess::Device);
+  cuda_res.deallocate(d_vals_out, ::RAJA::resources::MemoryAccess::Device);
 
   cuda::launch(cuda_res, Async);
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
                         std::is_pointer<KeyIter>,
@@ -490,7 +496,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
@@ -509,20 +515,21 @@ unstable_pairs(
       camp::is_same<Compare, operators::greater<K>>>::value,
       "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
 
-  return resources::EventProxy<resources::Cuda>(cuda_res);
+  return resources::EventProxy<Res>(cuda_res);
 }
 
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
                       std::is_pointer<KeyIter>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
@@ -535,14 +542,15 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+template <typename Res, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
+concepts::enable_if_t<resources::EventProxy<Res>,
+                      std::is_base_of<resources::Cuda, Res>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
                       std::is_pointer<KeyIter>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
-    resources::Cuda cuda_res,
+    Res cuda_res,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index a7c81413fb..5b63144651 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -36,29 +36,30 @@ namespace detail
  * This is useful for creating a vectorizable data layout and getting
  * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
  */
-template <typename T,
-          typename mempool = RAJA::basic_mempool::MemPool<
-              RAJA::basic_mempool::generic_allocator> >
+template <typename T>
 class SoAPtr
 {
   using value_type = T;
 
 public:
   SoAPtr() = default;
-  explicit SoAPtr(size_t size)
-      : mem(mempool::getInstance().template malloc<value_type>(size))
+  template < typename Res >
+  SoAPtr(size_t size, Res& res)
+      : mem(res.template allocate<value_type>(size, ::RAJA::resources::MemoryAccess::Device))
   {
   }
 
-  SoAPtr& allocate(size_t size)
+  template < typename Res >
+  SoAPtr& allocate(size_t size, Res& res)
   {
-    mem = mempool::getInstance().template malloc<value_type>(size);
+    mem = res.template allocate<value_type>(size, ::RAJA::resources::MemoryAccess::Device);
     return *this;
   }
 
-  SoAPtr& deallocate()
+  template < typename Res >
+  SoAPtr& deallocate(Res& res)
   {
-    mempool::getInstance().free(mem);
+    res.deallocate(mem, ::RAJA::resources::MemoryAccess::Device);
     mem = nullptr;
     return *this;
   }
@@ -75,8 +76,8 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
+template <typename T, typename IndexType, bool doing_min>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>>
 {
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
   using first_type = T;
@@ -84,24 +85,27 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
 
 public:
   SoAPtr() = default;
-  explicit SoAPtr(size_t size)
-      : mem(mempool::getInstance().template malloc<first_type>(size)),
-        mem_idx(mempool::getInstance().template malloc<second_type>(size))
+  template < typename Res >
+  explicit SoAPtr(size_t size, Res& res)
+      : mem(res.template allocate<first_type>(size, ::RAJA::resources::MemoryAccess::Device)),
+        mem_idx(res.template allocate<second_type>(size, ::RAJA::resources::MemoryAccess::Device))
   {
   }
 
-  SoAPtr& allocate(size_t size)
+  template < typename Res >
+  SoAPtr& allocate(size_t size, Res& res)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
-    mem_idx = mempool::getInstance().template malloc<second_type>(size);
+    mem = res.template allocate<first_type>(size, ::RAJA::resources::MemoryAccess::Device);
+    mem_idx = res.template allocate<second_type>(size, ::RAJA::resources::MemoryAccess::Device);
     return *this;
   }
 
-  SoAPtr& deallocate()
+  template < typename Res >
+  SoAPtr& deallocate(Res& res)
   {
-    mempool::getInstance().free(mem);
+    res.deallocate(mem, ::RAJA::resources::MemoryAccess::Device);
     mem = nullptr;
-    mempool::getInstance().free(mem_idx);
+    res.deallocate(mem_idx, ::RAJA::resources::MemoryAccess::Device);
     mem_idx = nullptr;
     return *this;
   }
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index be2a3f19b1..4585e33a70 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -151,19 +151,19 @@ namespace RAJA
 
   namespace type_traits
   {
-    template <typename T> struct is_resource : std::false_type {};
-    template <> struct is_resource<resources::Host> : std::true_type {};
+    template <typename T, typename=void> struct is_resource : std::false_type {};
+    template <typename T> struct is_resource<T, std::enable_if_t<std::is_base_of<resources::Host, T>::value>> : std::true_type {};
 #if defined(RAJA_CUDA_ACTIVE)
-    template <> struct is_resource<resources::Cuda> : std::true_type {};
+    template <typename T> struct is_resource<T, std::enable_if_t<std::is_base_of<resources::Cuda, T>::value>> : std::true_type {};
 #endif
 #if defined(RAJA_HIP_ACTIVE)
-    template <> struct is_resource<resources::Hip> : std::true_type {};
+    template <typename T> struct is_resource<T, std::enable_if_t<std::is_base_of<resources::Hip, T>::value>> : std::true_type {};
 #endif
 #if defined(RAJA_SYCL_ACTIVE)
-    template <> struct is_resource<resources::Sycl> : std::true_type {};
+    template <typename T> struct is_resource<T, std::enable_if_t<std::is_base_of<resources::Sycl, T>::value>> : std::true_type {};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    template <> struct is_resource<resources::Omp> : std::true_type {};
+    template <typename T> struct is_resource<T, std::enable_if_t<std::is_base_of<resources::Omp, T>::value>> : std::true_type {};
 #endif
   } // end namespace type_traits
 
diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index d498ca89f8..d6057eb7ba 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -53,10 +53,9 @@ void KernelBasicFissionFusionLoopTestImpl(
                                     &test_array_y);
 
 
-  memset(static_cast<void*>(test_array_x),
-         0,
-         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
-
+  working_res.memset(working_array_x,
+                     0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index c259827241..460d47b66b 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -53,9 +53,9 @@ void KernelConditionalFissionFusionLoopTestImpl(
                                     &test_array_y);
 
 
-  memset(static_cast<void*>(test_array_x),
-         0,
-         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(working_array_x,
+                     0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (int param = 0; param < 2; ++param) {