33#include < time.h>
44#include < hip/hip_runtime.h>
55
6+ #if defined(HAVE_UMPIRE)
7+ #include " umpire/interface/c_fortran/umpire.h"
8+ #endif
9+
610/* Blocksize divisible by the warp size */
711#define BLOCKSIZE 64
812
13+ // HIP error checking
14+ #define HIP_ERR (err ) (hip_errchk(err, __FILE__, __LINE__ ))
15+ static inline void hip_errchk (hipError_t err, const char *file, int line) {
16+ if (err != hipSuccess) {
17+ printf (" \n\n %s in %s at line %d\n " , hipGetErrorString (err), file, line);
18+ exit (EXIT_FAILURE);
19+ }
20+ }
21+
922/* GPU kernel definition */
1023__global__ void hipKernel (int * const A, const int size)
1124{
@@ -29,7 +42,7 @@ void ignoreTiming(int nSteps, int size)
2942
3043 int *d_A;
3144 // Allocate pinned device memory
32- hipMalloc ((void **)&d_A, size);
45+ HIP_ERR ( hipMalloc ((void **)&d_A, sizeof ( int ) * size) );
3346
3447 // Start timer and begin stepping loop
3548 clock_t tStart = clock ();
@@ -38,10 +51,10 @@ void ignoreTiming(int nSteps, int size)
3851 // Launch GPU kernel
3952 hipKernel<<<gridsize, blocksize, 0 , 0 >>>(d_A, size);
4053 // Synchronization
41- hipStreamSynchronize (0 );
54+ HIP_ERR ( hipStreamSynchronize (0 ) );
4255 }
4356 // Free allocation
44- hipFree (d_A);
57+ HIP_ERR ( hipFree (d_A) );
4558}
4659
4760/* Run without recurring allocation */
@@ -53,7 +66,7 @@ void noRecurringAlloc(int nSteps, int size)
5366
5467 int *d_A;
5568 // Allocate pinned device memory
56- hipMalloc ((void **)&d_A, size);
69+ HIP_ERR ( hipMalloc ((void **)&d_A, sizeof ( int ) * size) );
5770
5871 // Start timer and begin stepping loop
5972 clock_t tStart = clock ();
@@ -63,12 +76,12 @@ void noRecurringAlloc(int nSteps, int size)
6376 hipKernel<<<gridsize, blocksize, 0 , 0 >>>(d_A, size);
6477 }
6578 // Synchronization
66- hipStreamSynchronize (0 );
79+ HIP_ERR ( hipStreamSynchronize (0 ) );
6780 // Check results and print timings
6881 checkTiming (" noRecurringAlloc" , (double )(clock () - tStart) / CLOCKS_PER_SEC);
6982
7083 // Free allocation
71- hipFree (d_A);
84+ HIP_ERR ( hipFree (d_A) );
7285}
7386
7487/* Do recurring allocation without memory pooling */
@@ -84,24 +97,24 @@ void recurringAllocNoMemPools(int nSteps, int size)
8497 {
8598 int *d_A;
8699 // Allocate pinned device memory
87- hipMalloc ((void **)&d_A, size);
100+ HIP_ERR ( hipMalloc ((void **)&d_A, sizeof ( int ) * size) );
88101 // Launch GPU kernel
89102 hipKernel<<<gridsize, blocksize, 0 , 0 >>>(d_A, size);
90103 // Free allocation
91- hipFree (d_A);
104+ HIP_ERR ( hipFree (d_A) );
92105 }
93106 // Synchronization
94- hipStreamSynchronize (0 );
107+ HIP_ERR ( hipStreamSynchronize (0 ) );
95108 // Check results and print timings
96109 checkTiming (" recurringAllocNoMemPools" , (double )(clock () - tStart) / CLOCKS_PER_SEC);
97110}
98111
99112/* Do recurring allocation with memory pooling */
100- void recurringAllocMemPool (int nSteps, int size)
113+ void recurringAllocMallocAsync (int nSteps, int size)
101114{
102115 // Create HIP stream
103116 hipStream_t stream;
104- hipStreamCreate (&stream);
117+ HIP_ERR ( hipStreamCreate (&stream) );
105118
106119 // Determine grid and block size
107120 const int blocksize = BLOCKSIZE;
@@ -113,32 +126,70 @@ void recurringAllocMemPool(int nSteps, int size)
113126 {
114127 int *d_A;
115128 // Allocate pinned device memory
116- cudaMallocAsync ((void **)&d_A, size, stream);
129+ cudaMallocAsync ((void **)&d_A, sizeof ( int ) * size, stream);
117130 // Launch GPU kernel
118131 hipKernel<<<gridsize, blocksize, 0 , stream>>>(d_A, size);
119132 // Free allocation
120133 cudaFreeAsync (d_A, stream);
121134 }
122135 // Synchronization
123- hipStreamSynchronize (stream);
136+ HIP_ERR ( hipStreamSynchronize (stream) );
124137 // Check results and print timings
125- checkTiming (" recurringAllocMemPoolNoSync " , (double )(clock () - tStart) / CLOCKS_PER_SEC);
138+ checkTiming (" recurringAllocMallocAsync " , (double )(clock () - tStart) / CLOCKS_PER_SEC);
126139
127140 // Destroy the stream
128- hipStreamDestroy (stream);
141+ HIP_ERR (hipStreamDestroy (stream));
142+ }
143+
144+ #if defined(HAVE_UMPIRE)
145+ /* Do recurring allocation with Umpire memory pool */
146+ void recurringAllocUmpire (int nSteps, int size)
147+ {
148+ // Get Umpire pinned device memory pool
149+ umpire_resourcemanager rm;
150+ umpire_resourcemanager_get_instance (&rm);
151+ umpire_allocator allocator;
152+ umpire_resourcemanager_get_allocator_by_name (&rm, " DEVICE" , &allocator);
153+ umpire_allocator pool;
154+ umpire_resourcemanager_make_allocator_quick_pool (&rm, " pool" , allocator, 1024 , 1024 , &pool);
155+
156+ // Determine grid and block size
157+ const int blocksize = BLOCKSIZE;
158+ const int gridsize = (size - 1 + blocksize) / blocksize;
159+
160+ // Start timer and begin stepping loop
161+ clock_t tStart = clock ();
162+ for (unsigned int i = 0 ; i < nSteps; i++)
163+ {
164+ int *d_A;
165+ // Allocate pinned device memory with Umpire
166+ d_A = (int *) umpire_allocator_allocate (&pool, sizeof (int ) * size);
167+ // Launch GPU kernel
168+ hipKernel<<<gridsize, blocksize, 0 , 0 >>>(d_A, size);
169+ // Free Umpire allocation
170+ umpire_allocator_deallocate (&pool, d_A);
171+ }
172+ // Synchronization
173+ HIP_ERR (hipStreamSynchronize (0 ));
174+ // Check results and print timings
175+ checkTiming (" recurringAllocUmpire" , (double )(clock () - tStart) / CLOCKS_PER_SEC);
129176}
177+ #endif
130178
131179/* The main function */
132180int main (int argc, char * argv[])
133181{
134182 // Set the number of steps and 1D grid dimensions
135- int nSteps = 1e6 , size = 1e6 ;
183+ int nSteps = 1e4 , size = 1e6 ;
136184
137185 // Ignore first run, first kernel is slower
138186 ignoreTiming (nSteps, size);
139187
140188 // Run with different memory allocatins strategies
141189 noRecurringAlloc (nSteps, size);
142190 recurringAllocNoMemPools (nSteps, size);
143- recurringAllocMemPool (nSteps, size);
191+ recurringAllocMallocAsync (nSteps, size);
192+ #if defined(HAVE_UMPIRE)
193+ recurringAllocUmpire (nSteps, size);
194+ #endif
144195}
0 commit comments