Skip to content

Commit bdc969a

Browse files
committed
Merge branch 'main' of github.com:csc-training/hip-programming
2 parents 90ebbd1 + a1eb84c commit bdc969a

File tree

3 files changed

+93
-25
lines changed

3 files changed

+93
-25
lines changed

memory/02-mempools/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,20 @@ IMPORTANT NOTE! Unfortunately, the support for memory pools was only recently ad
1212

1313
* `hipMallocAsync` -> `cudaMallocAsync`
1414
* `hipFreeAsync` -> `cudaFreeAsync`
15+
16+
### Bonus (optional) - Implement an additional case using Umpire library
17+
18+
Umpire is available at https://github.com/LLNL/Umpire/. Install Umpire with
19+
20+
```
21+
git clone --recursive https://github.com/LLNL/Umpire.git
22+
cd Umpire && mkdir build && cd build
23+
cmake ../ -DUMPIRE_ENABLE_C=On -DENABLE_CUDA=On -DCMAKE_INSTALL_PREFIX=/path
24+
make
25+
make install
26+
```
27+
28+
Compile the exercise with
29+
```
30+
hipcc --gpu-architecture=sm_70 -DHAVE_UMPIRE=1 mempools.cpp -I/path/umpire/include/ -L/path/umpire/lib/ -lcamp -lumpire
31+
```

memory/02-mempools/mempools.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ void ignoreTiming(int nSteps, int size)
2929

3030
int *d_A;
3131
// Allocate pinned device memory
32-
hipMalloc((void**)&d_A, size);
32+
hipMalloc((void**)&d_A, sizeof(int) * size);
3333

3434
// Start timer and begin stepping loop
3535
clock_t tStart = clock();
@@ -53,7 +53,7 @@ void noRecurringAlloc(int nSteps, int size)
5353

5454
int *d_A;
5555
// Allocate pinned device memory
56-
#error allocate memory with hipMalloc for d_A of size
56+
#error allocate memory with hipMalloc for d_A of size ints
5757

5858
// Start timer and begin stepping loop
5959
clock_t tStart = clock();
@@ -84,7 +84,7 @@ void recurringAllocNoMemPools(int nSteps, int size)
8484
{
8585
int *d_A;
8686
// Allocate pinned device memory
87-
#error allocate memory with hipMalloc for d_A of size
87+
#error allocate memory with hipMalloc for d_A of size ints
8888
// Launch GPU kernel
8989
hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
9090
// Free allocation
@@ -97,7 +97,7 @@ void recurringAllocNoMemPools(int nSteps, int size)
9797
}
9898

9999
/* Do recurring allocation with memory pooling */
100-
void recurringAllocMemPool(int nSteps, int size)
100+
void recurringAllocMallocAsync(int nSteps, int size)
101101
{
102102
// Create HIP stream
103103
hipStream_t stream;
@@ -113,7 +113,7 @@ void recurringAllocMemPool(int nSteps, int size)
113113
{
114114
int *d_A;
115115
// Allocate pinned device memory
116-
#error allocate memory with cudaMallocAsync for d_A of size in stream
116+
#error allocate memory with cudaMallocAsync for d_A of size ints in stream
117117
// Launch GPU kernel
118118
hipKernel<<<gridsize, blocksize, 0, stream>>>(d_A, size);
119119
// Free allocation
@@ -122,7 +122,7 @@ void recurringAllocMemPool(int nSteps, int size)
122122
// Synchronization
123123
#error synchronize stream here
124124
// Check results and print timings
125-
checkTiming("recurringAllocMemPoolNoSync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
125+
checkTiming("recurringAllocMallocAsync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
126126

127127
// Destroy the stream
128128
hipStreamDestroy(stream);
@@ -132,13 +132,13 @@ void recurringAllocMemPool(int nSteps, int size)
132132
int main(int argc, char* argv[])
133133
{
134134
// Set the number of steps and 1D grid dimensions
135-
int nSteps = 1e6, size = 1e6;
135+
int nSteps = 1e4, size = 1e6;
136136

137137
// Ignore first run, first kernel is slower
138138
ignoreTiming(nSteps, size);
139139

140140
// Run with different memory allocatins strategies
141141
noRecurringAlloc(nSteps, size);
142142
recurringAllocNoMemPools(nSteps, size);
143-
recurringAllocMemPool(nSteps, size);
143+
recurringAllocMallocAsync(nSteps, size);
144144
}

memory/02-mempools/solution/mempools.cpp

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,22 @@
33
#include <time.h>
44
#include <hip/hip_runtime.h>
55

6+
#if defined(HAVE_UMPIRE)
7+
#include "umpire/interface/c_fortran/umpire.h"
8+
#endif
9+
610
/* Blocksize divisible by the warp size */
711
#define BLOCKSIZE 64
812

13+
// HIP error checking
14+
#define HIP_ERR(err) (hip_errchk(err, __FILE__, __LINE__ ))
15+
static inline void hip_errchk(hipError_t err, const char *file, int line) {
16+
if (err != hipSuccess) {
17+
printf("\n\n%s in %s at line %d\n", hipGetErrorString(err), file, line);
18+
exit(EXIT_FAILURE);
19+
}
20+
}
21+
922
/* GPU kernel definition */
1023
__global__ void hipKernel(int* const A, const int size)
1124
{
@@ -29,7 +42,7 @@ void ignoreTiming(int nSteps, int size)
2942

3043
int *d_A;
3144
// Allocate pinned device memory
32-
hipMalloc((void**)&d_A, size);
45+
HIP_ERR(hipMalloc((void**)&d_A, sizeof(int) * size));
3346

3447
// Start timer and begin stepping loop
3548
clock_t tStart = clock();
@@ -38,10 +51,10 @@ void ignoreTiming(int nSteps, int size)
3851
// Launch GPU kernel
3952
hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
4053
// Synchronization
41-
hipStreamSynchronize(0);
54+
HIP_ERR(hipStreamSynchronize(0));
4255
}
4356
// Free allocation
44-
hipFree(d_A);
57+
HIP_ERR(hipFree(d_A));
4558
}
4659

4760
/* Run without recurring allocation */
@@ -53,7 +66,7 @@ void noRecurringAlloc(int nSteps, int size)
5366

5467
int *d_A;
5568
// Allocate pinned device memory
56-
hipMalloc((void**)&d_A, size);
69+
HIP_ERR(hipMalloc((void**)&d_A, sizeof(int) * size));
5770

5871
// Start timer and begin stepping loop
5972
clock_t tStart = clock();
@@ -63,12 +76,12 @@ void noRecurringAlloc(int nSteps, int size)
6376
hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
6477
}
6578
// Synchronization
66-
hipStreamSynchronize(0);
79+
HIP_ERR(hipStreamSynchronize(0));
6780
// Check results and print timings
6881
checkTiming("noRecurringAlloc", (double)(clock() - tStart) / CLOCKS_PER_SEC);
6982

7083
// Free allocation
71-
hipFree(d_A);
84+
HIP_ERR(hipFree(d_A));
7285
}
7386

7487
/* Do recurring allocation without memory pooling */
@@ -84,24 +97,24 @@ void recurringAllocNoMemPools(int nSteps, int size)
8497
{
8598
int *d_A;
8699
// Allocate pinned device memory
87-
hipMalloc((void**)&d_A, size);
100+
HIP_ERR(hipMalloc((void**)&d_A, sizeof(int) * size));
88101
// Launch GPU kernel
89102
hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
90103
// Free allocation
91-
hipFree(d_A);
104+
HIP_ERR(hipFree(d_A));
92105
}
93106
// Synchronization
94-
hipStreamSynchronize(0);
107+
HIP_ERR(hipStreamSynchronize(0));
95108
// Check results and print timings
96109
checkTiming("recurringAllocNoMemPools", (double)(clock() - tStart) / CLOCKS_PER_SEC);
97110
}
98111

99112
/* Do recurring allocation with memory pooling */
100-
void recurringAllocMemPool(int nSteps, int size)
113+
void recurringAllocMallocAsync(int nSteps, int size)
101114
{
102115
// Create HIP stream
103116
hipStream_t stream;
104-
hipStreamCreate(&stream);
117+
HIP_ERR(hipStreamCreate(&stream));
105118

106119
// Determine grid and block size
107120
const int blocksize = BLOCKSIZE;
@@ -113,32 +126,70 @@ void recurringAllocMemPool(int nSteps, int size)
113126
{
114127
int *d_A;
115128
// Allocate pinned device memory
116-
cudaMallocAsync((void**)&d_A, size, stream);
129+
cudaMallocAsync((void**)&d_A, sizeof(int) * size, stream);
117130
// Launch GPU kernel
118131
hipKernel<<<gridsize, blocksize, 0, stream>>>(d_A, size);
119132
// Free allocation
120133
cudaFreeAsync(d_A, stream);
121134
}
122135
// Synchronization
123-
hipStreamSynchronize(stream);
136+
HIP_ERR(hipStreamSynchronize(stream));
124137
// Check results and print timings
125-
checkTiming("recurringAllocMemPoolNoSync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
138+
checkTiming("recurringAllocMallocAsync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
126139

127140
// Destroy the stream
128-
hipStreamDestroy(stream);
141+
HIP_ERR(hipStreamDestroy(stream));
142+
}
143+
144+
#if defined(HAVE_UMPIRE)
145+
/* Do recurring allocation with Umpire memory pool */
146+
void recurringAllocUmpire(int nSteps, int size)
147+
{
148+
// Get Umpire pinned device memory pool
149+
umpire_resourcemanager rm;
150+
umpire_resourcemanager_get_instance(&rm);
151+
umpire_allocator allocator;
152+
umpire_resourcemanager_get_allocator_by_name(&rm, "DEVICE", &allocator);
153+
umpire_allocator pool;
154+
umpire_resourcemanager_make_allocator_quick_pool(&rm, "pool", allocator, 1024, 1024, &pool);
155+
156+
// Determine grid and block size
157+
const int blocksize = BLOCKSIZE;
158+
const int gridsize = (size - 1 + blocksize) / blocksize;
159+
160+
// Start timer and begin stepping loop
161+
clock_t tStart = clock();
162+
for(unsigned int i = 0; i < nSteps; i++)
163+
{
164+
int *d_A;
165+
// Allocate pinned device memory with Umpire
166+
d_A = (int*) umpire_allocator_allocate(&pool, sizeof(int) * size);
167+
// Launch GPU kernel
168+
hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
169+
// Free Umpire allocation
170+
umpire_allocator_deallocate(&pool, d_A);
171+
}
172+
// Synchronization
173+
HIP_ERR(hipStreamSynchronize(0));
174+
// Check results and print timings
175+
checkTiming("recurringAllocUmpire", (double)(clock() - tStart) / CLOCKS_PER_SEC);
129176
}
177+
#endif
130178

131179
/* The main function */
132180
int main(int argc, char* argv[])
133181
{
134182
// Set the number of steps and 1D grid dimensions
135-
int nSteps = 1e6, size = 1e6;
183+
int nSteps = 1e4, size = 1e6;
136184

137185
// Ignore first run, first kernel is slower
138186
ignoreTiming(nSteps, size);
139187

140188
// Run with different memory allocatins strategies
141189
noRecurringAlloc(nSteps, size);
142190
recurringAllocNoMemPools(nSteps, size);
143-
recurringAllocMemPool(nSteps, size);
191+
recurringAllocMallocAsync(nSteps, size);
192+
#if defined(HAVE_UMPIRE)
193+
recurringAllocUmpire(nSteps, size);
194+
#endif
144195
}

0 commit comments

Comments
 (0)