Merge branch 'main' of github.com:csc-training/hip-programming

csccva · csccva · commit bdc969a8927b · 2022-11-11T13:31:48.000+02:00
diff --git a/memory/02-mempools/README.md b/memory/02-mempools/README.md
@@ -12,3 +12,20 @@ IMPORTANT NOTE! Unfortunately, the support for memory pools was only recently ad
 
 * `hipMallocAsync` -> `cudaMallocAsync`
 * `hipFreeAsync` -> `cudaFreeAsync`
+
+### Bonus (optional) - Implement an additional case using Umpire library
+
+Umpire is available at https://github.com/LLNL/Umpire/. Install Umpire with 
+
+```
+git clone --recursive https://github.com/LLNL/Umpire.git
+cd Umpire && mkdir build && cd build
+cmake ../ -DUMPIRE_ENABLE_C=On -DENABLE_CUDA=On -DCMAKE_INSTALL_PREFIX=/path
+make
+make install
+```
+
+Compile the exercise with 
+```
+hipcc --gpu-architecture=sm_70 -DHAVE_UMPIRE=1 mempools.cpp -I/path/umpire/include/ -L/path/umpire/lib/ -lcamp -lumpire
+```
diff --git a/memory/02-mempools/mempools.cpp b/memory/02-mempools/mempools.cpp
@@ -29,7 +29,7 @@ void ignoreTiming(int nSteps, int size)
 
   int *d_A;
   // Allocate pinned device memory
-  hipMalloc((void**)&d_A, size);
+  hipMalloc((void**)&d_A, sizeof(int) * size);
 
   // Start timer and begin stepping loop
   clock_t tStart = clock();
@@ -53,7 +53,7 @@ void noRecurringAlloc(int nSteps, int size)
 
   int *d_A;
   // Allocate pinned device memory
-  #error allocate memory with hipMalloc for d_A of size
+  #error allocate memory with hipMalloc for d_A of size ints
 
   // Start timer and begin stepping loop
   clock_t tStart = clock();
@@ -84,7 +84,7 @@ void recurringAllocNoMemPools(int nSteps, int size)
   {
     int *d_A;
     // Allocate pinned device memory
-    #error allocate memory with hipMalloc for d_A of size
+    #error allocate memory with hipMalloc for d_A of size ints
     // Launch GPU kernel
     hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
     // Free allocation
@@ -97,7 +97,7 @@ void recurringAllocNoMemPools(int nSteps, int size)
 }
 
 /* Do recurring allocation with memory pooling */
-void recurringAllocMemPool(int nSteps, int size)
+void recurringAllocMallocAsync(int nSteps, int size)
 {
   // Create HIP stream
   hipStream_t stream;
@@ -113,7 +113,7 @@ void recurringAllocMemPool(int nSteps, int size)
   {
     int *d_A;
     // Allocate pinned device memory
-    #error allocate memory with cudaMallocAsync for d_A of size in stream
+    #error allocate memory with cudaMallocAsync for d_A of size ints in stream
     // Launch GPU kernel
     hipKernel<<<gridsize, blocksize, 0, stream>>>(d_A, size);
     // Free allocation
@@ -122,7 +122,7 @@ void recurringAllocMemPool(int nSteps, int size)
   // Synchronization
   #error synchronize stream here
   // Check results and print timings
-  checkTiming("recurringAllocMemPoolNoSync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
+  checkTiming("recurringAllocMallocAsync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
 
   // Destroy the stream
   hipStreamDestroy(stream);
@@ -132,13 +132,13 @@ void recurringAllocMemPool(int nSteps, int size)
 int main(int argc, char* argv[])
 {
   // Set the number of steps and 1D grid dimensions
-  int nSteps = 1e6, size = 1e6;
+  int nSteps = 1e4, size = 1e6;
   
   // Ignore first run, first kernel is slower
   ignoreTiming(nSteps, size);
 
   // Run with different memory allocatins strategies
   noRecurringAlloc(nSteps, size);
   recurringAllocNoMemPools(nSteps, size);
-  recurringAllocMemPool(nSteps, size);
+  recurringAllocMallocAsync(nSteps, size);
 }
diff --git a/memory/02-mempools/solution/mempools.cpp b/memory/02-mempools/solution/mempools.cpp
@@ -3,9 +3,22 @@
 #include <time.h>
 #include <hip/hip_runtime.h>
 
+#if defined(HAVE_UMPIRE)
+  #include "umpire/interface/c_fortran/umpire.h"
+#endif
+
 /* Blocksize divisible by the warp size */
 #define BLOCKSIZE 64
 
+// HIP error checking
+#define HIP_ERR(err) (hip_errchk(err, __FILE__, __LINE__ ))
+static inline void hip_errchk(hipError_t err, const char *file, int line) {
+  if (err != hipSuccess) {
+    printf("\n\n%s in %s at line %d\n", hipGetErrorString(err), file, line);
+    exit(EXIT_FAILURE);
+  }
+}
+
 /* GPU kernel definition */
 __global__ void hipKernel(int* const A, const int size)
 {
@@ -29,7 +42,7 @@ void ignoreTiming(int nSteps, int size)
 
   int *d_A;
   // Allocate pinned device memory
-  hipMalloc((void**)&d_A, size);
+  HIP_ERR(hipMalloc((void**)&d_A, sizeof(int) * size));
 
   // Start timer and begin stepping loop
   clock_t tStart = clock();
@@ -38,10 +51,10 @@ void ignoreTiming(int nSteps, int size)
     // Launch GPU kernel
     hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
     // Synchronization
-    hipStreamSynchronize(0);
+    HIP_ERR(hipStreamSynchronize(0));
   }
   // Free allocation
-  hipFree(d_A);
+  HIP_ERR(hipFree(d_A));
 }
 
 /* Run without recurring allocation */
@@ -53,7 +66,7 @@ void noRecurringAlloc(int nSteps, int size)
 
   int *d_A;
   // Allocate pinned device memory
-  hipMalloc((void**)&d_A, size);
+  HIP_ERR(hipMalloc((void**)&d_A, sizeof(int) * size));
 
   // Start timer and begin stepping loop
   clock_t tStart = clock();
@@ -63,12 +76,12 @@ void noRecurringAlloc(int nSteps, int size)
     hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
   }
   // Synchronization
-  hipStreamSynchronize(0);
+  HIP_ERR(hipStreamSynchronize(0));
   // Check results and print timings
   checkTiming("noRecurringAlloc", (double)(clock() - tStart) / CLOCKS_PER_SEC);
 
   // Free allocation
-  hipFree(d_A);
+  HIP_ERR(hipFree(d_A));
 }
 
 /* Do recurring allocation without memory pooling */
@@ -84,24 +97,24 @@ void recurringAllocNoMemPools(int nSteps, int size)
   {
     int *d_A;
     // Allocate pinned device memory
-    hipMalloc((void**)&d_A, size);
+    HIP_ERR(hipMalloc((void**)&d_A, sizeof(int) * size));
     // Launch GPU kernel
     hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
     // Free allocation
-    hipFree(d_A);
+    HIP_ERR(hipFree(d_A));
   }
   // Synchronization
-  hipStreamSynchronize(0);
+  HIP_ERR(hipStreamSynchronize(0));
   // Check results and print timings
   checkTiming("recurringAllocNoMemPools", (double)(clock() - tStart) / CLOCKS_PER_SEC);
 }
 
 /* Do recurring allocation with memory pooling */
-void recurringAllocMemPool(int nSteps, int size)
+void recurringAllocMallocAsync(int nSteps, int size)
 {
   // Create HIP stream
   hipStream_t stream;
-  hipStreamCreate(&stream);
+  HIP_ERR(hipStreamCreate(&stream));
 
   // Determine grid and block size
   const int blocksize = BLOCKSIZE;
@@ -113,32 +126,70 @@ void recurringAllocMemPool(int nSteps, int size)
   {
     int *d_A;
     // Allocate pinned device memory
-    cudaMallocAsync((void**)&d_A, size, stream);
+    cudaMallocAsync((void**)&d_A, sizeof(int) * size, stream);
     // Launch GPU kernel
     hipKernel<<<gridsize, blocksize, 0, stream>>>(d_A, size);
     // Free allocation
     cudaFreeAsync(d_A, stream);
   }
   // Synchronization
-  hipStreamSynchronize(stream);
+  HIP_ERR(hipStreamSynchronize(stream));
   // Check results and print timings
-  checkTiming("recurringAllocMemPoolNoSync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
+  checkTiming("recurringAllocMallocAsync", (double)(clock() - tStart) / CLOCKS_PER_SEC);
 
   // Destroy the stream
-  hipStreamDestroy(stream);
+  HIP_ERR(hipStreamDestroy(stream));
+}
+
+#if defined(HAVE_UMPIRE)
+/* Do recurring allocation with Umpire memory pool */
+void recurringAllocUmpire(int nSteps, int size)
+{
+  // Get Umpire pinned device memory pool
+  umpire_resourcemanager rm;
+  umpire_resourcemanager_get_instance(&rm);
+  umpire_allocator allocator;
+  umpire_resourcemanager_get_allocator_by_name(&rm, "DEVICE", &allocator);
+  umpire_allocator pool;
+  umpire_resourcemanager_make_allocator_quick_pool(&rm, "pool", allocator, 1024, 1024, &pool);
+
+  // Determine grid and block size
+  const int blocksize = BLOCKSIZE;
+  const int gridsize = (size - 1 + blocksize) / blocksize;
+
+  // Start timer and begin stepping loop
+  clock_t tStart = clock();
+  for(unsigned int i = 0; i < nSteps; i++)
+  {
+    int *d_A;
+    // Allocate pinned device memory with Umpire
+    d_A = (int*) umpire_allocator_allocate(&pool, sizeof(int) * size);
+    // Launch GPU kernel
+    hipKernel<<<gridsize, blocksize, 0, 0>>>(d_A, size);
+    // Free Umpire allocation
+    umpire_allocator_deallocate(&pool, d_A);
+  }
+  // Synchronization
+  HIP_ERR(hipStreamSynchronize(0));
+  // Check results and print timings
+  checkTiming("recurringAllocUmpire", (double)(clock() - tStart) / CLOCKS_PER_SEC);
 }
+#endif
 
 /* The main function */
 int main(int argc, char* argv[])
 {
   // Set the number of steps and 1D grid dimensions
-  int nSteps = 1e6, size = 1e6;
+  int nSteps = 1e4, size = 1e6;
   
   // Ignore first run, first kernel is slower
   ignoreTiming(nSteps, size);
 
   // Run with different memory allocatins strategies
   noRecurringAlloc(nSteps, size);
   recurringAllocNoMemPools(nSteps, size);
-  recurringAllocMemPool(nSteps, size);
+  recurringAllocMallocAsync(nSteps, size);
+  #if defined(HAVE_UMPIRE)
+    recurringAllocUmpire(nSteps, size);
+  #endif
 }