@@ -293,13 +293,13 @@ int main(int argc, char* argv[]) {
293
293
NCCL_CALL (ncclGroupEnd ());
294
294
CUDA_RT_CALL (cudaStreamSynchronize (compute_stream));
295
295
#else
296
- MPI_CALL (MPI_Sendrecv (a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0 ,
296
+ MPI_CALL (MPI_Sendrecv (a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0 ,
297
297
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0 , MPI_COMM_WORLD,
298
298
MPI_STATUS_IGNORE));
299
299
MPI_CALL (MPI_Sendrecv (a_new + (iy_end - 1 ) * nx, nx, MPI_REAL_TYPE, bottom, 0 , a_new, nx,
300
- MPI_REAL_TYPE, top, 0 , MPI_COMM_WORLD, MPI_STATUS_IGNORE));
300
+ MPI_REAL_TYPE, top, 0 , MPI_COMM_WORLD, MPI_STATUS_IGNORE));
301
301
#endif
302
- std::swap (a_new, a);
302
+ std::swap (a_new, a);
303
303
}
304
304
POP_RANGE
305
305
@@ -326,7 +326,7 @@ int main(int argc, char* argv[]) {
326
326
CUDA_RT_CALL (cudaStreamWaitEvent (push_stream, reset_l2norm_done, 0 ));
327
327
calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100 ) == 0 );
328
328
329
- launch_jacobi_kernel (a_new, a, l2_norm_d, (iy_start + 1 ), (iy_end - 1 ), nx, calculate_norm,
329
+ launch_jacobi_kernel (a_new, a, l2_norm_d, (iy_start + 1 ), (iy_end - 1 ), nx, calculate_norm,
330
330
compute_stream);
331
331
332
332
launch_jacobi_kernel (a_new, a, l2_norm_d, iy_start, (iy_start + 1 ), nx, calculate_norm,
@@ -346,7 +346,7 @@ int main(int argc, char* argv[]) {
346
346
const int bottom = (rank + 1 ) % size;
347
347
348
348
// Apply periodic boundary conditions
349
- // TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
349
+ // TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
350
350
// using the nccl communicator and push_stream.
351
351
// Remember to use ncclGroupStart() and ncclGroupEnd()
352
352
#ifdef SOLUTION
@@ -358,14 +358,14 @@ int main(int argc, char* argv[]) {
358
358
NCCL_CALL (ncclSend (a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream));
359
359
NCCL_CALL (ncclGroupEnd ());
360
360
#else
361
- PUSH_RANGE (" MPI" , 5 )
361
+ PUSH_RANGE (" MPI" , 5 )
362
362
MPI_CALL (MPI_Sendrecv (a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0 ,
363
363
a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0 , MPI_COMM_WORLD,
364
364
MPI_STATUS_IGNORE));
365
365
MPI_CALL (MPI_Sendrecv (a_new + (iy_end - 1 ) * nx, nx, MPI_REAL_TYPE, bottom, 0 , a_new, nx,
366
366
MPI_REAL_TYPE, top, 0 , MPI_COMM_WORLD, MPI_STATUS_IGNORE));
367
367
#endif
368
- CUDA_RT_CALL (cudaEventRecord (push_done, push_stream));
368
+ CUDA_RT_CALL (cudaEventRecord (push_done, push_stream));
369
369
POP_RANGE
370
370
371
371
if (calculate_norm) {
@@ -410,13 +410,13 @@ int main(int argc, char* argv[]) {
410
410
411
411
if (rank == 0 && result_correct) {
412
412
if (csv) {
413
- // TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
413
+ // TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
414
414
#ifdef SOLUTION
415
415
printf (" nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n " , nx, ny, iter_max, nccheck, size,
416
416
#else
417
- printf (" mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n " , nx, ny, iter_max, nccheck, size,
417
+ printf (" mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n " , nx, ny, iter_max, nccheck, size,
418
418
#endif
419
- (stop - start), runtime_serial);
419
+ (stop - start), runtime_serial);
420
420
} else {
421
421
printf (" Num GPUs: %d.\n " , size);
422
422
printf (
0 commit comments