@@ -97,15 +97,15 @@ def _config_gpu_memory(self, gpu_mem_cap):
97
97
tf .config .set_memory_growth (gpu , True )
98
98
except AttributeError :
99
99
tf .config .experimental .set_memory_growth (gpu , True )
100
-
100
+
101
101
else :
102
102
try :
103
103
set_virtual_device_configuration = tf .config .set_virtual_device_configuration
104
104
device_config = tf .config .LogicalDeviceConfiguration (memory_limit = gpu_mem_cap )
105
105
except AttributeError :
106
106
set_virtual_device_configuration = tf .config .experimental .set_virtual_device_configuration
107
107
device_config = tf .config .experimental .VirtualDeviceConfiguration (memory_limit = gpu_mem_cap )
108
-
108
+
109
109
set_virtual_device_configuration (gpu , [device_config ])
110
110
except RuntimeError as e :
111
111
print ('Can not set GPU memory config' , e )
@@ -378,18 +378,40 @@ def infer_batch(x):
378
378
)
379
379
380
380
iter_times = []
381
+ memcopy_times = []
381
382
382
- def log_step (step_idx , display_every , iter_time ):
383
+ def log_step (step_idx , display_every , iter_time , memcpyHtoD_time ):
383
384
if step_idx % display_every == 0 :
384
385
print (
385
- f" step { step_idx :04d} , iter_time(ms)={ iter_time :.3f} "
386
+ f" step { step_idx :04d} , iter_time(ms)={ iter_time :.3f} , memcpyHtoD_time(ms)= { memcpyHtoD_time :.3f } "
386
387
)
387
388
388
389
dataset = timed_dataset (
389
390
dataset , activate = self ._args .debug_performance
390
391
)
391
392
393
+ @force_gpu_resync
394
+ @tf .function ()
395
+ def force_data_on_gpu (data , device = "/gpu:0" ):
396
+ with tf .device (device ):
397
+ if isinstance (data , (list , tuple )):
398
+ output_data = list ()
399
+ for t in data :
400
+ output_data .append (tf .identity (t ))
401
+ elif isinstance (data , dict ):
402
+ output_data = dict ()
403
+ for k , v in data .items ():
404
+ output_data [k ] = tf .identity (v )
405
+ else :
406
+ output_data = tf .identity (data )
407
+ return output_data
408
+
392
409
for step_idx , data_batch in enumerate (dataset ):
410
+
411
+ start_time = time .time ()
412
+ data_batch = force_data_on_gpu (data_batch )
413
+ memcopy_times .append (time .time () - start_time )
414
+
393
415
x , y = self .preprocess_model_inputs (data_batch )
394
416
395
417
start_time = time .time ()
@@ -398,11 +420,14 @@ def log_step(step_idx, display_every, iter_time):
398
420
399
421
if not self ._args .debug_performance :
400
422
log_step (
401
- step_idx + 1 , self ._args .display_every ,
402
- np .mean (iter_times [- self ._args .display_every :]) * 1000
423
+ step_idx + 1 ,
424
+ display_every = self ._args .display_every ,
425
+ iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
426
+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
403
427
)
404
428
else :
405
429
print (f"{ 'GPU Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
430
+ print (f"{ 'MemCopyHtoD Iteration Time' :18s} : { iter_times [- 1 ]:.4f} s" )
406
431
407
432
if not self ._args .use_synthetic_data :
408
433
data_aggregator .aggregate_data (y_pred , y )
@@ -411,15 +436,15 @@ def log_step(step_idx, display_every, iter_time):
411
436
step_idx + 1 >= self ._args .num_iterations ):
412
437
break
413
438
414
- if (not self ._args .debug_performance and
415
- step_idx % self ._args .display_every !=
416
- 0 ): # avoids double printing
439
+ if (
440
+ not self ._args .debug_performance and
441
+ step_idx % self ._args .display_every != 0
442
+ ): # avoids double printing
417
443
log_step (
418
444
step_idx + 1 ,
419
445
display_every = 1 , # force print
420
- iter_time = (
421
- np .mean (iter_times [- self ._args .display_every :]) * 1000
422
- )
446
+ iter_time = np .mean (iter_times [- self ._args .display_every :]) * 1000 ,
447
+ memcpyHtoD_time = np .mean (memcopy_times [- self ._args .display_every :]) * 1000
423
448
)
424
449
425
450
with timed_section ("Metric Computation" ):
@@ -440,26 +465,35 @@ def log_step(step_idx, display_every, iter_time):
440
465
# Skipping last batch. Might have different batch_size
441
466
run_times = np .array (iter_times )
442
467
run_times = run_times [self ._args .num_warmup_iterations :- 1 ]
468
+ mem_times = np .array (memcopy_times )
469
+ mem_times = mem_times [self ._args .num_warmup_iterations :- 1 ]
443
470
444
471
metrics ['Total GPU Time (s)' ] = int (np .ceil (np .sum (iter_times )))
445
472
metrics ['Throughput (samples/sec)' ] = np .mean (
446
473
self ._args .batch_size / run_times
447
474
)
448
- metrics ['99th_percentile (ms)' ] = np .percentile (
449
- run_times , q = 99 , interpolation = 'lower'
450
- ) * 1000
451
- metrics ['GPU Latency Mean (ms)' ] = np .mean (run_times ) * 1000
452
- metrics ['GPU Latency Median (ms)' ] = np .median (run_times ) * 1000
453
- metrics ['GPU Latency Min (ms)' ] = np .min (run_times ) * 1000
454
- metrics ['GPU Latency Max (ms)' ] = np .max (run_times ) * 1000
475
+
476
+ def timing_metrics (time_arr , log_prefix ):
477
+ data = dict ()
478
+ data [f"{ log_prefix } 99th_percentile (ms)" ] = np .percentile (
479
+ time_arr , q = 99 , interpolation = 'lower'
480
+ ) * 1000
481
+ data [f"{ log_prefix } Mean (ms)" ] = np .mean (time_arr ) * 1000
482
+ data [f"{ log_prefix } Median (ms)" ] = np .median (time_arr ) * 1000
483
+ data [f"{ log_prefix } Min (ms)" ] = np .min (time_arr ) * 1000
484
+ data [f"{ log_prefix } Max (ms)" ] = np .max (time_arr ) * 1000
485
+ return data
486
+
487
+ metrics .update (timing_metrics (run_times , "GPU Latency" ))
488
+ metrics .update (timing_metrics (mem_times , "MemCopyHtoD Time" ))
455
489
456
490
self ._export_runtime_metrics_to_json (metrics )
457
491
458
492
def log_value (key , val ):
459
493
if isinstance (val , int ):
460
- print (f"- { key :35s } : { val } " )
494
+ print (f"- { key :40s } : { val } " )
461
495
else :
462
- print (f"- { key :35s } : { val :.2f} " )
496
+ print (f"- { key :40s } : { val :.2f} " )
463
497
464
498
for key , val in sorted (metrics .items ()):
465
499
if isinstance (val , dict ):
0 commit comments