Skip to content
This repository was archived by the owner on Feb 3, 2025. It is now read-only.

Commit 8d2c59f

Browse files
author
DEKHTIARJonathan
committed
Performance Measurement Fix to avoid MemCopyHtoD Measuring
1 parent 6128d0b commit 8d2c59f

File tree

1 file changed

+55
-21
lines changed

1 file changed

+55
-21
lines changed

tftrt/examples/benchmark_runner.py

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,15 @@ def _config_gpu_memory(self, gpu_mem_cap):
9797
tf.config.set_memory_growth(gpu, True)
9898
except AttributeError:
9999
tf.config.experimental.set_memory_growth(gpu, True)
100-
100+
101101
else:
102102
try:
103103
set_virtual_device_configuration = tf.config.set_virtual_device_configuration
104104
device_config = tf.config.LogicalDeviceConfiguration(memory_limit=gpu_mem_cap)
105105
except AttributeError:
106106
set_virtual_device_configuration = tf.config.experimental.set_virtual_device_configuration
107107
device_config = tf.config.experimental.VirtualDeviceConfiguration(memory_limit=gpu_mem_cap)
108-
108+
109109
set_virtual_device_configuration(gpu, [device_config])
110110
except RuntimeError as e:
111111
print('Can not set GPU memory config', e)
@@ -378,18 +378,40 @@ def infer_batch(x):
378378
)
379379

380380
iter_times = []
381+
memcopy_times = []
381382

382-
def log_step(step_idx, display_every, iter_time):
383+
def log_step(step_idx, display_every, iter_time, memcpyHtoD_time):
383384
if step_idx % display_every == 0:
384385
print(
385-
f" step {step_idx:04d}, iter_time(ms)={iter_time:.3f}"
386+
f" step {step_idx:04d}, iter_time(ms)={iter_time:.3f}, memcpyHtoD_time(ms)={memcpyHtoD_time:.3f}"
386387
)
387388

388389
dataset = timed_dataset(
389390
dataset, activate=self._args.debug_performance
390391
)
391392

393+
@force_gpu_resync
394+
@tf.function()
395+
def force_data_on_gpu(data, device="/gpu:0"):
396+
with tf.device(device):
397+
if isinstance(data, (list, tuple)):
398+
output_data = list()
399+
for t in data:
400+
output_data.append(tf.identity(t))
401+
elif isinstance(data, dict):
402+
output_data = dict()
403+
for k, v in data.items():
404+
output_data[k] = tf.identity(v)
405+
else:
406+
output_data = tf.identity(data)
407+
return output_data
408+
392409
for step_idx, data_batch in enumerate(dataset):
410+
411+
start_time = time.time()
412+
data_batch = force_data_on_gpu(data_batch)
413+
memcopy_times.append(time.time() - start_time)
414+
393415
x, y = self.preprocess_model_inputs(data_batch)
394416

395417
start_time = time.time()
@@ -398,11 +420,14 @@ def log_step(step_idx, display_every, iter_time):
398420

399421
if not self._args.debug_performance:
400422
log_step(
401-
step_idx + 1, self._args.display_every,
402-
np.mean(iter_times[-self._args.display_every:]) * 1000
423+
step_idx + 1,
424+
display_every=self._args.display_every,
425+
iter_time=np.mean(iter_times[-self._args.display_every:]) * 1000,
426+
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000
403427
)
404428
else:
405429
print(f"{'GPU Iteration Time':18s}: {iter_times[-1]:.4f}s")
430+
print(f"{'MemCopyHtoD Iteration Time':18s}: {iter_times[-1]:.4f}s")
406431

407432
if not self._args.use_synthetic_data:
408433
data_aggregator.aggregate_data(y_pred, y)
@@ -411,15 +436,15 @@ def log_step(step_idx, display_every, iter_time):
411436
step_idx + 1 >= self._args.num_iterations):
412437
break
413438

414-
if (not self._args.debug_performance and
415-
step_idx % self._args.display_every !=
416-
0): # avoids double printing
439+
if (
440+
not self._args.debug_performance and
441+
step_idx % self._args.display_every != 0
442+
): # avoids double printing
417443
log_step(
418444
step_idx + 1,
419445
display_every=1, # force print
420-
iter_time=(
421-
np.mean(iter_times[-self._args.display_every:]) * 1000
422-
)
446+
iter_time=np.mean(iter_times[-self._args.display_every:]) * 1000,
447+
memcpyHtoD_time=np.mean(memcopy_times[-self._args.display_every:]) * 1000
423448
)
424449

425450
with timed_section("Metric Computation"):
@@ -440,26 +465,35 @@ def log_step(step_idx, display_every, iter_time):
440465
# Skipping last batch. Might have different batch_size
441466
run_times = np.array(iter_times)
442467
run_times = run_times[self._args.num_warmup_iterations:-1]
468+
mem_times = np.array(memcopy_times)
469+
mem_times = mem_times[self._args.num_warmup_iterations:-1]
443470

444471
metrics['Total GPU Time (s)'] = int(np.ceil(np.sum(iter_times)))
445472
metrics['Throughput (samples/sec)'] = np.mean(
446473
self._args.batch_size / run_times
447474
)
448-
metrics['99th_percentile (ms)'] = np.percentile(
449-
run_times, q=99, interpolation='lower'
450-
) * 1000
451-
metrics['GPU Latency Mean (ms)'] = np.mean(run_times) * 1000
452-
metrics['GPU Latency Median (ms)'] = np.median(run_times) * 1000
453-
metrics['GPU Latency Min (ms)'] = np.min(run_times) * 1000
454-
metrics['GPU Latency Max (ms)'] = np.max(run_times) * 1000
475+
476+
def timing_metrics(time_arr, log_prefix):
477+
data = dict()
478+
data[f"{log_prefix} 99th_percentile (ms)"] = np.percentile(
479+
time_arr, q=99, interpolation='lower'
480+
) * 1000
481+
data[f"{log_prefix} Mean (ms)"] = np.mean(time_arr) * 1000
482+
data[f"{log_prefix} Median (ms)"] = np.median(time_arr) * 1000
483+
data[f"{log_prefix} Min (ms)"] = np.min(time_arr) * 1000
484+
data[f"{log_prefix} Max (ms)"] = np.max(time_arr) * 1000
485+
return data
486+
487+
metrics.update(timing_metrics(run_times, "GPU Latency"))
488+
metrics.update(timing_metrics(mem_times, "MemCopyHtoD Time"))
455489

456490
self._export_runtime_metrics_to_json(metrics)
457491

458492
def log_value(key, val):
459493
if isinstance(val, int):
460-
print(f"- {key:35s}: {val}")
494+
print(f"- {key:40s}: {val}")
461495
else:
462-
print(f"- {key:35s}: {val:.2f}")
496+
print(f"- {key:40s}: {val:.2f}")
463497

464498
for key, val in sorted(metrics.items()):
465499
if isinstance(val, dict):

0 commit comments

Comments
 (0)