12
12
CACHE_CLEAR_KERNEL = "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor<int>, std::array<char*, 1ul> >(int, at::native::FillFunctor<int>, std::array<char*, 1ul>)"
13
13
14
14
15
- def _kineto_events_to_latency (prof ):
15
+ def _kineto_events_to_latency (prof , n_repeat ):
16
16
prof_averages = prof .key_averages (group_by_input_shape = False )
17
17
cuda_event_names = [
18
18
event .key
@@ -33,22 +33,16 @@ def _kineto_events_to_latency(prof):
33
33
kernel_duration_name_map [event .name ()] = []
34
34
kernel_duration_name_map [event .name ()].append (event .duration_ns () / 1e6 )
35
35
36
- kernel_hits = [len (kernel_duration_name_map [k ]) for k in kernel_duration_name_map ]
37
- assert all (
38
- x == kernel_hits [0 ] for x in kernel_hits
39
- ), "Error: Not all kernels run the same time."
36
+ op_time = 0.0
37
+ for name in kernel_duration_name_map :
38
+ op_time += sum (kernel_duration_name_map [name ])
40
39
41
- op_latencies = []
42
- for x in range (kernel_hits [0 ]):
43
- op_time = 0.0
44
- for name in kernel_duration_name_map :
45
- op_time += kernel_duration_name_map [name ][x ]
46
- op_latencies .append (op_time )
40
+ op_time = op_time / n_repeat
47
41
48
42
print (
49
43
prof .key_averages (group_by_input_shape = False ).table (sort_by = "cuda_time_total" )
50
44
)
51
- return Latency ( times = op_latencies )
45
+ return op_time
52
46
53
47
54
48
def _do_bench_cuda_time_cudagraph (
@@ -59,7 +53,7 @@ def _do_bench_cuda_time_cudagraph(
59
53
n_repeat : int ,
60
54
grad_to_none : bool ,
61
55
bypass_fail : bool = False ,
62
- ) -> Latency :
56
+ ) -> float :
63
57
with torch .cuda .stream (torch .cuda .Stream ()):
64
58
g = torch .cuda .CUDAGraph ()
65
59
with torch .cuda .graph (g ):
@@ -87,7 +81,7 @@ def _do_bench_cuda_time_cudagraph(
87
81
prof .step ()
88
82
synchronize_with_timing ()
89
83
90
- return _kineto_events_to_latency (prof )
84
+ return _kineto_events_to_latency (prof , n_repeat )
91
85
92
86
93
87
def do_bench_cuda_time (
@@ -97,7 +91,7 @@ def do_bench_cuda_time(
97
91
grad_to_none : bool ,
98
92
use_cuda_graphs : bool = False ,
99
93
bypass_fail : bool = False ,
100
- ) -> Latency :
94
+ ) -> float :
101
95
"""
102
96
Return the aggregated CUDA time of a benchmarked operator backend.
103
97
"""
@@ -156,4 +150,4 @@ def synchronize_with_timing():
156
150
prof .step ()
157
151
synchronize_with_timing ()
158
152
159
- return _kineto_events_to_latency (prof )
153
+ return _kineto_events_to_latency (prof , n_repeat )
0 commit comments