Skip to content

Commit 81bd19f

Browse files
authored
device timing histograms (#440)
* initial version of device timing histograms * fix Linux build * c++20 fixes, final cleanup * move the number of histogram bins
1 parent 128272a commit 81bd19f

File tree

7 files changed

+141
-4
lines changed

7 files changed

+141
-4
lines changed

docs/controls.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,10 @@ If set to a nonzero value, the Intercept Layer for OpenCL Applications will incl
299299

300300
If set to a nonzero value, the Intercept Layer for OpenCL Applications will add event profiling to track the minimum, maximum, and average device time for each OpenCL command. This operation may be fairly intrusive and may have side effects; in particular it forces all command queues to be created with PROFILING\_ENABLED and may increment the reference count for application events. When the process exits, this information will be included in the file "clIntercept\_report.txt".
301301

302+
##### `DevicePerformanceTimingHistogram` (bool)
303+
304+
If set to a nonzero value, the Intercept Layer for OpenCL Applications will report a histogram of device times in addition to the table of device times for each OpenCL command.
305+
302306
##### `DevicePerformanceTimeKernelInfoTracking` (bool)
303307

304308
If set to a nonzero value, the Intercept Layer for OpenCL Applications will distinguish between OpenCL NDRange kernels using information such as the kernel's Preferred Work Group Size Multiple (AKA SIMD size).

intercept/src/chrometracer.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,13 +202,13 @@ void CChromeTracer::writeDeviceTiming(
202202
}
203203

204204
// Shared lookup tables:
205-
static const size_t cNumStates = 3;
206-
static const char* colours[cNumStates] = {
205+
static constexpr size_t cNumStates = 3;
206+
static constexpr const char* colours[cNumStates] = {
207207
"thread_state_runnable",
208208
"cq_build_running",
209209
"thread_state_iowait"
210210
};
211-
static const char* suffixes[cNumStates] = {
211+
static constexpr const char* suffixes[cNumStates] = {
212212
"(Queued)",
213213
"(Submitted)",
214214
"(Execution)"

intercept/src/controls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ CLI_CONTROL_SEPARATOR( Performance Timing Controls: )
5858
CLI_CONTROL( bool, HostPerformanceTiming, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will track the minimum, maximum, and average host CPU time for each OpenCL entry point. When the process exits, this information will be included in the file \"clIntercept_report.txt\"." )
5959
CLI_CONTROL( bool, ToolOverheadTiming, true, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will include some types of tool overhead in timing reports and some types of logging." )
6060
CLI_CONTROL( bool, DevicePerformanceTiming, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will add event profiling to track the minimum, maximum, and average device time for each OpenCL command. This operation may be fairly intrusive and may have side effects; in particular it forces all command queues to be created with PROFILING_ENABLED and may increment the reference count for application events. When the process exits, this information will be included in the file \"clIntercept_report.txt\"." )
61+
CLI_CONTROL( bool, DevicePerformanceTimingHistogram, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will report a histogram of device times in addition to the table of device times for each OpenCL command." )
6162
CLI_CONTROL( bool, DevicePerformanceTimeKernelInfoTracking,false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will distinguish between OpenCL NDRange kernels using information such as the kernel's Preferred Work Group Size Multiple (AKA SIMD size)." )
6263
CLI_CONTROL( bool, DevicePerformanceTimeGWOTracking, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will distinguish between OpenCL NDRange kernels with different global work offsets for the purpose of device performance timing." )
6364
CLI_CONTROL( bool, DevicePerformanceTimeGWSTracking, false, "If set to a nonzero value, the Intercept Layer for OpenCL Applications will distinguish between OpenCL NDRange kernels with different global work sizes for the purpose of device performance timing." )

intercept/src/intercept.cpp

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ CLIntercept::CLIntercept( void* pGlobalData )
136136

137137
m_LoggedCLInfo = false;
138138

139-
m_EnqueueCounter.store(0, std::memory_order::memory_order_relaxed);
139+
m_EnqueueCounter.store(0, std::memory_order_relaxed);
140140

141141
m_EventsChromeTraced = 0;
142142
m_ProgramNumber = 0;
@@ -957,6 +957,55 @@ void CLIntercept::writeReport(
957957
}
958958
}
959959

960+
if( config().DevicePerformanceTimingHistogram )
961+
{
962+
CDeviceTimingHistogramMap::const_iterator id = m_DeviceTimingHistogramMap.begin();
963+
while( id != m_DeviceTimingHistogramMap.end() )
964+
{
965+
const cl_device_id device = (*id).first;
966+
const SDeviceTimingHistogram& histogram = (*id).second;
967+
968+
const SDeviceInfo& deviceInfo = m_DeviceInfoMap[device];
969+
970+
os << std::endl << "Device Performance Timing Histogram for " << deviceInfo.NameForReport << ":" << std::endl;
971+
972+
uint32_t total = 0;
973+
for( uint32_t bin = 0; bin < SDeviceTimingHistogram::cNumBins; bin++ )
974+
{
975+
total += histogram.Bins[bin];
976+
}
977+
978+
os << std::endl << "Total Events: " << total << std::endl << std::endl;
979+
980+
for( uint32_t bin = 0; bin < SDeviceTimingHistogram::cNumBins; bin++ )
981+
{
982+
if( bin == SDeviceTimingHistogram::cNumBins - 1 )
983+
{
984+
os << " >= ";
985+
}
986+
else
987+
{
988+
os << " < ";
989+
}
990+
const uint32_t count = histogram.Bins[bin];
991+
os << std::setw(9) << (1ULL << bin) << " ns: " << std::setw(9) << count << " : ";
992+
993+
uint32_t dots = static_cast<uint32_t>( 64.0f * count / total );
994+
if( count != 0 && dots == 0 )
995+
{
996+
dots++;
997+
}
998+
for( uint32_t d = 0; d < dots; d++ )
999+
{
1000+
os << "*";
1001+
}
1002+
os << std::endl;
1003+
}
1004+
1005+
++id;
1006+
}
1007+
}
1008+
9601009
#if defined(USE_MDAPI)
9611010
if( config().DevicePerfCounterEventBasedSampling )
9621011
{
@@ -6626,6 +6675,20 @@ void CLIntercept::checkTimingEvents()
66266675
commandStart,
66276676
commandEnd );
66286677
}
6678+
6679+
if( config().DevicePerformanceTimingHistogram )
6680+
{
6681+
SDeviceTimingHistogram& histogram = m_DeviceTimingHistogramMap[node.Device];
6682+
6683+
constexpr uint32_t cNumBins = SDeviceTimingHistogram::cNumBins;
6684+
const uint32_t count = Utils::CountLeadingZeroes( delta );
6685+
const uint32_t bin = count == 64 ? 0 :
6686+
count <= ( 64 - cNumBins ) ? cNumBins - 1 :
6687+
64 - count;
6688+
6689+
CLI_ASSERT( bin < cNumBins );
6690+
histogram.Bins[bin]++;
6691+
}
66296692
}
66306693
}
66316694

intercept/src/intercept.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,6 +1183,18 @@ class CLIntercept
11831183
typedef std::map< cl_device_id, CDeviceTimingStatsMap > CDeviceDeviceTimingStatsMap;
11841184
CDeviceDeviceTimingStatsMap m_DeviceTimingStatsMap;
11851185

1186+
// These structures define a mapping between a device ID and histogram
1187+
// bins for kernel execution on that device.
1188+
1189+
struct SDeviceTimingHistogram
1190+
{
1191+
constexpr static uint32_t cNumBins = 30;
1192+
uint32_t Bins[cNumBins] = { 0 };
1193+
};
1194+
1195+
typedef std::map< cl_device_id, SDeviceTimingHistogram > CDeviceTimingHistogramMap;
1196+
CDeviceTimingHistogramMap m_DeviceTimingHistogramMap;
1197+
11861198
// This defines a mapping between the kernel handle and information
11871199
// about the kernel.
11881200

intercept/src/utils.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,18 @@
1010
#include <fstream>
1111
#include <string>
1212

13+
#ifdef __cpp_lib_bitops
14+
#include <bit>
15+
#endif
16+
17+
#if defined(_WIN32) && !defined(__MINGW32__)
18+
#include <intrin.h>
19+
#endif
20+
21+
#ifndef __has_builtin
22+
#define __has_builtin(x) 0
23+
#endif
24+
1325
namespace Utils
1426
{
1527

@@ -55,4 +67,47 @@ std::string GetUniqueFileName(const std::string& fileName)
5567
return newFileName;
5668
}
5769

70+
uint32_t CountLeadingZeroes(uint64_t value)
71+
{
72+
#ifdef __cpp_lib_bitops
73+
return std::countl_zero(value);
74+
#elif defined(_WIN32) && !defined(__MINGW32__)
75+
if( value == 0 ) { return 64; }
76+
77+
unsigned long index;
78+
79+
if( value < 1ULL << 32 )
80+
{
81+
_BitScanReverse(&index, static_cast<unsigned long>(value));
82+
return 63 - index;
83+
}
84+
85+
value >>= 32;
86+
_BitScanReverse(&index, static_cast<unsigned long>(value));
87+
return 31 - index;
88+
#elif __has_builtin(__builtin_clz)
89+
if( value == 0 ) { return 64; }
90+
91+
if( value < 1ULL << 32 )
92+
{
93+
return 32 + __builtin_clz(static_cast<uint32_t>(value));
94+
}
95+
96+
value >>= 32;
97+
return __builtin_clz(static_cast<uint32_t>(value));
98+
#else
99+
if( value == 0 ) { return 64; }
100+
101+
uint32_t count = 0;
102+
uint64_t mask = 1ULL << 63;
103+
while( (value & mask) == 0 )
104+
{
105+
count++;
106+
mask >>= 1;
107+
}
108+
109+
return count;
110+
#endif
58111
}
112+
113+
} // namespace Utils

intercept/src/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66

77
#pragma once
88

9+
#include <cstdint>
910
#include <string>
1011

1112
namespace Utils
1213
{
1314

1415
std::string GetUniqueFileName(const std::string& fileName);
16+
uint32_t CountLeadingZeroes(uint64_t value);
1517

1618
}

0 commit comments

Comments
 (0)