Skip to content

Commit 1007ca3

Browse files
committed
[UR][SYCL] Add support for zeCommandListAppendLaunchKernelWithArguments()
Signed-off-by: Lukasz Dorau <[email protected]>
1 parent 7413519 commit 1007ca3

File tree

8 files changed

+262
-23
lines changed

8 files changed

+262
-23
lines changed

unified-runtime/cmake/FetchLevelZero.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ if(L0_COMPUTE_RUNTIME_HEADERS)
125125
set(COMPUTE_RUNTIME_REPO_PATH "${L0_COMPUTE_RUNTIME_HEADERS}")
126126
else()
127127
set(UR_COMPUTE_RUNTIME_REPO "https://github.com/intel/compute-runtime.git")
128-
set(UR_COMPUTE_RUNTIME_TAG 25.31.34666.3)
128+
set(UR_COMPUTE_RUNTIME_TAG 25.35.35096.9)
129129

130130
include(FetchContent)
131131
# Sparse fetch only the dir with level zero headers for experimental features to avoid pulling in the entire compute-runtime.

unified-runtime/source/adapters/level_zero/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,11 @@ template <>
324324
ze_structure_type_t getZeStructureType<ze_device_cache_line_size_ext_t>() {
325325
return ZE_STRUCTURE_TYPE_DEVICE_CACHELINE_SIZE_EXT;
326326
}
327+
template <>
328+
ze_structure_type_t getZeStructureType<
329+
ze_command_list_append_launch_kernel_param_cooperative_desc_t>() {
330+
return ZE_STRUCTURE_TYPE_COMMAND_LIST_APPEND_PARAM_COOPERATIVE_DESC;
331+
}
327332

328333
#ifdef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
329334
template <>

unified-runtime/source/adapters/level_zero/platform.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,22 @@ ur_result_t ur_platform_handle_t_::initialize() {
526526
ZeMemGetPitchFor2dImageExt.Supported =
527527
ZeMemGetPitchFor2dImageExt.zeMemGetPitchFor2dImage != nullptr;
528528

529+
ZE_CALL_NOCHECK(zeDriverGetExtensionFunctionAddress,
530+
(ZeDriver, "zeCommandListAppendLaunchKernelWithArguments",
531+
reinterpret_cast<void **>(
532+
&ZeCommandListAppendLaunchKernelWithArgumentsExt
533+
.zeCommandListAppendLaunchKernelWithArguments)));
534+
535+
ZeCommandListAppendLaunchKernelWithArgumentsExt.Supported =
536+
ZeCommandListAppendLaunchKernelWithArgumentsExt
537+
.zeCommandListAppendLaunchKernelWithArguments != nullptr;
538+
539+
// Check if the driver supports zeCommandListAppendLaunchKernelWithArguments()
540+
// with cooperative mode (version >= 1.6.35005)
541+
ZeCommandListAppendLaunchKernelWithArgumentsExt
542+
.DriverSupportsCooperativeKernelLaunchWithArgs =
543+
this->isDriverVersionNewerOrSimilar(1, 6, 35005);
544+
529545
return UR_RESULT_SUCCESS;
530546
}
531547

unified-runtime/source/adapters/level_zero/platform.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,4 +162,13 @@ struct ur_platform_handle_t_ : ur::handle_base<ur::level_zero::ddi_getter>,
162162
ze_device_handle_t, size_t, size_t,
163163
unsigned int, size_t *);
164164
} ZeMemGetPitchFor2dImageExt;
165+
166+
struct ZeCommandListAppendLaunchKernelWithArgumentsExtension {
167+
bool Supported = false;
168+
bool DriverSupportsCooperativeKernelLaunchWithArgs = false;
169+
ze_result_t (*zeCommandListAppendLaunchKernelWithArguments)(
170+
ze_command_list_handle_t, ze_kernel_handle_t, const ze_group_count_t,
171+
const ze_group_size_t, void **, void *, ze_event_handle_t, uint32_t,
172+
ze_event_handle_t *);
173+
} ZeCommandListAppendLaunchKernelWithArgumentsExt;
165174
};

unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp

Lines changed: 177 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "../sampler.hpp"
1515
#include "../ur_interface_loader.hpp"
1616
#include "command_buffer.hpp"
17+
#include "common.hpp"
1718
#include "context.hpp"
1819
#include "kernel.hpp"
1920
#include "memory.hpp"
@@ -149,21 +150,13 @@ ur_command_list_manager::getSignalEvent(ur_event_handle_t hUserEvent,
149150
}
150151
}
151152

152-
ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked(
153-
ur_kernel_handle_t hKernel, uint32_t workDim,
153+
// must be called with hKernel->Mutex held
154+
ur_result_t ur_command_list_manager::appendKernelLaunchLocked(
155+
ur_kernel_handle_t hKernel, ze_kernel_handle_t hZeKernel, uint32_t workDim,
154156
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
155157
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
156158
const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent,
157-
bool cooperative) {
158-
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
159-
UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER);
160-
161-
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
162-
UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
163-
164-
ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice.get());
165-
166-
std::scoped_lock<ur_shared_mutex> Lock(hKernel->Mutex);
159+
bool cooperative, std::vector<void *> &kMemObj, void *pNext) {
167160

168161
ze_group_count_t zeThreadGroupDimensions{1, 1, 1};
169162
uint32_t WG[3]{};
@@ -176,15 +169,28 @@ ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked(
176169

177170
UR_CALL(hKernel->prepareForSubmission(
178171
hContext.get(), hDevice.get(), pGlobalWorkOffset, workDim, WG[0], WG[1],
179-
WG[2], getZeCommandList(), waitListView));
172+
WG[2], getZeCommandList(), waitListView, kMemObj));
180173

181-
if (cooperative) {
174+
if (!kMemObj.empty()) {
175+
// zeCommandListAppendLaunchKernelWithArguments
176+
TRACK_SCOPE_LATENCY("ur_command_list_manager::"
177+
"zeCommandListAppendLaunchKernelWithArguments");
178+
ze_group_size_t groupSize = {WG[0], WG[1], WG[2]};
179+
ZE2UR_CALL(hContext->getPlatform()
180+
->ZeCommandListAppendLaunchKernelWithArgumentsExt
181+
.zeCommandListAppendLaunchKernelWithArguments,
182+
(getZeCommandList(), hZeKernel, zeThreadGroupDimensions,
183+
groupSize, hKernel->kernelArgs.data(), pNext, zeSignalEvent,
184+
waitListView.num, waitListView.handles));
185+
} else if (cooperative) {
186+
// zeCommandListAppendLaunchCooperativeKernel
182187
TRACK_SCOPE_LATENCY("ur_command_list_manager::"
183188
"zeCommandListAppendLaunchCooperativeKernel");
184189
ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel,
185190
(getZeCommandList(), hZeKernel, &zeThreadGroupDimensions,
186191
zeSignalEvent, waitListView.num, waitListView.handles));
187192
} else {
193+
// zeCommandListAppendLaunchKernel
188194
TRACK_SCOPE_LATENCY("ur_command_list_manager::"
189195
"zeCommandListAppendLaunchKernel");
190196
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
@@ -199,6 +205,39 @@ ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked(
199205
return UR_RESULT_SUCCESS;
200206
}
201207

208+
static ur_result_t kernelLaunchChecks(ur_kernel_handle_t hKernel,
209+
uint32_t workDim) {
210+
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
211+
UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER);
212+
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
213+
UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
214+
215+
return UR_RESULT_SUCCESS;
216+
}
217+
218+
ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked(
219+
ur_kernel_handle_t hKernel, uint32_t workDim,
220+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
221+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
222+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent,
223+
bool cooperative) {
224+
225+
ur_result_t checkResult = kernelLaunchChecks(hKernel, workDim);
226+
if (checkResult != UR_RESULT_SUCCESS) {
227+
return checkResult;
228+
}
229+
230+
ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice.get());
231+
std::vector<void *> emptyKMemObj;
232+
233+
std::scoped_lock<ur_shared_mutex> Lock(hKernel->Mutex);
234+
235+
return appendKernelLaunchLocked(
236+
hKernel, hZeKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
237+
pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent,
238+
cooperative, emptyKMemObj, nullptr /* pNext */);
239+
}
240+
202241
ur_result_t ur_command_list_manager::appendKernelLaunch(
203242
ur_kernel_handle_t hKernel, uint32_t workDim,
204243
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -1039,7 +1078,7 @@ ur_result_t ur_command_list_manager::releaseSubmittedKernels() {
10391078
return UR_RESULT_SUCCESS;
10401079
}
10411080

1042-
ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp(
1081+
ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExpOld(
10431082
ur_kernel_handle_t hKernel, uint32_t workDim,
10441083
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
10451084
const size_t *pLocalWorkSize, uint32_t numArgs,
@@ -1048,8 +1087,6 @@ ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp(
10481087
const ur_kernel_launch_property_t *launchPropList,
10491088
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
10501089
ur_event_handle_t phEvent) {
1051-
TRACK_SCOPE_LATENCY(
1052-
"ur_queue_immediate_in_order_t::enqueueKernelLaunchWithArgsExp");
10531090
{
10541091
std::scoped_lock<ur_shared_mutex> guard(hKernel->Mutex);
10551092
for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) {
@@ -1091,7 +1128,129 @@ ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp(
10911128
numPropsInLaunchPropList, launchPropList,
10921129
numEventsInWaitList, phEventWaitList, phEvent));
10931130

1094-
recordSubmittedKernel(hKernel);
1131+
return UR_RESULT_SUCCESS;
1132+
}
1133+
1134+
ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExpNew(
1135+
ur_kernel_handle_t hKernel, uint32_t workDim,
1136+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
1137+
const size_t *pLocalWorkSize, uint32_t numArgs,
1138+
const ur_exp_kernel_arg_properties_t *pArgs, uint32_t numEventsInWaitList,
1139+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent,
1140+
bool cooperativeKernelLaunchRequested) {
1141+
1142+
ur_result_t checkResult = kernelLaunchChecks(hKernel, workDim);
1143+
if (checkResult != UR_RESULT_SUCCESS) {
1144+
return checkResult;
1145+
}
1146+
1147+
// It is needed in case of UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE
1148+
// to launch the cooperative kernel.
1149+
ZeStruct<ze_command_list_append_launch_kernel_param_cooperative_desc_t>
1150+
cooperativeDesc;
1151+
cooperativeDesc.isCooperative = static_cast<ze_bool_t>(true);
1152+
1153+
void *pNext = nullptr;
1154+
if (cooperativeKernelLaunchRequested) {
1155+
pNext = &cooperativeDesc;
1156+
}
1157+
1158+
ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice.get());
1159+
1160+
std::scoped_lock<ur_shared_mutex> Lock(hKernel->Mutex);
1161+
1162+
// kernelMemObj contains kernel memory objects that
1163+
// UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ kernelArgs pointers point to
1164+
hKernel->kernelMemObj.resize(numArgs, 0);
1165+
hKernel->kernelArgs.resize(numArgs, 0);
1166+
1167+
for (uint32_t argIndex = 0; argIndex < numArgs; argIndex++) {
1168+
switch (pArgs[argIndex].type) {
1169+
case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
1170+
hKernel->kernelArgs[argIndex] = (void *)&pArgs[argIndex].size;
1171+
break;
1172+
case UR_EXP_KERNEL_ARG_TYPE_VALUE:
1173+
hKernel->kernelArgs[argIndex] = (void *)pArgs[argIndex].value.value;
1174+
break;
1175+
case UR_EXP_KERNEL_ARG_TYPE_POINTER:
1176+
hKernel->kernelArgs[argIndex] = (void *)&pArgs[argIndex].value.pointer;
1177+
break;
1178+
case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ:
1179+
// prepareForSubmission() will save zePtr in kernelMemObj[argIndex]
1180+
hKernel->kernelArgs[argIndex] = &hKernel->kernelMemObj[argIndex];
1181+
UR_CALL(hKernel->addPendingMemoryAllocation(
1182+
{pArgs[argIndex].value.memObjTuple.hMem,
1183+
ur_mem_buffer_t::device_access_mode_t::read_write,
1184+
pArgs[argIndex].index}));
1185+
break;
1186+
case UR_EXP_KERNEL_ARG_TYPE_SAMPLER:
1187+
hKernel->kernelArgs[argIndex] = &pArgs[argIndex].value.sampler->ZeSampler;
1188+
break;
1189+
default:
1190+
return UR_RESULT_ERROR_INVALID_ENUMERATION;
1191+
}
1192+
}
1193+
1194+
return appendKernelLaunchLocked(
1195+
hKernel, hZeKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
1196+
pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent,
1197+
cooperativeKernelLaunchRequested, hKernel->kernelMemObj, pNext);
1198+
}
1199+
1200+
ur_result_t ur_command_list_manager::appendKernelLaunchWithArgsExp(
1201+
ur_kernel_handle_t hKernel, uint32_t workDim,
1202+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
1203+
const size_t *pLocalWorkSize, uint32_t numArgs,
1204+
const ur_exp_kernel_arg_properties_t *pArgs,
1205+
uint32_t numPropsInLaunchPropList,
1206+
const ur_kernel_launch_property_t *launchPropList,
1207+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
1208+
ur_event_handle_t phEvent) {
1209+
TRACK_SCOPE_LATENCY(
1210+
"ur_queue_immediate_in_order_t::enqueueKernelLaunchWithArgsExp");
1211+
1212+
bool cooperativeKernelLaunchRequested = false;
1213+
1214+
for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList;
1215+
propIndex++) {
1216+
switch (launchPropList[propIndex].id) {
1217+
case UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE:
1218+
break;
1219+
case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE:
1220+
if (launchPropList[propIndex].value.cooperative) {
1221+
cooperativeKernelLaunchRequested = true;
1222+
}
1223+
break;
1224+
default:
1225+
// We don't support any other properties.
1226+
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
1227+
}
1228+
}
1229+
1230+
ur_platform_handle_t hPlatform = hContext->getPlatform();
1231+
bool KernelWithArgsSupported =
1232+
hPlatform->ZeCommandListAppendLaunchKernelWithArgumentsExt.Supported;
1233+
bool CooperativeCompatible =
1234+
hPlatform->ZeCommandListAppendLaunchKernelWithArgumentsExt
1235+
.DriverSupportsCooperativeKernelLaunchWithArgs;
1236+
bool RunNewPath =
1237+
KernelWithArgsSupported &&
1238+
(!cooperativeKernelLaunchRequested ||
1239+
(cooperativeKernelLaunchRequested && CooperativeCompatible));
1240+
if (RunNewPath) {
1241+
return appendKernelLaunchWithArgsExpNew(
1242+
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
1243+
numArgs, pArgs, numEventsInWaitList, phEventWaitList, phEvent,
1244+
cooperativeKernelLaunchRequested);
1245+
} else {
1246+
// We cannot pass cooperativeKernelLaunchRequested to
1247+
// appendKernelLaunchWithArgsExpOld() because appendKernelLaunch() must
1248+
// check it on its own since it is called also from enqueueKernelLaunch().
1249+
return appendKernelLaunchWithArgsExpOld(
1250+
hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
1251+
numArgs, pArgs, numPropsInLaunchPropList, launchPropList,
1252+
numEventsInWaitList, phEventWaitList, phEvent);
1253+
}
10951254

10961255
return UR_RESULT_SUCCESS;
10971256
}

unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,24 @@ struct ur_command_list_manager {
244244
ur_event_handle_t phEvent);
245245

246246
private:
247+
ur_result_t appendKernelLaunchWithArgsExpOld(
248+
ur_kernel_handle_t hKernel, uint32_t workDim,
249+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
250+
const size_t *pLocalWorkSize, uint32_t numArgs,
251+
const ur_exp_kernel_arg_properties_t *pArgs,
252+
uint32_t numPropsInLaunchPropList,
253+
const ur_kernel_launch_property_t *launchPropList,
254+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
255+
ur_event_handle_t phEvent);
256+
257+
ur_result_t appendKernelLaunchWithArgsExpNew(
258+
ur_kernel_handle_t hKernel, uint32_t workDim,
259+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
260+
const size_t *pLocalWorkSize, uint32_t numArgs,
261+
const ur_exp_kernel_arg_properties_t *pArgs, uint32_t numEventsInWaitList,
262+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent,
263+
bool cooperativeKernelLaunchRequested);
264+
247265
ur_result_t appendGenericCommandListsExp(
248266
uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists,
249267
ur_event_handle_t phEvent, uint32_t numEventsInWaitList,
@@ -258,6 +276,14 @@ struct ur_command_list_manager {
258276
ze_event_handle_t getSignalEvent(ur_event_handle_t hUserEvent,
259277
ur_command_t commandType);
260278

279+
ur_result_t appendKernelLaunchLocked(
280+
ur_kernel_handle_t hKernel, ze_kernel_handle_t hZeKernel,
281+
uint32_t workDim, const size_t *pGlobalWorkOffset,
282+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
283+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
284+
ur_event_handle_t phEvent, bool cooperative, std::vector<void *> &kMemObj,
285+
void *pNext);
286+
261287
ur_result_t appendKernelLaunchUnlocked(
262288
ur_kernel_handle_t hKernel, uint32_t workDim,
263289
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,

unified-runtime/source/adapters/level_zero/v2/kernel.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,8 @@ ur_result_t ur_kernel_handle_t_::prepareForSubmission(
270270
ur_context_handle_t hContext, ur_device_handle_t hDevice,
271271
const size_t *pGlobalWorkOffset, uint32_t workDim, uint32_t groupSizeX,
272272
uint32_t groupSizeY, uint32_t groupSizeZ,
273-
ze_command_list_handle_t commandList, wait_list_view &waitListView) {
273+
ze_command_list_handle_t commandList, wait_list_view &waitListView,
274+
std::vector<void *> &kMemObj) {
274275
auto &deviceKernelOpt = deviceKernels[deviceIndex(hDevice)];
275276
if (!deviceKernelOpt.has_value())
276277
return UR_RESULT_ERROR_INVALID_KERNEL;
@@ -298,8 +299,23 @@ ur_result_t ur_kernel_handle_t_::prepareForSubmission(
298299
zePtr = reinterpret_cast<void *>(hImage->getZeImage());
299300
}
300301
}
301-
// Set the argument only on this device's kernel.
302-
UR_CALL(deviceKernel.setArgPointer(pending.argIndex, zePtr));
302+
303+
// kMemObj must be a non-empty vector in the path of
304+
// zeCommandListAppendLaunchKernelWithArguments()
305+
if (!kMemObj.empty()) {
306+
// zeCommandListAppendLaunchKernelWithArguments()
307+
// (==CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithArguments())
308+
// calls setArgumentValue(i, argSize, argValue) for all arguments on its
309+
// own so do not call it here, but save the zePtr pointer in kMemObj
310+
// for this future call.
311+
if (pending.argIndex > kMemObj.size() - 1) {
312+
return UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX;
313+
}
314+
kMemObj[pending.argIndex] = zePtr;
315+
} else {
316+
// Set the argument only on this device's kernel.
317+
UR_CALL(deviceKernel.setArgPointer(pending.argIndex, zePtr));
318+
}
303319
}
304320
pending_allocations.clear();
305321

0 commit comments

Comments
 (0)