Skip to content

Commit a9f7541

Browse files
jeffbolznv0cc4m
andauthored
vulkan: optimizations for direct convolution (#14933)
* vulkan: optimizations for direct convolution - Empirically choose a better tile size. Reducing BS_K/BS_NPQ helps fill the GPU. The new size should be amenable to using coopmat, too. - Fix shmem bank conflicts. 16B padding should work with coopmat. - Some explicit loop unrolling. - Skip math/stores work for parts of the tile that are OOB. - Apply fastdiv opt. - Disable shuffles for NV. * Three tiles sizes for CONV_2D, and a heuristic to choose * reallow collectives for pre-Turing * make SHMEM_PAD a spec constant * fixes for intel perf - no shmem padding, placeholder shader core count * shader variants with/without unrolling * 0cc4m's fixes for AMD perf Co-authored-by: 0cc4m <[email protected]> --------- Co-authored-by: 0cc4m <[email protected]>
1 parent 9c35706 commit a9f7541

File tree

3 files changed

+233
-106
lines changed

3 files changed

+233
-106
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 175 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ enum vk_device_architecture {
222222
AMD_RDNA2,
223223
AMD_RDNA3,
224224
INTEL_XE2,
225+
NVIDIA_PRE_TURING,
225226
};
226227

227228
// HSK x HSV
@@ -315,10 +316,33 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
315316
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
316317
return vk_device_architecture::INTEL_XE2;
317318
}
319+
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
320+
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
321+
322+
bool cooperative_matrix = false;
323+
324+
// Detect "pre-turing" based on lack of coopmat support.
325+
for (const auto& properties : ext_props) {
326+
if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
327+
cooperative_matrix = true;
328+
break;
329+
}
330+
}
331+
332+
if (!cooperative_matrix) {
333+
return vk_device_architecture::NVIDIA_PRE_TURING;
334+
}
318335
}
319336
return vk_device_architecture::OTHER;
320337
}
321338

339+
enum vk_conv_shapes {
340+
CONV_SHAPE_128x128,
341+
CONV_SHAPE_64x32,
342+
CONV_SHAPE_32x256,
343+
CONV_SHAPE_COUNT,
344+
};
345+
322346
struct vk_device_struct {
323347
std::recursive_mutex mutex;
324348

@@ -483,8 +507,8 @@ struct vk_device_struct {
483507
vk_pipeline pipeline_rwkv_wkv6_f32;
484508
vk_pipeline pipeline_rwkv_wkv7_f32;
485509
vk_pipeline pipeline_opt_step_adamw_f32;
486-
vk_pipeline pipeline_conv2d_f32;
487-
vk_pipeline pipeline_conv2d_f16_f32;
510+
vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
511+
vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
488512
vk_pipeline pipeline_conv2d_dw_whcn_f32;
489513
vk_pipeline pipeline_conv2d_dw_cwhn_f32;
490514

@@ -908,8 +932,22 @@ struct vk_op_conv2d_push_constants {
908932
uint32_t nb1;
909933
uint32_t nb2;
910934
uint32_t nb3;
935+
936+
// init_fastdiv_values constants for dividing by KW, KW*KH, OW, OW*OH
937+
uint32_t KWmp; uint32_t KWL;
938+
uint32_t KWKHmp; uint32_t KWKHL;
939+
uint32_t OWmp; uint32_t OWL;
940+
uint32_t OWOHmp; uint32_t OWOHL;
911941
};
912942

943+
template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
944+
// Compute magic values to divide by KW, KW*KH, OW, OW*OH
945+
init_fastdiv_values(p.KW, p.KWmp, p.KWL);
946+
init_fastdiv_values(p.KW*p.KH, p.KWKHmp, p.KWKHL);
947+
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
948+
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
949+
}
950+
913951
struct vk_op_conv2d_dw_push_constants {
914952
uint32_t ne;
915953
uint32_t batches;
@@ -3048,48 +3086,89 @@ static void ggml_vk_load_shaders(vk_device& device) {
30483086
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
30493087

30503088
// conv2d
3051-
uint32_t conv2d_WG_SIZE = 256;
3052-
uint32_t conv2d_BS_K = 128;
3053-
uint32_t conv2d_BS_CRS = 16;
3054-
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3055-
if (device->subgroup_shuffle &&
3056-
device->vendor_id != VK_VENDOR_ID_INTEL) { // Do not enable collectives on Intel, see PR 14316
3057-
use_collectives = 1;
3058-
conv2d_BS_CRS = std::min(
3059-
device->subgroup_size,
3060-
conv2d_BS_CRS); // CRS block size should be capped at sugroup size for correctness when shuffle is used.
3061-
}
3062-
uint32_t conv2d_BS_NPQ = 128;
3063-
uint32_t conv2d_TS_K = 8;
3064-
uint32_t conv2d_shmem_req =
3065-
(conv2d_BS_K * (conv2d_BS_CRS + 1) + conv2d_BS_CRS * (conv2d_BS_NPQ + 1)) * sizeof(float);
3066-
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
3067-
conv2d_BS_CRS = 8;
3068-
if (use_collectives) {
3069-
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
3070-
}
3071-
}
3072-
3073-
if (use_collectives) {
3074-
ggml_vk_create_pipeline(
3075-
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3076-
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3077-
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3078-
ggml_vk_create_pipeline(
3079-
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3080-
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3081-
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true, true);
3082-
} else {
3083-
ggml_vk_create_pipeline(
3084-
device, device->pipeline_conv2d_f32, "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3085-
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3086-
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3087-
false);
3088-
ggml_vk_create_pipeline(
3089-
device, device->pipeline_conv2d_f16_f32, "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3090-
sizeof(vk_op_conv2d_push_constants), { conv2d_BS_K, conv2d_BS_NPQ, 1 },
3091-
{ conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives }, 1, true,
3092-
false);
3089+
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
3090+
uint32_t conv2d_WG_SIZE = 256;
3091+
uint32_t conv2d_BS_K = 128;
3092+
uint32_t conv2d_BS_CRS = 16;
3093+
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
3094+
uint32_t conv2d_BS_NPQ = 128;
3095+
uint32_t conv2d_TS_K = 8;
3096+
uint32_t conv2d_SHMEM_PAD = 4;
3097+
bool conv2d_UNROLL = true;
3098+
3099+
if (device->vendor_id == VK_VENDOR_ID_INTEL) {
3100+
conv2d_SHMEM_PAD = 0;
3101+
conv2d_UNROLL = false;
3102+
} else if (device->vendor_id == VK_VENDOR_ID_AMD) {
3103+
conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
3104+
}
3105+
3106+
switch (s) {
3107+
default:
3108+
case CONV_SHAPE_128x128:
3109+
conv2d_BS_K = 128;
3110+
conv2d_BS_NPQ = 128;
3111+
conv2d_BS_CRS = 16;
3112+
if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
3113+
conv2d_UNROLL = false;
3114+
}
3115+
break;
3116+
case CONV_SHAPE_64x32:
3117+
conv2d_BS_K = 64;
3118+
conv2d_BS_NPQ = 32;
3119+
conv2d_BS_CRS = 32;
3120+
conv2d_TS_K = 4;
3121+
break;
3122+
case CONV_SHAPE_32x256:
3123+
conv2d_BS_K = 32;
3124+
conv2d_BS_NPQ = 256;
3125+
conv2d_BS_CRS = 16;
3126+
break;
3127+
}
3128+
3129+
// Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
3130+
bool allow_collectives_nv = device->vendor_id != VK_VENDOR_ID_NVIDIA ||
3131+
device->architecture == vk_device_architecture::NVIDIA_PRE_TURING;
3132+
bool allow_collectives_amd = device->vendor_id != VK_VENDOR_ID_AMD ||
3133+
device->architecture == vk_device_architecture::AMD_GCN;
3134+
3135+
if (device->subgroup_shuffle &&
3136+
device->vendor_id != VK_VENDOR_ID_INTEL && // Do not enable collectives on Intel, see PR 14316.
3137+
allow_collectives_nv &&
3138+
allow_collectives_amd) {
3139+
use_collectives = 1;
3140+
conv2d_BS_CRS = std::min(
3141+
device->subgroup_size,
3142+
conv2d_BS_CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
3143+
}
3144+
3145+
uint32_t conv2d_shmem_req =
3146+
(conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
3147+
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
3148+
conv2d_BS_CRS = 8;
3149+
if (use_collectives) {
3150+
conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
3151+
}
3152+
}
3153+
3154+
std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
3155+
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
3156+
3157+
if (conv2d_UNROLL) {
3158+
ggml_vk_create_pipeline(
3159+
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
3160+
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3161+
ggml_vk_create_pipeline(
3162+
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_unroll_len, conv2d_f16_f32_unroll_data, "main", 3,
3163+
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3164+
} else {
3165+
ggml_vk_create_pipeline(
3166+
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_len, conv2d_f32_data, "main", 3,
3167+
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3168+
ggml_vk_create_pipeline(
3169+
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_len, conv2d_f16_f32_data, "main", 3,
3170+
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
3171+
}
30933172
}
30943173

30953174
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
@@ -6641,6 +6720,34 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
66416720
}
66426721
}
66436722

6723+
static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
6724+
const ggml_tensor *src0 = dst->src[0];
6725+
const ggml_tensor *src1 = dst->src[1];
6726+
6727+
// src0 - kernel: [KW, KH, Cin, Cout]
6728+
// src1 - input: [W, H, Cin, N]
6729+
// dst - result: [OW, OH, Cout, N]
6730+
6731+
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
6732+
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
6733+
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
6734+
};
6735+
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
6736+
int64_t W = src1->ne[0];
6737+
int64_t H = src1->ne[1];
6738+
int64_t KW = src0->ne[0];
6739+
int64_t KH = src0->ne[1];
6740+
int64_t Cout = src0->ne[3];
6741+
int64_t N = src1->ne[3];
6742+
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
6743+
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
6744+
int64_t NPQ = N * OW * OH;
6745+
6746+
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
6747+
std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
6748+
return elements;
6749+
}
6750+
66446751
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
66456752
switch (op) {
66466753
case GGML_OP_GET_ROWS:
@@ -6970,10 +7077,30 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
69707077
case GGML_OP_CONV_2D:
69717078
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
69727079
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
7080+
auto elements = ggml_vk_get_conv_elements(dst);
7081+
vk_conv_shapes shape;
7082+
7083+
uint32_t tiles[CONV_SHAPE_COUNT];
7084+
for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
7085+
tiles[i] = CEIL_DIV(elements[0], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[0]) * CEIL_DIV(elements[1], ctx->device->pipeline_conv2d_f32[i]->wg_denoms[1]);
7086+
}
7087+
7088+
// We can't query number of shader cores on Intel, use 32 as a placeholder
7089+
// so small convolutions will still choose a smaller tile.
7090+
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
7091+
7092+
if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
7093+
shape = CONV_SHAPE_128x128;
7094+
} else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
7095+
shape = CONV_SHAPE_32x256;
7096+
} else {
7097+
shape = CONV_SHAPE_64x32;
7098+
}
7099+
69737100
if (src0->type == GGML_TYPE_F32) {
6974-
return ctx->device->pipeline_conv2d_f32;
7101+
return ctx->device->pipeline_conv2d_f32[shape];
69757102
} else if (src0->type == GGML_TYPE_F16) {
6976-
return ctx->device->pipeline_conv2d_f16_f32;
7103+
return ctx->device->pipeline_conv2d_f16_f32[shape];
69777104
}
69787105
}
69797106
return nullptr;
@@ -7301,29 +7428,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
73017428
} break;
73027429
case GGML_OP_CONV_2D:
73037430
{
7304-
// src0 - kernel: [KW, KH, Cin, Cout]
7305-
// src1 - input: [W, H, Cin, N]
7306-
// dst - result: [OW, OH, Cout, N]
7307-
7308-
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
7309-
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
7310-
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
7311-
};
7312-
// parallelize in {OW/BS_K, OH/BS_NPQ, 1}
7313-
int64_t W = src1->ne[0];
7314-
int64_t H = src1->ne[1];
7315-
int64_t KW = src0->ne[0];
7316-
int64_t KH = src0->ne[1];
7317-
int64_t Cout = src0->ne[3];
7318-
int64_t N = src1->ne[3];
7319-
int64_t OH = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
7320-
int64_t OW = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
7321-
int64_t NPQ = N * OW * OH;
7322-
7323-
// Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
7324-
elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
7325-
}
7326-
break;
7431+
elements = ggml_vk_get_conv_elements(dst);
7432+
} break;
73277433
case GGML_OP_ADD:
73287434
case GGML_OP_SUB:
73297435
case GGML_OP_DIV:

0 commit comments

Comments
 (0)