@@ -1072,6 +1072,8 @@ Syntax:
1072
1072
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1073
1073
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1074
1074
1075
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1076
+
1075
1077
Overview:
1076
1078
"""""""""
1077
1079
@@ -1082,7 +1084,13 @@ global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
1082
1084
in ``tile `` mode. In tile mode, the multi-dimensional layout of the
1083
1085
source tensor is preserved at the destination. The dimension of the
1084
1086
tensor data ranges from 1d to 5d with the coordinates specified
1085
- by the ``i32 %d0 ... i32 %d4 `` arguments.
1087
+ by the ``i32 %d0 ... i32 %d4 `` arguments. In ``tile.gather4 `` mode,
1088
+ four rows in a 2D tensor are combined to form a single 2D destination
1089
+ tensor. The first coordinate ``i32 %x0 `` denotes the column index
1090
+ followed by four coordinates indicating the four row-indices.
1091
+ So, this mode takes a total of 5 coordinates as input arguments.
1092
+ For more information on ``gather4 `` mode, refer PTX ISA
1093
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
1086
1094
1087
1095
* The last three arguments to these intrinsics are flags
1088
1096
indicating support for multicast, cache_hint and cta_group::1/2
@@ -1116,10 +1124,18 @@ Syntax:
1116
1124
1117
1125
.. code-block :: llvm
1118
1126
1119
- declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3 ) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1127
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7 ) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1120
1128
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
1121
1129
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
1122
1130
1131
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1132
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1133
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1134
+
1135
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
1136
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1137
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1138
+
1123
1139
Overview:
1124
1140
"""""""""
1125
1141
@@ -1131,10 +1147,105 @@ in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
1131
1147
are unrolled into a single dimensional column at the destination. In this
1132
1148
mode, the tensor has to be at least three-dimensional. Along with the tensor
1133
1149
coordinates, im2col offsets are also specified (denoted by
1134
- ``i16 im2col0...i16 %im2col2 ``). The number of im2col offsets is two less
1135
- than the number of dimensions of the tensor operation. The last three arguments
1136
- to these intrinsics are flags, with the same functionality as described
1137
- in the ``tile `` mode intrinsics above.
1150
+ ``i16 im2col0...i16 %im2col2 ``). For the ``im2col `` mode, the number of offsets
1151
+ is two less than the number of dimensions of the tensor operation. For the
1152
+ ``im2col.w `` and ``im2col.w.128 `` mode, the number of offsets is always 2,
1153
+ denoted by ``i16 %wHalo `` and ``i16 %wOffset `` arguments. For more information
1154
+ on ``im2col.w `` and ``im2col.w.128 `` modes, refer PTX ISA
1155
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes >`_.
1156
+
1157
+ The last three arguments to these intrinsics are flags, with the same functionality
1158
+ as described in the ``tile `` mode intrinsics above.
1159
+
1160
+ For more information, refer PTX ISA
1161
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
1162
+
1163
+ '``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d ``'
1164
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1165
+
1166
+ Syntax:
1167
+ """""""
1168
+
1169
+ .. code-block :: llvm
1170
+
1171
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
1172
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(..., i32 %d0, i32 %d1, ...)
1173
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
1174
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1175
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1176
+
1177
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
1178
+
1179
+ Overview:
1180
+ """""""""
1181
+
1182
+ The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d ``' intrinsics
1183
+ correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.* ``
1184
+ set of PTX instructions. These instructions initiate an asynchronous
1185
+ copy of tensor data from global memory to shared::cta memory in
1186
+ ``tile `` mode. In tile mode, the multi-dimensional layout of the
1187
+ source tensor is preserved at the destination. The dimension of the
1188
+ tensor data ranges from 1d to 5d with the coordinates specified
1189
+ by the ``i32 %d0 ... i32 %d4 `` arguments. In ``tile.gather4 `` mode,
1190
+ four rows in a 2D tensor are combined to form a single 2D destination
1191
+ tensor. The first coordinate ``i32 %x0 `` denotes the column index
1192
+ followed by four coordinates indicating the four row-indices.
1193
+ So, this mode takes a total of 5 coordinates as input arguments.
1194
+ For more information on ``gather4 `` mode, refer PTX ISA
1195
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
1196
+
1197
+ * The last argument to these intrinsics is a boolean flag
1198
+ indicating support for cache_hint. This flag argument must
1199
+ be a compile-time constant. When set, it indicates a valid
1200
+ cache_hint (``i64 %ch ``) and generates the ``.L2::cache_hint ``
1201
+ variant of the PTX instruction.
1202
+
1203
+ For more information, refer PTX ISA
1204
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
1205
+
1206
+ '``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d ``'
1207
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1208
+
1209
+ Syntax:
1210
+ """""""
1211
+
1212
+ .. code-block :: llvm
1213
+
1214
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
1215
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
1216
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
1217
+
1218
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1219
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1220
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1221
+
1222
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1223
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1224
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1225
+
1226
+ Overview:
1227
+ """""""""
1228
+
1229
+ The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d ``' intrinsics
1230
+ correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.* ``
1231
+ set of PTX instructions. These instructions initiate an asynchronous copy
1232
+ of tensor data from global memory to shared::cta memory in ``im2col `` mode.
1233
+ In im2col mode, some dimensions of the source tensor are unrolled into a
1234
+ single dimensional column at the destination. In this mode, the tensor has
1235
+ to be at least three-dimensional. Along with the tensor coordinates, im2col
1236
+ offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2 ``).
1237
+ For the ``im2col `` mode, the number of offsets is two less than the number
1238
+ of dimensions of the tensor operation. For the ``im2col.w `` and ``im2col.w.128 ``
1239
+ mode, the number of offsets is always 2, denoted by ``i16 %wHalo `` and
1240
+ ``i16 %wOffset `` arguments. For more information on ``im2col.w `` and
1241
+ ``im2col.w.128 `` modes, refer PTX ISA
1242
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes >`_.
1243
+
1244
+ * The last argument to these intrinsics is a boolean flag
1245
+ indicating support for cache_hint. This flag argument must
1246
+ be a compile-time constant. When set, it indicates a valid
1247
+ cache_hint (``i64 %ch ``) and generates the ``.L2::cache_hint ``
1248
+ variant of the PTX instruction.
1138
1249
1139
1250
For more information, refer PTX ISA
1140
1251
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor >`_.
@@ -1153,6 +1264,8 @@ Syntax:
1153
1264
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1154
1265
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1155
1266
1267
+ declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
1268
+
1156
1269
Overview:
1157
1270
"""""""""
1158
1271
@@ -1162,6 +1275,12 @@ These instructions initiate an asynchronous copy of tensor data from
1162
1275
shared::cta to global memory (indicated by the ``s2g `` prefix)
1163
1276
in ``tile `` mode. The dimension of the tensor data ranges from 1d to 5d
1164
1277
with the coordinates specified by the ``i32 %d0 ... i32 %d4 `` arguments.
1278
+ In ``tile.scatter4 `` mode, a single 2D source tensor is divided into
1279
+ four rows in the 2D destination tensor. The first coordinate ``i32 %x0 ``
1280
+ denotes the column index followed by four coordinates indicating the
1281
+ four row-indices. So, this mode takes a total of 5 coordinates as input arguments.
1282
+ For more information on ``scatter4 `` mode, refer PTX ISA
1283
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
1165
1284
1166
1285
* The last argument to these intrinsics is a boolean flag
1167
1286
indicating support for cache_hint. This flag argument must
@@ -1214,6 +1333,8 @@ Syntax:
1214
1333
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1215
1334
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1216
1335
1336
+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
1337
+
1217
1338
Overview:
1218
1339
"""""""""
1219
1340
@@ -1225,6 +1346,13 @@ multi-dimensional layout of the source tensor is preserved at the destination.
1225
1346
The dimension of the tensor data ranges from 1d to 5d with the coordinates
1226
1347
specified by the ``i32 %d0 ... i32 %d4 `` arguments.
1227
1348
1349
+ In ``tile.gather4 `` mode, four rows in the 2-dimnesional source tensor are
1350
+ fetched to the L2 cache. The first coordinate ``i32 %x0 `` denotes the column index
1351
+ followed by four coordinates indicating the four row-indices. So, this mode takes
1352
+ a total of 5 coordinates as input arguments.
1353
+ For more information on ``gather4 `` mode, refer PTX ISA
1354
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes >`_.
1355
+
1228
1356
* The last argument to these intrinsics is a boolean flag
1229
1357
indicating support for cache_hint. This flag argument must
1230
1358
be a compile-time constant. When set, it indicates a valid
@@ -1246,6 +1374,14 @@ Syntax:
1246
1374
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
1247
1375
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
1248
1376
1377
+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1378
+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1379
+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1380
+
1381
+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
1382
+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
1383
+ declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
1384
+
1249
1385
Overview:
1250
1386
"""""""""
1251
1387
@@ -1256,9 +1392,16 @@ of tensor data from global memory to the L2 cache. In im2col mode, some
1256
1392
dimensions of the source tensor are unrolled into a single dimensional
1257
1393
column at the destination. In this mode, the tensor has to be at least
1258
1394
three-dimensional. Along with the tensor coordinates, im2col offsets are
1259
- also specified (denoted by ``i16 im2col0...i16 %im2col2 ``). The number
1260
- of im2col offsets is two less than the number of dimensions of the tensor
1261
- operation. The last argument to these intrinsics is a boolean flag, with
1395
+ also specified (denoted by ``i16 im2col0...i16 %im2col2 ``). For ``im2col ``
1396
+ mode, the number of offsets is two less than the number of dimensions of
1397
+ the tensor operation. For the ``im2col.w `` and ``im2col.w.128 `` modes,
1398
+ the number of offsets is always 2, denoted by ``i16 %wHalo `` and
1399
+ ``i16 %wOffset `` arguments. For more information on ``im2col.w `` and
1400
+ ``im2col.w.128 `` modes, refer PTX ISA
1401
+ `<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes >`_.
1402
+
1403
+
1404
+ The last argument to these intrinsics is a boolean flag, with
1262
1405
the same functionality as described in the ``tile `` mode intrinsics above.
1263
1406
1264
1407
For more information, refer PTX ISA
0 commit comments