Skip to content

Commit e2399be

Browse files
committed
add macro
1 parent 16be28a commit e2399be

File tree

1 file changed

+58
-208
lines changed

1 file changed

+58
-208
lines changed

kernel/power/dgemm_kernel_power10.c

Lines changed: 58 additions & 208 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,18 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
9292
rowC = (v4sf_t *) &CO[1* ldc+J]; \
9393
rowC[0] += result[1] * alpha;
9494
#endif
95-
95+
#define KERNEL(i) \
96+
rowA = (vec_t *)&AO[i<< 3];\
97+
rowB = *((__vector_pair *)((void *)&BO[i << 3]));\
98+
rowB1 = *((__vector_pair *)((void *)&BO[(i << 3) + 4]));\
99+
__builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
100+
__builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
101+
__builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
102+
__builtin_mma_xvf64gerpp(&acc3, rowB1, rowA[1]);\
103+
__builtin_mma_xvf64gerpp(&acc4, rowB, rowA[2]);\
104+
__builtin_mma_xvf64gerpp(&acc5, rowB1, rowA[2]);\
105+
__builtin_mma_xvf64gerpp(&acc6, rowB, rowA[3]);\
106+
__builtin_mma_xvf64gerpp(&acc7, rowB1, rowA[3]);
96107
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
97108

98109
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -203,214 +214,53 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
203214
__builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
204215
for (l = 1; l + 15 < temp; l += 16)
205216
{
206-
207-
vec_t *rowA0 = (vec_t *)&AO[(l + 0) << 3];
208-
__vector_pair rowB0 = *((__vector_pair *)((void *)&BO[(l + 0) << 3]));
209-
__vector_pair rowB0_1 = *((__vector_pair *)((void *)&BO[((l + 0) << 3) + 4]));
210-
__builtin_mma_xvf64gerpp(&acc0, rowB0, rowA0[0]);
211-
__builtin_mma_xvf64gerpp(&acc1, rowB0_1, rowA0[0]);
212-
__builtin_mma_xvf64gerpp(&acc2, rowB0, rowA0[1]);
213-
__builtin_mma_xvf64gerpp(&acc3, rowB0_1, rowA0[1]);
214-
__builtin_mma_xvf64gerpp(&acc4, rowB0, rowA0[2]);
215-
__builtin_mma_xvf64gerpp(&acc5, rowB0_1, rowA0[2]);
216-
__builtin_mma_xvf64gerpp(&acc6, rowB0, rowA0[3]);
217-
__builtin_mma_xvf64gerpp(&acc7, rowB0_1, rowA0[3]);
218-
219-
vec_t *rowA1 = (vec_t *)&AO[(l + 1) << 3];
220-
__vector_pair rowB1 = *((__vector_pair *)((void *)&BO[(l + 1) << 3]));
221-
__vector_pair rowB1_1 = *((__vector_pair *)((void *)&BO[((l + 1) << 3) + 4]));
222-
__builtin_mma_xvf64gerpp(&acc0, rowB1, rowA1[0]);
223-
__builtin_mma_xvf64gerpp(&acc1, rowB1_1, rowA1[0]);
224-
__builtin_mma_xvf64gerpp(&acc2, rowB1, rowA1[1]);
225-
__builtin_mma_xvf64gerpp(&acc3, rowB1_1, rowA1[1]);
226-
__builtin_mma_xvf64gerpp(&acc4, rowB1, rowA1[2]);
227-
__builtin_mma_xvf64gerpp(&acc5, rowB1_1, rowA1[2]);
228-
__builtin_mma_xvf64gerpp(&acc6, rowB1, rowA1[3]);
229-
__builtin_mma_xvf64gerpp(&acc7, rowB1_1, rowA1[3]);
230-
231-
vec_t *rowA2 = (vec_t *)&AO[(l + 2) << 3];
232-
__vector_pair rowB2 = *((__vector_pair *)((void *)&BO[(l + 2) << 3]));
233-
__vector_pair rowB2_1 = *((__vector_pair *)((void *)&BO[((l + 2) << 3) + 4]));
234-
__builtin_mma_xvf64gerpp(&acc0, rowB2, rowA2[0]);
235-
__builtin_mma_xvf64gerpp(&acc1, rowB2_1, rowA2[0]);
236-
__builtin_mma_xvf64gerpp(&acc2, rowB2, rowA2[1]);
237-
__builtin_mma_xvf64gerpp(&acc3, rowB2_1, rowA2[1]);
238-
__builtin_mma_xvf64gerpp(&acc4, rowB2, rowA2[2]);
239-
__builtin_mma_xvf64gerpp(&acc5, rowB2_1, rowA2[2]);
240-
__builtin_mma_xvf64gerpp(&acc6, rowB2, rowA2[3]);
241-
__builtin_mma_xvf64gerpp(&acc7, rowB2_1, rowA2[3]);
242-
243-
vec_t *rowA3 = (vec_t *)&AO[(l + 3) << 3];
244-
__vector_pair rowB3 = *((__vector_pair *)((void *)&BO[(l + 3) << 3]));
245-
__vector_pair rowB3_1 = *((__vector_pair *)((void *)&BO[((l + 3) << 3) + 4]));
246-
__builtin_mma_xvf64gerpp(&acc0, rowB3, rowA3[0]);
247-
__builtin_mma_xvf64gerpp(&acc1, rowB3_1, rowA3[0]);
248-
__builtin_mma_xvf64gerpp(&acc2, rowB3, rowA3[1]);
249-
__builtin_mma_xvf64gerpp(&acc3, rowB3_1, rowA3[1]);
250-
__builtin_mma_xvf64gerpp(&acc4, rowB3, rowA3[2]);
251-
__builtin_mma_xvf64gerpp(&acc5, rowB3_1, rowA3[2]);
252-
__builtin_mma_xvf64gerpp(&acc6, rowB3, rowA3[3]);
253-
__builtin_mma_xvf64gerpp(&acc7, rowB3_1, rowA3[3]);
254-
255-
vec_t *rowA4 = (vec_t *)&AO[(l + 4) << 3];
256-
__vector_pair rowB4 = *((__vector_pair *)((void *)&BO[(l + 4) << 3]));
257-
__vector_pair rowB4_1 = *((__vector_pair *)((void *)&BO[((l + 4) << 3) + 4]));
258-
__builtin_mma_xvf64gerpp(&acc0, rowB4, rowA4[0]);
259-
__builtin_mma_xvf64gerpp(&acc1, rowB4_1, rowA4[0]);
260-
__builtin_mma_xvf64gerpp(&acc2, rowB4, rowA4[1]);
261-
__builtin_mma_xvf64gerpp(&acc3, rowB4_1, rowA4[1]);
262-
__builtin_mma_xvf64gerpp(&acc4, rowB4, rowA4[2]);
263-
__builtin_mma_xvf64gerpp(&acc5, rowB4_1, rowA4[2]);
264-
__builtin_mma_xvf64gerpp(&acc6, rowB4, rowA4[3]);
265-
__builtin_mma_xvf64gerpp(&acc7, rowB4_1, rowA4[3]);
266-
267-
vec_t *rowA5 = (vec_t *)&AO[(l + 5) << 3];
268-
__vector_pair rowB5 = *((__vector_pair *)((void *)&BO[(l + 5) << 3]));
269-
__vector_pair rowB5_1 = *((__vector_pair *)((void *)&BO[((l + 5) << 3) + 4]));
270-
__builtin_mma_xvf64gerpp(&acc0, rowB5, rowA5[0]);
271-
__builtin_mma_xvf64gerpp(&acc1, rowB5_1, rowA5[0]);
272-
__builtin_mma_xvf64gerpp(&acc2, rowB5, rowA5[1]);
273-
__builtin_mma_xvf64gerpp(&acc3, rowB5_1, rowA5[1]);
274-
__builtin_mma_xvf64gerpp(&acc4, rowB5, rowA5[2]);
275-
__builtin_mma_xvf64gerpp(&acc5, rowB5_1, rowA5[2]);
276-
__builtin_mma_xvf64gerpp(&acc6, rowB5, rowA5[3]);
277-
__builtin_mma_xvf64gerpp(&acc7, rowB5_1, rowA5[3]);
278-
279-
vec_t *rowA6 = (vec_t *)&AO[(l + 6) << 3];
280-
__vector_pair rowB6 = *((__vector_pair *)((void *)&BO[(l + 6) << 3]));
281-
__vector_pair rowB6_1 = *((__vector_pair *)((void *)&BO[((l + 6) << 3) + 4]));
282-
__builtin_mma_xvf64gerpp(&acc0, rowB6, rowA6[0]);
283-
__builtin_mma_xvf64gerpp(&acc1, rowB6_1, rowA6[0]);
284-
__builtin_mma_xvf64gerpp(&acc2, rowB6, rowA6[1]);
285-
__builtin_mma_xvf64gerpp(&acc3, rowB6_1, rowA6[1]);
286-
__builtin_mma_xvf64gerpp(&acc4, rowB6, rowA6[2]);
287-
__builtin_mma_xvf64gerpp(&acc5, rowB6_1, rowA6[2]);
288-
__builtin_mma_xvf64gerpp(&acc6, rowB6, rowA6[3]);
289-
__builtin_mma_xvf64gerpp(&acc7, rowB6_1, rowA6[3]);
290-
291-
vec_t *rowA7 = (vec_t *)&AO[(l + 7) << 3];
292-
__vector_pair rowB7 = *((__vector_pair *)((void *)&BO[(l + 7) << 3]));
293-
__vector_pair rowB7_1 = *((__vector_pair *)((void *)&BO[((l + 7) << 3) + 4]));
294-
__builtin_mma_xvf64gerpp(&acc0, rowB7, rowA7[0]);
295-
__builtin_mma_xvf64gerpp(&acc1, rowB7_1, rowA7[0]);
296-
__builtin_mma_xvf64gerpp(&acc2, rowB7, rowA7[1]);
297-
__builtin_mma_xvf64gerpp(&acc3, rowB7_1, rowA7[1]);
298-
__builtin_mma_xvf64gerpp(&acc4, rowB7, rowA7[2]);
299-
__builtin_mma_xvf64gerpp(&acc5, rowB7_1, rowA7[2]);
300-
__builtin_mma_xvf64gerpp(&acc6, rowB7, rowA7[3]);
301-
__builtin_mma_xvf64gerpp(&acc7, rowB7_1, rowA7[3]);
302-
303-
vec_t *rowA8 = (vec_t *)&AO[(l + 8) << 3];
304-
__vector_pair rowB8 = *((__vector_pair *)((void *)&BO[(l + 8) << 3]));
305-
__vector_pair rowB8_1 = *((__vector_pair *)((void *)&BO[((l + 8) << 3) + 4]));
306-
__builtin_mma_xvf64gerpp(&acc0, rowB8, rowA8[0]);
307-
__builtin_mma_xvf64gerpp(&acc1, rowB8_1, rowA8[0]);
308-
__builtin_mma_xvf64gerpp(&acc2, rowB8, rowA8[1]);
309-
__builtin_mma_xvf64gerpp(&acc3, rowB8_1, rowA8[1]);
310-
__builtin_mma_xvf64gerpp(&acc4, rowB8, rowA8[2]);
311-
__builtin_mma_xvf64gerpp(&acc5, rowB8_1, rowA8[2]);
312-
__builtin_mma_xvf64gerpp(&acc6, rowB8, rowA8[3]);
313-
__builtin_mma_xvf64gerpp(&acc7, rowB8_1, rowA8[3]);
314-
315-
vec_t *rowA9 = (vec_t *)&AO[(l + 9) << 3];
316-
__vector_pair rowB9 = *((__vector_pair *)((void *)&BO[(l + 9) << 3]));
317-
__vector_pair rowB9_1 = *((__vector_pair *)((void *)&BO[((l + 9) << 3) + 4]));
318-
__builtin_mma_xvf64gerpp(&acc0, rowB9, rowA9[0]);
319-
__builtin_mma_xvf64gerpp(&acc1, rowB9_1, rowA9[0]);
320-
__builtin_mma_xvf64gerpp(&acc2, rowB9, rowA9[1]);
321-
__builtin_mma_xvf64gerpp(&acc3, rowB9_1, rowA9[1]);
322-
__builtin_mma_xvf64gerpp(&acc4, rowB9, rowA9[2]);
323-
__builtin_mma_xvf64gerpp(&acc5, rowB9_1, rowA9[2]);
324-
__builtin_mma_xvf64gerpp(&acc6, rowB9, rowA9[3]);
325-
__builtin_mma_xvf64gerpp(&acc7, rowB9_1, rowA9[3]);
326-
327-
vec_t *rowA10 = (vec_t *)&AO[(l + 10) << 3];
328-
__vector_pair rowB10 = *((__vector_pair *)((void *)&BO[(l + 10) << 3]));
329-
__vector_pair rowB10_1 = *((__vector_pair *)((void *)&BO[((l + 10) << 3) + 4]));
330-
__builtin_mma_xvf64gerpp(&acc0, rowB10, rowA10[0]);
331-
__builtin_mma_xvf64gerpp(&acc1, rowB10_1, rowA10[0]);
332-
__builtin_mma_xvf64gerpp(&acc2, rowB10, rowA10[1]);
333-
__builtin_mma_xvf64gerpp(&acc3, rowB10_1, rowA10[1]);
334-
__builtin_mma_xvf64gerpp(&acc4, rowB10, rowA10[2]);
335-
__builtin_mma_xvf64gerpp(&acc5, rowB10_1, rowA10[2]);
336-
__builtin_mma_xvf64gerpp(&acc6, rowB10, rowA10[3]);
337-
__builtin_mma_xvf64gerpp(&acc7, rowB10_1, rowA10[3]);
338-
339-
vec_t *rowA11 = (vec_t *)&AO[(l + 11) << 3];
340-
__vector_pair rowB11 = *((__vector_pair *)((void *)&BO[(l + 11) << 3]));
341-
__vector_pair rowB11_1 = *((__vector_pair *)((void *)&BO[((l + 11) << 3) + 4]));
342-
__builtin_mma_xvf64gerpp(&acc0, rowB11, rowA11[0]);
343-
__builtin_mma_xvf64gerpp(&acc1, rowB11_1, rowA11[0]);
344-
__builtin_mma_xvf64gerpp(&acc2, rowB11, rowA11[1]);
345-
__builtin_mma_xvf64gerpp(&acc3, rowB11_1, rowA11[1]);
346-
__builtin_mma_xvf64gerpp(&acc4, rowB11, rowA11[2]);
347-
__builtin_mma_xvf64gerpp(&acc5, rowB11_1, rowA11[2]);
348-
__builtin_mma_xvf64gerpp(&acc6, rowB11, rowA11[3]);
349-
__builtin_mma_xvf64gerpp(&acc7, rowB11_1, rowA11[3]);
350-
351-
vec_t *rowA12 = (vec_t *)&AO[(l + 12) << 3];
352-
__vector_pair rowB12 = *((__vector_pair *)((void *)&BO[(l + 12) << 3]));
353-
__vector_pair rowB12_1 = *((__vector_pair *)((void *)&BO[((l + 12) << 3) + 4]));
354-
__builtin_mma_xvf64gerpp(&acc0, rowB12, rowA12[0]);
355-
__builtin_mma_xvf64gerpp(&acc1, rowB12_1, rowA12[0]);
356-
__builtin_mma_xvf64gerpp(&acc2, rowB12, rowA12[1]);
357-
__builtin_mma_xvf64gerpp(&acc3, rowB12_1, rowA12[1]);
358-
__builtin_mma_xvf64gerpp(&acc4, rowB12, rowA12[2]);
359-
__builtin_mma_xvf64gerpp(&acc5, rowB12_1, rowA12[2]);
360-
__builtin_mma_xvf64gerpp(&acc6, rowB12, rowA12[3]);
361-
__builtin_mma_xvf64gerpp(&acc7, rowB12_1, rowA12[3]);
362-
363-
vec_t *rowA13 = (vec_t *)&AO[(l + 13) << 3];
364-
__vector_pair rowB13 = *((__vector_pair *)((void *)&BO[(l + 13) << 3]));
365-
__vector_pair rowB13_1 = *((__vector_pair *)((void *)&BO[((l + 13) << 3) + 4]));
366-
__builtin_mma_xvf64gerpp(&acc0, rowB13, rowA13[0]);
367-
__builtin_mma_xvf64gerpp(&acc1, rowB13_1, rowA13[0]);
368-
__builtin_mma_xvf64gerpp(&acc2, rowB13, rowA13[1]);
369-
__builtin_mma_xvf64gerpp(&acc3, rowB13_1, rowA13[1]);
370-
__builtin_mma_xvf64gerpp(&acc4, rowB13, rowA13[2]);
371-
__builtin_mma_xvf64gerpp(&acc5, rowB13_1, rowA13[2]);
372-
__builtin_mma_xvf64gerpp(&acc6, rowB13, rowA13[3]);
373-
__builtin_mma_xvf64gerpp(&acc7, rowB13_1, rowA13[3]);
374-
375-
vec_t *rowA14 = (vec_t *)&AO[(l + 14) << 3];
376-
__vector_pair rowB14 = *((__vector_pair *)((void *)&BO[(l + 14) << 3]));
377-
__vector_pair rowB14_1 = *((__vector_pair *)((void *)&BO[((l + 14) << 3) + 4]));
378-
__builtin_mma_xvf64gerpp(&acc0, rowB14, rowA14[0]);
379-
__builtin_mma_xvf64gerpp(&acc1, rowB14_1, rowA14[0]);
380-
__builtin_mma_xvf64gerpp(&acc2, rowB14, rowA14[1]);
381-
__builtin_mma_xvf64gerpp(&acc3, rowB14_1, rowA14[1]);
382-
__builtin_mma_xvf64gerpp(&acc4, rowB14, rowA14[2]);
383-
__builtin_mma_xvf64gerpp(&acc5, rowB14_1, rowA14[2]);
384-
__builtin_mma_xvf64gerpp(&acc6, rowB14, rowA14[3]);
385-
__builtin_mma_xvf64gerpp(&acc7, rowB14_1, rowA14[3]);
386-
387-
vec_t *rowA15 = (vec_t *)&AO[(l + 15) << 3];
388-
__vector_pair rowB15 = *((__vector_pair *)((void *)&BO[(l + 15) << 3]));
389-
__vector_pair rowB15_1 = *((__vector_pair *)((void *)&BO[((l + 15) << 3) + 4]));
390-
__builtin_mma_xvf64gerpp(&acc0, rowB15, rowA15[0]);
391-
__builtin_mma_xvf64gerpp(&acc1, rowB15_1, rowA15[0]);
392-
__builtin_mma_xvf64gerpp(&acc2, rowB15, rowA15[1]);
393-
__builtin_mma_xvf64gerpp(&acc3, rowB15_1, rowA15[1]);
394-
__builtin_mma_xvf64gerpp(&acc4, rowB15, rowA15[2]);
395-
__builtin_mma_xvf64gerpp(&acc5, rowB15_1, rowA15[2]);
396-
__builtin_mma_xvf64gerpp(&acc6, rowB15, rowA15[3]);
397-
__builtin_mma_xvf64gerpp(&acc7, rowB15_1, rowA15[3]);
398-
217+
KERNEL (l);
218+
KERNEL (l+1);
219+
KERNEL (l+2);
220+
KERNEL (l+3);
221+
KERNEL (l+4);
222+
KERNEL (l+5);
223+
KERNEL (l+6);
224+
KERNEL (l+7);
225+
KERNEL (l+8);
226+
KERNEL (l+9);
227+
KERNEL (l+10);
228+
KERNEL (l+11);
229+
KERNEL (l+12);
230+
KERNEL (l+13);
231+
KERNEL (l+14);
232+
KERNEL (l+15);
233+
}
234+
if ((temp - l) & 8)
235+
{
236+
KERNEL(l);
237+
KERNEL(l+1);
238+
KERNEL(l+2);
239+
KERNEL(l+3);
240+
KERNEL(l+4);
241+
KERNEL(l+5);
242+
KERNEL(l+6);
243+
KERNEL(l+7);
244+
l += 8;
245+
}
246+
if ((temp - l) & 4)
247+
{
248+
KERNEL(l);
249+
KERNEL(l+1);
250+
KERNEL(l+2);
251+
KERNEL(l+3);
252+
l += 4;
253+
}
254+
if ((temp - l) & 2)
255+
{
256+
KERNEL(l);
257+
KERNEL(l+1);
258+
l += 2;
259+
}
260+
if ((temp - l) & 1)
261+
{
262+
KERNEL(l);
399263
}
400-
for (; l < temp; l++)
401-
{
402-
rowA = (vec_t *) & AO[l << 3];
403-
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
404-
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
405-
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
406-
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
407-
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
408-
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
409-
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
410-
__builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
411-
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
412-
__builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
413-
}
414264
SAVE_ACC (&acc0, 0);
415265
SAVE_ACC1 (&acc1, 0);
416266
SAVE_ACC (&acc2, 2);

0 commit comments

Comments
 (0)