@@ -92,7 +92,18 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
9292 rowC = (v4sf_t *) &CO[1* ldc+J]; \
9393 rowC[0] += result[1] * alpha;
9494#endif
95-
95+ #define KERNEL (i ) \
96+ rowA = (vec_t *)&AO[i<< 3];\
97+ rowB = *((__vector_pair *)((void *)&BO[i << 3]));\
98+ rowB1 = *((__vector_pair *)((void *)&BO[(i << 3) + 4]));\
99+ __builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
100+ __builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
101+ __builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
102+ __builtin_mma_xvf64gerpp(&acc3, rowB1, rowA[1]);\
103+ __builtin_mma_xvf64gerpp(&acc4, rowB, rowA[2]);\
104+ __builtin_mma_xvf64gerpp(&acc5, rowB1, rowA[2]);\
105+ __builtin_mma_xvf64gerpp(&acc6, rowB, rowA[3]);\
106+ __builtin_mma_xvf64gerpp(&acc7, rowB1, rowA[3]);
96107#define PREFETCH1 (x , y ) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
97108
98109#if (defined(LEFT ) && !defined(TRANSA )) || (!defined(LEFT ) && defined(TRANSA ))
@@ -203,214 +214,53 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
203214 __builtin_mma_xvf64ger (& acc7 , rowB1 , rowA [3 ]);
204215 for (l = 1 ; l + 15 < temp ; l += 16 )
205216 {
206-
207- vec_t * rowA0 = (vec_t * )& AO [(l + 0 ) << 3 ];
208- __vector_pair rowB0 = * ((__vector_pair * )((void * )& BO [(l + 0 ) << 3 ]));
209- __vector_pair rowB0_1 = * ((__vector_pair * )((void * )& BO [((l + 0 ) << 3 ) + 4 ]));
210- __builtin_mma_xvf64gerpp (& acc0 , rowB0 , rowA0 [0 ]);
211- __builtin_mma_xvf64gerpp (& acc1 , rowB0_1 , rowA0 [0 ]);
212- __builtin_mma_xvf64gerpp (& acc2 , rowB0 , rowA0 [1 ]);
213- __builtin_mma_xvf64gerpp (& acc3 , rowB0_1 , rowA0 [1 ]);
214- __builtin_mma_xvf64gerpp (& acc4 , rowB0 , rowA0 [2 ]);
215- __builtin_mma_xvf64gerpp (& acc5 , rowB0_1 , rowA0 [2 ]);
216- __builtin_mma_xvf64gerpp (& acc6 , rowB0 , rowA0 [3 ]);
217- __builtin_mma_xvf64gerpp (& acc7 , rowB0_1 , rowA0 [3 ]);
218-
219- vec_t * rowA1 = (vec_t * )& AO [(l + 1 ) << 3 ];
220- __vector_pair rowB1 = * ((__vector_pair * )((void * )& BO [(l + 1 ) << 3 ]));
221- __vector_pair rowB1_1 = * ((__vector_pair * )((void * )& BO [((l + 1 ) << 3 ) + 4 ]));
222- __builtin_mma_xvf64gerpp (& acc0 , rowB1 , rowA1 [0 ]);
223- __builtin_mma_xvf64gerpp (& acc1 , rowB1_1 , rowA1 [0 ]);
224- __builtin_mma_xvf64gerpp (& acc2 , rowB1 , rowA1 [1 ]);
225- __builtin_mma_xvf64gerpp (& acc3 , rowB1_1 , rowA1 [1 ]);
226- __builtin_mma_xvf64gerpp (& acc4 , rowB1 , rowA1 [2 ]);
227- __builtin_mma_xvf64gerpp (& acc5 , rowB1_1 , rowA1 [2 ]);
228- __builtin_mma_xvf64gerpp (& acc6 , rowB1 , rowA1 [3 ]);
229- __builtin_mma_xvf64gerpp (& acc7 , rowB1_1 , rowA1 [3 ]);
230-
231- vec_t * rowA2 = (vec_t * )& AO [(l + 2 ) << 3 ];
232- __vector_pair rowB2 = * ((__vector_pair * )((void * )& BO [(l + 2 ) << 3 ]));
233- __vector_pair rowB2_1 = * ((__vector_pair * )((void * )& BO [((l + 2 ) << 3 ) + 4 ]));
234- __builtin_mma_xvf64gerpp (& acc0 , rowB2 , rowA2 [0 ]);
235- __builtin_mma_xvf64gerpp (& acc1 , rowB2_1 , rowA2 [0 ]);
236- __builtin_mma_xvf64gerpp (& acc2 , rowB2 , rowA2 [1 ]);
237- __builtin_mma_xvf64gerpp (& acc3 , rowB2_1 , rowA2 [1 ]);
238- __builtin_mma_xvf64gerpp (& acc4 , rowB2 , rowA2 [2 ]);
239- __builtin_mma_xvf64gerpp (& acc5 , rowB2_1 , rowA2 [2 ]);
240- __builtin_mma_xvf64gerpp (& acc6 , rowB2 , rowA2 [3 ]);
241- __builtin_mma_xvf64gerpp (& acc7 , rowB2_1 , rowA2 [3 ]);
242-
243- vec_t * rowA3 = (vec_t * )& AO [(l + 3 ) << 3 ];
244- __vector_pair rowB3 = * ((__vector_pair * )((void * )& BO [(l + 3 ) << 3 ]));
245- __vector_pair rowB3_1 = * ((__vector_pair * )((void * )& BO [((l + 3 ) << 3 ) + 4 ]));
246- __builtin_mma_xvf64gerpp (& acc0 , rowB3 , rowA3 [0 ]);
247- __builtin_mma_xvf64gerpp (& acc1 , rowB3_1 , rowA3 [0 ]);
248- __builtin_mma_xvf64gerpp (& acc2 , rowB3 , rowA3 [1 ]);
249- __builtin_mma_xvf64gerpp (& acc3 , rowB3_1 , rowA3 [1 ]);
250- __builtin_mma_xvf64gerpp (& acc4 , rowB3 , rowA3 [2 ]);
251- __builtin_mma_xvf64gerpp (& acc5 , rowB3_1 , rowA3 [2 ]);
252- __builtin_mma_xvf64gerpp (& acc6 , rowB3 , rowA3 [3 ]);
253- __builtin_mma_xvf64gerpp (& acc7 , rowB3_1 , rowA3 [3 ]);
254-
255- vec_t * rowA4 = (vec_t * )& AO [(l + 4 ) << 3 ];
256- __vector_pair rowB4 = * ((__vector_pair * )((void * )& BO [(l + 4 ) << 3 ]));
257- __vector_pair rowB4_1 = * ((__vector_pair * )((void * )& BO [((l + 4 ) << 3 ) + 4 ]));
258- __builtin_mma_xvf64gerpp (& acc0 , rowB4 , rowA4 [0 ]);
259- __builtin_mma_xvf64gerpp (& acc1 , rowB4_1 , rowA4 [0 ]);
260- __builtin_mma_xvf64gerpp (& acc2 , rowB4 , rowA4 [1 ]);
261- __builtin_mma_xvf64gerpp (& acc3 , rowB4_1 , rowA4 [1 ]);
262- __builtin_mma_xvf64gerpp (& acc4 , rowB4 , rowA4 [2 ]);
263- __builtin_mma_xvf64gerpp (& acc5 , rowB4_1 , rowA4 [2 ]);
264- __builtin_mma_xvf64gerpp (& acc6 , rowB4 , rowA4 [3 ]);
265- __builtin_mma_xvf64gerpp (& acc7 , rowB4_1 , rowA4 [3 ]);
266-
267- vec_t * rowA5 = (vec_t * )& AO [(l + 5 ) << 3 ];
268- __vector_pair rowB5 = * ((__vector_pair * )((void * )& BO [(l + 5 ) << 3 ]));
269- __vector_pair rowB5_1 = * ((__vector_pair * )((void * )& BO [((l + 5 ) << 3 ) + 4 ]));
270- __builtin_mma_xvf64gerpp (& acc0 , rowB5 , rowA5 [0 ]);
271- __builtin_mma_xvf64gerpp (& acc1 , rowB5_1 , rowA5 [0 ]);
272- __builtin_mma_xvf64gerpp (& acc2 , rowB5 , rowA5 [1 ]);
273- __builtin_mma_xvf64gerpp (& acc3 , rowB5_1 , rowA5 [1 ]);
274- __builtin_mma_xvf64gerpp (& acc4 , rowB5 , rowA5 [2 ]);
275- __builtin_mma_xvf64gerpp (& acc5 , rowB5_1 , rowA5 [2 ]);
276- __builtin_mma_xvf64gerpp (& acc6 , rowB5 , rowA5 [3 ]);
277- __builtin_mma_xvf64gerpp (& acc7 , rowB5_1 , rowA5 [3 ]);
278-
279- vec_t * rowA6 = (vec_t * )& AO [(l + 6 ) << 3 ];
280- __vector_pair rowB6 = * ((__vector_pair * )((void * )& BO [(l + 6 ) << 3 ]));
281- __vector_pair rowB6_1 = * ((__vector_pair * )((void * )& BO [((l + 6 ) << 3 ) + 4 ]));
282- __builtin_mma_xvf64gerpp (& acc0 , rowB6 , rowA6 [0 ]);
283- __builtin_mma_xvf64gerpp (& acc1 , rowB6_1 , rowA6 [0 ]);
284- __builtin_mma_xvf64gerpp (& acc2 , rowB6 , rowA6 [1 ]);
285- __builtin_mma_xvf64gerpp (& acc3 , rowB6_1 , rowA6 [1 ]);
286- __builtin_mma_xvf64gerpp (& acc4 , rowB6 , rowA6 [2 ]);
287- __builtin_mma_xvf64gerpp (& acc5 , rowB6_1 , rowA6 [2 ]);
288- __builtin_mma_xvf64gerpp (& acc6 , rowB6 , rowA6 [3 ]);
289- __builtin_mma_xvf64gerpp (& acc7 , rowB6_1 , rowA6 [3 ]);
290-
291- vec_t * rowA7 = (vec_t * )& AO [(l + 7 ) << 3 ];
292- __vector_pair rowB7 = * ((__vector_pair * )((void * )& BO [(l + 7 ) << 3 ]));
293- __vector_pair rowB7_1 = * ((__vector_pair * )((void * )& BO [((l + 7 ) << 3 ) + 4 ]));
294- __builtin_mma_xvf64gerpp (& acc0 , rowB7 , rowA7 [0 ]);
295- __builtin_mma_xvf64gerpp (& acc1 , rowB7_1 , rowA7 [0 ]);
296- __builtin_mma_xvf64gerpp (& acc2 , rowB7 , rowA7 [1 ]);
297- __builtin_mma_xvf64gerpp (& acc3 , rowB7_1 , rowA7 [1 ]);
298- __builtin_mma_xvf64gerpp (& acc4 , rowB7 , rowA7 [2 ]);
299- __builtin_mma_xvf64gerpp (& acc5 , rowB7_1 , rowA7 [2 ]);
300- __builtin_mma_xvf64gerpp (& acc6 , rowB7 , rowA7 [3 ]);
301- __builtin_mma_xvf64gerpp (& acc7 , rowB7_1 , rowA7 [3 ]);
302-
303- vec_t * rowA8 = (vec_t * )& AO [(l + 8 ) << 3 ];
304- __vector_pair rowB8 = * ((__vector_pair * )((void * )& BO [(l + 8 ) << 3 ]));
305- __vector_pair rowB8_1 = * ((__vector_pair * )((void * )& BO [((l + 8 ) << 3 ) + 4 ]));
306- __builtin_mma_xvf64gerpp (& acc0 , rowB8 , rowA8 [0 ]);
307- __builtin_mma_xvf64gerpp (& acc1 , rowB8_1 , rowA8 [0 ]);
308- __builtin_mma_xvf64gerpp (& acc2 , rowB8 , rowA8 [1 ]);
309- __builtin_mma_xvf64gerpp (& acc3 , rowB8_1 , rowA8 [1 ]);
310- __builtin_mma_xvf64gerpp (& acc4 , rowB8 , rowA8 [2 ]);
311- __builtin_mma_xvf64gerpp (& acc5 , rowB8_1 , rowA8 [2 ]);
312- __builtin_mma_xvf64gerpp (& acc6 , rowB8 , rowA8 [3 ]);
313- __builtin_mma_xvf64gerpp (& acc7 , rowB8_1 , rowA8 [3 ]);
314-
315- vec_t * rowA9 = (vec_t * )& AO [(l + 9 ) << 3 ];
316- __vector_pair rowB9 = * ((__vector_pair * )((void * )& BO [(l + 9 ) << 3 ]));
317- __vector_pair rowB9_1 = * ((__vector_pair * )((void * )& BO [((l + 9 ) << 3 ) + 4 ]));
318- __builtin_mma_xvf64gerpp (& acc0 , rowB9 , rowA9 [0 ]);
319- __builtin_mma_xvf64gerpp (& acc1 , rowB9_1 , rowA9 [0 ]);
320- __builtin_mma_xvf64gerpp (& acc2 , rowB9 , rowA9 [1 ]);
321- __builtin_mma_xvf64gerpp (& acc3 , rowB9_1 , rowA9 [1 ]);
322- __builtin_mma_xvf64gerpp (& acc4 , rowB9 , rowA9 [2 ]);
323- __builtin_mma_xvf64gerpp (& acc5 , rowB9_1 , rowA9 [2 ]);
324- __builtin_mma_xvf64gerpp (& acc6 , rowB9 , rowA9 [3 ]);
325- __builtin_mma_xvf64gerpp (& acc7 , rowB9_1 , rowA9 [3 ]);
326-
327- vec_t * rowA10 = (vec_t * )& AO [(l + 10 ) << 3 ];
328- __vector_pair rowB10 = * ((__vector_pair * )((void * )& BO [(l + 10 ) << 3 ]));
329- __vector_pair rowB10_1 = * ((__vector_pair * )((void * )& BO [((l + 10 ) << 3 ) + 4 ]));
330- __builtin_mma_xvf64gerpp (& acc0 , rowB10 , rowA10 [0 ]);
331- __builtin_mma_xvf64gerpp (& acc1 , rowB10_1 , rowA10 [0 ]);
332- __builtin_mma_xvf64gerpp (& acc2 , rowB10 , rowA10 [1 ]);
333- __builtin_mma_xvf64gerpp (& acc3 , rowB10_1 , rowA10 [1 ]);
334- __builtin_mma_xvf64gerpp (& acc4 , rowB10 , rowA10 [2 ]);
335- __builtin_mma_xvf64gerpp (& acc5 , rowB10_1 , rowA10 [2 ]);
336- __builtin_mma_xvf64gerpp (& acc6 , rowB10 , rowA10 [3 ]);
337- __builtin_mma_xvf64gerpp (& acc7 , rowB10_1 , rowA10 [3 ]);
338-
339- vec_t * rowA11 = (vec_t * )& AO [(l + 11 ) << 3 ];
340- __vector_pair rowB11 = * ((__vector_pair * )((void * )& BO [(l + 11 ) << 3 ]));
341- __vector_pair rowB11_1 = * ((__vector_pair * )((void * )& BO [((l + 11 ) << 3 ) + 4 ]));
342- __builtin_mma_xvf64gerpp (& acc0 , rowB11 , rowA11 [0 ]);
343- __builtin_mma_xvf64gerpp (& acc1 , rowB11_1 , rowA11 [0 ]);
344- __builtin_mma_xvf64gerpp (& acc2 , rowB11 , rowA11 [1 ]);
345- __builtin_mma_xvf64gerpp (& acc3 , rowB11_1 , rowA11 [1 ]);
346- __builtin_mma_xvf64gerpp (& acc4 , rowB11 , rowA11 [2 ]);
347- __builtin_mma_xvf64gerpp (& acc5 , rowB11_1 , rowA11 [2 ]);
348- __builtin_mma_xvf64gerpp (& acc6 , rowB11 , rowA11 [3 ]);
349- __builtin_mma_xvf64gerpp (& acc7 , rowB11_1 , rowA11 [3 ]);
350-
351- vec_t * rowA12 = (vec_t * )& AO [(l + 12 ) << 3 ];
352- __vector_pair rowB12 = * ((__vector_pair * )((void * )& BO [(l + 12 ) << 3 ]));
353- __vector_pair rowB12_1 = * ((__vector_pair * )((void * )& BO [((l + 12 ) << 3 ) + 4 ]));
354- __builtin_mma_xvf64gerpp (& acc0 , rowB12 , rowA12 [0 ]);
355- __builtin_mma_xvf64gerpp (& acc1 , rowB12_1 , rowA12 [0 ]);
356- __builtin_mma_xvf64gerpp (& acc2 , rowB12 , rowA12 [1 ]);
357- __builtin_mma_xvf64gerpp (& acc3 , rowB12_1 , rowA12 [1 ]);
358- __builtin_mma_xvf64gerpp (& acc4 , rowB12 , rowA12 [2 ]);
359- __builtin_mma_xvf64gerpp (& acc5 , rowB12_1 , rowA12 [2 ]);
360- __builtin_mma_xvf64gerpp (& acc6 , rowB12 , rowA12 [3 ]);
361- __builtin_mma_xvf64gerpp (& acc7 , rowB12_1 , rowA12 [3 ]);
362-
363- vec_t * rowA13 = (vec_t * )& AO [(l + 13 ) << 3 ];
364- __vector_pair rowB13 = * ((__vector_pair * )((void * )& BO [(l + 13 ) << 3 ]));
365- __vector_pair rowB13_1 = * ((__vector_pair * )((void * )& BO [((l + 13 ) << 3 ) + 4 ]));
366- __builtin_mma_xvf64gerpp (& acc0 , rowB13 , rowA13 [0 ]);
367- __builtin_mma_xvf64gerpp (& acc1 , rowB13_1 , rowA13 [0 ]);
368- __builtin_mma_xvf64gerpp (& acc2 , rowB13 , rowA13 [1 ]);
369- __builtin_mma_xvf64gerpp (& acc3 , rowB13_1 , rowA13 [1 ]);
370- __builtin_mma_xvf64gerpp (& acc4 , rowB13 , rowA13 [2 ]);
371- __builtin_mma_xvf64gerpp (& acc5 , rowB13_1 , rowA13 [2 ]);
372- __builtin_mma_xvf64gerpp (& acc6 , rowB13 , rowA13 [3 ]);
373- __builtin_mma_xvf64gerpp (& acc7 , rowB13_1 , rowA13 [3 ]);
374-
375- vec_t * rowA14 = (vec_t * )& AO [(l + 14 ) << 3 ];
376- __vector_pair rowB14 = * ((__vector_pair * )((void * )& BO [(l + 14 ) << 3 ]));
377- __vector_pair rowB14_1 = * ((__vector_pair * )((void * )& BO [((l + 14 ) << 3 ) + 4 ]));
378- __builtin_mma_xvf64gerpp (& acc0 , rowB14 , rowA14 [0 ]);
379- __builtin_mma_xvf64gerpp (& acc1 , rowB14_1 , rowA14 [0 ]);
380- __builtin_mma_xvf64gerpp (& acc2 , rowB14 , rowA14 [1 ]);
381- __builtin_mma_xvf64gerpp (& acc3 , rowB14_1 , rowA14 [1 ]);
382- __builtin_mma_xvf64gerpp (& acc4 , rowB14 , rowA14 [2 ]);
383- __builtin_mma_xvf64gerpp (& acc5 , rowB14_1 , rowA14 [2 ]);
384- __builtin_mma_xvf64gerpp (& acc6 , rowB14 , rowA14 [3 ]);
385- __builtin_mma_xvf64gerpp (& acc7 , rowB14_1 , rowA14 [3 ]);
386-
387- vec_t * rowA15 = (vec_t * )& AO [(l + 15 ) << 3 ];
388- __vector_pair rowB15 = * ((__vector_pair * )((void * )& BO [(l + 15 ) << 3 ]));
389- __vector_pair rowB15_1 = * ((__vector_pair * )((void * )& BO [((l + 15 ) << 3 ) + 4 ]));
390- __builtin_mma_xvf64gerpp (& acc0 , rowB15 , rowA15 [0 ]);
391- __builtin_mma_xvf64gerpp (& acc1 , rowB15_1 , rowA15 [0 ]);
392- __builtin_mma_xvf64gerpp (& acc2 , rowB15 , rowA15 [1 ]);
393- __builtin_mma_xvf64gerpp (& acc3 , rowB15_1 , rowA15 [1 ]);
394- __builtin_mma_xvf64gerpp (& acc4 , rowB15 , rowA15 [2 ]);
395- __builtin_mma_xvf64gerpp (& acc5 , rowB15_1 , rowA15 [2 ]);
396- __builtin_mma_xvf64gerpp (& acc6 , rowB15 , rowA15 [3 ]);
397- __builtin_mma_xvf64gerpp (& acc7 , rowB15_1 , rowA15 [3 ]);
398-
217+ KERNEL (l );
218+ KERNEL (l + 1 );
219+ KERNEL (l + 2 );
220+ KERNEL (l + 3 );
221+ KERNEL (l + 4 );
222+ KERNEL (l + 5 );
223+ KERNEL (l + 6 );
224+ KERNEL (l + 7 );
225+ KERNEL (l + 8 );
226+ KERNEL (l + 9 );
227+ KERNEL (l + 10 );
228+ KERNEL (l + 11 );
229+ KERNEL (l + 12 );
230+ KERNEL (l + 13 );
231+ KERNEL (l + 14 );
232+ KERNEL (l + 15 );
233+ }
234+ if ((temp - l ) & 8 )
235+ {
236+ KERNEL (l );
237+ KERNEL (l + 1 );
238+ KERNEL (l + 2 );
239+ KERNEL (l + 3 );
240+ KERNEL (l + 4 );
241+ KERNEL (l + 5 );
242+ KERNEL (l + 6 );
243+ KERNEL (l + 7 );
244+ l += 8 ;
245+ }
246+ if ((temp - l ) & 4 )
247+ {
248+ KERNEL (l );
249+ KERNEL (l + 1 );
250+ KERNEL (l + 2 );
251+ KERNEL (l + 3 );
252+ l += 4 ;
253+ }
254+ if ((temp - l ) & 2 )
255+ {
256+ KERNEL (l );
257+ KERNEL (l + 1 );
258+ l += 2 ;
259+ }
260+ if ((temp - l ) & 1 )
261+ {
262+ KERNEL (l );
399263 }
400- for (; l < temp ; l ++ )
401- {
402- rowA = (vec_t * ) & AO [l << 3 ];
403- rowB = * ((__vector_pair * )((void * )& BO [l << 3 ]));
404- rowB1 = * ((__vector_pair * )((void * )& BO [(l << 3 ) + 4 ]));
405- __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
406- __builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
407- __builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [1 ]);
408- __builtin_mma_xvf64gerpp (& acc3 , rowB1 , rowA [1 ]);
409- __builtin_mma_xvf64gerpp (& acc4 , rowB , rowA [2 ]);
410- __builtin_mma_xvf64gerpp (& acc5 , rowB1 , rowA [2 ]);
411- __builtin_mma_xvf64gerpp (& acc6 , rowB , rowA [3 ]);
412- __builtin_mma_xvf64gerpp (& acc7 , rowB1 , rowA [3 ]);
413- }
414264 SAVE_ACC (& acc0 , 0 );
415265 SAVE_ACC1 (& acc1 , 0 );
416266 SAVE_ACC (& acc2 , 2 );
0 commit comments