FFTLIB User Guide
FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 
38 /*******************************************************************************
39  *
40  * INCLUDES
41  *
42  ******************************************************************************/
43 
44 #include "../FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_priv.h"
45 #include "../../../common/c71/FFTLIB_inlines.h"
46 
47 
48 /*******************************************************************************
49  *
50  * DEFINES
51  *
52  ******************************************************************************/
53 
54 #define SE_PARAM_BASE (0x0000)
55 #define SE_SE0_PARAM_OFFSET (SE_PARAM_BASE)
56 #define SE_SE1_PARAM_OFFSET (SE_SE0_PARAM_OFFSET + SE_PARAM_SIZE)
57 #define SE_SA0_PARAM_OFFSET (SE_SE1_PARAM_OFFSET + SE_PARAM_SIZE)
58 
59 #define FFTLIB_PARAM_BASE (SE_SA0_PARAM_OFFSET + SE_PARAM_SIZE)
60 
61 #define FFTLIB_CONFIGREG_OFFSET (FFTLIB_PARAM_BASE)
62 #define FFTLIB_OFFSETREG_OFFSET FFTLIB_CONFIGREG_OFFSET + sizeof(FFTLIB_MMA_CONFIG_REG)
63 
64 
65 /*******************************************************************************
66  *
67  * INITIALIZATION FUNCTIONS
68  *
69  ******************************************************************************/
70 
72  const FFTLIB_bufParams2D_t *src0_addr,
73  const FFTLIB_bufParams2D_t *src1_addr,
74  const FFTLIB_bufParams2D_t *dst_addr,
75  uint64_t *archCycles,
76  uint64_t *estCycles,
77  int32_t *caseNumber)
78 {
79  // determine the mmaWidth based upon output data type
80  int32_t MMA_SIZE;
81  if (dst_addr->data_type == FFTLIB_INT8 || dst_addr->data_type == FFTLIB_UINT8) {
82  MMA_SIZE = FFTLIB_MMA_SIZE_8_BIT;
83  } else if (dst_addr->data_type == FFTLIB_INT16 || dst_addr->data_type == FFTLIB_UINT16){
84  MMA_SIZE = FFTLIB_MMA_SIZE_16_BIT;
85  } else {
86  MMA_SIZE = FFTLIB_MMA_SIZE_32_BIT;
87  }
88 
89  int32_t M = src0_addr->dim_y / MMA_SIZE;
90  int32_t remM = src0_addr->dim_y % MMA_SIZE;
91  int32_t K = src0_addr->dim_x / MMA_SIZE;
92  int32_t remK = src0_addr->dim_x % MMA_SIZE;
93  int32_t N = src1_addr->dim_x / MMA_SIZE;
94  int32_t remN = src1_addr->dim_x % MMA_SIZE;
95 
96  if(remM > 0){
97  M++;
98  }
99 
100  if(remK > 0){
101  K++;
102  }
103 
104  if(remN > 0){
105  N++;
106  }
107 
108  int32_t MN = M*N;
109 // *idealCycles = (src0_addr->dim_y * src0_addr->dim_x * src1_addr->dim_x)/(MMA_SIZE*MMA_SIZE);
110 // if(*idealCycles == 0){
111 // *idealCycles = 1;
112 // }
113 
114  uint64_t startupCycles, teardownCycles, caseCycles, overheadCycles;
115  uint64_t storeLatency = 24;
116 
117  startupCycles =
118  5 + // FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec overhead
119  5 + // kernel function call
120  5 + // load SE1 params
121  17; // SEOPEN latency on SE1 (the first SE needed)
122 
123  teardownCycles =
124  2 + // SE/SA/MMA closes
125  3; // stack operations
126 
127  // 1 output block
128  if (MN == 1)
129  {
130  // case 1: M*N = 1, K = 1
131  if (K == 1)
132  {
133  *caseNumber = 1;
134 
135  caseCycles =
136  MMA_SIZE + // load Bload
137  MMA_SIZE + // load A, compute
138  MMA_SIZE + // store Cback
139  storeLatency; // store latency
140  overheadCycles =
141  1 + // load Bload
142  0 + // load A, compute
143  0 + // store Cback
144  1; // store latency (PROT)
145  }
146  // case 2: M*N = 1, K > 1
147  else
148  {
149  *caseNumber = 2;
150  caseCycles =
151  MMA_SIZE + // load Bload
152  (K-1)*MMA_SIZE + // load A, load Bload, compute
153  MMA_SIZE + // load A, compute
154  MMA_SIZE + // store Cback
155  storeLatency; // store latency
156  overheadCycles =
157  2 + // load Bload
158  3 + // load A, load Bload, compute (branch around, PROT)
159  1 + // load A, compute
160  0 + // store Cback
161  1; // store latency (PROT)
162  }
163  }
164  // > 1 output block
165  else
166  {
167  // case 3: M*N > 1, K = 1
168  if (K == 1)
169  {
170  *caseNumber = 3;
171  caseCycles =
172  MMA_SIZE + // load Bload
173  MMA_SIZE; // load A, load Bload, compute
174 
175  overheadCycles =
176  1 + 0 + // load Bload
177  0 + 2 + // load A, load Bload, compute
178  3; // test MN condition below
179 
180  if(MN > 2) {
181  caseCycles +=
182  (M*N-2)*MMA_SIZE + // load A, load Bload, compute, store Cback
183  MMA_SIZE + // load A, compute, store Cback
184  MMA_SIZE + // store Cback
185  storeLatency; // store latency
186 
187  overheadCycles +=
188  1 + // UNPROT before MN loop
189  (M*N-2)*0 + 1 + // load A, load Bload, compute, store Cback (UNPROT)
190  0 + // load A, compute, store Cback (unroll)
191  0 + // store Cback (unroll)
192  0; // store latency (unroll)
193  } else {
194  caseCycles +=
195  MMA_SIZE + // load A, compute, store Cback
196  MMA_SIZE + // store Cback
197  storeLatency; // store latency
198 
199  overheadCycles +=
200  0 + // load A, compute, store Cback (unroll)
201  0 + // store Cback (unroll)
202  0; // store latency
203  }
204  }
205  // case 4: M*N > 1, K > 1
206  else
207  {
208  *caseNumber = 4;
209 
210  caseCycles =
211  MMA_SIZE + // load Bload
212  K*MMA_SIZE; // load A, load Bload, compute
213 
214  overheadCycles =
215  1 + // branch from case 3 to case 4
216  1 + // UNPROT
217  0 + // load Bload
218  0 + // load A, load Bload, compute
219  1 + // PROT and branch prep
220  2; // more branch prep and possible branch to K==2
221 
222  if(K >= 3){
223  caseCycles +=
224  0 + // (MN-2) loop setup
225  (M*N-2)*MMA_SIZE + // load A, load Bload, compute, store Cback
226  (M*N-2)*MMA_SIZE + // load A, load Bload, compute
227  (M*N-2)*(K-2)*MMA_SIZE + // load A, load Bload, compute
228  0 + // loop end
229  MMA_SIZE + // load A, load Bload, compute, store Cback
230  MMA_SIZE + // load A, load Bload, compute
231  (K-3)*MMA_SIZE + // load A, load Bload, compute
232  MMA_SIZE; // load A, , compute
233 
234  overheadCycles +=
235  2 + // (MN-2) loop setup
236  (M*N-2)*1 + // UNPROT
237  (M*N-2)*0 + // load A, load Bload, compute, store Cback
238  0 + // load A, load Bload, compute
239  (M*N-2)*1 + // load A, load Bload, compute, PROT
240  (M*N-2)*1 + // loop end (branch)
241  1 + // load A, load Bload, compute, store Cback, UNPROT
242  0 + // load A, load Bload, compute (unroll)
243  2 + // load A, load Bload, compute (useless PROT/UNPROT)
244  2; // load A, , compute
245 
246  } else { // K == 2
247  caseCycles +=
248  0 + // (MN-2) loop setup
249  (M*N-2)*MMA_SIZE + // load A, load Bload, compute, store Cback
250  (M*N-2)*MMA_SIZE + // load A, load Bload, compute
251  0 + // loop end
252  MMA_SIZE + // load A, load Bload, compute, store Cback
253  MMA_SIZE; // load A, , compute
254 
255  overheadCycles +=
256  1 + // UNPROT
257  0 + // (MN-2) loop setup
258  0 + // load A, load Bload, compute
259  (M*N-2)*1 + // load A, load Bload, compute
260  1 + // loop end, UNPROT
261  0 + // load A, load Bload, compute, store Cback
262  0; // load A, , compute
263 
264  } // end of if(K >= 3){} else{};
265  caseCycles +=
266  MMA_SIZE + // store Cback
267  storeLatency; // store latency
268  overheadCycles +=
269  0 + // store Cback
270  1; // store latency
271 
272 
273  }
274  }
275 
276  *archCycles = startupCycles + caseCycles + teardownCycles;
277  *estCycles = startupCycles + caseCycles + overheadCycles + teardownCycles;
278 
279  return;
280 }
281 
282 template <uint32_t dataType>
284  const FFTLIB_bufParams2D_t *src0_addr,
285  const FFTLIB_bufParams2D_t *src1_addr,
286  const FFTLIB_bufParams2D_t *dst_addr,
288 {
289  FFTLIB_STATUS status = FFTLIB_SUCCESS;
290 
291  // dataType dependent values
292  uint32_t MMA_SIZE;
293  __SE_ELETYPE SE_ELETYPE;
294  __SE_VECLEN SE_VECLEN;
295  uint32_t MMA_TYPE_BYTEWIDTH;
296  __MMA_A_CONFIG_ATYPE ATYPE;
297  __MMA_C_CONFIG_BTYPE BTYPE;
298  __MMA_X_CONFIG_XTYPE XTYPE;
299  FFTLIB_MMA_CONFIG_REG mmaConfig;
300 
301  // if/else structure removed by compiler as conditions are all known at compile time and only one is valid per template value
302  if(dataType == FFTLIB_INT32){
303  MMA_SIZE = FFTLIB_MMA_SIZE_32_BIT;
304  SE_ELETYPE = __SE_ELETYPE_32BIT;
305  SE_VECLEN = __SE_VECLEN_16ELEMS;
306  MMA_TYPE_BYTEWIDTH = 4;
307  ATYPE = __MMA_A_CONFIG_ATYPE_INT32;
308  BTYPE = __MMA_C_CONFIG_BTYPE_INT32;
309  XTYPE = __MMA_X_CONFIG_XTYPE_INT32;
310 
311  // initialize the config to one of the common configurations
313 
314  } else if(dataType == FFTLIB_INT16){
315  MMA_SIZE = FFTLIB_MMA_SIZE_16_BIT;
316  SE_ELETYPE = __SE_ELETYPE_16BIT;
317  SE_VECLEN = __SE_VECLEN_32ELEMS;
318  MMA_TYPE_BYTEWIDTH = 2;
319  ATYPE = __MMA_A_CONFIG_ATYPE_INT16;
320  BTYPE = __MMA_C_CONFIG_BTYPE_INT16;
321  XTYPE = __MMA_X_CONFIG_XTYPE_INT16;
322 
323  // initialize the config to one of the common configurations
325 
326  } else if(dataType == FFTLIB_INT8){
327  MMA_SIZE = FFTLIB_MMA_SIZE_8_BIT;
328  SE_ELETYPE = __SE_ELETYPE_8BIT;
329  SE_VECLEN = __SE_VECLEN_64ELEMS;
330  MMA_TYPE_BYTEWIDTH = 1;
331  ATYPE = __MMA_A_CONFIG_ATYPE_INT8;
332  BTYPE = __MMA_C_CONFIG_BTYPE_INT8;
333  XTYPE = __MMA_X_CONFIG_XTYPE_INT8;
334 
335  // initialize the config to one of the common configurations
337  } else {
338  status = FFTLIB_ERR_INVALID_TYPE;
339  }
340 
341 
342  /*************************************************************************/
343  /* Matrix multiply cases */
344  /*************************************************************************/
345 
346  int32_t M = src0_addr->dim_y / MMA_SIZE;
347  int32_t remM = src0_addr->dim_y % MMA_SIZE;
348  int32_t K = src0_addr->dim_x / MMA_SIZE;
349  int32_t remK = src0_addr->dim_x % MMA_SIZE;
350  int32_t N = src1_addr->dim_x / MMA_SIZE;
351  int32_t remN = src1_addr->dim_x % MMA_SIZE;
352 
353  if(remM > 0){
354  M++;
355  }
356 
357  if(remK > 0){
358  K++;
359  }
360 
361  if(remN > 0){
362  N++;
363  }
364 
365  // store parameters
368  pKerPrivArgs->M = M;
369  pKerPrivArgs->K = K;
370  pKerPrivArgs->N = N;
371 
372  int32_t strideAElements = pKerPrivArgs->strideAElements;
373  int32_t strideBElements = pKerPrivArgs->strideBElements;
374 
375 
376  /*************************************************************************/
377  /* Allocate SE/SA parameters and flags */
378  /*************************************************************************/
379 
380  // SE0 for the A matrix
381  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1(); // SE parameter vector
382 
383  se0Params.DIMFMT = __SE_DIMFMT_5D;
384  se0Params.ELETYPE = SE_ELETYPE;
385  se0Params.VECLEN = SE_VECLEN;
386  se0Params.DECDIM1 = __SE_DECDIM_DIM2;
387  se0Params.DECDIM2 = __SE_DECDIM_DIM4;
388  se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
389  se0Params.DECDIM2SD = __SE_DECDIMSD_DIM1;
390 
391  se0Params.ICNT0 = (uint32_t) MMA_SIZE;
392  se0Params.ICNT1 = (uint32_t) MMA_SIZE;
393  se0Params.DIM1 = strideAElements;
394  se0Params.DECDIM1_WIDTH = (uint32_t) src0_addr->dim_x;
395  se0Params.ICNT2 = (uint32_t) K;
396  se0Params.DIM2 = (int32_t) MMA_SIZE;
397  se0Params.ICNT3 = (uint32_t) N;
398  se0Params.DIM3 = (int32_t) 0;
399  se0Params.DECDIM2_WIDTH = (uint32_t) src0_addr->dim_y * strideAElements;
400  se0Params.ICNT4 = (uint32_t) M;
401  se0Params.DIM4 = (int32_t) MMA_SIZE * strideAElements;
402  //se0Params = __set_SE_FLAGS(se0Params, &se0Params, NULL);
403 
404 
405  /*************************************************************************/
406  /* Prepare streaming engine to fetch B matrix input */
407  /*************************************************************************/
408 
409  // SE1 for the B matrix
410  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1(); // SE parameter vector
411 
412  se1Params.DIMFMT = __SE_DIMFMT_5D;
413  se1Params.ELETYPE = SE_ELETYPE;
414  se1Params.VECLEN = SE_VECLEN;
415  se1Params.DECDIM1 = __SE_DECDIM_DIM3;
416  se1Params.DECDIM2 = __SE_DECDIM_DIM2;
417  se1Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
418  se1Params.DECDIM2SD = __SE_DECDIMSD_DIM1;
419 
420  // __set_DIMx(params, icnt, dim);
421  se1Params.ICNT0 = (uint32_t) MMA_SIZE;
422  se1Params.ICNT1 = (uint32_t) MMA_SIZE;
423  se1Params.DIM1 = (int32_t) strideBElements;
424  se1Params.DECDIM2_WIDTH = (uint32_t) src0_addr->dim_x * strideBElements;
425  se1Params.ICNT2 = (uint32_t) K;
426  se1Params.DIM2 = (int32_t) MMA_SIZE*strideBElements;
427  se1Params.DECDIM1_WIDTH = (uint32_t) src1_addr->dim_x;
428  se1Params.ICNT3 = (uint32_t) N;
429  se1Params.DIM3 = (int32_t) MMA_SIZE;
430  se1Params.ICNT4 = (uint32_t) M;
431  se1Params.DIM4 = (int32_t) 0;
432  //se1Params = __set_SE_FLAGS(se1Params, &se1Params, NULL);
433 
434 
435  /*************************************************************************/
436  /* Prepare streaming address generator to store C matrix output */
437  /*************************************************************************/
438 
439  // SA flags for storing the C matrix
440  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
441 
442  sa0Params.VECLEN = __SA_VECLEN_64ELEMS;
443  sa0Params.DIMFMT = __SA_DIMFMT_4D;
444  sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
445  sa0Params.DECDIM2 = __SA_DECDIM_DIM3;
446  sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
447  sa0Params.DECDIM2SD = __SA_DECDIMSD_DIM1;
448 
449  // __set_DIMx(params, icnt, dim);
450  sa0Params.ICNT0 = (uint32_t) FFTLIB_BYTE_WIDTH;
451  sa0Params.ICNT1 = (uint32_t) MMA_SIZE;
452  sa0Params.DIM1 = (int32_t) dst_addr->stride_y;
453  sa0Params.DECDIM1_WIDTH = (uint32_t) dst_addr->dim_x*MMA_TYPE_BYTEWIDTH;
454  sa0Params.ICNT2 = (uint32_t) N;
455  sa0Params.DIM2 = (int32_t) FFTLIB_BYTE_WIDTH;
456  sa0Params.DECDIM2_WIDTH = (uint32_t) src0_addr->dim_y*dst_addr->stride_y;
457  sa0Params.ICNT3 = (uint32_t) M;
458  sa0Params.DIM3 = (int32_t) MMA_SIZE*dst_addr->stride_y;
459 
460  //sa0Params = __set_SA_FLAGS(sa0Params, &saFlags);
461 
462  /*************************************************************************/
463  /* Prepare the MMA for C = A*B */
464  /*************************************************************************/
465 
466  // initialize the offset register
467  __HWA_OFFSET_REG mmaOffset = offsetRegStruct_zeros;
468 
469  // update data dependent configuration parameters
470  mmaConfig.A_ATYPE = ATYPE;
471 
472  mmaConfig.C_BTYPE = BTYPE;
473  mmaConfig.C_OP1PER = (K-1)*MMA_SIZE;
474  mmaConfig.C_CRSWPER = K*MMA_SIZE;
475  mmaConfig.C_CWSWPER = K*MMA_SIZE;
476 
477  mmaConfig.X_XTYPE = XTYPE;
478  mmaConfig.X_SHIFT = pKerInitArgs->shift;
479 
480  *(__SE_TEMPLATE_v1*)((uint8_t *)pKerPrivArgs->bufPblock + SE_SE0_PARAM_OFFSET) = se0Params;
481  *(__SE_TEMPLATE_v1*)((uint8_t *)pKerPrivArgs->bufPblock + SE_SE1_PARAM_OFFSET) = se1Params;
482  *(__SA_TEMPLATE_v1*)((uint8_t *)pKerPrivArgs->bufPblock + SE_SA0_PARAM_OFFSET) = sa0Params;
483  *((FFTLIB_MMA_CONFIG_REG *)((uint8_t *)pKerPrivArgs->bufPblock + FFTLIB_CONFIGREG_OFFSET)) = mmaConfig;
484  *((__HWA_OFFSET_REG *)((uint8_t *)pKerPrivArgs->bufPblock + FFTLIB_OFFSETREG_OFFSET)) = mmaOffset;
485 
486  return status;
487 }
488 
489 
490 
492  const FFTLIB_bufParams2D_t *src0_addr,
493  const FFTLIB_bufParams2D_t *src1_addr,
494  const FFTLIB_bufParams2D_t *dst_addr,
496 
498  const FFTLIB_bufParams2D_t *src0_addr,
499  const FFTLIB_bufParams2D_t *src1_addr,
500  const FFTLIB_bufParams2D_t *dst_addr,
502 
504  const FFTLIB_bufParams2D_t *src0_addr,
505  const FFTLIB_bufParams2D_t *src1_addr,
506  const FFTLIB_bufParams2D_t *dst_addr,
508 
509 /*******************************************************************************
510  *
511  * EXECUTION FUNCTIONS
512  *
513  ******************************************************************************/
514 
515 template <int32_t MMA_SIZE>
517  FFTLIB_kernelHandle handle,
518  const void *src0,
519  const void *src1,
520  void *dst)
521 {
522  // m,k,n are the counters for M, K, N
523  int32_t mn,k;
524 
525  // load private arguments into variables
528  int32_t M = pKerPrivArgs->M;
529  int32_t K = pKerPrivArgs->K;
530  int32_t N = pKerPrivArgs->N;
531 
532  int32_t MN = M*N;
533 
534 
535  /*************************************************************************/
536  /* Allocate SE/SA parameters and flags */
537  /*************************************************************************/
538 
539  // SE0 for the src0 matrix
540  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1(); // SE parameter vector
541 
542  // SE1 for the src1 matrix
543  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1(); // SE parameter vector
544 
545  // SA0 for storing the dst matrix
546  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
547 
548  /*************************************************************************/
549  /* Matrix multiply */
550  /*************************************************************************/
551 
552  // load streaming engine and MMA parameters
553  se0Params = *(__SE_TEMPLATE_v1*)((uint8_t *)pKerPrivArgs->bufPblock + SE_SE0_PARAM_OFFSET);
554  se1Params = *(__SE_TEMPLATE_v1*)((uint8_t *)pKerPrivArgs->bufPblock + SE_SE1_PARAM_OFFSET);
555  sa0Params = *(__SA_TEMPLATE_v1*)((uint8_t *)pKerPrivArgs->bufPblock + SE_SA0_PARAM_OFFSET);
556  FFTLIB_MMA_CONFIG_REG mmaConfig = *((FFTLIB_MMA_CONFIG_REG *)((uint8_t *)pKerPrivArgs->bufPblock + FFTLIB_CONFIGREG_OFFSET));
557  __HWA_OFFSET_REG mmaOffset = *((__HWA_OFFSET_REG *)((uint8_t *)pKerPrivArgs->bufPblock + FFTLIB_OFFSETREG_OFFSET));
558 
559  // open SE0
560  __SE0_OPEN(src0, se0Params);
561  // open SE1 for loading the B matrix
562  __SE1_OPEN(src1, se1Params);
563  // open SA0
564  __SA0_OPEN(sa0Params);
565  // open MMA
566  __HWAOPEN(mmaConfig, mmaOffset, __MMA_OPEN_FSM_RESET);
567  __HWAADV();
568 
569  // 1 output block
570  if ((M <= 1) && (N <= 1))
571  {
572  // case 1: M <= 1, K <= 1, N <= 1,
573  if (K == 1)
574  {
575  FFTLIB_DEBUGPRINTFN(1, "Case 1%s", "\n");
576  // load Bload(0, 0)
577  FFTLIB_UTIL_SE1Bload(MMA_SIZE);
578 
579  // swap Bload(0, 0) to Bfore(0, 0)
580  // load Afore(0, 0)
581  // compute Cfore(0, 0) = Afore(0, 0)*Bfore(0, 0)
583 
584  // swap Cfore(0, 0) to Cback(0, 0)
585  // store Cback(0, 0)
586  FFTLIB_UTIL_SA0Cstore(MMA_SIZE, (uint8_t *)dst);
587  }
588  // case 2: M*N = 1, K > 1
589  else
590  {
591  FFTLIB_DEBUGPRINTFN(1, "Case 2%s", "\n");
592  // load Bload(0, 0)
593  FFTLIB_UTIL_SE1Bload(MMA_SIZE);
594 
595  // swap Bload(k, 0) to Bfore(k, 0)
596  // load Afore(0, k)
597  // load Bload(k + 1, 0)
598  // compute Cfore(0, 0) += Afore(0, k)*Bfore(k, 0)
599  for (k = 0; k < (K - 1); k++)
600  {
602  }
603 
604  // swap Bload(K - 1, 0) to Bfore(K - 1, 0)
605  // load Afore(0, K - 1)
606  // compute Cfore(0, 0) += Afore(0, K - 1)*Bfore(K - 1, 0)
608 
609  // swap Cfore(0, 0) to Cback(0, 0)
610  // store Cback(0, 0)
611  FFTLIB_UTIL_SA0Cstore(MMA_SIZE, (uint8_t *)dst);
612  }
613  }
614  // > 1 output block
615  else
616  {
617  // case 3: M*N > 1, K = 1
618  FFTLIB_DEBUGPRINTFN(1, "Case 3%s", "\n");
619  if (K == 1)
620  {
621 
622  // load Bload(0, 0)
623  FFTLIB_UTIL_SE1Bload(MMA_SIZE);
624 
625  // swap Bload(0, 0) to Bfore(0, 0)
626  // load Afore(0, 0)
627  // load Bload(0, 1)
628  // compute Cfore(0, 0) = Afore(0, 0)*Bfore(0, 0)
630 
631  // guarantee this loop runs at least 2 iterations. The compiler needs the total trip
632  // count to be >= 24 due to MMA store latency. For 8- and 16-bit data a single block is
633  // sufficient. However, for 32-bit data a block is only 16 iterations and
634  // compiler generates a separate case for MN == 3 and MN > 3 to deal with
635  // this. That branch-around then breaks PE scheduling that would place overlap the
636  // unrolled loop. By explicitly coding for this case we can allow for PE scheduling.
637  // Adding the extra case added zero cycles to the 8- and 16- implementations, so able
638  // to maintain a separate template.
639  if(MN > 3) {
640  _nassert (MN > 3);
641  for (mn = 1; mn < (MN - 1); mn++)
642  {
643  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(MMA_SIZE, (uint8_t *)dst);
644  }
645 
646  // swap Bload(0, N - 1) to Bfore(0, N - 1)
647  // swap Cfore(M - 1, N - 2) to Cback(M - 1, N - 2)
648  // load Afore(M - 1, 0)
649  // store Cback(M - 1, N - 2)
650  // compute Cfore(M - 1, N - 1) = Afore(M - 1, 0)*Bfore(0, N - 1)
651  // Note: this loop is fully unrolled to allow for prolog-epilog (PE) merging with the loop above it.
652  FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
653 
654 
655  // swap Cfore(M - 1, N - 1) to Cback(M - 1, N - 1)
656  // store Cback(M - 1, N - 1)
657  // Note: this loop is fully unrolled to allow for prolog-epilog (PE) merging with the loop above it.
658  FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
659 
660  } else if(MN == 3) {
661  _nassert(MN == 3);
662  // Work-around for JIRA CODEGEN-4452
663  // In the 32-bit case, the utility function would only iterate 16 times while the latency for the store is
664  // 24 cycles. The compiler uses a strategy called iteration pruning to schedule this loop, but it is still
665  // not able to fully PE schedule. Instead, we fully unroll the loop so it can PE schedule with the
666  // function following it (which is also unrolled).
667  //
668  // However, for the 8- and 16-bit versions, the loop runs 64 and 32 iterations, enough to cover the 24 cycle
669  // store latency. From a program cache perspective, it is better to keep these as a loop. In fact, the
670  // 8-bit case may have worse performance if fully unrolled b/c the compiler has a limit on the number
671  // of store operations in a single block of code. For 8-bit data, this code has 3*64 store operations (all
672  // three unrolled functions) which ran into this limit.
673  if(MMA_SIZE == FFTLIB_MMA_SIZE_32_BIT){
674  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
675  } else {
676  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(MMA_SIZE, (uint8_t *)dst);
677  }
678  // swap Bload(0, N - 1) to Bfore(0, N - 1)
679  // swap Cfore(M - 1, N - 2) to Cback(M - 1, N - 2)
680  // load Afore(M - 1, 0)
681  // store Cback(M - 1, N - 2)
682  // compute Cfore(M - 1, N - 1) = Afore(M - 1, 0)*Bfore(0, N - 1)
683  FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
684 
685  // swap Cfore(M - 1, N - 1) to Cback(M - 1, N - 1)
686  // store Cback(M - 1, N - 1)
687 
688  FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
689  } else { // effectively MN == 2
690  _nassert(MN==2);
691  // swap Bload(0, N - 1) to Bfore(0, N - 1)
692  // swap Cfore(M - 1, N - 2) to Cback(M - 1, N - 2)
693  // load Afore(M - 1, 0)
694  // store Cback(M - 1, N - 2)
695  // compute Cfore(M - 1, N - 1) = Afore(M - 1, 0)*Bfore(0, N - 1)
696  FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
697 
698  // swap Cfore(M - 1, N - 1) to Cback(M - 1, N - 1)
699  // store Cback(M - 1, N - 1)
700  FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
701  }
702 
703  }
704  // case 4: M*N > 1, K > 1
705  else
706  {
707  FFTLIB_DEBUGPRINTFN(1, "Case 4%s", "\n");
708  _nassert(K >= 2);
709  // load Bload(0, 0)
710  FFTLIB_UTIL_SE1Bload(MMA_SIZE);
711 
712  //
713  // mn = 0
714  //
715 
716  // swap Bload(k, 0) to Bfore(k, 0)
717  // load Afore(0, k)
718  // load Bload(k + 1, 0); note that Bload(K, 0) -> Bload(0, 1)
719  // compute Cfore(0, 0) += Afore(0, k)*Bfore(k, 0)
720  _nassert(K >= 2);
721  for (k = 0; k < K; k++)
722  {
724  }
725 
726  if(K >= 3){
727  _nassert(K >= 3);
728  //
729  // mn = 1 to MN - 2
730  //
731  for (mn = 1; mn < (MN - 1); mn++)
732  {
733  // swap Bload(curr) to Bfore(curr)
734  // swap Cfore(prev) to Cback(prev)
735  // load Afore(curr)
736  // load Bload(next)
737  // store Cback(prev)
738  // compute Cfore(curr) = Afore(curr)*Bfore(curr)
739  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(MMA_SIZE, (uint8_t *)dst);
740 
741  // unroll one iteration of K to allow for loop merging with prior loop
742  FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
743 
744 
745  // swap Bload(curr) to Bfore(curr)
746  // load Afore(curr)
747  // load Bload(next)
748  // compute Cfore(curr) += Afore(curr)*Bfore(curr)
749  for (k = 2; k < K; k++)
750  {
752  }
753  }
754 
755  //
756  // mn = MN - 1
757  //
758 
759  // swap Bload(0, N - 1) to Bfore(0, N - 1)
760  // swap Cfore(M - 1, N - 2) to Cback(M - 1, N - 2)
761  // load Afore(M - 1, 0)
762  // load Bload(1, N - 1)
763  // store Cback(M - 1, N - 2)
764  // compute Cfore(M - 1, N - 1) = Afore(M - 1, 0)*Bfore(0, N - 1)
765  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(MMA_SIZE, (uint8_t *)dst);
766 
767  // swap Bload(k, N - 1) to Bfore(k, N - 1)
768  // load Afore(M - 1, k)
769  // load Bload(k + 1, N - 1)
770  // compute Cfore(M - 1, N - 1) += Afore(M - 1, k)*Bfore(k, N - 1)
771  // peel off an iteration for loop merging with above
772  FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
773 
774  for (k = 2; k < (K - 1); k++)
775  {
777  }
778 
779  // swap Bload(K - 1, N - 1) to Bfore(K - 1, N - 1)
780  // load Afore(M - 1, K - 1)
781  // compute Cfore(M - 1, N - 1) += Afore(M - 1, K - 1)*Bfore(K - 1, N - 1)
783  } else { // K == 2
784 
785  //
786  // mn = 1 to MN - 2
787  //
788  //#pragma NO_COALESCE_LOOP // resolves performance issue introduced with alpha-5 compiler where the compiler makes a poor coalesce choice
789  for (mn = 1; mn < (MN - 1); mn++)
790  {
791  // swap Bload(curr) to Bfore(curr)
792  // swap Cfore(prev) to Cback(prev)
793  // load Afore(curr)
794  // load Bload(next)
795  // store Cback(prev)
796  // compute Cfore(curr) = Afore(curr)*Bfore(curr)
797  if(MMA_SIZE == FFTLIB_MMA_SIZE_32_BIT){
798  // For the 32-bit case, the xfer-rcv-store latency is too long to be covered by the subsequent block.
799  // Use this if condition to unroll it only for the 32-bit case. The compiler should recognize the "if"
800  // can be evaluated at compile time during templating and only include the appropriate code in the
801  // generated template.
802  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
803  } else {
804  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(MMA_SIZE, (uint8_t *)dst);
805  }
806 
807  // only one iteration of K, fully unroll it to allow for loop merging with prior loop
808  FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
809  }
810 
811  //
812  // mn = MN - 1
813  //
814 
815  // swap Bload(0, N - 1) to Bfore(0, N - 1)
816  // swap Cfore(M - 1, N - 2) to Cback(M - 1, N - 2)
817  // load Afore(M - 1, 0)
818  // load Bload(1, N - 1)
819  // store Cback(M - 1, N - 2)
820  // compute Cfore(M - 1, N - 1) = Afore(M - 1, 0)*Bfore(0, N - 1)
821  FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(MMA_SIZE, (uint8_t *)dst);
822 
823  // swap Bload(K - 1, N - 1) to Bfore(K - 1, N - 1)
824  // load Afore(M - 1, K - 1)
825  // compute Cfore(M - 1, N - 1) += Afore(M - 1, K - 1)*Bfore(K - 1, N - 1)
826  FFTLIB_UTIL_SE0AloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
827 
828  } // end of if(K >= 3){} else{};
829 
830  // swap Cfore(M - 1, N - 1) to Cback(M - 1, N - 1)
831  // store Cback(M - 1, N - 1)
832  FFTLIB_UTIL_SA0Cstore(MMA_SIZE, (uint8_t *)dst);
833  }
834  }
835 
836  __SE0_CLOSE();
837  __SE1_CLOSE();
838  __SA0_CLOSE();
839  __HWACLOSE(0);
840 
841  return FFTLIB_SUCCESS;
842 }
843 
844 // explicit instantiations for different execute functions
846  const void *src0,
847  const void *src1,
848  void *dst);
849 
851  const void *src0,
852  const void *src1,
853  void *dst);
854 
856  const void *src0,
857  const void *src1,
858  void *dst);
859 
void FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_perfEst(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, uint64_t *archCycles, uint64_t *estCycles, int32_t *caseNumber)
This function estimates the cycles consumed for the kernel execution.
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci< FFTLIB_MMA_SIZE_8_BIT >(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci< FFTLIB_MMA_SIZE_16_BIT >(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
This function is the initialization function for the optimized C implementation of the kernel.
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci< FFTLIB_INT16 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
This function is the main execution function for the optimized C7x-MMA implementation of the matrix-m...
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci< FFTLIB_INT8 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci< FFTLIB_MMA_SIZE_32_BIT >(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci< FFTLIB_INT32 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
static void FFTLIB_UTIL_SA0Cstore(int32_t numRows, uint8_t *CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeC(int32_t numRows)
use streaming engine 0 to load numRows of A compute C(row) = A(row) x Bfore(:,:)
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(int32_t numRows, uint8_t *CbackDestination)
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeC(int32_t numRows)
static void FFTLIB_UTIL_SE1Bload(int32_t numRows)
load numRows of B into the Bload buffer
@ FFTLIB_INT16
@ FFTLIB_INT8
@ FFTLIB_UINT16
@ FFTLIB_UINT8
@ FFTLIB_INT32
void * FFTLIB_kernelHandle
Handle type for FFTLIB operations.
Definition: FFTLIB_types.h:217
FFTLIB_STATUS_NAME
The enumeration of all status codes.
Definition: FFTLIB_types.h:172
@ FFTLIB_ERR_INVALID_TYPE
Definition: FFTLIB_types.h:176
@ FFTLIB_SUCCESS
Definition: FFTLIB_types.h:173
#define FFTLIB_DEBUGPRINTFN(N, fmt,...)
Definition: FFTLIB_types.h:107
const __HWA_OFFSET_REG offsetRegStruct_zeros
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i16s_i16s_o16s
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i32s_i32s_o32s
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i8s_i8s_o8s
#define FFTLIB_MMA_CONFIG_REG
#define FFTLIB_MMA_SIZE_16_BIT
type is 16-bit integers
#define FFTLIB_BYTE_WIDTH
MMA width in bytes.
#define FFTLIB_MMA_SIZE_8_BIT
MMA size as a function of precision.
#define FFTLIB_MMA_SIZE_32_BIT
type is 32-bit integers
This structure holds all the initialization parameters for matrix- matrix multiplication.
int32_t K
Number of blocks in the horizontal dimension of the first input matrix. In the case of natural C code...
int32_t M
Number of blocks in the vertical dimension of the first input matrix. In the case of natural C code,...
int32_t N
Number of blocks in the horizontal dimension of the second input matrix. In the case of natural C cod...
uint8_t bufPblock[FFTLIB_LINALG_MATRIXMATRIXMULTIPLY_IXX_IXX_OXX_PBLOCK_SIZE]
Array to store the configuration prepared by FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci t...
A structure for a 2 dimensional buffer descriptor.
uint32_t dim_y
Height of buffer in Y dimension in elements.
uint32_t dim_x
Width of buffer in X dimension in elements.
int32_t stride_y
Stride in Y dimension in bytes.
uint32_t data_type
Values are of type FFTLIB_data_type_e.