44 #include "../FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_priv.h"
45 #include "../../../common/c71/FFTLIB_inlines.h"
54 #define SE_PARAM_BASE (0x0000)
55 #define SE_SE0_PARAM_OFFSET (SE_PARAM_BASE)
56 #define SE_SE1_PARAM_OFFSET (SE_SE0_PARAM_OFFSET + SE_PARAM_SIZE)
57 #define SE_SA0_PARAM_OFFSET (SE_SE1_PARAM_OFFSET + SE_PARAM_SIZE)
59 #define FFTLIB_PARAM_BASE (SE_SA0_PARAM_OFFSET + SE_PARAM_SIZE)
61 #define FFTLIB_CONFIGREG_OFFSET (FFTLIB_PARAM_BASE)
62 #define FFTLIB_OFFSETREG_OFFSET FFTLIB_CONFIGREG_OFFSET + sizeof(FFTLIB_MMA_CONFIG_REG)
89 int32_t M = src0_addr->
dim_y / MMA_SIZE;
90 int32_t remM = src0_addr->
dim_y % MMA_SIZE;
91 int32_t K = src0_addr->
dim_x / MMA_SIZE;
92 int32_t remK = src0_addr->
dim_x % MMA_SIZE;
93 int32_t N = src1_addr->
dim_x / MMA_SIZE;
94 int32_t remN = src1_addr->
dim_x % MMA_SIZE;
114 uint64_t startupCycles, teardownCycles, caseCycles, overheadCycles;
115 uint64_t storeLatency = 24;
227 (M*N-2)*(K-2)*MMA_SIZE +
276 *archCycles = startupCycles + caseCycles + teardownCycles;
277 *estCycles = startupCycles + caseCycles + overheadCycles + teardownCycles;
282 template <u
int32_t dataType>
293 __SE_ELETYPE SE_ELETYPE;
294 __SE_VECLEN SE_VECLEN;
295 uint32_t MMA_TYPE_BYTEWIDTH;
296 __MMA_A_CONFIG_ATYPE ATYPE;
297 __MMA_C_CONFIG_BTYPE BTYPE;
298 __MMA_X_CONFIG_XTYPE XTYPE;
304 SE_ELETYPE = __SE_ELETYPE_32BIT;
305 SE_VECLEN = __SE_VECLEN_16ELEMS;
306 MMA_TYPE_BYTEWIDTH = 4;
307 ATYPE = __MMA_A_CONFIG_ATYPE_INT32;
308 BTYPE = __MMA_C_CONFIG_BTYPE_INT32;
309 XTYPE = __MMA_X_CONFIG_XTYPE_INT32;
316 SE_ELETYPE = __SE_ELETYPE_16BIT;
317 SE_VECLEN = __SE_VECLEN_32ELEMS;
318 MMA_TYPE_BYTEWIDTH = 2;
319 ATYPE = __MMA_A_CONFIG_ATYPE_INT16;
320 BTYPE = __MMA_C_CONFIG_BTYPE_INT16;
321 XTYPE = __MMA_X_CONFIG_XTYPE_INT16;
328 SE_ELETYPE = __SE_ELETYPE_8BIT;
329 SE_VECLEN = __SE_VECLEN_64ELEMS;
330 MMA_TYPE_BYTEWIDTH = 1;
331 ATYPE = __MMA_A_CONFIG_ATYPE_INT8;
332 BTYPE = __MMA_C_CONFIG_BTYPE_INT8;
333 XTYPE = __MMA_X_CONFIG_XTYPE_INT8;
346 int32_t M = src0_addr->
dim_y / MMA_SIZE;
347 int32_t remM = src0_addr->
dim_y % MMA_SIZE;
348 int32_t K = src0_addr->
dim_x / MMA_SIZE;
349 int32_t remK = src0_addr->
dim_x % MMA_SIZE;
350 int32_t N = src1_addr->
dim_x / MMA_SIZE;
351 int32_t remN = src1_addr->
dim_x % MMA_SIZE;
381 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
383 se0Params.DIMFMT = __SE_DIMFMT_5D;
384 se0Params.ELETYPE = SE_ELETYPE;
385 se0Params.VECLEN = SE_VECLEN;
386 se0Params.DECDIM1 = __SE_DECDIM_DIM2;
387 se0Params.DECDIM2 = __SE_DECDIM_DIM4;
388 se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
389 se0Params.DECDIM2SD = __SE_DECDIMSD_DIM1;
391 se0Params.ICNT0 = (uint32_t) MMA_SIZE;
392 se0Params.ICNT1 = (uint32_t) MMA_SIZE;
393 se0Params.DIM1 = strideAElements;
394 se0Params.DECDIM1_WIDTH = (uint32_t) src0_addr->
dim_x;
395 se0Params.ICNT2 = (uint32_t) K;
396 se0Params.DIM2 = (int32_t) MMA_SIZE;
397 se0Params.ICNT3 = (uint32_t) N;
398 se0Params.DIM3 = (int32_t) 0;
399 se0Params.DECDIM2_WIDTH = (uint32_t) src0_addr->
dim_y * strideAElements;
400 se0Params.ICNT4 = (uint32_t) M;
401 se0Params.DIM4 = (int32_t) MMA_SIZE * strideAElements;
410 __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
412 se1Params.DIMFMT = __SE_DIMFMT_5D;
413 se1Params.ELETYPE = SE_ELETYPE;
414 se1Params.VECLEN = SE_VECLEN;
415 se1Params.DECDIM1 = __SE_DECDIM_DIM3;
416 se1Params.DECDIM2 = __SE_DECDIM_DIM2;
417 se1Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
418 se1Params.DECDIM2SD = __SE_DECDIMSD_DIM1;
421 se1Params.ICNT0 = (uint32_t) MMA_SIZE;
422 se1Params.ICNT1 = (uint32_t) MMA_SIZE;
423 se1Params.DIM1 = (int32_t) strideBElements;
424 se1Params.DECDIM2_WIDTH = (uint32_t) src0_addr->
dim_x * strideBElements;
425 se1Params.ICNT2 = (uint32_t) K;
426 se1Params.DIM2 = (int32_t) MMA_SIZE*strideBElements;
427 se1Params.DECDIM1_WIDTH = (uint32_t) src1_addr->
dim_x;
428 se1Params.ICNT3 = (uint32_t) N;
429 se1Params.DIM3 = (int32_t) MMA_SIZE;
430 se1Params.ICNT4 = (uint32_t) M;
431 se1Params.DIM4 = (int32_t) 0;
440 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
442 sa0Params.VECLEN = __SA_VECLEN_64ELEMS;
443 sa0Params.DIMFMT = __SA_DIMFMT_4D;
444 sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
445 sa0Params.DECDIM2 = __SA_DECDIM_DIM3;
446 sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
447 sa0Params.DECDIM2SD = __SA_DECDIMSD_DIM1;
451 sa0Params.ICNT1 = (uint32_t) MMA_SIZE;
452 sa0Params.DIM1 = (int32_t) dst_addr->
stride_y;
453 sa0Params.DECDIM1_WIDTH = (uint32_t) dst_addr->
dim_x*MMA_TYPE_BYTEWIDTH;
454 sa0Params.ICNT2 = (uint32_t) N;
456 sa0Params.DECDIM2_WIDTH = (uint32_t) src0_addr->
dim_y*dst_addr->
stride_y;
457 sa0Params.ICNT3 = (uint32_t) M;
458 sa0Params.DIM3 = (int32_t) MMA_SIZE*dst_addr->
stride_y;
470 mmaConfig.A_ATYPE = ATYPE;
472 mmaConfig.C_BTYPE = BTYPE;
473 mmaConfig.C_OP1PER = (K-1)*MMA_SIZE;
474 mmaConfig.C_CRSWPER = K*MMA_SIZE;
475 mmaConfig.C_CWSWPER = K*MMA_SIZE;
477 mmaConfig.X_XTYPE = XTYPE;
478 mmaConfig.X_SHIFT = pKerInitArgs->
shift;
515 template <
int32_t MMA_SIZE>
528 int32_t M = pKerPrivArgs->
M;
529 int32_t K = pKerPrivArgs->
K;
530 int32_t N = pKerPrivArgs->
N;
540 __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
543 __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
546 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
560 __SE0_OPEN(src0, se0Params);
562 __SE1_OPEN(src1, se1Params);
564 __SA0_OPEN(sa0Params);
566 __HWAOPEN(mmaConfig, mmaOffset, __MMA_OPEN_FSM_RESET);
570 if ((M <= 1) && (N <= 1))
599 for (k = 0; k < (K - 1); k++)
641 for (mn = 1; mn < (MN - 1); mn++)
652 FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
658 FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
674 FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
683 FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
688 FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
696 FFTLIB_UTIL_SE0AloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
700 FFTLIB_UTIL_SA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
721 for (k = 0; k < K; k++)
731 for (mn = 1; mn < (MN - 1); mn++)
742 FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
749 for (k = 2; k < K; k++)
772 FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
774 for (k = 2; k < (K - 1); k++)
789 for (mn = 1; mn < (MN - 1); mn++)
802 FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore_unroll<MMA_SIZE>(MMA_SIZE, (uint8_t *)dst);
808 FFTLIB_UTIL_SE0AloadSE1BloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
826 FFTLIB_UTIL_SE0AloadComputeC_unroll<MMA_SIZE>(MMA_SIZE);
#define FFTLIB_CONFIGREG_OFFSET
void FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_perfEst(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, uint64_t *archCycles, uint64_t *estCycles, int32_t *caseNumber)
This function estimates the cycles consumed for the kernel execution.
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci< FFTLIB_MMA_SIZE_8_BIT >(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci< FFTLIB_MMA_SIZE_16_BIT >(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
This function is the initialization function for the optimized C implementation of the kernel.
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci< FFTLIB_INT16 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
This function is the main execution function for the optimized C7x-MMA implementation of the matrix-m...
#define SE_SE0_PARAM_OFFSET
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci< FFTLIB_INT8 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_exec_ci< FFTLIB_MMA_SIZE_32_BIT >(FFTLIB_kernelHandle handle, const void *src0, const void *src1, void *dst)
#define SE_SE1_PARAM_OFFSET
#define FFTLIB_OFFSETREG_OFFSET
template FFTLIB_STATUS FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci< FFTLIB_INT32 >(FFTLIB_kernelHandle handle, const FFTLIB_bufParams2D_t *src0_addr, const FFTLIB_bufParams2D_t *src1_addr, const FFTLIB_bufParams2D_t *dst_addr, const FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_InitArgs *pKerInitArgs)
#define SE_SA0_PARAM_OFFSET
static void FFTLIB_UTIL_SA0Cstore(int32_t numRows, uint8_t *CbackDestination)
static void FFTLIB_UTIL_SE0AloadComputeC(int32_t numRows)
use streaming engine 0 to load numRows of A compute C(row) = A(row) x Bfore(:,:)
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeCSA0Cstore(int32_t numRows, uint8_t *CbackDestination)
static void FFTLIB_UTIL_SE0AloadSE1BloadComputeC(int32_t numRows)
static void FFTLIB_UTIL_SE1Bload(int32_t numRows)
load numRows of B into the Bload buffer
void * FFTLIB_kernelHandle
Handle type for FFTLIB operations.
FFTLIB_STATUS_NAME
The enumeration of all status codes.
@ FFTLIB_ERR_INVALID_TYPE
#define FFTLIB_DEBUGPRINTFN(N, fmt,...)
const __HWA_OFFSET_REG offsetRegStruct_zeros
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i16s_i16s_o16s
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i32s_i32s_o32s
const FFTLIB_MMA_CONFIG_REG configRegisterStruct_i8s_i8s_o8s
#define FFTLIB_MMA_CONFIG_REG
#define FFTLIB_MMA_SIZE_16_BIT
type is 16-bit integers
#define FFTLIB_BYTE_WIDTH
MMA width in bytes.
#define FFTLIB_MMA_SIZE_8_BIT
MMA size as a function of precision.
#define FFTLIB_MMA_SIZE_32_BIT
type is 32-bit integers
This structure holds all the initialization parameters for matrix- matrix multiplication.
int8_t shift
Output shift value.
This structure holds all private arguments.
int32_t strideBElements
Row stride for the B matrix.
int32_t K
Number of blocks in the horizontal dimension of the first input matrix. In the case of natural C code...
int32_t M
Number of blocks in the vertical dimension of the first input matrix. In the case of natural C code,...
int32_t strideAElements
Row stride for the A matrix.
int32_t N
Number of blocks in the horizontal dimension of the second input matrix. In the case of natural C cod...
uint8_t bufPblock[FFTLIB_LINALG_MATRIXMATRIXMULTIPLY_IXX_IXX_OXX_PBLOCK_SIZE]
Array to store the configuration prepared by FFTLIB_LINALG_matrixMatrixMultiply_ixX_ixX_oxX_init_ci t...
A structure for a 2 dimensional buffer descriptor.
uint32_t dim_y
Height of buffer in Y dimension in elements.
uint32_t dim_x
Width of buffer in X dimension in elements.
int32_t stride_y
Stride in Y dimension in bytes.
uint32_t data_type
Values are of type FFTLIB_data_type_e.