40 #include "../common/c71/DSPLIB_inlines.h"
43 #define SE_PARAM_BASE (0x0000)
44 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define DSPLIB_MATMUL_FIXED_UNROLL (32)
52 template <
typename dataTypeIn,
typename dataTypeOut>
56 __SE_TEMPLATE_v1 *se1Params)
58 typedef typename c7x::make_full_vector<int16_t>::type vec;
59 __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
60 se0Params->VECLEN = SE_VECLEN;
61 se0Params->TRANSPOSE = __SE_TRANSPOSE_128BIT;
62 se1Params->VECLEN = SE_VECLEN;
63 se1Params->TRANSPOSE = __SE_TRANSPOSE_64BIT;
67 __SE_TEMPLATE_v1 *se1Params)
69 se0Params->VECLEN = __SE_VECLEN_32ELEMS;
70 se0Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
71 se0Params->TRANSPOSE = __SE_TRANSPOSE_64BIT;
72 se1Params->VECLEN = __SE_VECLEN_32ELEMS;
73 se1Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
74 se1Params->TRANSPOSE = __SE_TRANSPOSE_32BIT;
76 template <
typename dataTypeIn,
typename dataTypeOut>
87 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
89 uint32_t M = pKerPrivArgs->
M;
90 uint32_t N = pKerPrivArgs->
N;
91 uint32_t K = pKerPrivArgs->
K;
92 uint32_t MBlocks = DSPLIB_ceilingDiv(M, 4);
93 uint32_t NBlocks = DSPLIB_ceilingDiv(N, 32);
94 uint32_t KBlocks = DSPLIB_ceilingDiv(K, 8);
95 pKerPrivArgs->
MBlocks = MBlocks;
96 pKerPrivArgs->
NBlocks = NBlocks;
97 pKerPrivArgs->
KBlocks = KBlocks;
102 __SE_TEMPLATE_v1 se0Params;
103 __SE_TEMPLATE_v1 se1Params;
104 __SA_TEMPLATE_v1 sa0Params;
105 typedef typename c7x::make_full_vector<dataTypeIn>::type vec;
106 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
108 se0Params = __gen_SE_TEMPLATE_v1();
111 se0Params.ICNT1 = (4u < M) ? 4 : M;
112 se0Params.DIM1 = strideIn0;
113 se0Params.ICNT2 = NBlocks;
115 se0Params.ICNT3 = MBlocks;
116 se0Params.DIM3 = strideIn0 * 4;
117 se0Params.DIMFMT = __SE_DIMFMT_4D;
118 se0Params.ELETYPE = SE_ELETYPE;
120 se1Params = __gen_SE_TEMPLATE_v1();
121 se1Params.ICNT0 = 32;
122 se1Params.ICNT1 = (8u < K) ? 8 : K;
123 se1Params.DIM1 = strideIn1;
124 se1Params.ICNT2 = KBlocks;
125 se1Params.DIM2 = strideIn1 * 8;
126 se1Params.ICNT3 = NBlocks;
128 se1Params.ICNT4 = MBlocks;
130 se1Params.DIMFMT = __SE_DIMFMT_5D;
131 se1Params.ELETYPE = SE_ELETYPE;
133 se1Params.DECDIM2 = __SE_DECDIM_DIM3;
134 se1Params.DECDIM2_WIDTH = N;
136 DSPLIB_matMul_fixed_PromoteTranspose_init_ci<dataTypeIn, dataTypeOut>(&se0Params, &se1Params);
138 sa0Params = __gen_SA_TEMPLATE_v1();
139 sa0Params.ICNT0 = 32;
141 sa0Params.DIM1 = strideOut;
142 sa0Params.ICNT2 = NBlocks;
144 sa0Params.ICNT3 = MBlocks;
145 sa0Params.DIM3 = 4 * strideOut;
146 sa0Params.DIMFMT = __SA_DIMFMT_4D;
147 sa0Params.VECLEN = __SA_VECLEN_32ELEMS;
149 sa0Params.DECDIM1 = __SA_DECDIM_DIM3;
150 sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM1;
151 sa0Params.DECDIM1_WIDTH = M * strideOut;
153 sa0Params.DECDIM2 = __SA_DECDIM_DIM2;
154 sa0Params.DECDIM2SD = __SA_DECDIMSD_DIM0;
155 sa0Params.DECDIM2_WIDTH = N;
181 template <
typename V>
static inline c7x::uchar_vec
setMask();
184 uint8_t mask[64] = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 32, 33, 34, 35, 40, 41,
185 42, 43, 36, 37, 38, 39, 44, 45, 46, 47, 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23,
186 28, 29, 30, 31, 48, 49, 50, 51, 56, 57, 58, 59, 52, 53, 54, 55, 60, 61, 62, 63};
187 c7x::uchar_vec vMaskOut = *stov_ptr(c7x::uchar_vec, (uint8_t *) &mask[0]);
193 uint8_t mask[64] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 32, 33, 36, 37, 40, 41,
194 44, 45, 34, 35, 38, 39, 42, 43, 46, 47, 16, 17, 20, 21, 24, 25, 28, 29, 18, 19, 22, 23,
195 26, 27, 30, 31, 48, 49, 52, 53, 56, 57, 60, 61, 50, 51, 54, 55, 58, 59, 62, 63};
196 c7x::uchar_vec vMaskOut = *stov_ptr(c7x::uchar_vec, (uint8_t *) &mask[0]);
200 template <
typename V, __SE_REG, __SE_REG>
static inline void vecMulAcc(V outSum0, V outSum1);
203 inline void vecMulAcc<c7x::long_vec &, __SE_REG_0, __SE_REG_1_ADV>(c7x::long_vec &outSum0, c7x::long_vec &outSum1)
205 c7x::long_vec vOut0, vOut1;
206 __vmatmpyhd_vvw(__SE_REG_0, __SE_REG_1_ADV, vOut0, vOut1);
212 inline void vecMulAcc<c7x::long_vec &, __SE_REG_0_ADV, __SE_REG_1_ADV>(c7x::long_vec &outSum0, c7x::long_vec &outSum1)
214 c7x::long_vec vOut0, vOut1;
215 __vmatmpyhd_vvw(__SE_REG_0_ADV, __SE_REG_1_ADV, vOut0, vOut1);
221 inline void vecMulAcc<c7x::int_vec &, __SE_REG_0, __SE_REG_1_ADV>(c7x::int_vec &outSum0, c7x::int_vec &outSum1)
223 c7x::int_vec vOut = __matmpy(__SE_REG_0, __SE_REG_1_ADV);
228 inline void vecMulAcc<c7x::int_vec &, __SE_REG_0_ADV, __SE_REG_1_ADV>(c7x::int_vec &outSum0, c7x::int_vec &outSum1)
230 c7x::int_vec vOut = __matmpy(__SE_REG_0_ADV, __SE_REG_1_ADV);
234 template <
typename V>
static inline void vecShiftRight(V vecSR0, V vecSR1, V shiftVec);
237 inline void vecShiftRight<c7x::long_vec &>(c7x::long_vec &vecSRL0, c7x::long_vec &vecSRL1, c7x::long_vec &shiftVec)
239 vecSRL0 = __shift_right(vecSRL0, shiftVec);
240 vecSRL1 = __shift_right(vecSRL1, shiftVec);
244 inline void vecShiftRight<c7x::int_vec &>(c7x::int_vec &vecSRI0, c7x::int_vec &vecSRI1, c7x::int_vec &shiftVec)
246 vecSRI0 = __shift_right(vecSRI0, shiftVec);
249 template <
typename V>
static inline void resetVec(V vecRes0, V vecRes1);
251 template <>
inline void resetVec<c7x::long_vec &>(c7x::long_vec &vecResL0, c7x::long_vec &vecResL1)
253 vecResL0 = (c7x::long_vec) 0;
254 vecResL1 = (c7x::long_vec) 0;
257 template <>
inline void resetVec<c7x::int_vec &>(c7x::int_vec &vecResI0, c7x::int_vec &vecResI1)
259 vecResI0 = (c7x::int_vec) 0;
262 template <
typename V>
263 static inline void packAlternate(V vecPA0, V vecPA1, V vecPA2, V vecPA3, V vecPAOut0, V vecPAOut1);
266 inline void packAlternate<c7x::long_vec &>(c7x::long_vec &vecPAL0,
267 c7x::long_vec &vecPAL1,
268 c7x::long_vec &vecPAL2,
269 c7x::long_vec &vecPAL3,
270 c7x::long_vec &vecPALOut0,
271 c7x::long_vec &vecPALOut1)
273 vecPALOut0 = c7x::as_long_vec(__vpackw_vvv(c7x::as_int_vec(vecPAL2), c7x::as_int_vec(vecPAL0)));
274 vecPALOut1 = c7x::as_long_vec(__vpackw_vvv(c7x::as_int_vec(vecPAL3), c7x::as_int_vec(vecPAL1)));
278 inline void packAlternate<c7x::int_vec &>(c7x::int_vec &vecPAI0,
279 c7x::int_vec &vecPAI1,
280 c7x::int_vec &vecPAI2,
281 c7x::int_vec &vecPAI3,
282 c7x::int_vec &vecPAIOut0,
283 c7x::int_vec &vecPAIOut1)
285 vecPAIOut0 = c7x::as_int_vec(__vpackl2_vvv(c7x::as_short_vec(vecPAI2), c7x::as_short_vec(vecPAI0)));
288 template <
typename V,
typename W>
298 c7x::uchar_vec vMask);
301 inline void vecPermutePack<c7x::long_vec &, int16_t *>(c7x::long_vec &vecPermL1,
302 c7x::long_vec &vecPermL2,
303 c7x::long_vec &vecPermL3,
304 c7x::long_vec &vecPermL4,
305 c7x::long_vec &vecPermL5,
306 c7x::long_vec &vecPermL6,
307 c7x::long_vec &vecPermL7,
308 c7x::long_vec &vecPermL8,
310 c7x::uchar_vec vMaskPerm)
312 c7x::long_vec vecPermL9, vecPermL10, vecPermL11, vecPermL12, vecPermL13, vecPermL14, vecPermL15, vecPermL16;
315 c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL3), c7x::as_uchar_vec(vecPermL1)));
317 c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL4), c7x::as_uchar_vec(vecPermL2)));
319 c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL7), c7x::as_uchar_vec(vecPermL5)));
321 c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL8), c7x::as_uchar_vec(vecPermL6)));
323 c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL3), c7x::as_uchar_vec(vecPermL1)));
325 c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL4), c7x::as_uchar_vec(vecPermL2)));
327 c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL7), c7x::as_uchar_vec(vecPermL5)));
329 c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL8), c7x::as_uchar_vec(vecPermL6)));
331 vecPermL1 = c7x::long_vec(vecPermL9.lo(), vecPermL11.lo());
332 vecPermL2 = c7x::long_vec(vecPermL10.lo(), vecPermL12.lo());
333 vecPermL3 = c7x::long_vec(vecPermL13.lo(), vecPermL15.lo());
334 vecPermL4 = c7x::long_vec(vecPermL14.lo(), vecPermL16.lo());
335 vecPermL5 = c7x::long_vec(vecPermL9.hi(), vecPermL11.hi());
336 vecPermL6 = c7x::long_vec(vecPermL10.hi(), vecPermL12.hi());
337 vecPermL7 = c7x::long_vec(vecPermL13.hi(), vecPermL15.hi());
338 vecPermL8 = c7x::long_vec(vecPermL14.hi(), vecPermL16.hi());
342 inline void vecPermutePack<c7x::int_vec &, int8_t *>(c7x::int_vec &vecPermI1,
343 c7x::int_vec &vecPermI2,
344 c7x::int_vec &vecPermI3,
345 c7x::int_vec &vecPermI4,
346 c7x::int_vec &vecPermI5,
347 c7x::int_vec &vecPermI6,
348 c7x::int_vec &vecPermI7,
349 c7x::int_vec &vecPermI8,
351 c7x::uchar_vec vMaskPerm)
353 c7x::int_vec vecPermI9, vecPermI11, vecPermI13, vecPermI15;
356 c7x::as_int_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermI3), c7x::as_uchar_vec(vecPermI1)));
358 c7x::as_int_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermI7), c7x::as_uchar_vec(vecPermI5)));
360 c7x::as_int_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermI3), c7x::as_uchar_vec(vecPermI1)));
362 c7x::as_int_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermI7), c7x::as_uchar_vec(vecPermI5)));
364 vecPermI1 = c7x::int_vec(vecPermI9.lo(), vecPermI11.lo());
365 vecPermI3 = c7x::int_vec(vecPermI13.lo(), vecPermI15.lo());
366 vecPermI5 = c7x::int_vec(vecPermI9.hi(), vecPermI11.hi());
367 vecPermI7 = c7x::int_vec(vecPermI13.hi(), vecPermI15.hi());
370 template <
typename dataTypeIn>
static inline void writeOutSA0(dataTypeIn *pOut, c7x::short_vec v1, c7x::short_vec v2);
372 template <>
inline void writeOutSA0(int16_t *pOut, c7x::short_vec v1, c7x::short_vec v2)
374 __vpred tmp = c7x::strm_agen<0, c7x::short_vec>::get_vpred();
375 c7x::short_vec *storevec = c7x::strm_agen<0, c7x::short_vec>::get_adv(pOut);
376 __vstore_pred_interleave(tmp, storevec, v1, v2);
379 template <>
inline void writeOutSA0(int8_t *pOut, c7x::short_vec v1, c7x::short_vec v2)
381 __vpred tmp = c7x::strm_agen<0, c7x::short_vec>::get_vpred();
382 c7x::char_hvec *storevec = c7x::strm_agen<0, c7x::char_hvec>::get_adv(pOut);
383 __vstore_pred_packl(tmp, storevec, v1);
386 template <
typename dataTypeIn,
typename dataTypeOut>
391 uint64_t start = __TSC;
393 uint64_t loopCycle = 0;
394 uint64_t accloopCycle = 0;
401 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
402 int32_t MBlocks = pKerPrivArgs->
MBlocks;
403 int32_t NBlocks = pKerPrivArgs->
NBlocks;
404 int32_t KBlocks = pKerPrivArgs->
KBlocks;
405 int32_t qs = pKerPrivArgs->
qs;
407 dataTypeIn *pIn0Local = (dataTypeIn *) pIn0;
408 dataTypeIn *pIn1Local = (dataTypeIn *) pIn1;
409 dataTypeIn *pOutLocal = (dataTypeIn *) pOut;
411 __SE_TEMPLATE_v1 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SE0_PARAM_OFFSET);
412 __SE_TEMPLATE_v1 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SE1_PARAM_OFFSET);
413 __SA_TEMPLATE_v1 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SA0_PARAM_OFFSET);
415 __SE0_OPEN(pIn0Local, se0Params);
416 __SE1_OPEN(pIn1Local, se1Params);
417 __SA0_OPEN(sa0Params);
419 c7x::uchar_vec vMask = setMask<dataTypeIn>();
421 dataTypeOut shiftVec = (dataTypeOut) qs;
422 dataTypeOut zeroVec = (dataTypeOut) 0;
423 dataTypeOut outSum0 = zeroVec, outSum1 = zeroVec, outSum2 = zeroVec, outSum3 = zeroVec;
424 dataTypeOut outSum4 = zeroVec, outSum5 = zeroVec, outSum6 = zeroVec, outSum7 = zeroVec;
425 dataTypeOut outSum8 = zeroVec, outSum9 = zeroVec, outSum10 = zeroVec, outSum11 = zeroVec;
426 dataTypeOut outSum12 = zeroVec, outSum13 = zeroVec, outSum14 = zeroVec, outSum15 = zeroVec;
427 dataTypeOut out1, out2, out3, out4, out5, out6, out7, out8;
429 for (int32_t l = 0; l < MBlocks * NBlocks; l++) {
433 for (int32_t p = 0; p < KBlocks; p++) {
434 vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum0, outSum1);
435 vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum2, outSum3);
436 vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum4, outSum5);
437 vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum6, outSum7);
438 vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum8, outSum9);
439 vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum10, outSum11);
440 vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum12, outSum13);
441 vecMulAcc<dataTypeOut &, __SE_REG_0_ADV, __SE_REG_1_ADV>(outSum14, outSum15);
444 loopCycle = __TSC - start;
445 accloopCycle += loopCycle;
446 printf(
"|INNER LOOP CNT : %d| %lu |\n", count++, loopCycle);
449 vecShiftRight<dataTypeOut &>(outSum0, outSum1, shiftVec);
450 vecShiftRight<dataTypeOut &>(outSum2, outSum3, shiftVec);
451 vecShiftRight<dataTypeOut &>(outSum4, outSum5, shiftVec);
452 vecShiftRight<dataTypeOut &>(outSum6, outSum7, shiftVec);
453 vecShiftRight<dataTypeOut &>(outSum8, outSum9, shiftVec);
454 vecShiftRight<dataTypeOut &>(outSum10, outSum11, shiftVec);
455 vecShiftRight<dataTypeOut &>(outSum12, outSum13, shiftVec);
456 vecShiftRight<dataTypeOut &>(outSum14, outSum15, shiftVec);
458 packAlternate<dataTypeOut &>(outSum0, outSum1, outSum2, outSum3, out1, out2);
459 packAlternate<dataTypeOut &>(outSum4, outSum5, outSum6, outSum7, out3, out4);
460 packAlternate<dataTypeOut &>(outSum8, outSum9, outSum10, outSum11, out5, out6);
461 packAlternate<dataTypeOut &>(outSum12, outSum13, outSum14, outSum15, out7, out8);
463 vecPermutePack<dataTypeOut &, dataTypeIn *>(out1, out2, out3, out4, out5, out6, out7, out8, pOutLocal, vMask);
465 writeOutSA0(pOutLocal, c7x::as_short_vec(out1), c7x::as_short_vec(out2));
466 writeOutSA0(pOutLocal, c7x::as_short_vec(out5), c7x::as_short_vec(out6));
467 writeOutSA0(pOutLocal, c7x::as_short_vec(out3), c7x::as_short_vec(out4));
468 writeOutSA0(pOutLocal, c7x::as_short_vec(out7), c7x::as_short_vec(out8));
470 resetVec<dataTypeOut &>(outSum0, outSum1);
471 resetVec<dataTypeOut &>(outSum2, outSum3);
472 resetVec<dataTypeOut &>(outSum4, outSum5);
473 resetVec<dataTypeOut &>(outSum6, outSum7);
474 resetVec<dataTypeOut &>(outSum8, outSum9);
475 resetVec<dataTypeOut &>(outSum10, outSum11);
476 resetVec<dataTypeOut &>(outSum12, outSum13);
477 resetVec<dataTypeOut &>(outSum14, outSum15);
481 printf(
"|CORE LOOP IN16_T | %lu |\n", accloopCycle);
493 void *restrict pOut);
498 void *restrict pOut);
static void vecMulAcc(V outSum0, V outSum1)
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
static void vecShiftRight(V vecSR0, V vecSR1, V shiftVec)
static c7x::uchar_vec setMask()
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
static void writeOutSA0(dataTypeIn *pOut, c7x::short_vec v1, c7x::short_vec v2)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
static void vecPermutePack(V vecPerm1, V vecPerm2, V vecPerm3, V vecPerm4, V vecPerm5, V vecPerm6, V vecPerm7, V vecPerm8, W pOutLocal, c7x::uchar_vec vMask)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
static void resetVec(V vecRes0, V vecRes1)
static void packAlternate(V vecPA0, V vecPA1, V vecPA2, V vecPA3, V vecPAOut0, V vecPAOut1)
DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
c7x::uchar_vec setMask< int8_t >()
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
c7x::uchar_vec setMask< int16_t >()
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_matMul_fixed.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MATMUL_FIXED_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t strideIn0Elements
int32_t strideOutElements
int32_t strideIn1Elements