40 #include "../common/c71/DSPLIB_inlines.h"
43 #define SE_PARAM_BASE (0x0000)
44 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define DSPLIB_MATMUL_FIXED_UNROLL (128)
52 template <
typename dataTypeIn,
typename dataTypeOut>
56 __SE_TEMPLATE_v1 *se1Params)
58 se0Params->PROMOTE = __SE_PROMOTE_OFF;
59 se1Params->TRANSPOSE = __SE_TRANSPOSE_64BIT;
63 __SE_TEMPLATE_v1 *se1Params)
65 se0Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
66 se1Params->TRANSPOSE = __SE_TRANSPOSE_32BIT;
67 se1Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
70 template <
typename dataTypeIn,
typename dataTypeOut>
78 __SE_TEMPLATE_v1 se0Params;
79 __SE_TEMPLATE_v1 se1Params;
80 __SA_TEMPLATE_v1 sa0Params;
81 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<dataTypeOut>::value;
82 __SE_VECLEN SE_VECLEN = c7x::se_veclen<c7x::short_vec>::value;
84 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<dataTypeOut>::value;
85 int32_t elementCount = c7x::element_count_of<dataTypeOut>::value;
87 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
88 int32_t M = pKerPrivArgs->
M;
89 int32_t K = pKerPrivArgs->
K;
90 int32_t N = pKerPrivArgs->
N;
94 int32_t unrollFactor = 1;
99 int32_t KBlocks = ((K + 4 - 1)) / (4);
100 int32_t MBlocks = ((M + 2 - 1)) / (2);
101 int32_t NBlocks = ((N + (elementCount * unrollFactor) - 1)) / (elementCount * unrollFactor);
104 pKerPrivArgs->
KBlocks = KBlocks;
105 pKerPrivArgs->
NBlocks = NBlocks;
106 pKerPrivArgs->
MBlocks = MBlocks;
111 se0Params = __gen_SE_TEMPLATE_v1();
112 se0Params.ELETYPE = SE_ELETYPE;
113 se0Params.DIMFMT = __SE_DIMFMT_5D;
114 se0Params.DECDIM1 = __SE_DECDIM_DIM2;
115 se0Params.DECDIM2 = __SE_DECDIM_DIM4;
116 se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
117 se0Params.DECDIM2SD = __SE_DECDIMSD_DIM1;
119 se0Params.VECLEN = __SE_VECLEN_4ELEMS;
120 se0Params.GRPDUP = __SE_GRPDUP_ON;
123 se0Params.DIM1 = strideIn0;
124 se0Params.ICNT2 = KBlocks;
126 se0Params.ICNT3 = NBlocks;
128 se0Params.DECDIM2_WIDTH = (uint32_t) M * strideIn0;
129 se0Params.ICNT4 = MBlocks;
130 se0Params.DIM4 = 2 * strideIn0;
131 se0Params.DECDIM1_WIDTH = (uint32_t) K;
136 se1Params = __gen_SE_TEMPLATE_v1();
137 se1Params.ELETYPE = SE_ELETYPE;
138 se1Params.VECLEN = SE_VECLEN;
139 se1Params.DIMFMT = __SE_DIMFMT_5D;
141 se1Params.ICNT0 = 4 * 4 * unrollFactor;
143 se1Params.DIM1 = strideIn1;
144 se1Params.DIM2 = 4 * strideIn1;
145 se1Params.ICNT2 = KBlocks;
146 se1Params.DIM3 = 4 * 4 * unrollFactor;
147 se1Params.ICNT3 = NBlocks;
149 se1Params.ICNT4 = MBlocks;
151 DSPLIB_matMul_fixed_PromoteTranspose_init_ci<dataTypeIn, dataTypeOut>(&se0Params, &se1Params);
155 sa0Params = __gen_SA_TEMPLATE_v1();
156 sa0Params.VECLEN = SA_VECLEN;
157 sa0Params.DIMFMT = __SA_DIMFMT_4D;
158 sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
159 sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
160 sa0Params.DECDIM2 = __SA_DECDIM_DIM3;
161 sa0Params.DECDIM2SD = __SA_DECDIMSD_DIM1;
163 sa0Params.ICNT0 = elementCount * unrollFactor;
165 sa0Params.ICNT2 = NBlocks;
166 sa0Params.DIM1 = strideOut;
167 sa0Params.DECDIM1_WIDTH = N;
168 sa0Params.DECDIM2_WIDTH = M * strideOut;
169 sa0Params.DIM2 = elementCount * unrollFactor;
170 sa0Params.ICNT3 = MBlocks;
171 sa0Params.DIM3 = 2 * strideOut;
196 template <u
int32_t
id>
static inline void loadMatSE(c7x::short_vec *a)
198 *a = c7x::strm_eng<id, c7x::short_vec>::get_adv();
200 template <
typename T,
typename pVec,
typename vecIn>
201 static inline void writeOutSA0(__vpred vPred, pVec *addr, T pOut, vecIn out1, vecIn out2)
203 vPred = c7x::strm_agen<0, pVec>::get_vpred();
204 addr = c7x::strm_agen<0, pVec>::get_adv(pOut);
205 __vstore_pred_packl_2src(vPred, addr, out1, out2);
208 template <
typename T,
typename vecIn>
209 static inline void writeOutSA0(__vpred vPred, c7x::char_hvec *addr, T pOut, vecIn out1, vecIn out2)
211 vPred = c7x::strm_agen<0, c7x::char_hvec>::get_vpred();
212 addr = c7x::strm_agen<0, c7x::char_hvec>::get_adv(pOut);
213 __vstore_pred_pack_byte_2src(vPred, addr, out1, out2);
217 template <
typename dataTypeIn,
typename dataTypeOut>
222 int16_t *pOutLocal = (int16_t *) pOut;
223 int32_t KBlocks = pKerPrivArgs->
KBlocks;
224 int32_t NBlocks = pKerPrivArgs->
NBlocks;
225 int32_t MBlocks = pKerPrivArgs->
MBlocks;
226 int32_t qs = pKerPrivArgs->
qs;
227 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
228 __SE_TEMPLATE_v1 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SE0_PARAM_OFFSET);
229 __SE_TEMPLATE_v1 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SE1_PARAM_OFFSET);
230 __SA_TEMPLATE_v1 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock +
SA0_PARAM_OFFSET);
233 uchar32 vMask = uchar32(0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7,
234 14, 15, 22, 23, 30, 31);
236 __SE0_OPEN(pIn0, se0Params);
237 __SE1_OPEN((pIn1), se1Params);
238 __SA0_OPEN(sa0Params);
240 if (pKerPrivArgs->
N >= 32) {
241 for (int32_t mn = 0; mn < MBlocks * NBlocks; mn++) {
242 c7x::long_vec c00 = (c7x::long_vec) 0;
243 c7x::long_vec c01 = (c7x::long_vec) 0;
244 c7x::long_vec c02 = (c7x::long_vec) 0;
245 c7x::long_vec c03 = (c7x::long_vec) 0;
246 c7x::long_vec c04 = (c7x::long_vec) 0;
247 c7x::long_vec c05 = (c7x::long_vec) 0;
248 c7x::long_vec c06 = (c7x::long_vec) 0;
249 c7x::long_vec c07 = (c7x::long_vec) 0;
250 c7x::long_vec c10 = (c7x::long_vec) 0;
251 c7x::long_vec c11 = (c7x::long_vec) 0;
252 c7x::long_vec c12 = (c7x::long_vec) 0;
253 c7x::long_vec c13 = (c7x::long_vec) 0;
254 c7x::long_vec c14 = (c7x::long_vec) 0;
255 c7x::long_vec c15 = (c7x::long_vec) 0;
256 c7x::long_vec c16 = (c7x::long_vec) 0;
257 c7x::long_vec c17 = (c7x::long_vec) 0;
258 c7x::short_vec b0, b1, b2, b3, b4, b5, b6, b7;
259 c7x::short_vec a0, a1;
260 for (int32_t k = 0; k < KBlocks; k++) {
274 b0 = __as_short16(__permute(vMask, __as_uchar32(b0)));
275 b1 = __as_short16(__permute(vMask, __as_uchar32(b1)));
276 b2 = __as_short16(__permute(vMask, __as_uchar32(b2)));
277 b3 = __as_short16(__permute(vMask, __as_uchar32(b3)));
279 b4 = __as_short16(__permute(vMask, __as_uchar32(b4)));
280 b5 = __as_short16(__permute(vMask, __as_uchar32(b5)));
281 b6 = __as_short16(__permute(vMask, __as_uchar32(b6)));
282 b7 = __as_short16(__permute(vMask, __as_uchar32(b7)));
284 c00 += __dotp4_ext(a0, b0);
285 c01 += __dotp4_ext(a0, b1);
286 c02 += __dotp4_ext(a0, b2);
287 c03 += __dotp4_ext(a0, b3);
289 c04 += __dotp4_ext(a0, b4);
290 c05 += __dotp4_ext(a0, b5);
291 c06 += __dotp4_ext(a0, b6);
292 c07 += __dotp4_ext(a0, b7);
294 c10 += __dotp4_ext(a1, b0);
295 c11 += __dotp4_ext(a1, b1);
296 c12 += __dotp4_ext(a1, b2);
297 c13 += __dotp4_ext(a1, b3);
299 c14 += __dotp4_ext(a1, b4);
300 c15 += __dotp4_ext(a1, b5);
301 c16 += __dotp4_ext(a1, b6);
302 c17 += __dotp4_ext(a1, b7);
304 c7x::long_vec c00shfited = __shift_right(c00, (c7x::long_vec) qs);
305 c7x::long_vec c01shfited = __shift_right(c01, (c7x::long_vec) qs);
306 c7x::long_vec c02shfited = __shift_right(c02, (c7x::long_vec) qs);
307 c7x::long_vec c03shfited = __shift_right(c03, (c7x::long_vec) qs);
308 c7x::long_vec c04shfited = __shift_right(c04, (c7x::long_vec) qs);
309 c7x::long_vec c05shfited = __shift_right(c05, (c7x::long_vec) qs);
310 c7x::long_vec c06shfited = __shift_right(c06, (c7x::long_vec) qs);
311 c7x::long_vec c07shfited = __shift_right(c07, (c7x::long_vec) qs);
312 c7x::long_vec c10shfited = __shift_right(c10, (c7x::long_vec) qs);
313 c7x::long_vec c11shfited = __shift_right(c11, (c7x::long_vec) qs);
314 c7x::long_vec c12shfited = __shift_right(c12, (c7x::long_vec) qs);
315 c7x::long_vec c13shfited = __shift_right(c13, (c7x::long_vec) qs);
316 c7x::long_vec c14shfited = __shift_right(c14, (c7x::long_vec) qs);
317 c7x::long_vec c15shfited = __shift_right(c15, (c7x::long_vec) qs);
318 c7x::long_vec c16shfited = __shift_right(c16, (c7x::long_vec) qs);
319 c7x::long_vec c17shfited = __shift_right(c17, (c7x::long_vec) qs);
321 c7x::int_vec out01 = __vwpackl_vvv(c7x::as_int_vec(c01shfited), c7x::as_int_vec(c00shfited));
322 c7x::int_vec out02 = __vwpackl_vvv(c7x::as_int_vec(c03shfited), c7x::as_int_vec(c02shfited));
323 c7x::int_vec out03 = __vwpackl_vvv(c7x::as_int_vec(c05shfited), c7x::as_int_vec(c04shfited));
324 c7x::int_vec out04 = __vwpackl_vvv(c7x::as_int_vec(c07shfited), c7x::as_int_vec(c06shfited));
325 c7x::int_vec out11 = __vwpackl_vvv(c7x::as_int_vec(c11shfited), c7x::as_int_vec(c10shfited));
326 c7x::int_vec out12 = __vwpackl_vvv(c7x::as_int_vec(c13shfited), c7x::as_int_vec(c12shfited));
327 c7x::int_vec out13 = __vwpackl_vvv(c7x::as_int_vec(c15shfited), c7x::as_int_vec(c14shfited));
328 c7x::int_vec out14 = __vwpackl_vvv(c7x::as_int_vec(c17shfited), c7x::as_int_vec(c16shfited));
337 for (int32_t mn = 0; mn < MBlocks * NBlocks; mn++) {
338 c7x::long_vec c00 = (c7x::long_vec) 0;
339 c7x::long_vec c01 = (c7x::long_vec) 0;
340 c7x::long_vec c02 = (c7x::long_vec) 0;
341 c7x::long_vec c03 = (c7x::long_vec) 0;
342 c7x::long_vec c10 = (c7x::long_vec) 0;
343 c7x::long_vec c11 = (c7x::long_vec) 0;
344 c7x::long_vec c12 = (c7x::long_vec) 0;
345 c7x::long_vec c13 = (c7x::long_vec) 0;
346 c7x::short_vec b0, b1, b2, b3;
347 c7x::short_vec a0, a1;
349 for (int32_t k = 0; k < KBlocks; k++) {
358 b0 = __as_short16(__permute(vMask, __as_uchar32(b0)));
359 b1 = __as_short16(__permute(vMask, __as_uchar32(b1)));
360 b2 = __as_short16(__permute(vMask, __as_uchar32(b2)));
361 b3 = __as_short16(__permute(vMask, __as_uchar32(b3)));
363 c00 += __dotp4_ext(a0, b0);
364 c01 += __dotp4_ext(a0, b1);
365 c02 += __dotp4_ext(a0, b2);
366 c03 += __dotp4_ext(a0, b3);
368 c10 += __dotp4_ext(a1, b0);
369 c11 += __dotp4_ext(a1, b1);
370 c12 += __dotp4_ext(a1, b2);
371 c13 += __dotp4_ext(a1, b3);
373 c7x::long_vec c00shfited = __shift_right(c00, (c7x::long_vec) qs);
374 c7x::long_vec c01shfited = __shift_right(c01, (c7x::long_vec) qs);
375 c7x::long_vec c02shfited = __shift_right(c02, (c7x::long_vec) qs);
376 c7x::long_vec c03shfited = __shift_right(c03, (c7x::long_vec) qs);
378 c7x::int_vec out01 = __vwpackl_vvv(c7x::as_int_vec(c01shfited), c7x::as_int_vec(c00shfited));
379 c7x::int_vec out02 = __vwpackl_vvv(c7x::as_int_vec(c03shfited), c7x::as_int_vec(c02shfited));
381 c7x::long_vec c10shfited = __shift_right(c10, (c7x::long_vec) qs);
382 c7x::long_vec c11shfited = __shift_right(c11, (c7x::long_vec) qs);
383 c7x::long_vec c12shfited = __shift_right(c12, (c7x::long_vec) qs);
384 c7x::long_vec c13shfited = __shift_right(c13, (c7x::long_vec) qs);
386 c7x::int_vec out11 = __vwpackl_vvv(c7x::as_int_vec(c11shfited), c7x::as_int_vec(c10shfited));
387 c7x::int_vec out12 = __vwpackl_vvv(c7x::as_int_vec(c13shfited), c7x::as_int_vec(c12shfited));
403 void *restrict pOut);
408 void *restrict pOut);
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
static void loadMatSE(c7x::short_vec *a)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
static void writeOutSA0(__vpred vPred, pVec *addr, T pOut, vecIn out1, vecIn out2)
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_matMul_fixed.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MATMUL_FIXED_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t strideIn0Elements
int32_t strideOutElements
int32_t strideIn1Elements