DSPLIB User Guide
DSPLIB_matMul_fixed_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
2 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *
11  * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the
14  * distribution.
15  *
16  * Neither the name of Texas Instruments Incorporated nor the names of
17  * its contributors may be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  *
32  ******************************************************************************/
33 
34 /*******************************************************************************
35  *
36  * INCLUDES
37  *
38  ******************************************************************************/
39 
40 #include "../common/c71/DSPLIB_inlines.h"
42 
43 #define SE_PARAM_BASE (0x0000)
44 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define DSPLIB_MATMUL_FIXED_UNROLL (32)
48 
49 /***********************************
50  * Initialization
51  ***********************************/
52 template <typename dataTypeIn, typename dataTypeOut>
53 inline void DSPLIB_matMul_fixed_PromoteTranspose_init_ci(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params);
54 template <>
56  __SE_TEMPLATE_v1 *se1Params)
57 {
58  typedef typename c7x::make_full_vector<int16_t>::type vec;
59  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
60  se0Params->VECLEN = SE_VECLEN;
61  se0Params->TRANSPOSE = __SE_TRANSPOSE_128BIT;
62  se1Params->VECLEN = SE_VECLEN;
63  se1Params->TRANSPOSE = __SE_TRANSPOSE_64BIT;
64 }
65 template <>
67  __SE_TEMPLATE_v1 *se1Params)
68 {
69  se0Params->VECLEN = __SE_VECLEN_32ELEMS;
70  se0Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
71  se0Params->TRANSPOSE = __SE_TRANSPOSE_64BIT;
72  se1Params->VECLEN = __SE_VECLEN_32ELEMS;
73  se1Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
74  se1Params->TRANSPOSE = __SE_TRANSPOSE_32BIT;
75 }
76 template <typename dataTypeIn, typename dataTypeOut>
78  const DSPLIB_bufParams2D_t *bufParamsIn0,
79  const DSPLIB_bufParams2D_t *bufParamsIn1,
80  const DSPLIB_bufParams2D_t *bufParamsOut,
81  const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
82 {
83 
86 
87  uint8_t *pBlock = pKerPrivArgs->bufPblock;
88 
89  uint32_t M = pKerPrivArgs->M;
90  uint32_t N = pKerPrivArgs->N;
91  uint32_t K = pKerPrivArgs->K;
92  uint32_t MBlocks = DSPLIB_ceilingDiv(M, 4);
93  uint32_t NBlocks = DSPLIB_ceilingDiv(N, 32);
94  uint32_t KBlocks = DSPLIB_ceilingDiv(K, 8);
95  pKerPrivArgs->MBlocks = MBlocks;
96  pKerPrivArgs->NBlocks = NBlocks;
97  pKerPrivArgs->KBlocks = KBlocks;
98  int32_t strideIn0 = pKerPrivArgs->strideIn0Elements;
99  int32_t strideIn1 = pKerPrivArgs->strideIn1Elements;
100  int32_t strideOut = pKerPrivArgs->strideOutElements;
101 
102  __SE_TEMPLATE_v1 se0Params;
103  __SE_TEMPLATE_v1 se1Params;
104  __SA_TEMPLATE_v1 sa0Params;
105  typedef typename c7x::make_full_vector<dataTypeIn>::type vec;
106  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
107 
108  se0Params = __gen_SE_TEMPLATE_v1();
109 
110  se0Params.ICNT0 = K;
111  se0Params.ICNT1 = (4u < M) ? 4 : M; // 4;
112  se0Params.DIM1 = strideIn0;
113  se0Params.ICNT2 = NBlocks;
114  se0Params.DIM2 = 0;
115  se0Params.ICNT3 = MBlocks;
116  se0Params.DIM3 = strideIn0 * 4;
117  se0Params.DIMFMT = __SE_DIMFMT_4D;
118  se0Params.ELETYPE = SE_ELETYPE;
119 
120  se1Params = __gen_SE_TEMPLATE_v1();
121  se1Params.ICNT0 = 32;
122  se1Params.ICNT1 = (8u < K) ? 8 : K; // 8;
123  se1Params.DIM1 = strideIn1;
124  se1Params.ICNT2 = KBlocks;
125  se1Params.DIM2 = strideIn1 * 8;
126  se1Params.ICNT3 = NBlocks;
127  se1Params.DIM3 = 32;
128  se1Params.ICNT4 = MBlocks;
129  se1Params.DIM4 = 0;
130  se1Params.DIMFMT = __SE_DIMFMT_5D;
131  se1Params.ELETYPE = SE_ELETYPE;
132 
133  se1Params.DECDIM2 = __SE_DECDIM_DIM3;
134  se1Params.DECDIM2_WIDTH = N;
135 
136  DSPLIB_matMul_fixed_PromoteTranspose_init_ci<dataTypeIn, dataTypeOut>(&se0Params, &se1Params);
137 
138  sa0Params = __gen_SA_TEMPLATE_v1();
139  sa0Params.ICNT0 = 32;
140  sa0Params.ICNT1 = 4;
141  sa0Params.DIM1 = strideOut;
142  sa0Params.ICNT2 = NBlocks;
143  sa0Params.DIM2 = 32;
144  sa0Params.ICNT3 = MBlocks;
145  sa0Params.DIM3 = 4 * strideOut;
146  sa0Params.DIMFMT = __SA_DIMFMT_4D;
147  sa0Params.VECLEN = __SA_VECLEN_32ELEMS;
148 
149  sa0Params.DECDIM1 = __SA_DECDIM_DIM3;
150  sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM1;
151  sa0Params.DECDIM1_WIDTH = M * strideOut;
152 
153  sa0Params.DECDIM2 = __SA_DECDIM_DIM2;
154  sa0Params.DECDIM2SD = __SA_DECDIMSD_DIM0;
155  sa0Params.DECDIM2_WIDTH = N;
156 
157  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET) = se0Params;
158  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET) = se1Params;
159  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET) = sa0Params;
160  return status;
161 }
162 
165  const DSPLIB_bufParams2D_t *bufParamsIn0,
166  const DSPLIB_bufParams2D_t *bufParamsIn1,
167  const DSPLIB_bufParams2D_t *bufParamsOut,
168  const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs);
169 
172  const DSPLIB_bufParams2D_t *bufParamsIn0,
173  const DSPLIB_bufParams2D_t *bufParamsIn1,
174  const DSPLIB_bufParams2D_t *bufParamsOut,
175  const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs);
176 
177 /***********************************
178  * Implementation
179  ***********************************/
180 
181 template <typename V> static inline c7x::uchar_vec setMask();
182 template <> inline c7x::uchar_vec setMask<int16_t>()
183 {
184  uint8_t mask[64] = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 32, 33, 34, 35, 40, 41,
185  42, 43, 36, 37, 38, 39, 44, 45, 46, 47, 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23,
186  28, 29, 30, 31, 48, 49, 50, 51, 56, 57, 58, 59, 52, 53, 54, 55, 60, 61, 62, 63};
187  c7x::uchar_vec vMaskOut = *stov_ptr(c7x::uchar_vec, (uint8_t *) &mask[0]);
188  return vMaskOut;
189 }
190 
191 template <> inline c7x::uchar_vec setMask<int8_t>()
192 {
193  uint8_t mask[64] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 32, 33, 36, 37, 40, 41,
194  44, 45, 34, 35, 38, 39, 42, 43, 46, 47, 16, 17, 20, 21, 24, 25, 28, 29, 18, 19, 22, 23,
195  26, 27, 30, 31, 48, 49, 52, 53, 56, 57, 60, 61, 50, 51, 54, 55, 58, 59, 62, 63};
196  c7x::uchar_vec vMaskOut = *stov_ptr(c7x::uchar_vec, (uint8_t *) &mask[0]);
197  return vMaskOut;
198 }
199 
200 template <typename V, __SE_REG, __SE_REG> static inline void vecMulAcc(V outSum0, V outSum1);
201 
202 template <>
203 inline void vecMulAcc<c7x::long_vec &, __SE_REG_0, __SE_REG_1_ADV>(c7x::long_vec &outSum0, c7x::long_vec &outSum1)
204 {
205  c7x::long_vec vOut0, vOut1;
206  __vmatmpyhd_vvw(__SE_REG_0, __SE_REG_1_ADV, vOut0, vOut1);
207  outSum0 += vOut0;
208  outSum1 += vOut1;
209 }
210 
211 template <>
212 inline void vecMulAcc<c7x::long_vec &, __SE_REG_0_ADV, __SE_REG_1_ADV>(c7x::long_vec &outSum0, c7x::long_vec &outSum1)
213 {
214  c7x::long_vec vOut0, vOut1;
215  __vmatmpyhd_vvw(__SE_REG_0_ADV, __SE_REG_1_ADV, vOut0, vOut1);
216  outSum0 += vOut0;
217  outSum1 += vOut1;
218 }
219 
220 template <>
221 inline void vecMulAcc<c7x::int_vec &, __SE_REG_0, __SE_REG_1_ADV>(c7x::int_vec &outSum0, c7x::int_vec &outSum1)
222 {
223  c7x::int_vec vOut = __matmpy(__SE_REG_0, __SE_REG_1_ADV);
224  outSum0 += vOut;
225 }
226 
227 template <>
228 inline void vecMulAcc<c7x::int_vec &, __SE_REG_0_ADV, __SE_REG_1_ADV>(c7x::int_vec &outSum0, c7x::int_vec &outSum1)
229 {
230  c7x::int_vec vOut = __matmpy(__SE_REG_0_ADV, __SE_REG_1_ADV);
231  outSum0 += vOut;
232 }
233 
234 template <typename V> static inline void vecShiftRight(V vecSR0, V vecSR1, V shiftVec);
235 
236 template <>
237 inline void vecShiftRight<c7x::long_vec &>(c7x::long_vec &vecSRL0, c7x::long_vec &vecSRL1, c7x::long_vec &shiftVec)
238 {
239  vecSRL0 = __shift_right(vecSRL0, shiftVec);
240  vecSRL1 = __shift_right(vecSRL1, shiftVec);
241 }
242 
243 template <>
244 inline void vecShiftRight<c7x::int_vec &>(c7x::int_vec &vecSRI0, c7x::int_vec &vecSRI1, c7x::int_vec &shiftVec)
245 {
246  vecSRI0 = __shift_right(vecSRI0, shiftVec);
247 }
248 
249 template <typename V> static inline void resetVec(V vecRes0, V vecRes1);
250 
251 template <> inline void resetVec<c7x::long_vec &>(c7x::long_vec &vecResL0, c7x::long_vec &vecResL1)
252 {
253  vecResL0 = (c7x::long_vec) 0;
254  vecResL1 = (c7x::long_vec) 0;
255 }
256 
257 template <> inline void resetVec<c7x::int_vec &>(c7x::int_vec &vecResI0, c7x::int_vec &vecResI1)
258 {
259  vecResI0 = (c7x::int_vec) 0;
260 }
261 
262 template <typename V>
263 static inline void packAlternate(V vecPA0, V vecPA1, V vecPA2, V vecPA3, V vecPAOut0, V vecPAOut1);
264 
265 template <>
266 inline void packAlternate<c7x::long_vec &>(c7x::long_vec &vecPAL0,
267  c7x::long_vec &vecPAL1,
268  c7x::long_vec &vecPAL2,
269  c7x::long_vec &vecPAL3,
270  c7x::long_vec &vecPALOut0,
271  c7x::long_vec &vecPALOut1)
272 {
273  vecPALOut0 = c7x::as_long_vec(__vpackw_vvv(c7x::as_int_vec(vecPAL2), c7x::as_int_vec(vecPAL0)));
274  vecPALOut1 = c7x::as_long_vec(__vpackw_vvv(c7x::as_int_vec(vecPAL3), c7x::as_int_vec(vecPAL1)));
275 }
276 
277 template <>
278 inline void packAlternate<c7x::int_vec &>(c7x::int_vec &vecPAI0,
279  c7x::int_vec &vecPAI1,
280  c7x::int_vec &vecPAI2,
281  c7x::int_vec &vecPAI3,
282  c7x::int_vec &vecPAIOut0,
283  c7x::int_vec &vecPAIOut1)
284 {
285  vecPAIOut0 = c7x::as_int_vec(__vpackl2_vvv(c7x::as_short_vec(vecPAI2), c7x::as_short_vec(vecPAI0)));
286 }
287 
288 template <typename V, typename W>
289 static inline void vecPermutePack(V vecPerm1,
290  V vecPerm2,
291  V vecPerm3,
292  V vecPerm4,
293  V vecPerm5,
294  V vecPerm6,
295  V vecPerm7,
296  V vecPerm8,
297  W pOutLocal,
298  c7x::uchar_vec vMask);
299 
300 template <>
301 inline void vecPermutePack<c7x::long_vec &, int16_t *>(c7x::long_vec &vecPermL1,
302  c7x::long_vec &vecPermL2,
303  c7x::long_vec &vecPermL3,
304  c7x::long_vec &vecPermL4,
305  c7x::long_vec &vecPermL5,
306  c7x::long_vec &vecPermL6,
307  c7x::long_vec &vecPermL7,
308  c7x::long_vec &vecPermL8,
309  int16_t *pOutLocal,
310  c7x::uchar_vec vMaskPerm)
311 {
312  c7x::long_vec vecPermL9, vecPermL10, vecPermL11, vecPermL12, vecPermL13, vecPermL14, vecPermL15, vecPermL16;
313 
314  vecPermL9 =
315  c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL3), c7x::as_uchar_vec(vecPermL1)));
316  vecPermL10 =
317  c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL4), c7x::as_uchar_vec(vecPermL2)));
318  vecPermL11 =
319  c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL7), c7x::as_uchar_vec(vecPermL5)));
320  vecPermL12 =
321  c7x::as_long_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermL8), c7x::as_uchar_vec(vecPermL6)));
322  vecPermL13 =
323  c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL3), c7x::as_uchar_vec(vecPermL1)));
324  vecPermL14 =
325  c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL4), c7x::as_uchar_vec(vecPermL2)));
326  vecPermL15 =
327  c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL7), c7x::as_uchar_vec(vecPermL5)));
328  vecPermL16 =
329  c7x::as_long_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermL8), c7x::as_uchar_vec(vecPermL6)));
330 
331  vecPermL1 = c7x::long_vec(vecPermL9.lo(), vecPermL11.lo());
332  vecPermL2 = c7x::long_vec(vecPermL10.lo(), vecPermL12.lo());
333  vecPermL3 = c7x::long_vec(vecPermL13.lo(), vecPermL15.lo());
334  vecPermL4 = c7x::long_vec(vecPermL14.lo(), vecPermL16.lo());
335  vecPermL5 = c7x::long_vec(vecPermL9.hi(), vecPermL11.hi());
336  vecPermL6 = c7x::long_vec(vecPermL10.hi(), vecPermL12.hi());
337  vecPermL7 = c7x::long_vec(vecPermL13.hi(), vecPermL15.hi());
338  vecPermL8 = c7x::long_vec(vecPermL14.hi(), vecPermL16.hi());
339 }
340 
341 template <>
342 inline void vecPermutePack<c7x::int_vec &, int8_t *>(c7x::int_vec &vecPermI1,
343  c7x::int_vec &vecPermI2,
344  c7x::int_vec &vecPermI3,
345  c7x::int_vec &vecPermI4,
346  c7x::int_vec &vecPermI5,
347  c7x::int_vec &vecPermI6,
348  c7x::int_vec &vecPermI7,
349  c7x::int_vec &vecPermI8,
350  int8_t *pOutLocal,
351  c7x::uchar_vec vMaskPerm)
352 {
353  c7x::int_vec vecPermI9, vecPermI11, vecPermI13, vecPermI15;
354 
355  vecPermI9 =
356  c7x::as_int_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermI3), c7x::as_uchar_vec(vecPermI1)));
357  vecPermI11 =
358  c7x::as_int_vec(__permute_low_low(vMaskPerm, c7x::as_uchar_vec(vecPermI7), c7x::as_uchar_vec(vecPermI5)));
359  vecPermI13 =
360  c7x::as_int_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermI3), c7x::as_uchar_vec(vecPermI1)));
361  vecPermI15 =
362  c7x::as_int_vec(__permute_high_high(vMaskPerm, c7x::as_uchar_vec(vecPermI7), c7x::as_uchar_vec(vecPermI5)));
363 
364  vecPermI1 = c7x::int_vec(vecPermI9.lo(), vecPermI11.lo());
365  vecPermI3 = c7x::int_vec(vecPermI13.lo(), vecPermI15.lo());
366  vecPermI5 = c7x::int_vec(vecPermI9.hi(), vecPermI11.hi());
367  vecPermI7 = c7x::int_vec(vecPermI13.hi(), vecPermI15.hi());
368 }
369 
370 template <typename dataTypeIn> static inline void writeOutSA0(dataTypeIn *pOut, c7x::short_vec v1, c7x::short_vec v2);
371 
372 template <> inline void writeOutSA0(int16_t *pOut, c7x::short_vec v1, c7x::short_vec v2)
373 {
374  __vpred tmp = c7x::strm_agen<0, c7x::short_vec>::get_vpred();
375  c7x::short_vec *storevec = c7x::strm_agen<0, c7x::short_vec>::get_adv(pOut);
376  __vstore_pred_interleave(tmp, storevec, v1, v2);
377 }
378 
379 template <> inline void writeOutSA0(int8_t *pOut, c7x::short_vec v1, c7x::short_vec v2)
380 {
381  __vpred tmp = c7x::strm_agen<0, c7x::short_vec>::get_vpred();
382  c7x::char_hvec *storevec = c7x::strm_agen<0, c7x::char_hvec>::get_adv(pOut);
383  __vstore_pred_packl(tmp, storevec, v1);
384 }
385 
386 template <typename dataTypeIn, typename dataTypeOut>
388 DSPLIB_matMul_fixed_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
389 {
390 #if ENABLE_PROFILE
391  uint64_t start = __TSC;
392  uint64_t overhead = __TSC - start;
393  uint64_t loopCycle = 0;
394  uint64_t accloopCycle = 0;
395  uint64_t count = 0;
396 #endif
397 
398  DSPLIB_STATUS status = DSPLIB_SUCCESS;
399 
401  uint8_t *pBlock = pKerPrivArgs->bufPblock;
402  int32_t MBlocks = pKerPrivArgs->MBlocks;
403  int32_t NBlocks = pKerPrivArgs->NBlocks;
404  int32_t KBlocks = pKerPrivArgs->KBlocks;
405  int32_t qs = pKerPrivArgs->qs;
406 
407  dataTypeIn *pIn0Local = (dataTypeIn *) pIn0;
408  dataTypeIn *pIn1Local = (dataTypeIn *) pIn1;
409  dataTypeIn *pOutLocal = (dataTypeIn *) pOut;
410 
411  __SE_TEMPLATE_v1 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET);
412  __SE_TEMPLATE_v1 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET);
413  __SA_TEMPLATE_v1 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET);
414 
415  __SE0_OPEN(pIn0Local, se0Params);
416  __SE1_OPEN(pIn1Local, se1Params);
417  __SA0_OPEN(sa0Params);
418 
419  c7x::uchar_vec vMask = setMask<dataTypeIn>();
420 
421  dataTypeOut shiftVec = (dataTypeOut) qs;
422  dataTypeOut zeroVec = (dataTypeOut) 0;
423  dataTypeOut outSum0 = zeroVec, outSum1 = zeroVec, outSum2 = zeroVec, outSum3 = zeroVec;
424  dataTypeOut outSum4 = zeroVec, outSum5 = zeroVec, outSum6 = zeroVec, outSum7 = zeroVec;
425  dataTypeOut outSum8 = zeroVec, outSum9 = zeroVec, outSum10 = zeroVec, outSum11 = zeroVec;
426  dataTypeOut outSum12 = zeroVec, outSum13 = zeroVec, outSum14 = zeroVec, outSum15 = zeroVec;
427  dataTypeOut out1, out2, out3, out4, out5, out6, out7, out8;
428 
429  for (int32_t l = 0; l < MBlocks * NBlocks; l++) {
430 #if ENABLE_PROFILE
431  start = __TSC;
432 #endif
433  for (int32_t p = 0; p < KBlocks; p++) {
434  vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum0, outSum1);
435  vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum2, outSum3);
436  vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum4, outSum5);
437  vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum6, outSum7);
438  vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum8, outSum9);
439  vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum10, outSum11);
440  vecMulAcc<dataTypeOut &, __SE_REG_0, __SE_REG_1_ADV>(outSum12, outSum13);
441  vecMulAcc<dataTypeOut &, __SE_REG_0_ADV, __SE_REG_1_ADV>(outSum14, outSum15);
442  }
443 #if ENABLE_PROFILE
444  loopCycle = __TSC - start;
445  accloopCycle += loopCycle;
446  printf("|INNER LOOP CNT : %d| %lu |\n", count++, loopCycle);
447 #endif
448 
449  vecShiftRight<dataTypeOut &>(outSum0, outSum1, shiftVec);
450  vecShiftRight<dataTypeOut &>(outSum2, outSum3, shiftVec);
451  vecShiftRight<dataTypeOut &>(outSum4, outSum5, shiftVec);
452  vecShiftRight<dataTypeOut &>(outSum6, outSum7, shiftVec);
453  vecShiftRight<dataTypeOut &>(outSum8, outSum9, shiftVec);
454  vecShiftRight<dataTypeOut &>(outSum10, outSum11, shiftVec);
455  vecShiftRight<dataTypeOut &>(outSum12, outSum13, shiftVec);
456  vecShiftRight<dataTypeOut &>(outSum14, outSum15, shiftVec);
457 
458  packAlternate<dataTypeOut &>(outSum0, outSum1, outSum2, outSum3, out1, out2);
459  packAlternate<dataTypeOut &>(outSum4, outSum5, outSum6, outSum7, out3, out4);
460  packAlternate<dataTypeOut &>(outSum8, outSum9, outSum10, outSum11, out5, out6);
461  packAlternate<dataTypeOut &>(outSum12, outSum13, outSum14, outSum15, out7, out8);
462 
463  vecPermutePack<dataTypeOut &, dataTypeIn *>(out1, out2, out3, out4, out5, out6, out7, out8, pOutLocal, vMask);
464 
465  writeOutSA0(pOutLocal, c7x::as_short_vec(out1), c7x::as_short_vec(out2));
466  writeOutSA0(pOutLocal, c7x::as_short_vec(out5), c7x::as_short_vec(out6));
467  writeOutSA0(pOutLocal, c7x::as_short_vec(out3), c7x::as_short_vec(out4));
468  writeOutSA0(pOutLocal, c7x::as_short_vec(out7), c7x::as_short_vec(out8));
469 
470  resetVec<dataTypeOut &>(outSum0, outSum1);
471  resetVec<dataTypeOut &>(outSum2, outSum3);
472  resetVec<dataTypeOut &>(outSum4, outSum5);
473  resetVec<dataTypeOut &>(outSum6, outSum7);
474  resetVec<dataTypeOut &>(outSum8, outSum9);
475  resetVec<dataTypeOut &>(outSum10, outSum11);
476  resetVec<dataTypeOut &>(outSum12, outSum13);
477  resetVec<dataTypeOut &>(outSum14, outSum15);
478  }
479 
480 #if ENABLE_PROFILE
481  printf("|CORE LOOP IN16_T | %lu |\n", accloopCycle);
482 #endif
483 
484  __SE0_CLOSE();
485  __SE1_CLOSE();
486  __SA0_CLOSE();
487  return (status);
488 }
489 
491  void *restrict pIn0,
492  void *restrict pIn1,
493  void *restrict pOut);
494 
496  void *restrict pIn0,
497  void *restrict pIn1,
498  void *restrict pOut);
static void vecMulAcc(V outSum0, V outSum1)
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
#define SE0_PARAM_OFFSET
static void vecShiftRight(V vecSR0, V vecSR1, V shiftVec)
static c7x::uchar_vec setMask()
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
static void writeOutSA0(dataTypeIn *pOut, c7x::short_vec v1, c7x::short_vec v2)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
static void vecPermutePack(V vecPerm1, V vecPerm2, V vecPerm3, V vecPerm4, V vecPerm5, V vecPerm6, V vecPerm7, V vecPerm8, W pOutLocal, c7x::uchar_vec vMask)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
static void resetVec(V vecRes0, V vecRes1)
static void packAlternate(V vecPA0, V vecPA1, V vecPA2, V vecPA3, V vecPAOut0, V vecPAOut1)
DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
c7x::uchar_vec setMask< int8_t >()
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
#define SA0_PARAM_OFFSET
#define SE1_PARAM_OFFSET
DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
c7x::uchar_vec setMask< int16_t >()
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_matMul_fixed.
uint64_t overhead
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MATMUL_FIXED_IXX_IXX_OXX_PBLOCK_SIZE]