DSPLIB User Guide
DSPLIB_matMul_fixed_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
2 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *
11  * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the
14  * distribution.
15  *
16  * Neither the name of Texas Instruments Incorporated nor the names of
17  * its contributors may be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  *
32  ******************************************************************************/
33 
34 /*******************************************************************************
35  *
36  * INCLUDES
37  *
38  ******************************************************************************/
39 
40 #include "../common/c71/DSPLIB_inlines.h"
42 
43 #define SE_PARAM_BASE (0x0000)
44 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
45 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + SE_PARAM_SIZE)
46 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + SE_PARAM_SIZE)
47 #define DSPLIB_MATMUL_FIXED_UNROLL (128)
48 
49 /***********************************
50  * Initialization
51  ***********************************/
52 template <typename dataTypeIn, typename dataTypeOut>
53 inline void DSPLIB_matMul_fixed_PromoteTranspose_init_ci(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params);
54 template <>
56  __SE_TEMPLATE_v1 *se1Params)
57 {
58  se0Params->PROMOTE = __SE_PROMOTE_OFF;
59  se1Params->TRANSPOSE = __SE_TRANSPOSE_64BIT;
60 }
61 template <>
63  __SE_TEMPLATE_v1 *se1Params)
64 {
65  se0Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
66  se1Params->TRANSPOSE = __SE_TRANSPOSE_32BIT;
67  se1Params->PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
68 }
69 
70 template <typename dataTypeIn, typename dataTypeOut>
72  const DSPLIB_bufParams2D_t *bufParamsIn0,
73  const DSPLIB_bufParams2D_t *bufParamsIn1,
74  const DSPLIB_bufParams2D_t *bufParamsOut,
75  const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
76 {
78  __SE_TEMPLATE_v1 se0Params;
79  __SE_TEMPLATE_v1 se1Params;
80  __SA_TEMPLATE_v1 sa0Params;
81  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<dataTypeOut>::value;
82  __SE_VECLEN SE_VECLEN = c7x::se_veclen<c7x::short_vec>::value;
83  ;
84  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<dataTypeOut>::value;
85  int32_t elementCount = c7x::element_count_of<dataTypeOut>::value;
87  uint8_t *pBlock = pKerPrivArgs->bufPblock;
88  int32_t M = pKerPrivArgs->M;
89  int32_t K = pKerPrivArgs->K;
90  int32_t N = pKerPrivArgs->N;
91  int32_t strideIn0 = pKerPrivArgs->strideIn0Elements;
92  int32_t strideIn1 = pKerPrivArgs->strideIn1Elements;
93  int32_t strideOut = pKerPrivArgs->strideOutElements;
94  int32_t unrollFactor = 1;
95  if (N >= 32) {
96  unrollFactor = 2;
97  }
98 
99  int32_t KBlocks = ((K + 4 - 1)) / (4);
100  int32_t MBlocks = ((M + 2 - 1)) / (2);
101  int32_t NBlocks = ((N + (elementCount * unrollFactor) - 1)) / (elementCount * unrollFactor);
102  ;
103 
104  pKerPrivArgs->KBlocks = KBlocks;
105  pKerPrivArgs->NBlocks = NBlocks;
106  pKerPrivArgs->MBlocks = MBlocks;
107 
108  /**********************************************************************/
109  /* Prepare streaming engine 0 to fetch A matrix */
110  /**********************************************************************/
111  se0Params = __gen_SE_TEMPLATE_v1();
112  se0Params.ELETYPE = SE_ELETYPE;
113  se0Params.DIMFMT = __SE_DIMFMT_5D;
114  se0Params.DECDIM1 = __SE_DECDIM_DIM2;
115  se0Params.DECDIM2 = __SE_DECDIM_DIM4;
116  se0Params.DECDIM1SD = __SE_DECDIMSD_DIM0;
117  se0Params.DECDIM2SD = __SE_DECDIMSD_DIM1;
118 
119  se0Params.VECLEN = __SE_VECLEN_4ELEMS;
120  se0Params.GRPDUP = __SE_GRPDUP_ON;
121  se0Params.ICNT0 = 4;
122  se0Params.ICNT1 = 2;
123  se0Params.DIM1 = strideIn0;
124  se0Params.ICNT2 = KBlocks;
125  se0Params.DIM2 = 4;
126  se0Params.ICNT3 = NBlocks;
127  se0Params.DIM3 = 0;
128  se0Params.DECDIM2_WIDTH = (uint32_t) M * strideIn0;
129  se0Params.ICNT4 = MBlocks;
130  se0Params.DIM4 = 2 * strideIn0;
131  se0Params.DECDIM1_WIDTH = (uint32_t) K;
132 
133  /**********************************************************************/
134  /* Prepare streaming engine 1 to fetch B matrix */
135  /**********************************************************************/
136  se1Params = __gen_SE_TEMPLATE_v1();
137  se1Params.ELETYPE = SE_ELETYPE;
138  se1Params.VECLEN = SE_VECLEN;
139  se1Params.DIMFMT = __SE_DIMFMT_5D;
140 
141  se1Params.ICNT0 = 4 * 4 * unrollFactor;
142  se1Params.ICNT1 = 4;
143  se1Params.DIM1 = strideIn1;
144  se1Params.DIM2 = 4 * strideIn1;
145  se1Params.ICNT2 = KBlocks;
146  se1Params.DIM3 = 4 * 4 * unrollFactor;
147  se1Params.ICNT3 = NBlocks;
148  se1Params.DIM4 = 0;
149  se1Params.ICNT4 = MBlocks;
150 
151  DSPLIB_matMul_fixed_PromoteTranspose_init_ci<dataTypeIn, dataTypeOut>(&se0Params, &se1Params);
152  /**********************************************************************/
153  /* Prepare SA template to write C matrix */
154  /**********************************************************************/
155  sa0Params = __gen_SA_TEMPLATE_v1();
156  sa0Params.VECLEN = SA_VECLEN;
157  sa0Params.DIMFMT = __SA_DIMFMT_4D;
158  sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
159  sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
160  sa0Params.DECDIM2 = __SA_DECDIM_DIM3;
161  sa0Params.DECDIM2SD = __SA_DECDIMSD_DIM1;
162 
163  sa0Params.ICNT0 = elementCount * unrollFactor;
164  sa0Params.ICNT1 = 2;
165  sa0Params.ICNT2 = NBlocks;
166  sa0Params.DIM1 = strideOut;
167  sa0Params.DECDIM1_WIDTH = N;
168  sa0Params.DECDIM2_WIDTH = M * strideOut;
169  sa0Params.DIM2 = elementCount * unrollFactor;
170  sa0Params.ICNT3 = MBlocks;
171  sa0Params.DIM3 = 2 * strideOut;
172 
173  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET) = se0Params;
174  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET) = se1Params;
175  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET) = sa0Params;
176 
177  return status;
178 }
179 
182  const DSPLIB_bufParams2D_t *bufParamsIn0,
183  const DSPLIB_bufParams2D_t *bufParamsIn1,
184  const DSPLIB_bufParams2D_t *bufParamsOut,
185  const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs);
186 
189  const DSPLIB_bufParams2D_t *bufParamsIn0,
190  const DSPLIB_bufParams2D_t *bufParamsIn1,
191  const DSPLIB_bufParams2D_t *bufParamsOut,
192  const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs);
193 /***********************************
194  * Implementation
195  ***********************************/
196 template <uint32_t id> static inline void loadMatSE(c7x::short_vec *a)
197 {
198  *a = c7x::strm_eng<id, c7x::short_vec>::get_adv();
199 }
200 template <typename T, typename pVec, typename vecIn>
201 static inline void writeOutSA0(__vpred vPred, pVec *addr, T pOut, vecIn out1, vecIn out2)
202 {
203  vPred = c7x::strm_agen<0, pVec>::get_vpred();
204  addr = c7x::strm_agen<0, pVec>::get_adv(pOut);
205  __vstore_pred_packl_2src(vPred, addr, out1, out2);
206  return;
207 }
208 template <typename T, typename vecIn>
209 static inline void writeOutSA0(__vpred vPred, c7x::char_hvec *addr, T pOut, vecIn out1, vecIn out2)
210 {
211  vPred = c7x::strm_agen<0, c7x::char_hvec>::get_vpred();
212  addr = c7x::strm_agen<0, c7x::char_hvec>::get_adv(pOut);
213  __vstore_pred_pack_byte_2src(vPred, addr, out1, out2);
214  return;
215 }
216 
217 template <typename dataTypeIn, typename dataTypeOut>
219 DSPLIB_matMul_fixed_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
220 {
222  int16_t *pOutLocal = (int16_t *) pOut;
223  int32_t KBlocks = pKerPrivArgs->KBlocks;
224  int32_t NBlocks = pKerPrivArgs->NBlocks;
225  int32_t MBlocks = pKerPrivArgs->MBlocks;
226  int32_t qs = pKerPrivArgs->qs;
227  uint8_t *pBlock = pKerPrivArgs->bufPblock;
228  __SE_TEMPLATE_v1 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET);
229  __SE_TEMPLATE_v1 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET);
230  __SA_TEMPLATE_v1 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET);
231  __vpred vPred;
232  dataTypeOut *addr;
233  uchar32 vMask = uchar32(0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7,
234  14, 15, 22, 23, 30, 31);
235 
236  __SE0_OPEN(pIn0, se0Params);
237  __SE1_OPEN((pIn1), se1Params);
238  __SA0_OPEN(sa0Params);
239 
240  if (pKerPrivArgs->N >= 32) {
241  for (int32_t mn = 0; mn < MBlocks * NBlocks; mn++) {
242  c7x::long_vec c00 = (c7x::long_vec) 0;
243  c7x::long_vec c01 = (c7x::long_vec) 0;
244  c7x::long_vec c02 = (c7x::long_vec) 0;
245  c7x::long_vec c03 = (c7x::long_vec) 0;
246  c7x::long_vec c04 = (c7x::long_vec) 0;
247  c7x::long_vec c05 = (c7x::long_vec) 0;
248  c7x::long_vec c06 = (c7x::long_vec) 0;
249  c7x::long_vec c07 = (c7x::long_vec) 0;
250  c7x::long_vec c10 = (c7x::long_vec) 0;
251  c7x::long_vec c11 = (c7x::long_vec) 0;
252  c7x::long_vec c12 = (c7x::long_vec) 0;
253  c7x::long_vec c13 = (c7x::long_vec) 0;
254  c7x::long_vec c14 = (c7x::long_vec) 0;
255  c7x::long_vec c15 = (c7x::long_vec) 0;
256  c7x::long_vec c16 = (c7x::long_vec) 0;
257  c7x::long_vec c17 = (c7x::long_vec) 0;
258  c7x::short_vec b0, b1, b2, b3, b4, b5, b6, b7;
259  c7x::short_vec a0, a1;
260  for (int32_t k = 0; k < KBlocks; k++) {
261  loadMatSE<0>(&a0);
262  loadMatSE<0>(&a1);
263 
264  loadMatSE<1>(&b0);
265  loadMatSE<1>(&b1);
266  loadMatSE<1>(&b2);
267  loadMatSE<1>(&b3);
268 
269  loadMatSE<1>(&b4);
270  loadMatSE<1>(&b5);
271  loadMatSE<1>(&b6);
272  loadMatSE<1>(&b7);
273 
274  b0 = __as_short16(__permute(vMask, __as_uchar32(b0)));
275  b1 = __as_short16(__permute(vMask, __as_uchar32(b1)));
276  b2 = __as_short16(__permute(vMask, __as_uchar32(b2)));
277  b3 = __as_short16(__permute(vMask, __as_uchar32(b3)));
278 
279  b4 = __as_short16(__permute(vMask, __as_uchar32(b4)));
280  b5 = __as_short16(__permute(vMask, __as_uchar32(b5)));
281  b6 = __as_short16(__permute(vMask, __as_uchar32(b6)));
282  b7 = __as_short16(__permute(vMask, __as_uchar32(b7)));
283 
284  c00 += __dotp4_ext(a0, b0);
285  c01 += __dotp4_ext(a0, b1);
286  c02 += __dotp4_ext(a0, b2);
287  c03 += __dotp4_ext(a0, b3);
288 
289  c04 += __dotp4_ext(a0, b4);
290  c05 += __dotp4_ext(a0, b5);
291  c06 += __dotp4_ext(a0, b6);
292  c07 += __dotp4_ext(a0, b7);
293 
294  c10 += __dotp4_ext(a1, b0);
295  c11 += __dotp4_ext(a1, b1);
296  c12 += __dotp4_ext(a1, b2);
297  c13 += __dotp4_ext(a1, b3);
298 
299  c14 += __dotp4_ext(a1, b4);
300  c15 += __dotp4_ext(a1, b5);
301  c16 += __dotp4_ext(a1, b6);
302  c17 += __dotp4_ext(a1, b7);
303  }
304  c7x::long_vec c00shfited = __shift_right(c00, (c7x::long_vec) qs);
305  c7x::long_vec c01shfited = __shift_right(c01, (c7x::long_vec) qs);
306  c7x::long_vec c02shfited = __shift_right(c02, (c7x::long_vec) qs);
307  c7x::long_vec c03shfited = __shift_right(c03, (c7x::long_vec) qs);
308  c7x::long_vec c04shfited = __shift_right(c04, (c7x::long_vec) qs);
309  c7x::long_vec c05shfited = __shift_right(c05, (c7x::long_vec) qs);
310  c7x::long_vec c06shfited = __shift_right(c06, (c7x::long_vec) qs);
311  c7x::long_vec c07shfited = __shift_right(c07, (c7x::long_vec) qs);
312  c7x::long_vec c10shfited = __shift_right(c10, (c7x::long_vec) qs);
313  c7x::long_vec c11shfited = __shift_right(c11, (c7x::long_vec) qs);
314  c7x::long_vec c12shfited = __shift_right(c12, (c7x::long_vec) qs);
315  c7x::long_vec c13shfited = __shift_right(c13, (c7x::long_vec) qs);
316  c7x::long_vec c14shfited = __shift_right(c14, (c7x::long_vec) qs);
317  c7x::long_vec c15shfited = __shift_right(c15, (c7x::long_vec) qs);
318  c7x::long_vec c16shfited = __shift_right(c16, (c7x::long_vec) qs);
319  c7x::long_vec c17shfited = __shift_right(c17, (c7x::long_vec) qs);
320 
321  c7x::int_vec out01 = __vwpackl_vvv(c7x::as_int_vec(c01shfited), c7x::as_int_vec(c00shfited));
322  c7x::int_vec out02 = __vwpackl_vvv(c7x::as_int_vec(c03shfited), c7x::as_int_vec(c02shfited));
323  c7x::int_vec out03 = __vwpackl_vvv(c7x::as_int_vec(c05shfited), c7x::as_int_vec(c04shfited));
324  c7x::int_vec out04 = __vwpackl_vvv(c7x::as_int_vec(c07shfited), c7x::as_int_vec(c06shfited));
325  c7x::int_vec out11 = __vwpackl_vvv(c7x::as_int_vec(c11shfited), c7x::as_int_vec(c10shfited));
326  c7x::int_vec out12 = __vwpackl_vvv(c7x::as_int_vec(c13shfited), c7x::as_int_vec(c12shfited));
327  c7x::int_vec out13 = __vwpackl_vvv(c7x::as_int_vec(c15shfited), c7x::as_int_vec(c14shfited));
328  c7x::int_vec out14 = __vwpackl_vvv(c7x::as_int_vec(c17shfited), c7x::as_int_vec(c16shfited));
329 
330  writeOutSA0(vPred, addr, pOutLocal, out01, out02);
331  writeOutSA0(vPred, addr, pOutLocal, out03, out04);
332  writeOutSA0(vPred, addr, pOutLocal, out11, out12);
333  writeOutSA0(vPred, addr, pOutLocal, out13, out14);
334  }
335  }
336  else {
337  for (int32_t mn = 0; mn < MBlocks * NBlocks; mn++) {
338  c7x::long_vec c00 = (c7x::long_vec) 0;
339  c7x::long_vec c01 = (c7x::long_vec) 0;
340  c7x::long_vec c02 = (c7x::long_vec) 0;
341  c7x::long_vec c03 = (c7x::long_vec) 0;
342  c7x::long_vec c10 = (c7x::long_vec) 0;
343  c7x::long_vec c11 = (c7x::long_vec) 0;
344  c7x::long_vec c12 = (c7x::long_vec) 0;
345  c7x::long_vec c13 = (c7x::long_vec) 0;
346  c7x::short_vec b0, b1, b2, b3;
347  c7x::short_vec a0, a1;
348  // printf("Kloop\n");
349  for (int32_t k = 0; k < KBlocks; k++) {
350  loadMatSE<0>(&a0);
351  loadMatSE<0>(&a1);
352 
353  loadMatSE<1>(&b0);
354  loadMatSE<1>(&b1);
355  loadMatSE<1>(&b2);
356  loadMatSE<1>(&b3);
357 
358  b0 = __as_short16(__permute(vMask, __as_uchar32(b0)));
359  b1 = __as_short16(__permute(vMask, __as_uchar32(b1)));
360  b2 = __as_short16(__permute(vMask, __as_uchar32(b2)));
361  b3 = __as_short16(__permute(vMask, __as_uchar32(b3)));
362 
363  c00 += __dotp4_ext(a0, b0);
364  c01 += __dotp4_ext(a0, b1);
365  c02 += __dotp4_ext(a0, b2);
366  c03 += __dotp4_ext(a0, b3);
367 
368  c10 += __dotp4_ext(a1, b0);
369  c11 += __dotp4_ext(a1, b1);
370  c12 += __dotp4_ext(a1, b2);
371  c13 += __dotp4_ext(a1, b3);
372  }
373  c7x::long_vec c00shfited = __shift_right(c00, (c7x::long_vec) qs);
374  c7x::long_vec c01shfited = __shift_right(c01, (c7x::long_vec) qs);
375  c7x::long_vec c02shfited = __shift_right(c02, (c7x::long_vec) qs);
376  c7x::long_vec c03shfited = __shift_right(c03, (c7x::long_vec) qs);
377 
378  c7x::int_vec out01 = __vwpackl_vvv(c7x::as_int_vec(c01shfited), c7x::as_int_vec(c00shfited));
379  c7x::int_vec out02 = __vwpackl_vvv(c7x::as_int_vec(c03shfited), c7x::as_int_vec(c02shfited));
380 
381  c7x::long_vec c10shfited = __shift_right(c10, (c7x::long_vec) qs);
382  c7x::long_vec c11shfited = __shift_right(c11, (c7x::long_vec) qs);
383  c7x::long_vec c12shfited = __shift_right(c12, (c7x::long_vec) qs);
384  c7x::long_vec c13shfited = __shift_right(c13, (c7x::long_vec) qs);
385 
386  c7x::int_vec out11 = __vwpackl_vvv(c7x::as_int_vec(c11shfited), c7x::as_int_vec(c10shfited));
387  c7x::int_vec out12 = __vwpackl_vvv(c7x::as_int_vec(c13shfited), c7x::as_int_vec(c12shfited));
388 
389  writeOutSA0(vPred, addr, pOutLocal, out01, out02);
390  writeOutSA0(vPred, addr, pOutLocal, out11, out12);
391  }
392  }
393  __SE0_CLOSE();
394  __SE1_CLOSE();
395  __SA0_CLOSE();
396  // printStridedMat<dataTypeIn>("C OPT", pKerPrivArgs->M, pKerPrivArgs->strideOutElements, (dataTypeIn*)pOut);
397 
398  return DSPLIB_SUCCESS;
399 }
401  void *restrict pIn0,
402  void *restrict pIn1,
403  void *restrict pOut);
404 
406  void *restrict pIn0,
407  void *restrict pIn1,
408  void *restrict pOut);
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
#define SE0_PARAM_OFFSET
static void loadMatSE(c7x::short_vec *a)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I16S_O16S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
void DSPLIB_matMul_fixed_PromoteTranspose_init_ci(__SE_TEMPLATE_v1 *se0Params, __SE_TEMPLATE_v1 *se1Params)
DSPLIB_STATUS DSPLIB_matMul_fixed_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
static void writeOutSA0(__vpred vPred, pVec *addr, T pOut, vecIn out1, vecIn out2)
template DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci< DSPLIB_MATMAPY_FXD_I8S_O8S >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
#define SA0_PARAM_OFFSET
#define SE1_PARAM_OFFSET
DSPLIB_STATUS DSPLIB_matMul_fixed_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_fixed_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_matMul_fixed.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MATMUL_FIXED_IXX_IXX_OXX_PBLOCK_SIZE]