DSPLIB User Guide
DSPLIB_matMul_generic_size_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
2 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *
11  * Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the
14  * distribution.
15  *
16  * Neither the name of Texas Instruments Incorporated nor the names of
17  * its contributors may be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  *
32  ******************************************************************************/
33 
34 /*******************************************************************************
35  *
36  * INCLUDES
37  *
38  ******************************************************************************/
39 
40 #include "../common/c71/DSPLIB_inlines.h"
41 #include "DSPLIB_matMul_priv.h"
42 #include "DSPLIB_types.h"
43 #include <cstdint>
44 #include <float.h>
45 
46 /*******************************************************************************
47  *
48  * DEFINES
49  *
50  ******************************************************************************/
51 
52 #define SE_PARAM_BASE (0x0000)
53 #define SE0_PARAM_OFFSET (SE_PARAM_BASE)
54 #define SE1_PARAM_OFFSET (SE0_PARAM_OFFSET + SE_PARAM_SIZE)
55 #define SA0_PARAM_OFFSET (SE1_PARAM_OFFSET + SE_PARAM_SIZE)
56 #define SA1_PARAM_OFFSET (SA0_PARAM_OFFSET + SE_PARAM_SIZE)
57 #define SA2_PARAM_OFFSET (SA1_PARAM_OFFSET + SE_PARAM_SIZE)
58 
59 #define DSPLIB_MATMUL_DOUBLE_UNROLL_FACTOR (8)
60 #define DSPLIB_MATMUL_SE_DOUBLE_UNROLL_FACTOR (4)
61 
62 #define DSPLIB_MATMUL_FLOAT_UNROLL_FACTOR (16)
63 #define DSPLIB_MATMUL_SE_FLOAT_UNROLL_FACTOR (8)
64 
65 template <typename dataType> inline void setUnrollFactors(int32_t *unrollFactor, int32_t *seUnrollFactor);
66 
67 template <> inline void setUnrollFactors<float>(int32_t *unrollFactor, int32_t *seUnrollFactor)
68 {
69  *unrollFactor = DSPLIB_MATMUL_FLOAT_UNROLL_FACTOR;
70  *seUnrollFactor = DSPLIB_MATMUL_SE_FLOAT_UNROLL_FACTOR;
71 }
72 template <> inline void setUnrollFactors<double>(int32_t *unrollFactor, int32_t *seUnrollFactor)
73 {
74  *unrollFactor = DSPLIB_MATMUL_DOUBLE_UNROLL_FACTOR;
75  *seUnrollFactor = DSPLIB_MATMUL_SE_DOUBLE_UNROLL_FACTOR;
76 }
77 
78 template <typename dataType>
80  const DSPLIB_bufParams2D_t *bufParamsIn0,
81  const DSPLIB_bufParams2D_t *bufParamsIn1,
82  const DSPLIB_bufParams2D_t *bufParamsOut,
83  const DSPLIB_matMul_InitArgs *pKerInitArgs)
84 {
86  __SE_TEMPLATE_v1 se0Params;
87  __SE_TEMPLATE_v1 se1Params;
88  __SA_TEMPLATE_v1 sa0Params;
89  __SA_TEMPLATE_v1 sa1Params;
90  __SA_TEMPLATE_v1 sa2Params;
91 
92  __SE_ELETYPE SE_ELETYPE;
93  __SE_VECLEN SE_VECLEN;
94  __SA_VECLEN SA_VECLEN;
95 
96  DSPLIB_matMul_PrivArgs *pKerPrivArgs = (DSPLIB_matMul_PrivArgs *) handle;
97 
98  uint8_t *pBlock = pKerPrivArgs->bufPblock;
99 
100  int32_t M = pKerPrivArgs->M;
101  int32_t K = pKerPrivArgs->K;
102  int32_t N = pKerPrivArgs->N;
103  int32_t strideIn0 = pKerPrivArgs->strideIn0Elements;
104  int32_t strideIn1 = pKerPrivArgs->strideIn1Elements;
105  int32_t strideOut = pKerPrivArgs->strideOutElements;
106 
107  int32_t unrollFactor = 0;
108  int32_t seUnrollFactor = 0;
109 
110  setUnrollFactors<dataType>(&unrollFactor, &seUnrollFactor);
111 
112  typedef typename c7x::make_full_vector<dataType>::type vec;
113 
114  int32_t elementCount = c7x::element_count_of<vec>::value;
115  SE_VECLEN = c7x::se_veclen<vec>::value;
116  SA_VECLEN = c7x::sa_veclen<vec>::value;
117  SE_ELETYPE = c7x::se_eletype<vec>::value;
118  int32_t KBlocks = ((K + unrollFactor - 1)) / (unrollFactor);
119  int32_t NBlocks = ((N + elementCount - 1)) / (elementCount);
120 
121  pKerPrivArgs->KBlocks = KBlocks;
122  pKerPrivArgs->NBlocks = NBlocks;
123 
124  /**********************************************************************/
125  /* Prepare SA0 template to fetch A matrix */
126  /**********************************************************************/
127 
128  sa0Params = __gen_SA_TEMPLATE_v1();
129  sa0Params.VECLEN = SA_VECLEN;
130  sa0Params.DIMFMT = __SA_DIMFMT_5D;
131  sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
132  sa0Params.DECDIM1SD = __SA_DECDIMSD_DIM1;
133 
134  sa0Params.ICNT0 = 1;
135  sa0Params.ICNT1 = seUnrollFactor;
136  sa0Params.DIM1 = 2;
137  sa0Params.ICNT2 = KBlocks;
138  sa0Params.DIM2 = (int32_t) ((uint32_t) seUnrollFactor << (uint32_t) 1);
139  sa0Params.DECDIM1_WIDTH = K;
140  sa0Params.ICNT3 = NBlocks;
141  sa0Params.DIM3 = 0;
142  sa0Params.ICNT4 = M;
143  sa0Params.DIM4 = strideIn0;
144 
145  /**********************************************************************/
146  /* Prepare SA2 template to fetch A matrix */
147  /**********************************************************************/
148 
149  sa2Params = __gen_SA_TEMPLATE_v1();
150  sa2Params.VECLEN = SA_VECLEN;
151  sa2Params.DIMFMT = __SA_DIMFMT_5D;
152  sa2Params.DECDIM1 = __SA_DECDIM_DIM2;
153  sa2Params.DECDIM1SD = __SA_DECDIMSD_DIM1;
154 
155  sa2Params.ICNT0 = 1;
156  sa2Params.ICNT1 = seUnrollFactor;
157  sa2Params.DIM1 = 2;
158  sa2Params.ICNT2 = KBlocks;
159  sa2Params.DIM2 = (int32_t) ((uint32_t) seUnrollFactor << (uint32_t) 1);
160  sa2Params.DECDIM1_WIDTH = (uint32_t) (K % 2 == 0 ? K : K - 1);
161  sa2Params.ICNT3 = NBlocks;
162  sa2Params.DIM3 = 0;
163  sa2Params.ICNT4 = M;
164  sa2Params.DIM4 = strideIn0;
165 
166  /**********************************************************************/
167  /* Prepare streaming engine 0 to fetch B matrix */
168  /**********************************************************************/
169 
170  se0Params = __gen_SE_TEMPLATE_v1();
171  se0Params.ELETYPE = SE_ELETYPE;
172  se0Params.VECLEN = SE_VECLEN;
173  se0Params.DIMFMT = __SE_DIMFMT_5D;
174  se0Params.DECDIM1 = __SE_DECDIM_DIM2;
175  se0Params.DECDIM2 = __SE_DECDIM_DIM3;
176  se0Params.DECDIM1SD = __SE_DECDIMSD_DIM1;
177  se0Params.DECDIM2SD = __SE_DECDIMSD_DIM0;
178 
179  se0Params.ICNT0 = elementCount;
180  se0Params.ICNT1 = seUnrollFactor;
181  se0Params.DIM1 = (int32_t) ((uint32_t) strideIn1 << (uint32_t) 1);
182  se0Params.ICNT2 = KBlocks;
183  se0Params.DIM2 = seUnrollFactor * (int32_t) ((uint32_t) strideIn1 << (uint32_t) 1);
184  se0Params.DECDIM1_WIDTH = (uint32_t) K * strideIn1;
185  se0Params.ICNT3 = NBlocks;
186  se0Params.DIM3 = elementCount;
187  se0Params.DECDIM2_WIDTH = (uint32_t) N;
188  se0Params.ICNT4 = M;
189  se0Params.DIM4 = 0;
190 
191  /**********************************************************************/
192  /* Prepare streaming engine 1 to fetch B matrix */
193  /**********************************************************************/
194 
195  se1Params = __gen_SE_TEMPLATE_v1();
196  se1Params.ELETYPE = SE_ELETYPE;
197  se1Params.VECLEN = SE_VECLEN;
198  se1Params.DIMFMT = __SE_DIMFMT_5D;
199  se1Params.DECDIM1 = __SE_DECDIM_DIM2;
200  se1Params.DECDIM2 = __SE_DECDIM_DIM3;
201  se1Params.DECDIM1SD = __SE_DECDIMSD_DIM1;
202  se1Params.DECDIM2SD = __SE_DECDIMSD_DIM0;
203 
204  se1Params.ICNT0 = elementCount;
205  se1Params.ICNT1 = seUnrollFactor;
206  se1Params.DIM1 = (int32_t) ((uint32_t) strideIn1 << (uint32_t) 1);
207  se1Params.ICNT2 = KBlocks;
208  se1Params.DIM2 = seUnrollFactor * (int32_t) ((uint32_t) strideIn1 << (uint32_t) 1);
209  se1Params.DECDIM1_WIDTH = (uint32_t) (K % 2 == 0 ? K : K - 1) * strideIn1;
210  se1Params.ICNT3 = NBlocks;
211  se1Params.DIM3 = elementCount;
212  se1Params.DECDIM2_WIDTH = (uint32_t) N;
213  se1Params.ICNT4 = M;
214  se1Params.DIM4 = 0;
215 
216  /**********************************************************************/
217  /* Prepare SA template to write C matrix */
218  /**********************************************************************/
219 
220  sa1Params = __gen_SA_TEMPLATE_v1();
221  sa1Params.VECLEN = SA_VECLEN;
222  sa1Params.DIMFMT = __SA_DIMFMT_3D;
223  sa1Params.DECDIM1 = __SA_DECDIM_DIM1;
224  sa1Params.DECDIM1SD = __SA_DECDIMSD_DIM0;
225 
226  sa1Params.ICNT0 = elementCount;
227  sa1Params.ICNT1 = NBlocks;
228  sa1Params.DIM1 = elementCount;
229  sa1Params.DECDIM1_WIDTH = N;
230  sa1Params.ICNT2 = M;
231  sa1Params.DIM2 = strideOut;
232 
233  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET) = se0Params;
234  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET) = se1Params;
235 
236  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET) = sa0Params;
237  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA1_PARAM_OFFSET) = sa1Params;
238  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA2_PARAM_OFFSET) = sa2Params;
239 
240  return status;
241 }
242 
244  const DSPLIB_bufParams2D_t *bufParamsIn0,
245  const DSPLIB_bufParams2D_t *bufParamsIn1,
246  const DSPLIB_bufParams2D_t *bufParamsOut,
247  const DSPLIB_matMul_InitArgs *pKerInitArgs);
249  const DSPLIB_bufParams2D_t *bufParamsIn0,
250  const DSPLIB_bufParams2D_t *bufParamsIn1,
251  const DSPLIB_bufParams2D_t *bufParamsOut,
252  const DSPLIB_matMul_InitArgs *pKerInitArgs);
253 
254 template <typename T, typename vec, uint32_t id> static inline vec loadAMatSA(__vpred tmp, void *pIn)
255 {
256 
257  vec out;
258 
259  tmp = c7x::strm_agen<id, T>::get_vpred();
260  out = __vload_pred_dup(tmp, (c7x::strm_agen<id, T>::get_adv(pIn)));
261  /* printf("\nOut vector below:\n"); */
262  DSPLIB_debugPrintVector(out);
263 
264  return out;
265 }
266 
267 template <typename T, typename vec> static inline void writeOutSA1(__vpred tmp, vec *addr, T pOut, vec out)
268 {
269  /* printf("\nOut vector below:\n"); */
270  /* DSPLIB_debugPrintVector(out); */
271  tmp = c7x::strm_agen<1, vec>::get_vpred();
272  addr = c7x::strm_agen<1, vec>::get_adv(pOut);
273  __vstore_pred(tmp, addr, out);
274  return;
275 }
276 
277 template <typename dataType>
279  void *restrict pIn0,
280  void *restrict pIn1,
281  void *restrict pOut);
282 
283 template <>
285  void *restrict pIn0,
286  void *restrict pIn1,
287  void *restrict pOut)
288 {
289  DSPLIB_matMul_PrivArgs *pKerPrivArgs = (DSPLIB_matMul_PrivArgs *) handle;
290 
291  int32_t M = pKerPrivArgs->M;
292  int32_t K = pKerPrivArgs->K;
293  int32_t KBlocks = pKerPrivArgs->KBlocks;
294  int32_t NBlocks = pKerPrivArgs->NBlocks;
295 
296  __SE_TEMPLATE_v1 se0Params;
297  __SE_TEMPLATE_v1 se1Params;
298  __SA_TEMPLATE_v1 sa0Params;
299  __SA_TEMPLATE_v1 sa1Params;
300  __SA_TEMPLATE_v1 sa2Params;
301 
302 #if DSPLIB_DEBUGPRINT
303  printf("Enter DSPLIB_matMul_exec_ci\n");
304 #endif
305 
306  typedef typename c7x::make_full_vector<double>::type vec;
307 
308  uint8_t *pBlock = pKerPrivArgs->bufPblock;
309  if (K > 1) {
310  int32_t strideIn1 = pKerPrivArgs->strideIn1Elements;
311 
312  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET);
313  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET);
314 
315  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET);
316  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA1_PARAM_OFFSET);
317  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA2_PARAM_OFFSET);
318 
319  // Input samples
320  __SE0_OPEN(pIn1, se0Params);
321  __SE1_OPEN(((double *) pIn1 + strideIn1), se1Params);
322 
323  // Output samples
324  __SA0_OPEN(sa0Params);
325  __SA2_OPEN(sa2Params);
326  __SA1_OPEN(sa1Params);
327 
328  vec a;
329  vec b;
330 
331  vec r00, r01, r03, r02, r04, r05, r06, r07;
332 
333  vec c0 = vec(0);
334  vec c1 = vec(0);
335  vec c2 = vec(0);
336  vec c3 = vec(0);
337 
338  __vpred tmp;
339  vec *addr;
340 
341  DSPLIB_DEBUGPRINTFN(1, "\nIn _ci.cpp M = %d, NBlocks = %d, KBlocks = %d: \n", M, NBlocks, KBlocks);
342 
343  vec a00, a01, a02, a03;
344 
345  double *pIn0Local = ((double *) pIn0 + 1);
346 
347  __vpred predA;
348  a00 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
349  a01 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
350  a02 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
351  a03 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
352 
353  /* uint64_t startCycle, endCycle; */
354 
355  for (int32_t mn = 0; mn < M * NBlocks; mn++) {
356 
357  r00 = (vec) 0;
358  r01 = (vec) 0;
359  r02 = (vec) 0;
360  r03 = (vec) 0;
361  r04 = (vec) 0;
362  r05 = (vec) 0;
363  r06 = (vec) 0;
364  r07 = (vec) 0;
365 
366  for (int32_t k = 0; k < KBlocks; k++) {
367 
368  a = loadAMatSA<double, vec, 0>(predA, pIn0);
369  b = c7x::strm_eng<0, vec>::get_adv();
370  r00 += a * b;
371 
372  b = c7x::strm_eng<1, vec>::get_adv();
373  r01 += a00 * b;
374 
375  a = loadAMatSA<double, vec, 0>(predA, pIn0);
376  b = c7x::strm_eng<0, vec>::get_adv();
377  r02 += a * b;
378 
379  b = c7x::strm_eng<1, vec>::get_adv();
380  r03 += a01 * b;
381 
382  a = loadAMatSA<double, vec, 0>(predA, pIn0);
383  b = c7x::strm_eng<0, vec>::get_adv();
384  r04 += a * b;
385 
386  b = c7x::strm_eng<1, vec>::get_adv();
387  r05 += a02 * b;
388 
389  a = loadAMatSA<double, vec, 0>(predA, pIn0);
390  b = c7x::strm_eng<0, vec>::get_adv();
391  r06 += a * b;
392 
393  b = c7x::strm_eng<1, vec>::get_adv();
394  r07 += a03 * b;
395 
396  a00 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
397  a01 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
398  a02 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
399  a03 = loadAMatSA<double, vec, 2>(predA, pIn0Local);
400  }
401 
402  c0 = r00 + r01;
403  c1 = r02 + r03;
404  c2 = r04 + r05;
405  c3 = r06 + r07;
406 
407  c0 += c1;
408  c2 += c3;
409  c0 += c2;
410 
411  writeOutSA1(tmp, addr, pOut, c0);
412  }
413 
414  __SE0_CLOSE();
415  __SE1_CLOSE();
416  __SA0_CLOSE();
417  }
418  else {
419  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock);
420 
421  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
422  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
423 
424  // Input samples
425  __SE0_OPEN(pIn1, se0Params);
426  __SA0_OPEN(sa0Params);
427 
428  // Output samples
429  __SA1_OPEN(sa1Params);
430 
431  vec a;
432  vec b;
433 
434  vec r00, r02, r04, r06;
435 
436  vec c0 = vec(0);
437  vec c1 = vec(0);
438  vec c2 = vec(0);
439  vec c3 = vec(0);
440 
441  __vpred tmp;
442  vec *addr;
443 
444  DSPLIB_DEBUGPRINTFN(1, "\nIn _ci.cpp M = %d, NBlocks = %d, KBlocks = %d: \n", M, NBlocks, KBlocks);
445 
446  __vpred predA;
447 
448  for (int32_t mn = 0; mn < M * NBlocks; mn++) {
449 
450  r00 = (vec) 0;
451  r02 = (vec) 0;
452  r04 = (vec) 0;
453  r06 = (vec) 0;
454 
455  for (int32_t k = 0; k < KBlocks; k++) {
456 
457  a = loadAMatSA<double, vec, 0>(predA, pIn0);
458  b = c7x::strm_eng<0, vec>::get_adv();
459  r00 += a * b;
460 
461  a = loadAMatSA<double, vec, 0>(predA, pIn0);
462  b = c7x::strm_eng<0, vec>::get_adv();
463  r02 += a * b;
464 
465  a = loadAMatSA<double, vec, 0>(predA, pIn0);
466  b = c7x::strm_eng<0, vec>::get_adv();
467  r04 += a * b;
468 
469  a = loadAMatSA<double, vec, 0>(predA, pIn0);
470  b = c7x::strm_eng<0, vec>::get_adv();
471  r06 += a * b;
472  }
473 
474  c0 = r00;
475  c1 = r02;
476  c2 = r04;
477  c3 = r06;
478 
479  c0 += c1;
480  c2 += c3;
481 
482  c0 += c2;
483 
484  writeOutSA1(tmp, addr, pOut, c0);
485  }
486 
487  __SE0_CLOSE();
488  __SA0_CLOSE();
489  __SA1_CLOSE();
490  }
491  return DSPLIB_SUCCESS;
492 }
493 template <>
495  void *restrict pIn0,
496  void *restrict pIn1,
497  void *restrict pOut)
498 {
499  DSPLIB_matMul_PrivArgs *pKerPrivArgs = (DSPLIB_matMul_PrivArgs *) handle;
500 
501  int32_t M = pKerPrivArgs->M;
502  int32_t K = pKerPrivArgs->K;
503  int32_t KBlocks = pKerPrivArgs->KBlocks;
504  int32_t NBlocks = pKerPrivArgs->NBlocks;
505 
506  __SE_TEMPLATE_v1 se0Params;
507  __SE_TEMPLATE_v1 se1Params;
508  __SA_TEMPLATE_v1 sa0Params;
509  __SA_TEMPLATE_v1 sa1Params;
510  __SA_TEMPLATE_v1 sa2Params;
511 
512 #if DSPLIB_DEBUGPRINT
513  printf("Enter DSPLIB_matMul_exec_ci\n");
514 #endif
515 
516  typedef typename c7x::make_full_vector<float>::type vec;
517 
518  uint8_t *pBlock = pKerPrivArgs->bufPblock;
519  if (K > 1) {
520  int32_t strideIn1 = pKerPrivArgs->strideIn1Elements;
521 
522  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE0_PARAM_OFFSET);
523  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE1_PARAM_OFFSET);
524 
525  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA0_PARAM_OFFSET);
526  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA1_PARAM_OFFSET);
527  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + SA2_PARAM_OFFSET);
528 
529  // Input samples
530  __SE0_OPEN(pIn1, se0Params);
531  __SE1_OPEN(((float *) pIn1 + strideIn1), se1Params);
532 
533  // Output samples
534  __SA0_OPEN(sa0Params);
535  __SA2_OPEN(sa2Params);
536  __SA1_OPEN(sa1Params);
537 
538  vec a;
539  vec b;
540 
541  vec r00, r01, r03, r02, r04, r05, r06, r07;
542  vec r08, r09, r0a, r0b, r0c, r0d, r0e, r0f;
543 
544  vec c0 = vec(0);
545  vec c1 = vec(0);
546  vec c2 = vec(0);
547  vec c3 = vec(0);
548  vec c4, c5, c6, c7;
549 
550  __vpred tmp;
551  vec *addr;
552 
553  DSPLIB_DEBUGPRINTFN(1, "\nIn _ci.cpp M = %d, NBlocks = %d, KBlocks = %d: \n", M, NBlocks, KBlocks);
554 
555  vec a00, a01, a02, a03;
556  vec a04, a05, a06, a07;
557 
558  float *pIn0Local = ((float *) pIn0 + 1);
559 
560  __vpred predA;
561  a00 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
562  a01 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
563  a02 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
564  a03 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
565  a04 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
566  a05 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
567  a06 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
568  a07 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
569  /* #pragma COALESCE_LOOP */
570  /* #pragma MUST_ITERATE(2, , 2) */
571  /* for (int32_t m = 0; m < M; m++) { */
572  /* #pragma COALESCE_LOOP */
573  /* #pragma MUST_ITERATE(2, , 2) */
574  /* uint64_t startCycle, endCycle; */
575 
576  /* #pragma COALESCE_LOOP */
577  /* #pragma MUST_ITERATE(2, , 2) */
578  for (int32_t mn = 0; mn < M * NBlocks; mn++) {
579  /* for (int32_t n = 0; n < 2; n++) { */
580 
581  r00 = (vec) 0;
582  r01 = (vec) 0;
583  r02 = (vec) 0;
584  r03 = (vec) 0;
585  r04 = (vec) 0;
586  r05 = (vec) 0;
587  r06 = (vec) 0;
588  r07 = (vec) 0;
589  r08 = (vec) 0;
590  r09 = (vec) 0;
591  r0a = (vec) 0;
592  r0b = (vec) 0;
593  r0c = (vec) 0;
594  r0d = (vec) 0;
595  r0e = (vec) 0;
596  r0f = (vec) 0;
597  /* startCycle = (__get_GTSC(1), __STSC); */
598  /* #pragma MUST_ITERATE(2, , 2) */
599  for (int32_t k = 0; k < KBlocks; k++) {
600 
601  a = loadAMatSA<float, vec, 0>(predA, pIn0);
602  /* printf("\na: \n"); */
603  /* DSPLIB_debugPrintVector(a); */
604  b = c7x::strm_eng<0, vec>::get_adv();
605  /* printf("\nb: \n"); */
606  /* DSPLIB_debugPrintVector(b); */
607  r00 += a * b;
608 
609  /* printf("\na00: \n"); */
610  /* DSPLIB_debugPrintVector(a00); */
611  b = c7x::strm_eng<1, vec>::get_adv();
612  /* printf("\nb: \n"); */
613  /* DSPLIB_debugPrintVector(b); */
614  r01 += a00 * b;
615 
616  a = loadAMatSA<float, vec, 0>(predA, pIn0);
617  /* printf("\na: \n"); */
618  /* DSPLIB_debugPrintVector(a); */
619  b = c7x::strm_eng<0, vec>::get_adv();
620  /* printf("\nb: \n"); */
621  /* DSPLIB_debugPrintVector(b); */
622  r02 += a * b;
623 
624  /* printf("\na01: \n"); */
625  /* DSPLIB_debugPrintVector(a01); */
626  b = c7x::strm_eng<1, vec>::get_adv();
627  /* printf("\nb: \n"); */
628  /* DSPLIB_debugPrintVector(b); */
629  r03 += a01 * b;
630 
631  a = loadAMatSA<float, vec, 0>(predA, pIn0);
632  b = c7x::strm_eng<0, vec>::get_adv();
633  r04 += a * b;
634 
635  b = c7x::strm_eng<1, vec>::get_adv();
636  r05 += a02 * b;
637 
638  a = loadAMatSA<float, vec, 0>(predA, pIn0);
639  b = c7x::strm_eng<0, vec>::get_adv();
640  r06 += a * b;
641 
642  b = c7x::strm_eng<1, vec>::get_adv();
643  r07 += a03 * b;
644 
645  a = loadAMatSA<float, vec, 0>(predA, pIn0);
646  b = c7x::strm_eng<0, vec>::get_adv();
647  r08 += a * b;
648 
649  b = c7x::strm_eng<1, vec>::get_adv();
650  r09 += a04 * b;
651 
652  a = loadAMatSA<float, vec, 0>(predA, pIn0);
653  b = c7x::strm_eng<0, vec>::get_adv();
654  r0a += a * b;
655 
656  b = c7x::strm_eng<1, vec>::get_adv();
657  r0b += a05 * b;
658 
659  a = loadAMatSA<float, vec, 0>(predA, pIn0);
660  b = c7x::strm_eng<0, vec>::get_adv();
661  r0c += a * b;
662 
663  b = c7x::strm_eng<1, vec>::get_adv();
664  r0d += a06 * b;
665 
666  a = loadAMatSA<float, vec, 0>(predA, pIn0);
667  b = c7x::strm_eng<0, vec>::get_adv();
668  r0e += a * b;
669 
670  b = c7x::strm_eng<1, vec>::get_adv();
671  r0f += a07 * b;
672 
673  a00 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
674  a01 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
675  a02 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
676  a03 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
677 
678  a04 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
679  a05 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
680  a06 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
681  a07 = loadAMatSA<float, vec, 2>(predA, pIn0Local);
682  }
683 
684  /* startCycle = (__get_GTSC(1), __STSC); */
685  /* c = r00 + r01 + r02 + r03 + r04 + r05 + r06 + r07 + r08 + r09 + r0a + r0b + r0c + r0d + r0e + r0f; */
686 
687  c0 = r00 + r01;
688  c1 = r02 + r03;
689  c2 = r04 + r05;
690  c3 = r06 + r07;
691  c4 = r08 + r09;
692  c5 = r0a + r0b;
693  c6 = r0c + r0d;
694  c7 = r0e + r0f;
695 
696  c0 += c1;
697  c2 += c3;
698  c4 += c5;
699  c6 += c7;
700 
701  c0 += c2;
702  c4 += c6;
703  c0 += c4;
704 
705  /* endCycle = (__get_GTSC(1), __STSC); */
706  writeOutSA1(tmp, addr, pOut, c0);
707  }
708  /* } */
709 
710  /* printf("\nTotaly cycles: %ld\n", endCycle - startCycle); */
711 
712  __SE0_CLOSE();
713  __SE1_CLOSE();
714  __SA0_CLOSE();
715  }
716  else {
717  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock);
718 
719  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
720  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
721 
722  // Input samples
723  __SE0_OPEN(pIn1, se0Params);
724  __SA0_OPEN(sa0Params);
725 
726  // Output samples
727  __SA1_OPEN(sa1Params);
728 
729  vec a;
730  vec b;
731 
732  vec r00, r02, r04, r06;
733  vec r08, r0a, r0c, r0e;
734 
735  vec c0 = vec(0);
736  vec c1 = vec(0);
737  vec c2 = vec(0);
738  vec c3 = vec(0);
739  vec c4, c5, c6, c7;
740 
741  __vpred tmp;
742  vec *addr;
743 
744  DSPLIB_DEBUGPRINTFN(1, "\nIn _ci.cpp M = %d, NBlocks = %d, KBlocks = %d: \n", M, NBlocks, KBlocks);
745 
746  __vpred predA;
747 
748  for (int32_t mn = 0; mn < M * NBlocks; mn++) {
749 
750  r00 = (vec) 0;
751  r02 = (vec) 0;
752  r04 = (vec) 0;
753  r06 = (vec) 0;
754  r08 = (vec) 0;
755  r0a = (vec) 0;
756  r0c = (vec) 0;
757  r0e = (vec) 0;
758 
759  for (int32_t k = 0; k < KBlocks; k++) {
760 
761  a = loadAMatSA<float, vec, 0>(predA, pIn0);
762  b = c7x::strm_eng<0, vec>::get_adv();
763  r00 += a * b;
764 
765  a = loadAMatSA<float, vec, 0>(predA, pIn0);
766  b = c7x::strm_eng<0, vec>::get_adv();
767  r02 += a * b;
768 
769  a = loadAMatSA<float, vec, 0>(predA, pIn0);
770  b = c7x::strm_eng<0, vec>::get_adv();
771  r04 += a * b;
772 
773  a = loadAMatSA<float, vec, 0>(predA, pIn0);
774  b = c7x::strm_eng<0, vec>::get_adv();
775  r06 += a * b;
776 
777  a = loadAMatSA<float, vec, 0>(predA, pIn0);
778  b = c7x::strm_eng<0, vec>::get_adv();
779  r08 += a * b;
780 
781  a = loadAMatSA<float, vec, 0>(predA, pIn0);
782  b = c7x::strm_eng<0, vec>::get_adv();
783  r0a += a * b;
784 
785  a = loadAMatSA<float, vec, 0>(predA, pIn0);
786  b = c7x::strm_eng<0, vec>::get_adv();
787  r0c += a * b;
788 
789  a = loadAMatSA<float, vec, 0>(predA, pIn0);
790  b = c7x::strm_eng<0, vec>::get_adv();
791  r0e += a * b;
792 
793  }
794 
795  c0 = r00;
796  c1 = r02;
797  c2 = r04;
798  c3 = r06;
799  c4 = r08;
800  c5 = r0a;
801  c6 = r0c;
802  c7 = r0e;
803 
804  c0 += c1;
805  c2 += c3;
806  c4 += c5;
807  c6 += c7;
808 
809  c0 += c2;
810  c4 += c6;
811  c0 += c4;
812 
813  writeOutSA1(tmp, addr, pOut, c0);
814  }
815 
816  __SE0_CLOSE();
817  __SA0_CLOSE();
818  __SA1_CLOSE();
819  }
820 
821  return DSPLIB_SUCCESS;
822 }
823 
824 template <typename dataType>
826 DSPLIB_matMul_generic_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
827 {
828  DSPLIB_matMul_generic_core_ci<dataType>(handle, pIn0, pIn1, pOut);
829  return DSPLIB_SUCCESS;
830 }
831 
833  void *restrict pIn0,
834  void *restrict pIn1,
835  void *restrict pOut);
837  void *restrict pIn0,
838  void *restrict pIn1,
839  void *restrict pOut);
#define SE0_PARAM_OFFSET
template DSPLIB_STATUS DSPLIB_matMul_generic_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_matMul_generic_core_ci< float >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
#define DSPLIB_MATMUL_FLOAT_UNROLL_FACTOR
template DSPLIB_STATUS DSPLIB_matMul_generic_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_matMul_generic_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn0, const DSPLIB_bufParams2D_t *bufParamsIn1, const DSPLIB_bufParams2D_t *bufParamsOut, const DSPLIB_matMul_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_matMul_generic_core_ci< double >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
void setUnrollFactors< float >(int32_t *unrollFactor, int32_t *seUnrollFactor)
#define DSPLIB_MATMUL_DOUBLE_UNROLL_FACTOR
template DSPLIB_STATUS DSPLIB_matMul_generic_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
static void writeOutSA1(__vpred tmp, vec *addr, T pOut, vec out)
void setUnrollFactors(int32_t *unrollFactor, int32_t *seUnrollFactor)
#define SA2_PARAM_OFFSET
DSPLIB_STATUS DSPLIB_matMul_generic_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
DSPLIB_STATUS DSPLIB_matMul_generic_core_ci(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
#define DSPLIB_MATMUL_SE_FLOAT_UNROLL_FACTOR
template DSPLIB_STATUS DSPLIB_matMul_generic_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pIn0, void *restrict pIn1, void *restrict pOut)
#define DSPLIB_MATMUL_SE_DOUBLE_UNROLL_FACTOR
#define SA1_PARAM_OFFSET
void setUnrollFactors< double >(int32_t *unrollFactor, int32_t *seUnrollFactor)
#define SA0_PARAM_OFFSET
#define SE1_PARAM_OFFSET
static vec loadAMatSA(__vpred tmp, void *pIn)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_matMul.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MATMUL_IXX_IXX_OXX_PBLOCK_SIZE]