DSPLIB User Guide
DSPLIB_sqrAdd_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
47 #include "../common/c71/DSPLIB_inlines.h"
48 #include "DSPLIB_sqrAdd_priv.h"
49 #include <float.h>
50 
51 // Generic initialization
52 template <typename dataType>
54  const DSPLIB_bufParams1D_t *bufParamsIn,
55  const DSPLIB_bufParams1D_t *bufParamsOut,
56  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
57 {
59  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
60  uint32_t blockSize = pKerPrivArgs->blockSize;
61 
62  __SE_TEMPLATE_v1 se0Params;
63  __SE_TEMPLATE_v1 se1Params;
64 
65  __SE_ELETYPE SE_ELETYPE;
66  __SE_VECLEN SE_VECLEN;
67 
68  uint8_t *pBlock = pKerPrivArgs->bufPblock;
69 
70  typedef typename c7x::make_full_vector<dataType>::type vec;
71  int32_t eleCount = c7x::element_count_of<vec>::value;
72  SE_VECLEN = c7x::se_veclen<vec>::value;
73  SE_ELETYPE = c7x::se_eletype<vec>::value;
74 
75 #if DSPLIB_DEBUGPRINT
76  printf("Enter eleCount %d\n", eleCount);
77 #endif
78 
79  int32_t blockCount = (blockSize % eleCount == 0) ? (blockSize / eleCount) : (blockSize / eleCount) + 1;
80  int32_t blockCountHalf = (blockCount % 2 == 0) ? blockCount / 2 : blockCount / 2 + 1;
81 
82  /**********************************************************************/
83  /* Prepare streaming engine 0 to fetch the input */
84  /**********************************************************************/
85  se0Params = __gen_SE_TEMPLATE_v1();
86  uint32_t icnt0Param = (blockSize < eleCount) ? blockSize : eleCount;
87  se0Params.ICNT0 = icnt0Param;
88  se0Params.ICNT1 = blockCountHalf;
89  se0Params.DIM1 = eleCount * 2;
90  se0Params.ELETYPE = SE_ELETYPE;
91  se0Params.VECLEN = SE_VECLEN;
92  se0Params.DECDIM1_WIDTH = blockSize;
93  se0Params.DECDIM1 = __SE_DECDIM_DIM1;
94  se0Params.DIMFMT = __SE_DIMFMT_2D;
95 
96  /**********************************************************************/
97  /* Prepare streaming engine 1 to fetch the input */
98  /**********************************************************************/
99  se1Params = __gen_SE_TEMPLATE_v1();
100  icnt0Param = ((blockSize - eleCount) < eleCount) ? (blockSize - eleCount) : eleCount;
101  se1Params.ICNT0 = icnt0Param;
102  se1Params.ICNT1 = blockCountHalf;
103  se1Params.DIM1 = eleCount * 2;
104  se1Params.ELETYPE = SE_ELETYPE;
105  se1Params.VECLEN = SE_VECLEN;
106  se1Params.DECDIM1_WIDTH = blockSize - eleCount;
107  se1Params.DECDIM1 = __SE_DECDIM_DIM1;
108  se1Params.DIMFMT = __SE_DIMFMT_2D;
109 
110  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET) = se0Params;
111  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET) = se1Params;
112 
113  return status;
114 }
115 
116 template <>
118  const DSPLIB_bufParams1D_t *bufParamsIn,
119  const DSPLIB_bufParams1D_t *bufParamsOut,
120  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
121 {
122  DSPLIB_STATUS status = DSPLIB_SUCCESS;
123  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
124  uint32_t blockSize = pKerPrivArgs->blockSize;
125 
126  __SE_TEMPLATE_v1 se0Params;
127  __SE_TEMPLATE_v1 se1Params;
128 
129  __SE_ELETYPE SE_ELETYPE;
130  __SE_VECLEN SE_VECLEN;
131 
132  __SE_PROMOTE SE_PROMOTE;
133 
134  uint8_t *pBlock = pKerPrivArgs->bufPblock;
135 
136  int32_t eleCount = c7x::element_count_of<c7x::short_vec>::value;
137 
138  SE_VECLEN = c7x::se_veclen<c7x::short_vec>::value;
139  SE_ELETYPE = c7x::se_eletype<c7x::char_vec>::value;
140  SE_PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
141 
142 #if DSPLIB_DEBUGPRINT
143  printf("Enter eleCount %d\n", eleCount);
144 #endif
145 
146  // printf("Enter eleCount %d\n", eleCount);
147  int32_t blockCount = (blockSize % eleCount == 0) ? (blockSize / eleCount) : (blockSize / eleCount) + 1;
148  int32_t blockCountHalf = (blockCount % 2 == 0) ? blockCount / 2 : blockCount / 2 + 1;
149 
150  /**********************************************************************/
151  /* Prepare streaming engine 0 to fetch the input */
152  /**********************************************************************/
153  se0Params = __gen_SE_TEMPLATE_v1();
154  uint32_t icnt0Param = (blockSize < eleCount) ? blockSize : eleCount;
155  se0Params.ICNT0 = icnt0Param;
156  se0Params.ICNT1 = blockCountHalf;
157  se0Params.DIM1 = eleCount * 2;
158  se0Params.ELETYPE = SE_ELETYPE;
159  se0Params.VECLEN = SE_VECLEN;
160  se0Params.DECDIM1_WIDTH = blockSize;
161  se0Params.DECDIM1 = __SE_DECDIM_DIM1;
162  se0Params.DIMFMT = __SE_DIMFMT_2D;
163  se0Params.PROMOTE = SE_PROMOTE;
164 
165  /**********************************************************************/
166  /* Prepare streaming engine 1 to fetch the input */
167  /**********************************************************************/
168  se1Params = __gen_SE_TEMPLATE_v1();
169  icnt0Param = ((blockSize - eleCount) < eleCount) ? (blockSize - eleCount) : eleCount;
170  se1Params.ICNT0 = icnt0Param;
171  se1Params.ICNT1 = blockCountHalf;
172  se1Params.DIM1 = eleCount * 2;
173  se1Params.ELETYPE = SE_ELETYPE;
174  se1Params.VECLEN = SE_VECLEN;
175  se1Params.DECDIM1_WIDTH = blockSize - eleCount;
176  se1Params.DECDIM1 = __SE_DECDIM_DIM1;
177  se1Params.DIMFMT = __SE_DIMFMT_2D;
178  se1Params.PROMOTE = SE_PROMOTE;
179 
180  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET) = se0Params;
181  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET) = se1Params;
182 
183  return status;
184 }
185 
187  const DSPLIB_bufParams1D_t *bufParamsIn,
188  const DSPLIB_bufParams1D_t *bufParamsOut,
189  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
190 
192  const DSPLIB_bufParams1D_t *bufParamsIn,
193  const DSPLIB_bufParams1D_t *bufParamsOut,
194  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
195 
197  const DSPLIB_bufParams1D_t *bufParamsIn,
198  const DSPLIB_bufParams1D_t *bufParamsOut,
199  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
200 
202  const DSPLIB_bufParams1D_t *bufParamsIn,
203  const DSPLIB_bufParams1D_t *bufParamsOut,
204  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
205 
207  const DSPLIB_bufParams1D_t *bufParamsIn,
208  const DSPLIB_bufParams1D_t *bufParamsOut,
209  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
210 
212  const DSPLIB_bufParams1D_t *bufParamsIn,
213  const DSPLIB_bufParams1D_t *bufParamsOut,
214  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
215 
217  const DSPLIB_bufParams1D_t *bufParamsIn,
218  const DSPLIB_bufParams1D_t *bufParamsOut,
219  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
220 
222  const DSPLIB_bufParams1D_t *bufParamsIn,
223  const DSPLIB_bufParams1D_t *bufParamsOut,
224  const DSPLIB_sqrAdd_InitArgs *pKerInitArgs);
225 
226 // This function performs horizontal add of the output vector.
227 // It is used for float and double datat teyps.
228 // The __horizontal_add() intrinsic is used to perform horizontal add on all other datatypes.
229 
230 #pragma FUNC_ALWAYS_INLINE
231 static inline float DSPLIB_horiAdd(c7x::float_vec vector)
232 {
233  float sum = 0;
234 
235  vector.lo() = vector.hi() + vector.lo();
236  vector.lo().lo() = vector.lo().hi() + vector.lo().lo();
237 // #if __C7X_VEC_SIZE_BYTES__ == 64
238  vector.lo().lo().lo() = vector.lo().lo().hi() + vector.lo().lo().lo();
239  sum = (float) vector.s[0] + (float) vector.s[1];
240 
241  return sum;
242 }
243 
244 #pragma FUNC_ALWAYS_INLINE
245 static inline double DSPLIB_horiAdd(c7x::double_vec vector)
246 {
247  double sum = 0;
248 
249  vector.lo() = vector.hi() + vector.lo();
250 
251 // #if __C7X_VEC_SIZE_BYTES__ == 64
252  vector.lo().lo() = vector.lo().hi() + vector.lo().lo();
253 
254  sum = (double) vector.s[0] + (double) vector.s[1];
255 
256  return sum;
257 }
258 
259 /**********************************************************************/
260 /* Execute for datatypes float and double */
261 /**********************************************************************/
262 
263 // This is the generic implementation of exec_ci. It is used for float and double.
264 // Other datatypes have their own explicet implementation.
265 template <typename dataType>
266 DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
267 {
268  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
269  uint32_t blockSize = pKerPrivArgs->blockSize;
270 
271  __SE_TEMPLATE_v1 se0Params;
272  __SE_TEMPLATE_v1 se1Params;
273 
274  dataType *restrict pInLocal1 = (dataType *) pIn;
275  dataType *restrict pOutLocal = (dataType *) pOut;
276 
277 #if DSPLIB_DEBUGPRINT
278  printf("Enter DSPLIB_sqrAdd_exec_ci\n");
279 #endif
280 
281  typedef typename c7x::make_full_vector<dataType>::type vec;
282  int32_t eleCount = c7x::element_count_of<vec>::value;
283 
284  dataType *restrict pInLocal2 = pInLocal1 + eleCount;
285 #if DSPLIB_DEBUGPRINT
286  printf("Enter eleCount %d\n", eleCount);
287 #endif
288 
289  uint8_t *pBlock = pKerPrivArgs->bufPblock;
290 
291  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
292  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
293 
294  // Input samples
295  __SE0_OPEN(pInLocal1, se0Params);
296  __SE1_OPEN(pInLocal2, se1Params);
297 
298 #if DSPLIB_DEBUGPRINT
299  printf("DSPLIB_DEBUGPRINT blockSize %d pInLocal1 %p pInLocal2 %p\n", blockSize, pInLocal1, pInLocal2);
300 #endif
301 
302  vec outa, outb, outc, outd, oute, outf, outg, outh, outab, outcd, outef, outgh, out;
303 
304  outa = (vec) 0.0;
305  outb = (vec) 0.0;
306  outc = (vec) 0.0;
307  outd = (vec) 0.0;
308  oute = (vec) 0.0;
309  outf = (vec) 0.0;
310  outg = (vec) 0.0;
311  outh = (vec) 0.0;
312  outab = (vec) 0.0;
313  outcd = (vec) 0.0;
314  outef = (vec) 0.0;
315  outgh = (vec) 0.0;
316  out = (vec) 0.0;
317 
318  dataType result;
319  for (int32_t counter = 0; counter < blockSize; counter += eleCount * 8) {
320  vec a = c7x::strm_eng<0, vec>::get_adv();
321  // a.print();
322  vec b = c7x::strm_eng<1, vec>::get_adv();
323  // b.print();
324  outa += a * a;
325  outb += b * b;
326 #if DSPLIB_DEBUGPRINT
327 // DSPLIB_debugPrintVector(a);
328 // DSPLIB_debugPrintVector(outa);
329 #endif
330  vec c = c7x::strm_eng<0, vec>::get_adv();
331  // c.print();
332  vec d = c7x::strm_eng<1, vec>::get_adv();
333  // d.print();
334  outc += c * c;
335  outd += d * d;
336 
337  vec e = c7x::strm_eng<0, vec>::get_adv();
338  // e.print();
339  vec f = c7x::strm_eng<1, vec>::get_adv();
340  // f.print();
341  oute += e * e;
342  outf += f * f;
343 
344  vec g = c7x::strm_eng<0, vec>::get_adv();
345  // g.print();
346  vec h = c7x::strm_eng<1, vec>::get_adv();
347  // h.print();
348 
349  outg += g * g;
350  outh += h * h;
351  }
352 
353  outab = outa + outb;
354  outcd = outc + outd;
355  outef = oute + outf;
356  outgh = outg + outh;
357  out = outab + outcd + outef + outgh;
358 
359  // out.print();
360 #if DSPLIB_DEBUGPRINT
361  DSPLIB_debugPrintVector(out);
362 #endif
363  result = DSPLIB_horiAdd(out);
364  // printf("\nresult: %10g\n", result);
365 
366  *pOutLocal = result;
367 #if DSPLIB_DEBUGPRINT
368  printf("DSPLIB_DEBUGPRINT DSPLIB_sqrAdd_exec_ci result %lf\n", result);
369 #endif
370 
371  __SE0_CLOSE();
372  __SE1_CLOSE();
373 
374  return DSPLIB_SUCCESS;
375 }
376 
377 /**********************************************************************/
378 /* Execute for datatype int8_t */
379 /**********************************************************************/
380 // The input datatype of int8_t is promoted to int16_t using the streaming engine.
381 // The dotprod is then implemented as int16_t.
382 // When completed, the output of int64_t is casted down to int32_t.
383 template <>
384 DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci<int8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
385 {
386  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
387  uint32_t blockSize = pKerPrivArgs->blockSize;
388 
389  __SE_TEMPLATE_v1 se0Params;
390  __SE_TEMPLATE_v1 se1Params;
391 
392  int8_t *restrict pInLocal1 = (int8_t *) pIn;
393  int32_t *restrict pOutLocal = (int32_t *) pOut;
394 
395 #if DSPLIB_DEBUGPRINT
396  printf("Enter DSPLIB_sqrAdd_exec_ci\n");
397 #endif
398 
399  typedef typename c7x::make_full_vector<int16_t>::type vec;
400  int32_t eleCount = c7x::element_count_of<vec>::value;
401  // eleCount = eleCount * 2;
402 
403  int8_t *restrict pInLocal2 = pInLocal1 + eleCount;
404 
405 #if DSPLIB_DEBUGPRINT
406  printf("Enter eleCount %d\n", eleCount);
407 #endif
408  // printf("Enter eleCount %d\n", eleCount);
409 
410  uint8_t *pBlock = pKerPrivArgs->bufPblock;
411 
412  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
413  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
414 
415  // Input samples
416  __SE0_OPEN(pInLocal1, se0Params);
417  __SE1_OPEN(pInLocal2, se1Params);
418 
419 #if DSPLIB_DEBUGPRINT
420  printf("DSPLIB_DEBUGPRINT blockSize %d pInLocal1 %p pInLocal2 %p\n", blockSize, pInLocal1, pInLocal2);
421 #endif
422 
423  typedef typename c7x::make_full_vector<int64_t>::type vec_out;
424 
425  vec_out outa;
426  vec_out outb;
427  outa = (vec_out) 0;
428  outb = (vec_out) 0;
429 
430  int64_t result = 0;
431  for (int32_t counter = 0; counter < blockSize; counter += eleCount * 2) {
432  vec a = c7x::strm_eng<0, vec>::get_adv();
433  // a.print();
434  vec b = c7x::strm_eng<1, vec>::get_adv();
435  // b.print();
436 
437  outa += __vdotp4hd_vvv(a, a);
438  outb += __vdotp4hd_vvv(b, b);
439 
440  // printf(" | a = ");
441  // a.print();
442 
443  // printf(" | out a = ");
444  // outa.print();
445 
446  // printf(" | b = ");
447  // b.print();
448 
449  // printf(" | out b = ");
450  // outb.print();
451 
452  // printf("\n");
453 
454 #if DSPLIB_DEBUGPRINT
455 // DSPLIB_debugPrintVector(a);
456 // DSPLIB_debugPrintVector(outa);
457 #endif
458  }
459 
460  outa = outa + outb;
461  // out.print();
462 #if DSPLIB_DEBUGPRINT
463  // DSPLIB_debugPrintVector(out);
464 #endif
465  // use intrensic for horizontal add
466  // cast the int64_t output of __horizontal_add to int32_t.
467  result = (int32_t) __horizontal_add(outa);
468 
469  *pOutLocal = result;
470 #if DSPLIB_DEBUGPRINT
471  printf("DSPLIB_DEBUGPRINT DSPLIB_sqrAdd_exec_ci result %lf\n", result);
472 #endif
473 
474  __SE0_CLOSE();
475  __SE1_CLOSE();
476 
477  return DSPLIB_SUCCESS;
478 }
479 
480 /**********************************************************************/
481 /* Execute for datatype uint8_t */
482 /**********************************************************************/
483 template <>
484 DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci<uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
485 {
486  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
487  uint32_t blockSize = pKerPrivArgs->blockSize;
488 
489  __SE_TEMPLATE_v1 se0Params;
490  __SE_TEMPLATE_v1 se1Params;
491 
492  uint8_t *restrict pInLocal1 = (uint8_t *) pIn;
493  uint32_t *restrict pOutLocal = (uint32_t *) pOut;
494 
495 #if DSPLIB_DEBUGPRINT
496  printf("Enter DSPLIB_sqrAdd_exec_ci\n");
497 #endif
498 
499  typedef typename c7x::make_full_vector<uint8_t>::type vec;
500  int32_t eleCount = c7x::element_count_of<vec>::value;
501 
502  uint8_t *restrict pInLocal2 = pInLocal1 + eleCount;
503 #if DSPLIB_DEBUGPRINT
504  printf("Enter eleCount %d\n", eleCount);
505 #endif
506 
507  uint8_t *pBlock = pKerPrivArgs->bufPblock;
508 
509  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
510  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
511 
512  // Input samples
513  __SE0_OPEN(pInLocal1, se0Params);
514  __SE1_OPEN(pInLocal2, se1Params);
515 
516 #if DSPLIB_DEBUGPRINT
517  printf("DSPLIB_DEBUGPRINT blockSize %d pInLocal1 %p pInLocal2 %p\n", blockSize, pInLocal1, pInLocal2);
518 #endif
519 
520  typedef typename c7x::make_full_vector<uint32_t>::type vec_out;
521 
522  vec_out outa;
523  vec_out outb;
524 
525  outa = (vec_out) 0;
526  outb = (vec_out) 0;
527  uint64_t result = 0;
528  for (int32_t counter = 0; counter < blockSize; counter += eleCount * 2) {
529  vec a = c7x::strm_eng<0, vec>::get_adv();
530  // a.print();
531  vec b = c7x::strm_eng<1, vec>::get_adv();
532  // b.print();
533 
534  outa += __vdotp4ubw_vvv(a, a);
535  outb += __vdotp4ubw_vvv(b, b);
536 
537  // printf(" | a = ");
538  // a.print();
539 
540  // printf(" | out a = ");
541  // outa.print();
542 
543  // printf(" | b = ");
544  // b.print();
545 
546  // printf(" | out b = ");
547  // outb.print();
548 
549  // printf("\n");
550 
551 #if DSPLIB_DEBUGPRINT
552 // DSPLIB_debugPrintVector(a);
553 // DSPLIB_debugPrintVector(outa);
554 #endif
555  }
556 
557  outa = outa + outb;
558 
559  // out.print();
560 #if DSPLIB_DEBUGPRINT
561  // DSPLIB_debugPrintVector(out);
562 #endif
563  // use intrensic for horizontal add
564  // cast the uint64_t output of __horizontal_add to uint32_t.
565  result = (uint32_t) __horizontal_add(outa);
566 
567  *pOutLocal = result;
568 #if DSPLIB_DEBUGPRINT
569  printf("DSPLIB_DEBUGPRINT DSPLIB_sqrAdd_exec_ci result %lf\n", result);
570 #endif
571 
572  __SE0_CLOSE();
573  __SE1_CLOSE();
574 
575  return DSPLIB_SUCCESS;
576 }
577 
578 /**********************************************************************/
579 /* Execute for datatype int16_t */
580 /**********************************************************************/
581 template <>
582 DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci<int16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
583 {
584  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
585  uint32_t blockSize = pKerPrivArgs->blockSize;
586 
587  __SE_TEMPLATE_v1 se0Params;
588  __SE_TEMPLATE_v1 se1Params;
589 
590  int16_t *restrict pInLocal1 = (int16_t *) pIn;
591  int64_t *restrict pOutLocal = (int64_t *) pOut;
592 
593 #if DSPLIB_DEBUGPRINT
594  printf("Enter DSPLIB_sqrAdd_exec_ci\n");
595 #endif
596 
597  typedef typename c7x::make_full_vector<int16_t>::type vec;
598  int32_t eleCount = c7x::element_count_of<vec>::value;
599 
600  int16_t *restrict pInLocal2 = pInLocal1 + eleCount;
601 #if DSPLIB_DEBUGPRINT
602  printf("Enter eleCount %d\n", eleCount);
603 #endif
604 
605  uint8_t *pBlock = pKerPrivArgs->bufPblock;
606 
607  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
608  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
609 
610  // Input samples
611  __SE0_OPEN(pInLocal1, se0Params);
612  __SE1_OPEN(pInLocal2, se1Params);
613 
614 #if DSPLIB_DEBUGPRINT
615  printf("DSPLIB_DEBUGPRINT blockSize %d pInLocal1 %p pInLocal2 %p\n", blockSize, pInLocal1, pInLocal2);
616 #endif
617 
618  typedef typename c7x::make_full_vector<int64_t>::type vec_out;
619 
620  vec_out outa;
621  vec_out outb;
622  outa = (vec_out) 0;
623  outb = (vec_out) 0;
624 
625  int64_t result = 0;
626  for (int32_t counter = 0; counter < blockSize; counter += eleCount * 2) {
627  vec a = c7x::strm_eng<0, vec>::get_adv();
628  // a.print();
629  vec b = c7x::strm_eng<1, vec>::get_adv();
630  // b.print();
631 
632  outa += __vdotp4hd_vvv(a, a);
633  outb += __vdotp4hd_vvv(b, b);
634 
635 #if DSPLIB_DEBUGPRINT
636 // DSPLIB_debugPrintVector(a);
637 // DSPLIB_debugPrintVector(outa);
638 #endif
639  }
640 
641  outa = outa + outb;
642  // out.print();
643 #if DSPLIB_DEBUGPRINT
644  // DSPLIB_debugPrintVector(out);
645 #endif
646  // use intrensic for horizontal add
647  result = __horizontal_add(outa);
648 
649  *pOutLocal = result;
650 #if DSPLIB_DEBUGPRINT
651  printf("DSPLIB_DEBUGPRINT DSPLIB_sqrAdd_exec_ci result %lf\n", result);
652 #endif
653 
654  __SE0_CLOSE();
655  __SE1_CLOSE();
656 
657  return DSPLIB_SUCCESS;
658 }
659 
660 /**********************************************************************/
661 /* Execute for datatype uint16_t */
662 /**********************************************************************/
663 template <>
664 DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci<uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
665 {
666  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
667  uint32_t blockSize = pKerPrivArgs->blockSize;
668 
669  __SE_TEMPLATE_v1 se0Params;
670  __SE_TEMPLATE_v1 se1Params;
671 
672  uint16_t *restrict pInLocal1 = (uint16_t *) pIn;
673  uint64_t *restrict pOutLocal = (uint64_t *) pOut;
674 
675 #if DSPLIB_DEBUGPRINT
676  printf("Enter DSPLIB_sqrAdd_exec_ci\n");
677 #endif
678 
679  typedef typename c7x::make_full_vector<uint16_t>::type vec;
680  int32_t eleCount = c7x::element_count_of<vec>::value;
681 
682  uint16_t *restrict pInLocal2 = pInLocal1 + eleCount;
683 #if DSPLIB_DEBUGPRINT
684  printf("Enter eleCount %d\n", eleCount);
685 #endif
686 
687  uint8_t *pBlock = pKerPrivArgs->bufPblock;
688 
689  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
690  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
691 
692  // Input samples
693  __SE0_OPEN(pInLocal1, se0Params);
694  __SE1_OPEN(pInLocal2, se1Params);
695 
696 #if DSPLIB_DEBUGPRINT
697  printf("DSPLIB_DEBUGPRINT blockSize %d pInLocal1 %p pInLocal2 %p\n", blockSize, pInLocal1, pInLocal2);
698 #endif
699 
700  typedef typename c7x::make_full_vector<uint64_t>::type vec_out;
701 
702  vec_out outa;
703  vec_out outb;
704 
705  outa = (vec_out) 0;
706  outb = (vec_out) 0;
707  uint64_t result = 0;
708  for (int32_t counter = 0; counter < blockSize; counter += eleCount * 2) {
709  vec a = c7x::strm_eng<0, vec>::get_adv();
710  // a.print();
711  vec b = c7x::strm_eng<1, vec>::get_adv();
712  // b.print();
713 
714  outa += __vdotp4uhd_vvv(a, a);
715  outb += __vdotp4uhd_vvv(b, b);
716 
717 #if DSPLIB_DEBUGPRINT
718 // DSPLIB_debugPrintVector(a);
719 // DSPLIB_debugPrintVector(outa);
720 #endif
721  }
722 
723  outa = outa + outb;
724 
725  // out.print();
726 #if DSPLIB_DEBUGPRINT
727  // DSPLIB_debugPrintVector(out);
728 #endif
729  // use intrensic for horizontal add
730  result = __horizontal_add(outa);
731 
732  *pOutLocal = result;
733 #if DSPLIB_DEBUGPRINT
734  printf("DSPLIB_DEBUGPRINT DSPLIB_sqrAdd_exec_ci result %lf\n", result);
735 #endif
736 
737  __SE0_CLOSE();
738  __SE1_CLOSE();
739 
740  return DSPLIB_SUCCESS;
741 }
742 
743 /**********************************************************************/
744 /* Execute for datatype int32_t */
745 /**********************************************************************/
746 template <>
747 DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci<int32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
748 {
749  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
750  uint32_t blockSize = pKerPrivArgs->blockSize;
751 
752  __SE_TEMPLATE_v1 se0Params;
753  __SE_TEMPLATE_v1 se1Params;
754 
755  int32_t *restrict pInLocal1 = (int32_t *) pIn;
756  int64_t *restrict pOutLocal = (int64_t *) pOut;
757 
758 #if DSPLIB_DEBUGPRINT
759  printf("Enter DSPLIB_sqrAdd_exec_ci\n");
760 #endif
761 
762  typedef typename c7x::make_full_vector<int32_t>::type vec;
763  int32_t eleCount = c7x::element_count_of<vec>::value;
764 
765  int32_t *restrict pInLocal2 = pInLocal1 + eleCount;
766 #if DSPLIB_DEBUGPRINT
767  printf("Enter eleCount %d\n", eleCount);
768 #endif
769 
770  uint8_t *pBlock = pKerPrivArgs->bufPblock;
771 
772  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
773  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
774 
775  // Input samples
776  __SE0_OPEN(pInLocal1, se0Params);
777  __SE1_OPEN(pInLocal2, se1Params);
778 
779 #if DSPLIB_DEBUGPRINT
780  printf("DSPLIB_DEBUGPRINT blockSize %d pInLocal1 %p pInLocal2 %p\n", blockSize, pInLocal1, pInLocal2);
781 #endif
782 
783  typedef typename c7x::make_full_vector<int64_t>::type vec_out;
784 
785  vec_out out;
786  vec_out outa0;
787  vec_out outa1;
788  vec_out outb0;
789  vec_out outb1;
790 
791  out = (vec_out) 0;
792  int64_t result = 0;
793  for (int32_t counter = 0; counter < blockSize; counter += eleCount * 2) {
794  vec a = c7x::strm_eng<0, vec>::get_adv();
795  // a.print();
796  vec b = c7x::strm_eng<1, vec>::get_adv();
797  // b.print();
798 
799  __vmpywd_vvw(a, a, outa0, outa1);
800  __vmpywd_vvw(b, b, outb0, outb1);
801 
802  out += (outa0 + outa1 + outb0 + outb1);
803 
804 #if DSPLIB_DEBUGPRINT
805 // DSPLIB_debugPrintVector(a);
806 // DSPLIB_debugPrintVector(outa);
807 #endif
808  }
809 
810  // out.print();
811 #if DSPLIB_DEBUGPRINT
812  // DSPLIB_debugPrintVector(out);
813 #endif
814  // use intrensic for horizontal add
815  result = __horizontal_add(out);
816 
817  *pOutLocal = result;
818 #if DSPLIB_DEBUGPRINT
819  printf("DSPLIB_DEBUGPRINT DSPLIB_sqrAdd_exec_ci result %lf\n", result);
820 #endif
821 
822  __SE0_CLOSE();
823  __SE1_CLOSE();
824 
825  return DSPLIB_SUCCESS;
826 }
827 
828 /**********************************************************************/
829 /* Execute for datatype uint32_t */
830 /**********************************************************************/
831 template <>
832 DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci<uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
833 {
834  DSPLIB_sqrAdd_PrivArgs *pKerPrivArgs = (DSPLIB_sqrAdd_PrivArgs *) handle;
835  uint32_t blockSize = pKerPrivArgs->blockSize;
836 
837  __SE_TEMPLATE_v1 se0Params;
838  __SE_TEMPLATE_v1 se1Params;
839 
840  uint32_t *restrict pInLocal1 = (uint32_t *) pIn;
841  uint64_t *restrict pOutLocal = (uint64_t *) pOut;
842 
843 #if DSPLIB_DEBUGPRINT
844  printf("Enter DSPLIB_sqrAdd_exec_ci\n");
845 #endif
846 
847  typedef typename c7x::make_full_vector<uint32_t>::type vec;
848  int32_t eleCount = c7x::element_count_of<vec>::value;
849 
850  uint32_t *restrict pInLocal2 = pInLocal1 + eleCount;
851 #if DSPLIB_DEBUGPRINT
852  printf("Enter eleCount %d\n", eleCount);
853 #endif
854 
855  uint8_t *pBlock = pKerPrivArgs->bufPblock;
856 
857  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
858  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
859 
860  // Input samples
861  __SE0_OPEN(pInLocal1, se0Params);
862  __SE1_OPEN(pInLocal2, se1Params);
863 
864 #if DSPLIB_DEBUGPRINT
865  printf("DSPLIB_DEBUGPRINT blockSize %d pInLocal1 %p pInLocal2 %p\n", blockSize, pInLocal1, pInLocal2);
866 #endif
867 
868  typedef typename c7x::make_full_vector<uint64_t>::type vec_out;
869 
870  vec_out out;
871  vec_out outa0;
872  vec_out outa1;
873  vec_out outb0;
874  vec_out outb1;
875 
876  out = (vec_out) 0;
877  uint64_t result = 0;
878  for (int32_t counter = 0; counter < blockSize; counter += eleCount * 2) {
879  vec a = c7x::strm_eng<0, vec>::get_adv();
880  // a.print();
881  vec b = c7x::strm_eng<1, vec>::get_adv();
882  // b.print();
883 
884  __vmpyuwd_vvw(a, a, outa0, outa1);
885  __vmpyuwd_vvw(b, b, outb0, outb1);
886 
887  out += (outa0 + outa1 + outb0 + outb1);
888 
889 #if DSPLIB_DEBUGPRINT
890 // DSPLIB_debugPrintVector(a);
891 // DSPLIB_debugPrintVector(outa);
892 #endif
893  }
894 
895  // out.print();
896 #if DSPLIB_DEBUGPRINT
897  // DSPLIB_debugPrintVector(out);
898 #endif
899  // use intrensic for horizontal add
900  result = __horizontal_add(out);
901 
902  *pOutLocal = result;
903 #if DSPLIB_DEBUGPRINT
904  printf("DSPLIB_DEBUGPRINT DSPLIB_sqrAdd_exec_ci result %lf\n", result);
905 #endif
906 
907  __SE0_CLOSE();
908  __SE1_CLOSE();
909 
910  return DSPLIB_SUCCESS;
911 }
912 
914 DSPLIB_sqrAdd_exec_ci<float>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
915 
917 DSPLIB_sqrAdd_exec_ci<double>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
918 
919 // template DSPLIB_STATUS
920 // DSPLIB_sqrAdd_exec_ci<int8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
921 
922 // template DSPLIB_STATUS
923 // DSPLIB_sqrAdd_exec_ci<uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
924 
925 template DSPLIB_STATUS
926 DSPLIB_sqrAdd_exec_ci<int16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
927 
928 template DSPLIB_STATUS
929 DSPLIB_sqrAdd_exec_ci<uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
930 
931 template DSPLIB_STATUS
932 DSPLIB_sqrAdd_exec_ci<int32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
933 
934 template DSPLIB_STATUS
935 DSPLIB_sqrAdd_exec_ci<uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
#define SE_SE0_PARAM_OFFSET
#define SE_SE1_PARAM_OFFSET
template DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< int8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
DSPLIB_STATUS DSPLIB_sqrAdd_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< int16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
static float DSPLIB_horiAdd(c7x::float_vec vector)
DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< int32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_sqrAdd_exec_ci< uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_sqrAdd_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_sqrAdd_InitArgs *pKerInitArgs)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_sqrAdd.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 1 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_SQRADD_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t blockSize
Size of input buffer for different batches DSPLIB_sqrAdd_init that will be retrieved and used by DSPL...