DSPLIB User Guide
DSPLIB_minerror_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
47 #include "../common/DSPLIB_inlines.h"
48 #include "DSPLIB_minerror_priv.h"
49 #include <c7x_scalable.h>
50 #include <float.h>
51 
52 /*******************************************************************************
53  *
54  * DEFINES
55  *
56  ******************************************************************************/
57 #define SE_PARAM_BASE (0x0000)
58 #define SE_SE0_PARAM_OFFSET (SE_PARAM_BASE)
59 #define SE_SE1_PARAM_OFFSET (SE_SE0_PARAM_OFFSET + SE_PARAM_SIZE)
60 #define CURR_IDX_VEC_OFFSET (SE_SE1_PARAM_OFFSET + SE_PARAM_SIZE)
61 
62 template <typename dataType>
64  const DSPLIB_bufParams2D_t *bufParamsIn,
65  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
66  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
67  const DSPLIB_bufParams1D_t *bufParamsOutVal,
68  const DSPLIB_minerror_InitArgs *pKerInitArgs)
69 {
70  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
72  __SE_TEMPLATE_v1 se0Params;
73 
74  __SE_ELETYPE SE_ELETYPE;
75  __SE_VECLEN SE_VECLEN;
76 
77  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
78 
79  uint8_t *pBlock = pKerPrivArgs->bufPblock;
80  uint32_t vecInSize = pKerPrivArgs->vecInSize;
81  uint32_t errCoefsSize = pKerPrivArgs->errCoefsSize;
82  uint32_t strideIn = pKerPrivArgs->strideIn;
83 
84  typedef typename c7x::make_full_vector<dataType>::type vec;
85  int32_t eleCount = c7x::element_count_of<vec>::value;
86  SE_VECLEN = c7x::se_veclen<vec>::value;
87  SE_ELETYPE = c7x::se_eletype<vec>::value;
88 
89  int32_t dataSize = 8;
90 
91  DSPLIB_DEBUGPRINTFN(0, "Enter eleCount %d datasize %d\n", eleCount, dataSize);
92 
93  // Initialize current index with promoted/conversion datatype
94  if (bufParamsIn->data_type == DSPLIB_INT64 || bufParamsIn->data_type == DSPLIB_UINT64) {
95  pKerPrivArgs->mainLoopCount = (int32_t) (((vecInSize + (eleCount * 2) - 1) / (eleCount * 2)));
96  double *vCurrIdx = (double *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET);
97 
98  for (int j = 0; j < eleCount; j++) {
99  vCurrIdx[j] = j;
100  }
101  }
102  else if (bufParamsIn->data_type == DSPLIB_INT32 || bufParamsIn->data_type == DSPLIB_UINT32) {
103  pKerPrivArgs->mainLoopCount = (int32_t) (((vecInSize + (eleCount * 2) - 1) / (eleCount * 2)) * 2);
104  int64_t *vCurrIdx = (int64_t *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET);
105 
106  for (int j = 0; j < eleCount / 2; j++) {
107  vCurrIdx[j] = j;
108  }
109  }
110  else if (bufParamsIn->data_type == DSPLIB_INT16 || bufParamsIn->data_type == DSPLIB_UINT16) {
111  pKerPrivArgs->mainLoopCount = (int32_t) (((vecInSize + eleCount - 1) / eleCount) * 2);
112  int32_t *vCurrIdx = (int32_t *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET);
113 
114  for (int j = 0; j < eleCount / 2; j++) {
115  vCurrIdx[j] = j;
116  }
117  }
118  else if (bufParamsIn->data_type == DSPLIB_INT8 || bufParamsIn->data_type == DSPLIB_UINT8) {
119 
120 #if __C7X_VEC_SIZE_BITS__ == 512
121  int32_t outEleCount = eleCount / 2;
122 #else
123  int32_t outEleCount = eleCount;
124 #endif
125  pKerPrivArgs->mainLoopCount = (int32_t) (((vecInSize + outEleCount - 1) / outEleCount));
126  dataType *vCurrIdx = (dataType *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET);
127 
128  for (int j = 0; j < eleCount; j++) {
129  vCurrIdx[j] = j;
130  }
131  }
132  else {
133  dataType *vCurrIdx = (dataType *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET);
134  pKerPrivArgs->mainLoopCount = (int32_t) ((vecInSize + (eleCount * 2) - 1) / (eleCount * 2));
135 
136  for (int j = 0; j < eleCount; j++) {
137  vCurrIdx[j] = j;
138  }
139  }
140 
141  se0Params = __gen_SE_TEMPLATE_v1();
142  /**********************************************************************/
143  /* Prepare streaming engine 1 to fetch the input */
144  /**********************************************************************/
145  /* SET TRANSPOSE BOUNDARY */
146  if (bufParamsIn->data_type == DSPLIB_INT8 || bufParamsIn->data_type == DSPLIB_UINT8 ||
147  bufParamsIn->data_type == DSPLIB_INT16 || bufParamsIn->data_type == DSPLIB_UINT16 ||
148  bufParamsIn->data_type == DSPLIB_INT32 || bufParamsIn->data_type == DSPLIB_UINT32 ||
149  bufParamsIn->data_type == DSPLIB_FLOAT32) {
150  se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
151  }
152  else {
153  se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
154  }
155 
156 
157  /* INITIALIZE SE PARAMETERS */
158  if (bufParamsIn->data_type == DSPLIB_INT64 || bufParamsIn->data_type == DSPLIB_UINT64 ||
159  bufParamsIn->data_type == DSPLIB_FLOAT32 || bufParamsIn->data_type == DSPLIB_FLOAT64) {
160  if (bufParamsIn->data_type == DSPLIB_FLOAT32) {
161  dataSize = 4;
162  }
163 
164  int32_t temp;
165  se0Params.ICNT0 = errCoefsSize; // col; // reach till end of columns
166  se0Params.ICNT1 = temp =
167  (vecInSize > (uint32_t) eleCount)
168  ? eleCount
169  : vecInSize; //(dataSize > eleCount) ? eleCount : dataSize; // total elements that can be read in columns
170  se0Params.DIM1 = strideIn / dataSize; // col; // upto col
171 
172  se0Params.ICNT2 = pKerPrivArgs->mainLoopCount; // need row/eleCount complete sweep of columns.
173 
174  se0Params.DIM2 = (strideIn / dataSize * temp * 2); // total elements in one sweep of columns
175  se0Params.ELETYPE = SE_ELETYPE;
176  se0Params.VECLEN = SE_VECLEN;
177  se0Params.DIMFMT = __SE_DIMFMT_3D;
178  }
179  else if (bufParamsIn->data_type == DSPLIB_INT32 || bufParamsIn->data_type == DSPLIB_UINT32) {
180  dataSize = 4;
181  int32_t temp;
182  se0Params.ICNT0 = errCoefsSize; // col; // reach till end of columns
183  se0Params.ICNT1 = temp =
184  (vecInSize > (uint32_t) eleCount / 2)
185  ? eleCount / 2
186  : vecInSize; //(dataSize > eleCount) ? eleCount : dataSize; // total elements that can be read in columns
187  se0Params.DIM1 = strideIn / dataSize; // col; // upto col
188 
189  se0Params.ICNT2 = pKerPrivArgs->mainLoopCount; // need row/eleCount complete sweep of columns.
190 
191  se0Params.DIM2 = (strideIn / dataSize * temp * 2); // total elements in one sweep of columns
192  if (bufParamsIn->data_type == DSPLIB_INT32){
193  se0Params.PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
194  }
195  else{
196  se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
197  }
198  se0Params.ELETYPE = SE_ELETYPE;
199  typedef typename c7x::make_full_vector<int64_t>::type vec64;
200  SE_VECLEN = c7x::se_veclen<vec64>::value;
201  se0Params.ELETYPE = SE_ELETYPE;
202  se0Params.VECLEN = SE_VECLEN;
203  se0Params.DIMFMT = __SE_DIMFMT_3D;
204  }
205  else if (bufParamsIn->data_type == DSPLIB_INT16 || bufParamsIn->data_type == DSPLIB_UINT16) {
206  dataSize = 2;
207  se0Params.ICNT0 = errCoefsSize; // col; // reach till end of columns
208  se0Params.ICNT1 = (vecInSize > (uint32_t) (eleCount / 4))
209  ? (eleCount / 4)
210  : vecInSize; // total elements that can be read in columns
211 
212  se0Params.DIM1 = strideIn / dataSize; // col; // upto col
213  se0Params.ICNT2 = pKerPrivArgs->mainLoopCount; // need row/eleCount complete sweep of columns.
214  se0Params.DIM2 = (strideIn / dataSize * eleCount / 2); // total elements in one sweep of columns
215 
216  if (bufParamsIn->data_type == DSPLIB_INT16){
217  se0Params.PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
218  }
219  else{
220  se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
221  }
222  se0Params.ELETYPE = SE_ELETYPE;
223  typedef typename c7x::make_full_vector<int32_t>::type vec32;
224  SE_VECLEN = c7x::se_veclen<vec32>::value;
225  se0Params.VECLEN = SE_VECLEN;
226  se0Params.DIMFMT = __SE_DIMFMT_3D;
227  }
228  else {
229  dataSize = 1;
230  se0Params.VECLEN = SE_VECLEN;
231  se0Params.ICNT0 = errCoefsSize; // col; // reach till end of columns
232  se0Params.ICNT1 = 16; // total elements that can be read in columns
233  se0Params.DIM1 = strideIn; // col; // upto col
234  se0Params.ICNT2 = pKerPrivArgs->mainLoopCount; // need row/eleCount complete sweep of columns.
235  se0Params.DIM2 = strideIn * eleCount; // total elements in one sweep of columns
236  se0Params.ELETYPE = SE_ELETYPE;
237  se0Params.DIMFMT = __SE_DIMFMT_3D;
238 
239 #if __C7X_VEC_SIZE_BITS__ == 512
240  se0Params.DIM2 = strideIn * eleCount / 2; // total elements in one sweep of columns
241 #endif
242  }
243 
244 
245  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET) = se0Params;
246  *(uint32_t *) ((uint8_t *) pBlock + (SE_SE1_PARAM_OFFSET)) = bufParamsIn->data_type;
247 
248  DSPLIB_DEBUGPRINTFN(0, "Exit function with status: %d\n", status);
249  return status;
250 }
251 
253  const DSPLIB_bufParams2D_t *bufParamsIn,
254  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
255  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
256  const DSPLIB_bufParams1D_t *bufParamsOutVal,
257  const DSPLIB_minerror_InitArgs *pKerInitArgs);
258 
260  const DSPLIB_bufParams2D_t *bufParamsIn,
261  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
262  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
263  const DSPLIB_bufParams1D_t *bufParamsOutVal,
264  const DSPLIB_minerror_InitArgs *pKerInitArgs);
265 
267  const DSPLIB_bufParams2D_t *bufParamsIn,
268  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
269  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
270  const DSPLIB_bufParams1D_t *bufParamsOutVal,
271  const DSPLIB_minerror_InitArgs *pKerInitArgs);
272 
274  const DSPLIB_bufParams2D_t *bufParamsIn,
275  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
276  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
277  const DSPLIB_bufParams1D_t *bufParamsOutVal,
278  const DSPLIB_minerror_InitArgs *pKerInitArgs);
279 
281  const DSPLIB_bufParams2D_t *bufParamsIn,
282  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
283  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
284  const DSPLIB_bufParams1D_t *bufParamsOutVal,
285  const DSPLIB_minerror_InitArgs *pKerInitArgs);
286 
288  const DSPLIB_bufParams2D_t *bufParamsIn,
289  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
290  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
291  const DSPLIB_bufParams1D_t *bufParamsOutVal,
292  const DSPLIB_minerror_InitArgs *pKerInitArgs);
293 
295  const DSPLIB_bufParams2D_t *bufParamsIn,
296  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
297  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
298  const DSPLIB_bufParams1D_t *bufParamsOutVal,
299  const DSPLIB_minerror_InitArgs *pKerInitArgs);
300 
302  const DSPLIB_bufParams2D_t *bufParamsIn,
303  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
304  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
305  const DSPLIB_bufParams1D_t *bufParamsOutVal,
306  const DSPLIB_minerror_InitArgs *pKerInitArgs);
307 
309  const DSPLIB_bufParams2D_t *bufParamsIn,
310  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
311  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
312  const DSPLIB_bufParams1D_t *bufParamsOutVal,
313  const DSPLIB_minerror_InitArgs *pKerInitArgs);
314 
316  const DSPLIB_bufParams2D_t *bufParamsIn,
317  const DSPLIB_bufParams1D_t *bufParamsErrCoefs,
318  const DSPLIB_bufParams1D_t *bufParamsOutIndex,
319  const DSPLIB_bufParams1D_t *bufParamsOutVal,
320  const DSPLIB_minerror_InitArgs *pKerInitArgs);
321 
322 template <typename FloatingPointDataType>
323 inline void minerror_exec_ci_float_inputs(void *restrict pErrCoefs,
324  const int *restrict pMaxIndex,
325  const void *restrict pMaxVal,
326  uint8_t *restrict pBlock,
327  uint32_t vecInSize,
328  uint32_t errCoefsSize,
329  int32_t mainLoopCount)
330 {
331  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
332  FloatingPointDataType *restrict pErrCoefsLocal = (FloatingPointDataType *) pErrCoefs;
333  int *restrict pMaxIndexLocal = (int *) pMaxIndex;
334  FloatingPointDataType *restrict pMaxValLocal = (FloatingPointDataType *) pMaxVal;
335 
336  typedef typename c7x::make_full_vector<FloatingPointDataType>::type vec;
337  int32_t eleCount = c7x::element_count_of<vec>::value;
338 
339  int32_t i;
340  *pMaxValLocal = (FloatingPointDataType) (std::numeric_limits<FloatingPointDataType>::min());
341 
342  vec errCoefs1 = vec(pErrCoefsLocal[0]);
343  vec errCoefs2 = vec(pErrCoefsLocal[1]);
344  vec errCoefs3 = vec(pErrCoefsLocal[2]);
345  vec errCoefs4 = vec(pErrCoefsLocal[3]);
346  vec errCoefs5 = vec(pErrCoefsLocal[4]);
347  vec errCoefs6 = vec(pErrCoefsLocal[5]);
348  vec errCoefs7 = vec(pErrCoefsLocal[6]);
349  vec errCoefs8 = vec(pErrCoefsLocal[7]);
350  vec errCoefs9 = vec(pErrCoefsLocal[8]);
351 
352  vec dotProduct = vec(0);
353  vec maxValVec = vec(*pMaxValLocal);
354 
355  vec vIdx = vec(0);
356  vec vCurrIdx = *((vec *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET));
357 
358  __vpred vpMask;
359 
360  DSPLIB_DEBUGPRINTFN(0, "mainLoopCount %d errCoefsSize %d vecInSize %d\n", mainLoopCount, errCoefsSize, vecInSize);
361 
362  for (i = 0; i < mainLoopCount; i++) {
363  dotProduct = vec(0);
364 
365  // 2SE fetch implementation
366  vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
367  vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
368  vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
369  vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
370  vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
371  vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
372  vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
373  vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
374  vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
375 
376  vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
377  vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
378  vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
379  vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
380  vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
381  vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
382  vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
383  vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
384  vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
385 
386  vec dotProduct1 = (tmp1_0 * errCoefs1);
387  vec dotProduct2 = (tmp2_0 * errCoefs2);
388  vec dotProduct3 = (tmp3_0 * errCoefs3);
389  vec dotProduct4 = (tmp4_0 * errCoefs4);
390  vec dotProduct5 = (tmp5_0 * errCoefs5);
391  vec dotProduct6 = (tmp6_0 * errCoefs6);
392  vec dotProduct7 = (tmp7_0 * errCoefs7);
393  vec dotProduct8 = (tmp8_0 * errCoefs8);
394  vec dotProduct9 = (tmp9_0 * errCoefs9);
395 
396  dotProduct = dotProduct + dotProduct1;
397  dotProduct = dotProduct + dotProduct2;
398  dotProduct = dotProduct + dotProduct3;
399  dotProduct = dotProduct + dotProduct4;
400  dotProduct = dotProduct + dotProduct5;
401  dotProduct = dotProduct + dotProduct6;
402  dotProduct = dotProduct + dotProduct7;
403  dotProduct = dotProduct + dotProduct8;
404  dotProduct = dotProduct + dotProduct9;
405 
406  // Vertical max comparison once the vector dot products are computed for se0 fetched data
407  vpMask = __cmp_lt_pred(dotProduct, maxValVec);
408  maxValVec = __select(vpMask, maxValVec, dotProduct);
409  vIdx = __select(vpMask, vIdx, vCurrIdx);
410  vCurrIdx = vCurrIdx + (eleCount);
411 
412  dotProduct = vec(0);
413  dotProduct1 = (tmp1_1 * errCoefs1);
414  dotProduct2 = (tmp2_1 * errCoefs2);
415  dotProduct3 = (tmp3_1 * errCoefs3);
416  dotProduct4 = (tmp4_1 * errCoefs4);
417  dotProduct5 = (tmp5_1 * errCoefs5);
418  dotProduct6 = (tmp6_1 * errCoefs6);
419  dotProduct7 = (tmp7_1 * errCoefs7);
420  dotProduct8 = (tmp8_1 * errCoefs8);
421  dotProduct9 = (tmp9_1 * errCoefs9);
422 
423  dotProduct = dotProduct + dotProduct1;
424  dotProduct = dotProduct + dotProduct2;
425  dotProduct = dotProduct + dotProduct3;
426  dotProduct = dotProduct + dotProduct4;
427  dotProduct = dotProduct + dotProduct5;
428  dotProduct = dotProduct + dotProduct6;
429  dotProduct = dotProduct + dotProduct7;
430  dotProduct = dotProduct + dotProduct8;
431  dotProduct = dotProduct + dotProduct9;
432 
433  // Vertical max comparison once the vector dot products are computed for se1 fetched data
434  vpMask = __cmp_lt_pred(dotProduct, maxValVec);
435  maxValVec = __select(vpMask, maxValVec, dotProduct);
436  vIdx = __select(vpMask, vIdx, vCurrIdx);
437  vCurrIdx = vCurrIdx + (eleCount);
438  }
439 
440  // Horizontal max computation
441  c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
442 
443  DSPLIB_DEBUGPRINTFN(0, "%s", "Exit function\n");
444  return;
445 }
446 
447 template <typename Integer64BitDataType, typename Integer64BitConversionDataType>
448 inline void minerror_exec_ci_signed_integer64_inputs(void *restrict pErrCoefs,
449  const int *restrict pMaxIndex,
450  const void *restrict pMaxVal,
451  uint8_t *restrict pBlock,
452  uint32_t vecInSize,
453  uint32_t errCoefsSize,
454  int32_t mainLoopCount)
455 {
456 
457  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
458 
459  // convert int64 to double to handle overflow and precision accuracy during multiplication and accumulation
460  // eg: for 64bits, multiply and accumulate in double precision
461  Integer64BitDataType *restrict pErrCoefsLocal = (Integer64BitDataType *) pErrCoefs;
462  int *restrict pMaxIndexLocal = (int *) pMaxIndex;
463  Integer64BitConversionDataType *restrict pMaxValLocal = (Integer64BitConversionDataType *) pMaxVal;
464 
465  typedef typename c7x::make_full_vector<Integer64BitDataType>::type vec;
466  int32_t eleCount = c7x::element_count_of<vec>::value;
467 
468  typedef typename c7x::make_full_vector<Integer64BitConversionDataType>::type vecConverted;
469  typedef typename c7x::make_full_vector<int32_t>::type vecIntermediate;
470 
471  int32_t i;
472  *pMaxValLocal = (Integer64BitConversionDataType) (std::numeric_limits<Integer64BitConversionDataType>::min());
473 
474  vecConverted errCoefs1 = vecConverted(pErrCoefsLocal[0]);
475  vecConverted errCoefs2 = vecConverted(pErrCoefsLocal[1]);
476  vecConverted errCoefs3 = vecConverted(pErrCoefsLocal[2]);
477  vecConverted errCoefs4 = vecConverted(pErrCoefsLocal[3]);
478  vecConverted errCoefs5 = vecConverted(pErrCoefsLocal[4]);
479  vecConverted errCoefs6 = vecConverted(pErrCoefsLocal[5]);
480  vecConverted errCoefs7 = vecConverted(pErrCoefsLocal[6]);
481  vecConverted errCoefs8 = vecConverted(pErrCoefsLocal[7]);
482  vecConverted errCoefs9 = vecConverted(pErrCoefsLocal[8]);
483 
484  vecConverted dotProduct = vecConverted(0);
485  vecConverted maxValVec = vecConverted(*pMaxValLocal);
486 
487  vecConverted vIdx = vecConverted(0);
488  vecConverted vCurrIdx = *((vecConverted *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET));
489 
490  vecConverted mulFactor = vecConverted(4294967296.00);
491 
492  __vpred vpMask;
493 
494  DSPLIB_DEBUGPRINTFN(0, "mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
495 
496  for (i = 0; i < mainLoopCount; i++) {
497  dotProduct = vecConverted(0);
498 
499  // 2SE fetch implementation
500 
501  vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
502  vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
503  vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
504  vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
505  vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
506  vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
507  vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
508  vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
509  vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
510 
511  vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
512  vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
513  vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
514  vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
515  vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
516  vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
517  vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
518  vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
519  vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
520 
521  vecIntermediate v16bits1_0 = convert_long_to_int<vecIntermediate, vec>(tmp1_0);
522  vecIntermediate v16bits2_0 = convert_long_to_int<vecIntermediate, vec>(tmp2_0);
523  vecIntermediate v16bits3_0 = convert_long_to_int<vecIntermediate, vec>(tmp3_0);
524  vecIntermediate v16bits4_0 = convert_long_to_int<vecIntermediate, vec>(tmp4_0);
525  vecIntermediate v16bits5_0 = convert_long_to_int<vecIntermediate, vec>(tmp5_0);
526  vecIntermediate v16bits6_0 = convert_long_to_int<vecIntermediate, vec>(tmp6_0);
527  vecIntermediate v16bits7_0 = convert_long_to_int<vecIntermediate, vec>(tmp7_0);
528  vecIntermediate v16bits8_0 = convert_long_to_int<vecIntermediate, vec>(tmp8_0);
529  vecIntermediate v16bits9_0 = convert_long_to_int<vecIntermediate, vec>(tmp9_0);
530 
531  vecConverted vhigh16bits1_0 = __high_int_to_double(v16bits1_0);
532  vecConverted vlow16bits1_0 = __low_int_to_double(v16bits1_0);
533  vecConverted vSum1_0 = vhigh16bits1_0 * mulFactor;
534  vSum1_0 = vSum1_0 + vlow16bits1_0;
535 
536  vecConverted vhigh16bits2_0 = __high_int_to_double(v16bits2_0);
537  vecConverted vlow16bits2_0 = __low_int_to_double(v16bits2_0);
538  vecConverted vSum2_0 = vhigh16bits2_0 * mulFactor;
539  vSum2_0 = vSum2_0 + vlow16bits2_0;
540 
541  vecConverted vhigh16bits3_0 = __high_int_to_double(v16bits3_0);
542  vecConverted vlow16bits3_0 = __low_int_to_double(v16bits3_0);
543  vecConverted vSum3_0 = vhigh16bits3_0 * mulFactor;
544  vSum3_0 = vSum3_0 + vlow16bits3_0;
545 
546  vecConverted vhigh16bits4_0 = __high_int_to_double(v16bits4_0);
547  vecConverted vlow16bits4_0 = __low_int_to_double(v16bits4_0);
548  vecConverted vSum4_0 = vhigh16bits4_0 * mulFactor;
549  vSum4_0 = vSum4_0 + vlow16bits4_0;
550 
551  vecConverted vhigh16bits5_0 = __high_int_to_double(v16bits5_0);
552  vecConverted vlow16bits5_0 = __low_int_to_double(v16bits5_0);
553  vecConverted vSum5_0 = vhigh16bits5_0 * mulFactor;
554  vSum5_0 = vSum5_0 + vlow16bits5_0;
555 
556  vecConverted vhigh16bits6_0 = __high_int_to_double(v16bits6_0);
557  vecConverted vlow16bits6_0 = __low_int_to_double(v16bits6_0);
558  vecConverted vSum6_0 = vhigh16bits6_0 * mulFactor;
559  vSum6_0 = vSum6_0 + vlow16bits6_0;
560 
561  vecConverted vhigh16bits7_0 = __high_int_to_double(v16bits7_0);
562  vecConverted vlow16bits7_0 = __low_int_to_double(v16bits7_0);
563  vecConverted vSum7_0 = vhigh16bits7_0 * mulFactor;
564  vSum7_0 = vSum7_0 + vlow16bits7_0;
565 
566  vecConverted vhigh16bits8_0 = __high_int_to_double(v16bits8_0);
567  vecConverted vlow16bits8_0 = __low_int_to_double(v16bits8_0);
568  vecConverted vSum8_0 = vhigh16bits8_0 * mulFactor;
569  vSum8_0 = vSum8_0 + vlow16bits8_0;
570 
571  vecConverted vhigh16bits9_0 = __high_int_to_double(v16bits9_0);
572  vecConverted vlow16bits9_0 = __low_int_to_double(v16bits9_0);
573  vecConverted vSum9_0 = vhigh16bits9_0 * mulFactor;
574  vSum9_0 = vSum9_0 + vlow16bits9_0;
575 
576  vecConverted dotProduct1 = (vSum1_0 * errCoefs1);
577  vecConverted dotProduct2 = (vSum2_0 * errCoefs2);
578  vecConverted dotProduct3 = (vSum3_0 * errCoefs3);
579  vecConverted dotProduct4 = (vSum4_0 * errCoefs4);
580  vecConverted dotProduct5 = (vSum5_0 * errCoefs5);
581  vecConverted dotProduct6 = (vSum6_0 * errCoefs6);
582  vecConverted dotProduct7 = (vSum7_0 * errCoefs7);
583  vecConverted dotProduct8 = (vSum8_0 * errCoefs8);
584  vecConverted dotProduct9 = (vSum9_0 * errCoefs9);
585 
586  dotProduct = dotProduct + dotProduct1;
587  dotProduct = dotProduct + dotProduct2;
588  dotProduct = dotProduct + dotProduct3;
589  dotProduct = dotProduct + dotProduct4;
590  dotProduct = dotProduct + dotProduct5;
591  dotProduct = dotProduct + dotProduct6;
592  dotProduct = dotProduct + dotProduct7;
593  dotProduct = dotProduct + dotProduct8;
594  dotProduct = dotProduct + dotProduct9;
595 
596  // Vertical max comparison once the vector dot products are computed for se0 feteched data
597  vpMask = __cmp_lt_pred(dotProduct, maxValVec);
598  maxValVec = __select(vpMask, maxValVec, dotProduct);
599  vIdx = __select(vpMask, vIdx, vCurrIdx);
600  vCurrIdx = vCurrIdx + (eleCount);
601 
602  dotProduct = vecConverted(0);
603 
604  vecIntermediate v16bits1_1 = convert_long_to_int<vecIntermediate, vec>(tmp1_1);
605  vecIntermediate v16bits2_1 = convert_long_to_int<vecIntermediate, vec>(tmp2_1);
606  vecIntermediate v16bits3_1 = convert_long_to_int<vecIntermediate, vec>(tmp3_1);
607  vecIntermediate v16bits4_1 = convert_long_to_int<vecIntermediate, vec>(tmp4_1);
608  vecIntermediate v16bits5_1 = convert_long_to_int<vecIntermediate, vec>(tmp5_1);
609  vecIntermediate v16bits6_1 = convert_long_to_int<vecIntermediate, vec>(tmp6_1);
610  vecIntermediate v16bits7_1 = convert_long_to_int<vecIntermediate, vec>(tmp7_1);
611  vecIntermediate v16bits8_1 = convert_long_to_int<vecIntermediate, vec>(tmp8_1);
612  vecIntermediate v16bits9_1 = convert_long_to_int<vecIntermediate, vec>(tmp9_1);
613 
614  vecConverted vhigh16bits1_1 = __high_int_to_double(v16bits1_1);
615  vecConverted vlow16bits1_1 = __low_int_to_double(v16bits1_1);
616  vecConverted vSum1_1 = vhigh16bits1_1 * mulFactor;
617  vSum1_1 = vSum1_1 + vlow16bits1_1;
618 
619  vecConverted vhigh16bits2_1 = __high_int_to_double(v16bits2_1);
620  vecConverted vlow16bits2_1 = __low_int_to_double(v16bits2_1);
621  vecConverted vSum2_1 = vhigh16bits2_1 * mulFactor;
622  vSum2_1 = vSum2_1 + vlow16bits2_1;
623 
624  vecConverted vhigh16bits3_1 = __high_int_to_double(v16bits3_1);
625  vecConverted vlow16bits3_1 = __low_int_to_double(v16bits3_1);
626  vecConverted vSum3_1 = vhigh16bits3_1 * mulFactor;
627  vSum3_1 = vSum3_1 + vlow16bits3_1;
628 
629  vecConverted vhigh16bits4_1 = __high_int_to_double(v16bits4_1);
630  vecConverted vlow16bits4_1 = __low_int_to_double(v16bits4_1);
631  vecConverted vSum4_1 = vhigh16bits4_1 * mulFactor;
632  vSum4_1 = vSum4_1 + vlow16bits4_1;
633 
634  vecConverted vhigh16bits5_1 = __high_int_to_double(v16bits5_1);
635  vecConverted vlow16bits5_1 = __low_int_to_double(v16bits5_1);
636  vecConverted vSum5_1 = vhigh16bits5_1 * mulFactor;
637  vSum5_1 = vSum5_1 + vlow16bits5_1;
638 
639  vecConverted vhigh16bits6_1 = __high_int_to_double(v16bits6_1);
640  vecConverted vlow16bits6_1 = __low_int_to_double(v16bits6_1);
641  vecConverted vSum6_1 = vhigh16bits6_1 * mulFactor;
642  vSum6_1 = vSum6_1 + vlow16bits6_1;
643 
644  vecConverted vhigh16bits7_1 = __high_int_to_double(v16bits7_1);
645  vecConverted vlow16bits7_1 = __low_int_to_double(v16bits7_1);
646  vecConverted vSum7_1 = vhigh16bits7_1 * mulFactor;
647  vSum7_1 = vSum7_1 + vlow16bits7_1;
648 
649  vecConverted vhigh16bits8_1 = __high_int_to_double(v16bits8_1);
650  vecConverted vlow16bits8_1 = __low_int_to_double(v16bits8_1);
651  vecConverted vSum8_1 = vhigh16bits8_1 * mulFactor;
652  vSum8_1 = vSum8_1 + vlow16bits8_1;
653 
654  vecConverted vhigh16bits9_1 = __high_int_to_double(v16bits9_1);
655  vecConverted vlow16bits9_1 = __low_int_to_double(v16bits9_1);
656  vecConverted vSum9_1 = vhigh16bits9_1 * mulFactor;
657  vSum9_1 = vSum9_1 + vlow16bits9_1;
658 
659  dotProduct1 = (vSum1_1 * errCoefs1);
660  dotProduct2 = (vSum2_1 * errCoefs2);
661  dotProduct3 = (vSum3_1 * errCoefs3);
662  dotProduct4 = (vSum4_1 * errCoefs4);
663  dotProduct5 = (vSum5_1 * errCoefs5);
664  dotProduct6 = (vSum6_1 * errCoefs6);
665  dotProduct7 = (vSum7_1 * errCoefs7);
666  dotProduct8 = (vSum8_1 * errCoefs8);
667  dotProduct9 = (vSum9_1 * errCoefs9);
668 
669  dotProduct = dotProduct + dotProduct1;
670  dotProduct = dotProduct + dotProduct2;
671  dotProduct = dotProduct + dotProduct3;
672  dotProduct = dotProduct + dotProduct4;
673  dotProduct = dotProduct + dotProduct5;
674  dotProduct = dotProduct + dotProduct6;
675  dotProduct = dotProduct + dotProduct7;
676  dotProduct = dotProduct + dotProduct8;
677  dotProduct = dotProduct + dotProduct9;
678 
679  // Vertical max comparison once the vector dot products are computed for se1 feteched data
680  vpMask = __cmp_lt_pred(dotProduct, maxValVec);
681  maxValVec = __select(vpMask, maxValVec, dotProduct);
682  vIdx = __select(vpMask, vIdx, vCurrIdx);
683  vCurrIdx = vCurrIdx + (eleCount);
684  }
685 
686  // Horizontal max computation
687  c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
688  double maxval = *pMaxValLocal;
689  *pMaxValLocal = ((Integer64BitDataType) maxval);
690 
691  DSPLIB_DEBUGPRINTFN(0, "%s", "Exit function\n");
692  return;
693 }
694 
695 template <typename Integer64BitDataType, typename Integer64BitConversionDataType>
696 inline void minerror_exec_ci_unsigned_integer64_inputs(void *restrict pErrCoefs,
697  const int *restrict pMaxIndex,
698  const void *restrict pMaxVal,
699  uint8_t *restrict pBlock,
700  uint32_t vecInSize,
701  uint32_t errCoefsSize,
702  int32_t mainLoopCount)
703 {
704  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
705 
706  // convert uint64 to double to handle overflow and precision accuracy during multiplication and accumulation
707  // eg: for 64bits, multiply and accumulate in double precision
708 
709  Integer64BitDataType *restrict pErrCoefsLocal = (Integer64BitDataType *) pErrCoefs;
710  int *restrict pMaxIndexLocal = (int *) pMaxIndex;
711  Integer64BitConversionDataType *restrict pMaxValLocal = (Integer64BitConversionDataType *) pMaxVal;
712 
713  typedef typename c7x::make_full_vector<Integer64BitDataType>::type vec;
714  int32_t eleCount = c7x::element_count_of<vec>::value;
715 
716  typedef typename c7x::make_full_vector<Integer64BitConversionDataType>::type vecConverted;
717 
718  typedef typename c7x::make_full_vector<uint32_t>::type vecIntermediate;
719 
720 
721  int32_t i;
722  *pMaxValLocal = (Integer64BitConversionDataType) (std::numeric_limits<Integer64BitConversionDataType>::min());
723 
724  vecConverted errCoefs1 = vecConverted(pErrCoefsLocal[0]);
725  vecConverted errCoefs2 = vecConverted(pErrCoefsLocal[1]);
726  vecConverted errCoefs3 = vecConverted(pErrCoefsLocal[2]);
727  vecConverted errCoefs4 = vecConverted(pErrCoefsLocal[3]);
728  vecConverted errCoefs5 = vecConverted(pErrCoefsLocal[4]);
729  vecConverted errCoefs6 = vecConverted(pErrCoefsLocal[5]);
730  vecConverted errCoefs7 = vecConverted(pErrCoefsLocal[6]);
731  vecConverted errCoefs8 = vecConverted(pErrCoefsLocal[7]);
732  vecConverted errCoefs9 = vecConverted(pErrCoefsLocal[8]);
733 
734  vecConverted dotProduct = vecConverted(0);
735  vecConverted maxValVec = vecConverted(*pMaxValLocal);
736 
737  vecConverted vIdx = vecConverted(0);
738  vecConverted vCurrIdx = *((vecConverted *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET));
739 
740  vecConverted mulFactor = vecConverted(4294967296.00);
741 
742  __vpred vpMask;
743 
744  DSPLIB_DEBUGPRINTFN(0, "mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
745 
746  for (i = 0; i < mainLoopCount; i++) {
747  dotProduct = vecConverted(0);
748 
749  // 2SE fetch implementation
750 
751  vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
752  vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
753  vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
754  vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
755  vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
756  vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
757  vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
758  vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
759  vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
760 
761  vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
762  vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
763  vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
764  vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
765  vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
766  vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
767  vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
768  vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
769  vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
770 
771  vecIntermediate v16bits1_0 = convert_long_to_int<vecIntermediate, vec>(tmp1_0);
772  vecIntermediate v16bits2_0 = convert_long_to_int<vecIntermediate, vec>(tmp2_0);
773  vecIntermediate v16bits3_0 = convert_long_to_int<vecIntermediate, vec>(tmp3_0);
774  vecIntermediate v16bits4_0 = convert_long_to_int<vecIntermediate, vec>(tmp4_0);
775  vecIntermediate v16bits5_0 = convert_long_to_int<vecIntermediate, vec>(tmp5_0);
776  vecIntermediate v16bits6_0 = convert_long_to_int<vecIntermediate, vec>(tmp6_0);
777  vecIntermediate v16bits7_0 = convert_long_to_int<vecIntermediate, vec>(tmp7_0);
778  vecIntermediate v16bits8_0 = convert_long_to_int<vecIntermediate, vec>(tmp8_0);
779  vecIntermediate v16bits9_0 = convert_long_to_int<vecIntermediate, vec>(tmp9_0);
780 
781  vecConverted vhigh16bits1_0 = __high_int_to_double(v16bits1_0);
782  vecConverted vlow16bits1_0 = __low_int_to_double(v16bits1_0);
783  vecConverted vSum1_0 = vhigh16bits1_0 * mulFactor;
784  vSum1_0 = vSum1_0 + vlow16bits1_0;
785 
786  vecConverted vhigh16bits2_0 = __high_int_to_double(v16bits2_0);
787  vecConverted vlow16bits2_0 = __low_int_to_double(v16bits2_0);
788  vecConverted vSum2_0 = vhigh16bits2_0 * mulFactor;
789  vSum2_0 = vSum2_0 + vlow16bits2_0;
790 
791  vecConverted vhigh16bits3_0 = __high_int_to_double(v16bits3_0);
792  vecConverted vlow16bits3_0 = __low_int_to_double(v16bits3_0);
793  vecConverted vSum3_0 = vhigh16bits3_0 * mulFactor;
794  vSum3_0 = vSum3_0 + vlow16bits3_0;
795 
796  vecConverted vhigh16bits4_0 = __high_int_to_double(v16bits4_0);
797  vecConverted vlow16bits4_0 = __low_int_to_double(v16bits4_0);
798  vecConverted vSum4_0 = vhigh16bits4_0 * mulFactor;
799  vSum4_0 = vSum4_0 + vlow16bits4_0;
800 
801  vecConverted vhigh16bits5_0 = __high_int_to_double(v16bits5_0);
802  vecConverted vlow16bits5_0 = __low_int_to_double(v16bits5_0);
803  vecConverted vSum5_0 = vhigh16bits5_0 * mulFactor;
804  vSum5_0 = vSum5_0 + vlow16bits5_0;
805 
806  vecConverted vhigh16bits6_0 = __high_int_to_double(v16bits6_0);
807  vecConverted vlow16bits6_0 = __low_int_to_double(v16bits6_0);
808  vecConverted vSum6_0 = vhigh16bits6_0 * mulFactor;
809  vSum6_0 = vSum6_0 + vlow16bits6_0;
810 
811  vecConverted vhigh16bits7_0 = __high_int_to_double(v16bits7_0);
812  vecConverted vlow16bits7_0 = __low_int_to_double(v16bits7_0);
813  vecConverted vSum7_0 = vhigh16bits7_0 * mulFactor;
814  vSum7_0 = vSum7_0 + vlow16bits7_0;
815 
816  vecConverted vhigh16bits8_0 = __high_int_to_double(v16bits8_0);
817  vecConverted vlow16bits8_0 = __low_int_to_double(v16bits8_0);
818  vecConverted vSum8_0 = vhigh16bits8_0 * mulFactor;
819  vSum8_0 = vSum8_0 + vlow16bits8_0;
820 
821  vecConverted vhigh16bits9_0 = __high_int_to_double(v16bits9_0);
822  vecConverted vlow16bits9_0 = __low_int_to_double(v16bits9_0);
823  vecConverted vSum9_0 = vhigh16bits9_0 * mulFactor;
824  vSum9_0 = vSum9_0 + vlow16bits9_0;
825 
826  vecConverted dotProduct1 = (vSum1_0 * errCoefs1);
827  vecConverted dotProduct2 = (vSum2_0 * errCoefs2);
828  vecConverted dotProduct3 = (vSum3_0 * errCoefs3);
829  vecConverted dotProduct4 = (vSum4_0 * errCoefs4);
830  vecConverted dotProduct5 = (vSum5_0 * errCoefs5);
831  vecConverted dotProduct6 = (vSum6_0 * errCoefs6);
832  vecConverted dotProduct7 = (vSum7_0 * errCoefs7);
833  vecConverted dotProduct8 = (vSum8_0 * errCoefs8);
834  vecConverted dotProduct9 = (vSum9_0 * errCoefs9);
835 
836  dotProduct = dotProduct + dotProduct1;
837  dotProduct = dotProduct + dotProduct2;
838  dotProduct = dotProduct + dotProduct3;
839  dotProduct = dotProduct + dotProduct4;
840  dotProduct = dotProduct + dotProduct5;
841  dotProduct = dotProduct + dotProduct6;
842  dotProduct = dotProduct + dotProduct7;
843  dotProduct = dotProduct + dotProduct8;
844  dotProduct = dotProduct + dotProduct9;
845 
846  // Vertical max comparison once the vector dot products are computed for se0 feteched data
847  vpMask = __cmp_lt_pred(dotProduct, maxValVec);
848  maxValVec = __select(vpMask, maxValVec, dotProduct);
849  vIdx = __select(vpMask, vIdx, vCurrIdx);
850  vCurrIdx = vCurrIdx + (eleCount);
851 
852  dotProduct = vecConverted(0);
853 
854  vecIntermediate v16bits1_1 = convert_long_to_int<vecIntermediate, vec>(tmp1_1);
855  vecIntermediate v16bits2_1 = convert_long_to_int<vecIntermediate, vec>(tmp2_1);
856  vecIntermediate v16bits3_1 = convert_long_to_int<vecIntermediate, vec>(tmp3_1);
857  vecIntermediate v16bits4_1 = convert_long_to_int<vecIntermediate, vec>(tmp4_1);
858  vecIntermediate v16bits5_1 = convert_long_to_int<vecIntermediate, vec>(tmp5_1);
859  vecIntermediate v16bits6_1 = convert_long_to_int<vecIntermediate, vec>(tmp6_1);
860  vecIntermediate v16bits7_1 = convert_long_to_int<vecIntermediate, vec>(tmp7_1);
861  vecIntermediate v16bits8_1 = convert_long_to_int<vecIntermediate, vec>(tmp8_1);
862  vecIntermediate v16bits9_1 = convert_long_to_int<vecIntermediate, vec>(tmp9_1);
863 
864  vecConverted vhigh16bits1_1 = __high_int_to_double(v16bits1_1);
865  vecConverted vlow16bits1_1 = __low_int_to_double(v16bits1_1);
866  vecConverted vSum1_1 = vhigh16bits1_1 * mulFactor;
867  vSum1_1 = vSum1_1 + vlow16bits1_1;
868 
869  vecConverted vhigh16bits2_1 = __high_int_to_double(v16bits2_1);
870  vecConverted vlow16bits2_1 = __low_int_to_double(v16bits2_1);
871  vecConverted vSum2_1 = vhigh16bits2_1 * mulFactor;
872  vSum2_1 = vSum2_1 + vlow16bits2_1;
873 
874  vecConverted vhigh16bits3_1 = __high_int_to_double(v16bits3_1);
875  vecConverted vlow16bits3_1 = __low_int_to_double(v16bits3_1);
876  vecConverted vSum3_1 = vhigh16bits3_1 * mulFactor;
877  vSum3_1 = vSum3_1 + vlow16bits3_1;
878 
879  vecConverted vhigh16bits4_1 = __high_int_to_double(v16bits4_1);
880  vecConverted vlow16bits4_1 = __low_int_to_double(v16bits4_1);
881  vecConverted vSum4_1 = vhigh16bits4_1 * mulFactor;
882  vSum4_1 = vSum4_1 + vlow16bits4_1;
883 
884  vecConverted vhigh16bits5_1 = __high_int_to_double(v16bits5_1);
885  vecConverted vlow16bits5_1 = __low_int_to_double(v16bits5_1);
886  vecConverted vSum5_1 = vhigh16bits5_1 * mulFactor;
887  vSum5_1 = vSum5_1 + vlow16bits5_1;
888 
889  vecConverted vhigh16bits6_1 = __high_int_to_double(v16bits6_1);
890  vecConverted vlow16bits6_1 = __low_int_to_double(v16bits6_1);
891  vecConverted vSum6_1 = vhigh16bits6_1 * mulFactor;
892  vSum6_1 = vSum6_1 + vlow16bits6_1;
893 
894  vecConverted vhigh16bits7_1 = __high_int_to_double(v16bits7_1);
895  vecConverted vlow16bits7_1 = __low_int_to_double(v16bits7_1);
896  vecConverted vSum7_1 = vhigh16bits7_1 * mulFactor;
897  vSum7_1 = vSum7_1 + vlow16bits7_1;
898 
899  vecConverted vhigh16bits8_1 = __high_int_to_double(v16bits8_1);
900  vecConverted vlow16bits8_1 = __low_int_to_double(v16bits8_1);
901  vecConverted vSum8_1 = vhigh16bits8_1 * mulFactor;
902  vSum8_1 = vSum8_1 + vlow16bits8_1;
903 
904  vecConverted vhigh16bits9_1 = __high_int_to_double(v16bits9_1);
905  vecConverted vlow16bits9_1 = __low_int_to_double(v16bits9_1);
906  vecConverted vSum9_1 = vhigh16bits9_1 * mulFactor;
907  vSum9_1 = vSum9_1 + vlow16bits9_1;
908 
909  dotProduct1 = (vSum1_1 * errCoefs1);
910  dotProduct2 = (vSum2_1 * errCoefs2);
911  dotProduct3 = (vSum3_1 * errCoefs3);
912  dotProduct4 = (vSum4_1 * errCoefs4);
913  dotProduct5 = (vSum5_1 * errCoefs5);
914  dotProduct6 = (vSum6_1 * errCoefs6);
915  dotProduct7 = (vSum7_1 * errCoefs7);
916  dotProduct8 = (vSum8_1 * errCoefs8);
917  dotProduct9 = (vSum9_1 * errCoefs9);
918 
919  dotProduct = dotProduct + dotProduct1;
920  dotProduct = dotProduct + dotProduct2;
921  dotProduct = dotProduct + dotProduct3;
922  dotProduct = dotProduct + dotProduct4;
923  dotProduct = dotProduct + dotProduct5;
924  dotProduct = dotProduct + dotProduct6;
925  dotProduct = dotProduct + dotProduct7;
926  dotProduct = dotProduct + dotProduct8;
927  dotProduct = dotProduct + dotProduct9;
928 
929  // Vertical max comparison once the vector dot products are computed for se1 feteched data
930  vpMask = __cmp_lt_pred(dotProduct, maxValVec);
931  maxValVec = __select(vpMask, maxValVec, dotProduct);
932  vIdx = __select(vpMask, vIdx, vCurrIdx);
933  vCurrIdx = vCurrIdx + (eleCount);
934  }
935 
936  // Horizontal max computation
937  c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
938  double maxval = *pMaxValLocal;
939  *pMaxValLocal = ((Integer64BitDataType) maxval);
940  DSPLIB_DEBUGPRINTFN(0, "%s", "Exit function\n");
941  return;
942 }
943 
944 // supports both signed and unsigned datatypes
945 template <typename Integer32BitDataType, typename Integer32BitPromotedDataType>
946 inline void minerror_exec_ci_integer32_inputs(void *restrict pErrCoefs,
947  const int *restrict pMaxIndex,
948  const void *restrict pMaxVal,
949  uint8_t *restrict pBlock,
950  uint32_t vecInSize,
951  uint32_t errCoefsSize,
952  int32_t mainLoopCount)
953 {
954 
955  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
956  // SE promoted to 64 bits in transpose mode to handle overflow
957  // eg: for 32bits, multiply and accumulate with right shift in 64bits
958 
959  Integer32BitDataType *restrict pErrCoefsLocal = (Integer32BitDataType *) pErrCoefs;
960  int *restrict pMaxIndexLocal = (int *) pMaxIndex;
961  Integer32BitPromotedDataType *restrict pMaxValLocal = (Integer32BitPromotedDataType *) pMaxVal;
962 
963  typedef typename c7x::make_full_vector<Integer32BitPromotedDataType>::type vec;
964  int32_t eleCount = c7x::element_count_of<vec>::value;
965 
966 
967  int32_t i;
968  *pMaxValLocal = (Integer32BitPromotedDataType) (std::numeric_limits<Integer32BitPromotedDataType>::min());
969 
970  vec errCoefs1 = vec(pErrCoefsLocal[0]);
971  vec errCoefs2 = vec(pErrCoefsLocal[1]);
972  vec errCoefs3 = vec(pErrCoefsLocal[2]);
973  vec errCoefs4 = vec(pErrCoefsLocal[3]);
974  vec errCoefs5 = vec(pErrCoefsLocal[4]);
975  vec errCoefs6 = vec(pErrCoefsLocal[5]);
976  vec errCoefs7 = vec(pErrCoefsLocal[6]);
977  vec errCoefs8 = vec(pErrCoefsLocal[7]);
978  vec errCoefs9 = vec(pErrCoefsLocal[8]);
979 
980  vec dotProduct = vec(0);
981  vec maxValVec = vec(*pMaxValLocal);
982 
983  vec vIdx = vec(0);
984  vec vCurrIdx = *((vec *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET));
985  __vpred vpMask;
986 
987  DSPLIB_DEBUGPRINTFN(0, "mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
988 
989  for (i = 0; i < mainLoopCount; i++) {
990  dotProduct = vec(0);
991 
992  // 2SE fetch implementation
993 
994  vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
995  vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
996  vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
997  vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
998  vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
999  vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
1000  vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
1001  vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
1002  vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
1003 
1004  vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
1005  vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
1006  vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
1007  vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
1008  vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
1009  vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
1010  vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
1011  vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
1012  vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
1013 
1014  vec dotProduct1 = (tmp1_0 * errCoefs1);
1015  vec dotProduct2 = (tmp2_0 * errCoefs2);
1016  vec dotProduct3 = (tmp3_0 * errCoefs3);
1017  vec dotProduct4 = (tmp4_0 * errCoefs4);
1018  vec dotProduct5 = (tmp5_0 * errCoefs5);
1019  vec dotProduct6 = (tmp6_0 * errCoefs6);
1020  vec dotProduct7 = (tmp7_0 * errCoefs7);
1021  vec dotProduct8 = (tmp8_0 * errCoefs8);
1022  vec dotProduct9 = (tmp9_0 * errCoefs9);
1023 
1024  vec acc1 = (__shift_right(dotProduct1, vec(1))) + (__shift_right(dotProduct2, vec(1)));
1025  vec acc2 = (__shift_right(dotProduct3, vec(1))) + (__shift_right(dotProduct4, vec(1)));
1026  vec acc3 = (__shift_right(dotProduct5, vec(1))) + (__shift_right(dotProduct6, vec(1)));
1027  vec acc4 = (__shift_right(dotProduct7, vec(1))) + (__shift_right(dotProduct8, vec(1)));
1028  vec acc5 = (__shift_right(acc1, vec(1))) + (__shift_right(acc2, vec(1)));
1029  vec acc6 = (__shift_right(acc3, vec(1))) + (__shift_right(acc4, vec(1)));
1030  vec acc7 = (__shift_right(acc5, vec(1))) + (__shift_right(acc6, vec(1)));
1031  dotProduct = (__shift_right(acc7, vec(1))) + (__shift_right(dotProduct9, vec(1)));
1032 
1033  // Vertical max comparison once the vector dot products are computed for se0 feteched data
1034  __max_index(dotProduct, maxValVec, vpMask);
1035  vIdx = __select(vpMask, vCurrIdx, vIdx);
1036  vCurrIdx = vCurrIdx + (eleCount);
1037 
1038  dotProduct = vec(0);
1039 
1040  dotProduct1 = (tmp1_1 * errCoefs1);
1041  dotProduct2 = (tmp2_1 * errCoefs2);
1042  dotProduct3 = (tmp3_1 * errCoefs3);
1043  dotProduct4 = (tmp4_1 * errCoefs4);
1044  dotProduct5 = (tmp5_1 * errCoefs5);
1045  dotProduct6 = (tmp6_1 * errCoefs6);
1046  dotProduct7 = (tmp7_1 * errCoefs7);
1047  dotProduct8 = (tmp8_1 * errCoefs8);
1048  dotProduct9 = (tmp9_1 * errCoefs9);
1049 
1050  acc1 = (__shift_right(dotProduct1, vec(1))) + (__shift_right(dotProduct2, vec(1)));
1051  acc2 = (__shift_right(dotProduct3, vec(1))) + (__shift_right(dotProduct4, vec(1)));
1052  acc3 = (__shift_right(dotProduct5, vec(1))) + (__shift_right(dotProduct6, vec(1)));
1053  acc4 = (__shift_right(dotProduct7, vec(1))) + (__shift_right(dotProduct8, vec(1)));
1054  acc5 = (__shift_right(acc1, vec(1))) + (__shift_right(acc2, vec(1)));
1055  acc6 = (__shift_right(acc3, vec(1))) + (__shift_right(acc4, vec(1)));
1056  acc7 = (__shift_right(acc5, vec(1))) + (__shift_right(acc6, vec(1)));
1057  dotProduct = (__shift_right(acc7, vec(1))) + (__shift_right(dotProduct9, vec(1)));
1058 
1059  // Vertical max comparison once the vector dot products are computed for se1 feteched data
1060  __max_index(dotProduct, maxValVec, vpMask);
1061  vIdx = __select(vpMask, vCurrIdx, vIdx);
1062  vCurrIdx = vCurrIdx + (eleCount);
1063  }
1064 
1065  // Horizontal max computation
1066  c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
1067 
1068  DSPLIB_DEBUGPRINTFN(0, "%s", "Exit function\n");
1069  return;
1070 }
1071 
1072 // supports both signed and unsigned datatypes
1073 template <typename Integer16BitDataType, typename Integer16BitPromotedDataType>
1074 inline void minerror_exec_ci_integer16_inputs(void *restrict pErrCoefs,
1075  const int *restrict pMaxIndex,
1076  const void *restrict pMaxVal,
1077  uint8_t *restrict pBlock,
1078  uint32_t vecInSize,
1079  uint32_t errCoefsSize,
1080  int32_t mainLoopCount)
1081 {
1082 
1083  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1084  // SE promoted to 32 bits in transpose mode to handle overflow
1085  // eg: for 16bits, multiply and accumulate with right shift in 32bits
1086 
1087  Integer16BitDataType *restrict pErrCoefsLocal = (Integer16BitDataType *) pErrCoefs;
1088  int *restrict pMaxIndexLocal = (int *) pMaxIndex;
1089  Integer16BitPromotedDataType *restrict pMaxValLocal = (Integer16BitPromotedDataType *) pMaxVal;
1090 
1091  typedef typename c7x::make_full_vector<Integer16BitPromotedDataType>::type vec;
1092  int16_t eleCount = (c7x::element_count_of<vec>::value);
1093 
1094  int32_t i;
1095  *pMaxValLocal = (Integer16BitPromotedDataType) (std::numeric_limits<Integer16BitPromotedDataType>::min());
1096 
1097  vec errCoefs1 = vec(pErrCoefsLocal[0]);
1098  vec errCoefs2 = vec(pErrCoefsLocal[1]);
1099  vec errCoefs3 = vec(pErrCoefsLocal[2]);
1100  vec errCoefs4 = vec(pErrCoefsLocal[3]);
1101  vec errCoefs5 = vec(pErrCoefsLocal[4]);
1102  vec errCoefs6 = vec(pErrCoefsLocal[5]);
1103  vec errCoefs7 = vec(pErrCoefsLocal[6]);
1104  vec errCoefs8 = vec(pErrCoefsLocal[7]);
1105  vec errCoefs9 = vec(pErrCoefsLocal[8]);
1106 
1107  vec dotProduct = vec(0);
1108  vec maxValVec = vec(*pMaxValLocal);
1109 
1110  vec vIdx = vec(0);
1111  vec vCurrIdx = *((vec *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET));
1112  __vpred vpMask;
1113 
1114  DSPLIB_DEBUGPRINTFN(0, " mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
1115 
1116  for (i = 0; i < mainLoopCount; i++) {
1117  dotProduct = vec(0);
1118 
1119  vec loadVec1 = c7x::strm_eng<0, vec>::get_adv();
1120  vec loadVec2 = c7x::strm_eng<1, vec>::get_adv();
1121 
1122  vec checkEven1 = __pack_consec_low(loadVec2, loadVec1);
1123  vec checkOdd1 = __pack_consec_high(loadVec2, loadVec1);
1124 
1125  vec loadVec3 = c7x::strm_eng<0, vec>::get_adv();
1126  vec loadVec4 = c7x::strm_eng<1, vec>::get_adv();
1127 
1128  vec checkEven2 = __pack_consec_low(loadVec4, loadVec3);
1129  vec checkOdd2 = __pack_consec_high(loadVec4, loadVec3);
1130 
1131  vec loadVec5 = c7x::strm_eng<0, vec>::get_adv();
1132  vec loadVec6 = c7x::strm_eng<1, vec>::get_adv();
1133 
1134  vec checkEven3 = __pack_consec_low(loadVec6, loadVec5);
1135  vec checkOdd3 = __pack_consec_high(loadVec6, loadVec5);
1136 
1137  vec loadVec7 = c7x::strm_eng<0, vec>::get_adv();
1138  vec loadVec8 = c7x::strm_eng<1, vec>::get_adv();
1139 
1140  vec checkEven4 = __pack_consec_low(loadVec8, loadVec7);
1141  vec checkOdd4 = __pack_consec_high(loadVec8, loadVec7);
1142 
1143  vec loadVec9 = c7x::strm_eng<0, vec>::get_adv();
1144  vec loadVec10 = c7x::strm_eng<1, vec>::get_adv();
1145 
1146  vec checkEven5 = __pack_consec_low(loadVec10, loadVec9);
1147 
1148  vec dotProduct1 = (checkEven1 * errCoefs1);
1149  vec dotProduct2 = (checkOdd1 * errCoefs2);
1150  vec dotProduct3 = (checkEven2 * errCoefs3);
1151  vec dotProduct4 = (checkOdd2 * errCoefs4);
1152  vec dotProduct5 = (checkEven3 * errCoefs5);
1153  vec dotProduct6 = (checkOdd3 * errCoefs6);
1154  vec dotProduct7 = (checkEven4 * errCoefs7);
1155  vec dotProduct8 = (checkOdd4 * errCoefs8);
1156  vec dotProduct9 = (checkEven5 * errCoefs9);
1157 
1158  vec acc1 = (__shift_right(dotProduct1, vec(1))) + (__shift_right(dotProduct2, vec(1)));
1159  vec acc2 = (__shift_right(dotProduct3, vec(1))) + (__shift_right(dotProduct4, vec(1)));
1160  vec acc3 = (__shift_right(dotProduct5, vec(1))) + (__shift_right(dotProduct6, vec(1)));
1161  vec acc4 = (__shift_right(dotProduct7, vec(1))) + (__shift_right(dotProduct8, vec(1)));
1162  vec acc5 = (__shift_right(acc1, vec(1))) + (__shift_right(acc2, vec(1)));
1163  vec acc6 = (__shift_right(acc3, vec(1))) + (__shift_right(acc4, vec(1)));
1164  vec acc7 = (__shift_right(acc5, vec(1))) + (__shift_right(acc6, vec(1)));
1165  dotProduct = (__shift_right(acc7, vec(1))) + (__shift_right(dotProduct9, vec(1)));
1166 
1167  // Vertical max comparison once the vector dot products are computed for se1 feteched data
1168  __max_index(dotProduct, maxValVec, vpMask);
1169  vIdx = __select(vpMask, vCurrIdx, vIdx);
1170  vCurrIdx = vCurrIdx + (eleCount);
1171  }
1172 
1173  // Horizontal max computation
1174  c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
1175 
1176  DSPLIB_DEBUGPRINTFN(0, "%s", "Exit function\n");
1177  return;
1178 }
1179 
1180 // supports both signed and unsigned datatypes
1181 template <typename Integer8BitDataType, typename Integer8BitPromotedDataType>
1182 inline void minerror_exec_ci_integer8_inputs(void *restrict pErrCoefs,
1183  const int *restrict pMaxIndex,
1184  const void *restrict pMaxVal,
1185  uint8_t *restrict pBlock,
1186  uint32_t vecInSize,
1187  uint32_t errCoefsSize,
1188  int32_t mainLoopCount)
1189 {
1190 
1191  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1192  // explicit promotion to 16 bit to handle overflow
1193  // eg: for 8bits, multiply and accumulate with right shift in 16bits
1194 
1195  Integer8BitDataType *restrict pErrCoefsLocal = (Integer8BitDataType *) pErrCoefs;
1196  int *restrict pMaxIndexLocal = (int *) pMaxIndex;
1197  Integer8BitPromotedDataType *restrict pMaxValLocal = (Integer8BitPromotedDataType *) pMaxVal;
1198 
1199  typedef typename c7x::make_full_vector<Integer8BitDataType>::type vec;
1200  uint8_t eleCount = c7x::element_count_of<vec>::value;
1201 
1202  typedef typename c7x::make_full_vector<Integer8BitPromotedDataType>::type vecPromoted;
1203 
1204  typedef typename c7x::make_vector<Integer8BitDataType, __C7X_VEC_SIZE_BYTES__ / 2>::type vecPartial;
1205 
1206 
1207  *pMaxValLocal = (Integer8BitPromotedDataType) (std::numeric_limits<Integer8BitPromotedDataType>::min());
1208 
1209  vecPromoted dotProduct = vecPromoted(0);
1210  vecPromoted dotProductEven = vecPromoted(0);
1211  vecPromoted dotProductOdd = vecPromoted(0);
1212  vecPromoted maxValVec = vecPromoted(*pMaxValLocal);
1213 
1214  vecPromoted vIdx = vecPromoted(0);
1215  vec vCurrIdx = *((vec *) ((uint8_t *) pBlock + CURR_IDX_VEC_OFFSET));
1216 
1217  __vpred vpMask;
1218 
1219  DSPLIB_DEBUGPRINTFN(0, "mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
1220 
1221 #if __C7X_VEC_SIZE_BITS__ == 256
1222 
1223 
1224  vecPromoted errCoefs1 = vecPromoted(pErrCoefsLocal[0]);
1225  vecPromoted errCoefs2 = vecPromoted(pErrCoefsLocal[1]);
1226  vecPromoted errCoefs3 = vecPromoted(pErrCoefsLocal[2]);
1227  vec errCoefs4 = vec(pErrCoefsLocal[3]);
1228  vec errCoefs5 = vec(pErrCoefsLocal[4]);
1229  vec errCoefs6 = vec(pErrCoefsLocal[5]);
1230  vec errCoefs7 = vec(pErrCoefsLocal[6]);
1231  vec errCoefs8 = vec(pErrCoefsLocal[7]);
1232  vec errCoefs9 = vec(pErrCoefsLocal[8]);
1233 
1234  for (int i = 0; i < mainLoopCount; i++) {
1235  // 2SE fetch implementation
1236  vec loadVec1 = c7x::strm_eng<0, vec>::get_adv();
1237  vec loadVec2 = c7x::strm_eng<0, vec>::get_adv();
1238 
1239  vec loadVec3 = c7x::strm_eng<1, vec>::get_adv();
1240  vec loadVec4 = c7x::strm_eng<1, vec>::get_adv();
1241 
1242  vec checkEven1 = __pack_consec_low(loadVec2, loadVec1);
1243  vec checkOdd1 = __pack_consec_high(loadVec2, loadVec1);
1244  vec checkEven2 = __pack_consec_low(loadVec4, loadVec3);
1245  vec checkOdd2 = __pack_consec_high(loadVec4, loadVec3);
1246 
1247  vec iEven1 = __pack_consec_low(checkEven2, checkEven1);
1248  vec iOdd1 = __pack_consec_low(checkOdd2, checkOdd1);
1249  vec iEven2 = __pack_consec_high(checkEven2, checkEven1);
1250  vec iOdd2 = __pack_consec_high(checkOdd2, checkOdd1);
1251 
1252  vec loadVec5 = c7x::strm_eng<0, vec>::get_adv();
1253  vec loadVec6 = c7x::strm_eng<0, vec>::get_adv();
1254 
1255  vec loadVec7 = c7x::strm_eng<1, vec>::get_adv();
1256  vec loadVec8 = c7x::strm_eng<1, vec>::get_adv();
1257 
1258  vec checkEven3 = __pack_consec_low(loadVec6, loadVec5);
1259  vec checkOdd3 = __pack_consec_high(loadVec6, loadVec5);
1260  vec checkEven4 = __pack_consec_low(loadVec8, loadVec7);
1261  vec checkOdd4 = __pack_consec_high(loadVec8, loadVec7);
1262 
1263  vec iEven3 = __pack_consec_low(checkEven4, checkEven3);
1264  vec iOdd3 = __pack_consec_low(checkOdd4, checkOdd3);
1265  vec iEven4 = __pack_consec_high(checkEven4, checkEven3);
1266  vec iOdd4 = __pack_consec_high(checkOdd4, checkOdd3);
1267 
1268  vec loadVec9 = c7x::strm_eng<0, vec>::get_adv();
1269  vec loadVec10 = c7x::strm_eng<0, vec>::get_adv();
1270 
1271  vec loadVec11 = c7x::strm_eng<1, vec>::get_adv();
1272  vec loadVec12 = c7x::strm_eng<1, vec>::get_adv();
1273 
1274  vec checkEven5 = __pack_consec_low(loadVec10, loadVec9);
1275  vec checkEven6 = __pack_consec_low(loadVec12, loadVec11);
1276 
1277  vec iEven5 = __pack_consec_low(checkEven6, checkEven5);
1278 
1279  vecPromoted dotProductEven1 = vecPromoted(0);
1280  vecPromoted dotProductEven2 = vecPromoted(0);
1281  vecPromoted dotProductEven3 = vecPromoted(0);
1282  vecPromoted dotProductEven4 = vecPromoted(0);
1283  vecPromoted dotProductEven5 = vecPromoted(0);
1284  vecPromoted dotProductEven6 = vecPromoted(0);
1285  vecPromoted dotProductEven7 = vecPromoted(0);
1286  vecPromoted dotProductEven8 = vecPromoted(0);
1287  vecPromoted dotProductEven9 = vecPromoted(0);
1288 
1289  vecPromoted dotProductOdd1 = vecPromoted(0);
1290  vecPromoted dotProductOdd2 = vecPromoted(0);
1291  vecPromoted dotProductOdd3 = vecPromoted(0);
1292  vecPromoted dotProductOdd4 = vecPromoted(0);
1293  vecPromoted dotProductOdd5 = vecPromoted(0);
1294  vecPromoted dotProductOdd6 = vecPromoted(0);
1295  vecPromoted dotProductOdd7 = vecPromoted(0);
1296  vecPromoted dotProductOdd8 = vecPromoted(0);
1297  vecPromoted dotProductOdd9 = vecPromoted(0);
1298 
1299  vecPromoted iEvenEvenShort1 = vecPromoted(0);
1300  vecPromoted iEvenOddShort1 = vecPromoted(0);
1301  vecPromoted iOddEvenShort1 = vecPromoted(0);
1302  vecPromoted iOddOddShort1 = vecPromoted(0);
1303  vecPromoted iEvenEvenShort2 = vecPromoted(0);
1304  vecPromoted iEvenOddShort2 = vecPromoted(0);
1305 
1306  iEvenEvenShort1 = convert_char_to_short<vecPromoted, vecPartial>(iEven1.even(), false);
1307 
1308  dotProductEven1 = (iEvenEvenShort1 * errCoefs1);
1309  iEvenOddShort1 = convert_char_to_short<vecPromoted, vecPartial>(iEven1.odd(), false);
1310  dotProductOdd1 = (iEvenOddShort1 * errCoefs1);
1311 
1312  iOddEvenShort1 = convert_char_to_short<vecPromoted, vecPartial>(iOdd1.even(), false);
1313  dotProductEven2 = (iOddEvenShort1 * errCoefs2);
1314  iOddOddShort1 = convert_char_to_short<vecPromoted, vecPartial>(iOdd1.odd(), false);
1315  dotProductOdd2 = (iOddOddShort1 * errCoefs2);
1316 
1317  iEvenEvenShort2 = convert_char_to_short<vecPromoted, vecPartial>(iEven2.even(), false);
1318  dotProductEven3 = (iEvenEvenShort2 * errCoefs3);
1319  iEvenOddShort2 = convert_char_to_short<vecPromoted, vecPartial>(iEven2.odd(), false);
1320  dotProductOdd3 = (iEvenOddShort2 * errCoefs3);
1321 
1322  mul_char_to_short<vecPromoted &, vec>(iOdd2, errCoefs4, dotProductEven4, dotProductOdd4);
1323  mul_char_to_short<vecPromoted &, vec>(iEven3, errCoefs5, dotProductEven5, dotProductOdd5);
1324  mul_char_to_short<vecPromoted &, vec>(iOdd3, errCoefs6, dotProductEven6, dotProductOdd6);
1325  mul_char_to_short<vecPromoted &, vec>(iEven4, errCoefs7, dotProductEven7, dotProductOdd7);
1326  mul_char_to_short<vecPromoted &, vec>(iOdd4, errCoefs8, dotProductEven8, dotProductOdd8);
1327  mul_char_to_short<vecPromoted &, vec>(iEven5, errCoefs9, dotProductEven9, dotProductOdd9);
1328 
1329  vecPromoted accEven1 =
1330  (__shift_right(dotProductEven1, vecPromoted(1))) + (__shift_right(dotProductEven2, vecPromoted(1)));
1331  vecPromoted accEven2 =
1332  (__shift_right(dotProductEven3, vecPromoted(1))) + (__shift_right(dotProductEven4, vecPromoted(1)));
1333  vecPromoted accEven3 =
1334  (__shift_right(dotProductEven5, vecPromoted(1))) + (__shift_right(dotProductEven6, vecPromoted(1)));
1335  vecPromoted accEven4 =
1336  (__shift_right(dotProductEven7, vecPromoted(1))) + (__shift_right(dotProductEven8, vecPromoted(1)));
1337  vecPromoted accEven5 = (__shift_right(accEven1, vecPromoted(1))) + (__shift_right(accEven2, vecPromoted(1)));
1338  vecPromoted accEven6 = (__shift_right(accEven3, vecPromoted(1))) + (__shift_right(accEven4, vecPromoted(1)));
1339  vecPromoted accEven7 = (__shift_right(accEven5, vecPromoted(1))) + (__shift_right(accEven6, vecPromoted(1)));
1340  dotProductEven = (__shift_right(accEven7, vecPromoted(1))) + (__shift_right(dotProductEven9, vecPromoted(1)));
1341 
1342  // Vertical max comparison once the vector dot products are computed for se0 feteched data
1343  __max_index(dotProductEven, maxValVec, vpMask);
1344  vecPromoted vCurrIdxPrmt;
1345  vCurrIdxPrmt = convert_char_to_short<vecPromoted, vecPartial>(vCurrIdx.even(), true);
1346  vIdx = __select(vpMask, vCurrIdxPrmt, vIdx);
1347 
1348  vecPromoted accOdd1 =
1349  (__shift_right(dotProductOdd1, vecPromoted(1))) + (__shift_right(dotProductOdd2, vecPromoted(1)));
1350  vecPromoted accOdd2 =
1351  (__shift_right(dotProductOdd3, vecPromoted(1))) + (__shift_right(dotProductOdd4, vecPromoted(1)));
1352  vecPromoted accOdd3 =
1353  (__shift_right(dotProductOdd5, vecPromoted(1))) + (__shift_right(dotProductOdd6, vecPromoted(1)));
1354  vecPromoted accOdd4 =
1355  (__shift_right(dotProductOdd7, vecPromoted(1))) + (__shift_right(dotProductOdd8, vecPromoted(1)));
1356  vecPromoted accOdd5 = (__shift_right(accOdd1, vecPromoted(1))) + (__shift_right(accOdd2, vecPromoted(1)));
1357  vecPromoted accOdd6 = (__shift_right(accOdd3, vecPromoted(1))) + (__shift_right(accOdd4, vecPromoted(1)));
1358  vecPromoted accOdd7 = (__shift_right(accOdd5, vecPromoted(1))) + (__shift_right(accOdd6, vecPromoted(1)));
1359  dotProductOdd = (__shift_right(accOdd7, vecPromoted(1))) + (__shift_right(dotProductOdd9, vecPromoted(1)));
1360 
1361  // Vertical max comparison once the vector dot products are computed for se1 feteched data
1362  __max_index(dotProductOdd, maxValVec, vpMask);
1363  vCurrIdxPrmt = convert_char_to_short<vecPromoted, vecPartial>(vCurrIdx.odd(), true);
1364  vIdx = __select(vpMask, vCurrIdxPrmt, vIdx);
1365  vCurrIdx = vCurrIdx + vec(eleCount);
1366  }
1367 
1368 
1369 #else
1370  vecPromoted errCoefs1 = vecPromoted(pErrCoefsLocal[0]);
1371  vecPromoted errCoefs2 = vecPromoted(pErrCoefsLocal[1]);
1372  vecPromoted errCoefs3 = vecPromoted(pErrCoefsLocal[2]);
1373  vecPartial errCoefs4 = vecPartial(pErrCoefsLocal[3]);
1374  vecPartial errCoefs5 = vecPartial(pErrCoefsLocal[4]);
1375  vecPartial errCoefs6 = vecPartial(pErrCoefsLocal[5]);
1376  vecPartial errCoefs7 = vecPartial(pErrCoefsLocal[6]);
1377  vecPartial errCoefs8 = vecPartial(pErrCoefsLocal[7]);
1378  vecPartial errCoefs9 = vecPartial(pErrCoefsLocal[8]);
1379 
1380  for (int i = 0; i < mainLoopCount; i++) {
1381  dotProductEven = vecPromoted(0);
1382  dotProductOdd = vecPromoted(0);
1383 
1384  // 2SE fetch implementation
1385 
1386  vec loadVec1 = c7x::strm_eng<0, vec>::get_adv();
1387  vec loadVec2 = c7x::strm_eng<1, vec>::get_adv();
1388 
1389  vec checkEven1 = __pack_consec_low(loadVec2, loadVec1);
1390  vec checkOdd1 = __pack_consec_high(loadVec2, loadVec1);
1391 
1392  vecPartial iEven1 = checkEven1.even();
1393  vecPartial iOdd1 = checkOdd1.even();
1394 
1395  vecPartial iEven2 = checkEven1.odd();
1396  vecPartial iOdd2 = checkOdd1.odd();
1397 
1398  vec loadVec3 = c7x::strm_eng<0, vec>::get_adv();
1399  vec loadVec4 = c7x::strm_eng<1, vec>::get_adv();
1400 
1401  vec checkEven2 = __pack_consec_low(loadVec4, loadVec3);
1402  vec checkOdd2 = __pack_consec_high(loadVec4, loadVec3);
1403 
1404  vecPartial iEven3 = checkEven2.even();
1405  vecPartial iOdd3 = checkOdd2.even();
1406  vecPartial iEven4 = checkEven2.odd();
1407  vecPartial iOdd4 = checkOdd2.odd();
1408 
1409  vec loadVec5 = c7x::strm_eng<0, vec>::get_adv();
1410  vec loadVec6 = c7x::strm_eng<1, vec>::get_adv();
1411 
1412  vec checkEven3 = __pack_consec_low(loadVec6, loadVec5);
1413 
1414  vecPartial iEven5 = checkEven3.even();
1415 
1416  vecPromoted dotProduct1 = vecPromoted(0);
1417  vecPromoted dotProduct2 = vecPromoted(0);
1418  vecPromoted dotProduct3 = vecPromoted(0);
1419  vecPromoted dotProduct4 = vecPromoted(0);
1420  vecPromoted dotProduct5 = vecPromoted(0);
1421  vecPromoted dotProduct6 = vecPromoted(0);
1422  vecPromoted dotProduct7 = vecPromoted(0);
1423  vecPromoted dotProduct8 = vecPromoted(0);
1424  vecPromoted dotProduct9 = vecPromoted(0);
1425 
1426  vecPromoted iColShort1 = vecPromoted(0);
1427  vecPromoted iColShort2 = vecPromoted(0);
1428  vecPromoted iColShort3 = vecPromoted(0);
1429 
1430  iColShort1 = convert_char_to_short<vecPromoted, vecPartial>(iEven1, false);
1431  dotProduct1 = (iColShort1 * errCoefs1);
1432 
1433  iColShort2 = convert_char_to_short<vecPromoted, vecPartial>(iOdd1, false);
1434  dotProduct2 = (iColShort2 * errCoefs2);
1435 
1436  iColShort3 = convert_char_to_short<vecPromoted, vecPartial>(iEven2, false);
1437  dotProduct3 = (iColShort3 * errCoefs3);
1438 
1439  mul_char_to_short<vecPromoted &, vecPartial>(iOdd2, errCoefs4, dotProduct4, dotProduct);
1440  mul_char_to_short<vecPromoted &, vecPartial>(iEven3, errCoefs5, dotProduct5, dotProduct);
1441  mul_char_to_short<vecPromoted &, vecPartial>(iOdd3, errCoefs6, dotProduct6, dotProduct);
1442  mul_char_to_short<vecPromoted &, vecPartial>(iEven4, errCoefs7, dotProduct7, dotProduct);
1443  mul_char_to_short<vecPromoted &, vecPartial>(iOdd4, errCoefs8, dotProduct8, dotProduct);
1444  mul_char_to_short<vecPromoted &, vecPartial>(iEven5, errCoefs9, dotProduct9, dotProduct);
1445 
1446  vecPromoted acc1 = (__shift_right(dotProduct1, vecPromoted(1))) + (__shift_right(dotProduct2, vecPromoted(1)));
1447  vecPromoted acc2 = (__shift_right(dotProduct3, vecPromoted(1))) + (__shift_right(dotProduct4, vecPromoted(1)));
1448  vecPromoted acc3 = (__shift_right(dotProduct5, vecPromoted(1))) + (__shift_right(dotProduct6, vecPromoted(1)));
1449  vecPromoted acc4 = (__shift_right(dotProduct7, vecPromoted(1))) + (__shift_right(dotProduct8, vecPromoted(1)));
1450  vecPromoted acc5 = (__shift_right(acc1, vecPromoted(1))) + (__shift_right(acc2, vecPromoted(1)));
1451  vecPromoted acc6 = (__shift_right(acc3, vecPromoted(1))) + (__shift_right(acc4, vecPromoted(1)));
1452  vecPromoted acc7 = (__shift_right(acc5, vecPromoted(1))) + (__shift_right(acc6, vecPromoted(1)));
1453  dotProduct = (__shift_right(acc7, vecPromoted(1))) + (__shift_right(dotProduct9, vecPromoted(1)));
1454 
1455  vecPromoted vCurrIdxPrmt;
1456 
1457  // Vertical max comparison once the vector dot products are computed for se feteched data
1458  __max_index(dotProduct, maxValVec, vpMask);
1459  vCurrIdxPrmt = convert_char_to_short<vecPromoted, vecPartial>(vCurrIdx.lo(), true);
1460  vIdx = __select(vpMask, vCurrIdxPrmt, vIdx);
1461  vCurrIdx = vCurrIdx + vec(eleCount / 2);
1462  }
1463 #endif
1464 
1465  // Horizontal max computation
1466  c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
1467 
1468  DSPLIB_DEBUGPRINTFN(0, "%s", "Exit function\n");
1469  return;
1470 }
1471 
1472 template <>
1474  void *restrict pIn,
1475  void *restrict pErrCoefs,
1476  const int *restrict pMaxIndex,
1477  const void *restrict pMaxVal)
1478 {
1479  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1480  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1481 
1482  typedef typename c7x::make_full_vector<int32_t>::type vec;
1483  int32_t eleCount = c7x::element_count_of<vec>::value;
1484  uint32_t dataSize = 4;
1485  int32_t strideIn = pKerPrivArgs->strideIn;
1486 
1487  __SE_TEMPLATE_v1 se0Params;
1488 
1489  int32_t *restrict pInLocal = (int32_t *) pIn;
1490 
1491 
1492  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1493 
1494  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1495 
1496 
1497  __SE0_OPEN(pInLocal, se0Params);
1498  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount / 2), se0Params);
1499 
1500  minerror_exec_ci_integer32_inputs<int32_t, int64_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1501  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1502 
1503  __SE1_CLOSE();
1504  __SE0_CLOSE();
1505 
1506 
1507  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1508  return DSPLIB_SUCCESS;
1509 }
1510 
1511 template <>
1513  void *restrict pIn,
1514  void *restrict pErrCoefs,
1515  const int *restrict pMaxIndex,
1516  const void *restrict pMaxVal)
1517 {
1518  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1519  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1520 
1521  typedef typename c7x::make_full_vector<uint32_t>::type vec;
1522  int32_t eleCount = c7x::element_count_of<vec>::value;
1523  uint32_t dataSize = 4;
1524  int32_t strideIn = pKerPrivArgs->strideIn;
1525  __SE_TEMPLATE_v1 se0Params;
1526 
1527  uint32_t *restrict pInLocal = (uint32_t *) pIn;
1528 
1529 
1530  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1531 
1532  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1533 
1534 
1535  __SE0_OPEN(pInLocal, se0Params);
1536  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount / 2), se0Params);
1537 
1538  minerror_exec_ci_integer32_inputs<uint32_t, uint64_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1539  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1540 
1541  __SE1_CLOSE();
1542  __SE0_CLOSE();
1543 
1544 
1545  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1546  return DSPLIB_SUCCESS;
1547 }
1548 
1549 template <>
1551  void *restrict pIn,
1552  void *restrict pErrCoefs,
1553  const int *restrict pMaxIndex,
1554  const void *restrict pMaxVal)
1555 {
1556  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1557  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1558  typedef typename c7x::make_full_vector<int64_t>::type vec;
1559  int32_t eleCount = c7x::element_count_of<vec>::value;
1560  uint32_t dataSize = 8;
1561  int32_t strideIn = pKerPrivArgs->strideIn;
1562 
1563  __SE_TEMPLATE_v1 se0Params;
1564 
1565  int64_t *restrict pInLocal = (int64_t *) pIn;
1566 
1567 
1568  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1569 
1570  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1571 
1572  __SE0_OPEN(pInLocal, se0Params);
1573  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1574 
1575  minerror_exec_ci_signed_integer64_inputs<int64_t, double>(pErrCoefs, pMaxIndex, pMaxVal, pBlock,
1576  pKerPrivArgs->vecInSize, pKerPrivArgs->errCoefsSize,
1577  pKerPrivArgs->mainLoopCount);
1578 
1579  __SE1_CLOSE();
1580  __SE0_CLOSE();
1581 
1582 
1583  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1584  return DSPLIB_SUCCESS;
1585 }
1586 
1587 template <>
1589  void *restrict pIn,
1590  void *restrict pErrCoefs,
1591  const int *restrict pMaxIndex,
1592  const void *restrict pMaxVal)
1593 {
1594  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1595  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1596  typedef typename c7x::make_full_vector<uint64_t>::type vec;
1597  int32_t eleCount = c7x::element_count_of<vec>::value;
1598  uint32_t dataSize = 8;
1599  int32_t strideIn = pKerPrivArgs->strideIn;
1600 
1601  __SE_TEMPLATE_v1 se0Params;
1602 
1603  uint64_t *restrict pInLocal = (uint64_t *) pIn;
1604 
1605 
1606  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1607 
1608  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1609 
1610  __SE0_OPEN(pInLocal, se0Params);
1611  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1612 
1613  minerror_exec_ci_unsigned_integer64_inputs<uint64_t, double>(pErrCoefs, pMaxIndex, pMaxVal, pBlock,
1614  pKerPrivArgs->vecInSize, pKerPrivArgs->errCoefsSize,
1615  pKerPrivArgs->mainLoopCount);
1616 
1617  __SE1_CLOSE();
1618  __SE0_CLOSE();
1619 
1620  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1621  return DSPLIB_SUCCESS;
1622 }
1623 
1624 template <>
1626  void *restrict pIn,
1627  void *restrict pErrCoefs,
1628  const int *restrict pMaxIndex,
1629  const void *restrict pMaxVal)
1630 {
1631  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1632  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1633 
1634  typedef typename c7x::make_full_vector<float>::type vec;
1635  int32_t eleCount = c7x::element_count_of<vec>::value;
1636  uint32_t dataSize = 4;
1637  int32_t strideIn = pKerPrivArgs->strideIn;
1638 
1639  __SE_TEMPLATE_v1 se0Params;
1640 
1641  float *restrict pInLocal = (float *) pIn;
1642 
1643 
1644  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1645 
1646  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1647 
1648  __SE0_OPEN(pInLocal, se0Params);
1649  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1650 
1651  minerror_exec_ci_float_inputs<float>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1652  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1653 
1654  __SE1_CLOSE();
1655  __SE0_CLOSE();
1656 
1657 
1658  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1659  return DSPLIB_SUCCESS;
1660 }
1661 
1662 template <>
1664  void *restrict pIn,
1665  void *restrict pErrCoefs,
1666  const int *restrict pMaxIndex,
1667  const void *restrict pMaxVal)
1668 {
1669  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1670  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1671 
1672  typedef typename c7x::make_full_vector<double>::type vec;
1673  int32_t eleCount = c7x::element_count_of<vec>::value;
1674  uint32_t dataSize = 8;
1675  int32_t strideIn = pKerPrivArgs->strideIn;
1676 
1677  __SE_TEMPLATE_v1 se0Params;
1678 
1679  double *restrict pInLocal = (double *) pIn;
1680 
1681  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1682 
1683  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1684 
1685  __SE0_OPEN(pInLocal, se0Params);
1686  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1687 
1688  minerror_exec_ci_float_inputs<double>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1689  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1690 
1691  __SE1_CLOSE();
1692  __SE0_CLOSE();
1693 
1694  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1695  return DSPLIB_SUCCESS;
1696 }
1697 
1698 template <>
1700  void *restrict pIn,
1701  void *restrict pErrCoefs,
1702  const int *restrict pMaxIndex,
1703  const void *restrict pMaxVal)
1704 {
1705  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1706  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1707 
1708  __SE_TEMPLATE_v1 se0Params;
1709 
1710  int8_t *restrict pInLocal = (int8_t *) pIn;
1711 
1712  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1713  typedef typename c7x::make_full_vector<int8_t>::type vec;
1714  int32_t eleCount = c7x::element_count_of<vec>::value;
1715  uint32_t dataSize = 1;
1716  int32_t strideIn = pKerPrivArgs->strideIn;
1717 
1718  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1719 
1720  __SE0_OPEN(pInLocal, se0Params);
1721 
1722 
1723 #if __C7X_VEC_SIZE_BITS__ == 256
1724  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 2)), se0Params);
1725 #else
1726  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1727 #endif
1728 
1729  minerror_exec_ci_integer8_inputs<int8_t, int16_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1730  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1731 
1732  __SE1_CLOSE();
1733  __SE0_CLOSE();
1734 
1735 
1736  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1737  return DSPLIB_SUCCESS;
1738 }
1739 
1740 template <>
1742  void *restrict pIn,
1743  void *restrict pErrCoefs,
1744  const int *restrict pMaxIndex,
1745  const void *restrict pMaxVal)
1746 {
1747  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1748  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1749 
1750  __SE_TEMPLATE_v1 se0Params;
1751 
1752  uint8_t *restrict pInLocal = (uint8_t *) pIn;
1753 
1754  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1755  typedef typename c7x::make_full_vector<uint8_t>::type vec;
1756  int32_t eleCount = c7x::element_count_of<vec>::value;
1757  uint32_t dataSize = 1;
1758  int32_t strideIn = pKerPrivArgs->strideIn;
1759 
1760  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1761 
1762  __SE0_OPEN(pInLocal, se0Params);
1763 
1764 #if __C7X_VEC_SIZE_BITS__ == 256
1765  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 2)), se0Params);
1766 #else
1767  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1768 #endif
1769 
1770  minerror_exec_ci_integer8_inputs<uint8_t, uint16_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1771  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1772 
1773  __SE1_CLOSE();
1774  __SE0_CLOSE();
1775 
1776 
1777  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1778  return DSPLIB_SUCCESS;
1779 }
1780 
1781 template <>
1783  void *restrict pIn,
1784  void *restrict pErrCoefs,
1785  const int *restrict pMaxIndex,
1786  const void *restrict pMaxVal)
1787 {
1788  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1789  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1790 
1791  __SE_TEMPLATE_v1 se0Params;
1792 
1793  int16_t *restrict pInLocal = (int16_t *) pIn;
1794 
1795  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1796  typedef typename c7x::make_full_vector<int16_t>::type vec;
1797  int32_t eleCount = c7x::element_count_of<vec>::value;
1798  uint32_t dataSize = 2;
1799  int32_t strideIn = pKerPrivArgs->strideIn;
1800 
1801  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1802 
1803  __SE0_OPEN(pInLocal, se0Params);
1804  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1805 
1806  minerror_exec_ci_integer16_inputs<int16_t, int32_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1807  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1808 
1809  __SE1_CLOSE();
1810  __SE0_CLOSE();
1811 
1812  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1813  return DSPLIB_SUCCESS;
1814 }
1815 
1816 template <>
1818  void *restrict pIn,
1819  void *restrict pErrCoefs,
1820  const int *restrict pMaxIndex,
1821  const void *restrict pMaxVal)
1822 {
1823  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering Function");
1824  DSPLIB_minerror_PrivArgs *pKerPrivArgs = (DSPLIB_minerror_PrivArgs *) handle;
1825 
1826  __SE_TEMPLATE_v1 se0Params;
1827 
1828  uint16_t *restrict pInLocal = (uint16_t *) pIn;
1829 
1830  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1831  typedef typename c7x::make_full_vector<uint16_t>::type vec;
1832  int32_t eleCount = c7x::element_count_of<vec>::value;
1833  uint32_t dataSize = 2;
1834  int32_t strideIn = pKerPrivArgs->strideIn;
1835 
1836  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1837 
1838  __SE0_OPEN(pInLocal, se0Params);
1839  __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1840 
1841  minerror_exec_ci_integer16_inputs<uint16_t, uint32_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->vecInSize,
1842  pKerPrivArgs->errCoefsSize, pKerPrivArgs->mainLoopCount);
1843 
1844  __SE1_CLOSE();
1845  __SE0_CLOSE();
1846 
1847  DSPLIB_DEBUGPRINTFN(0, "Exit function with %d\n", DSPLIB_SUCCESS);
1848  return DSPLIB_SUCCESS;
1849 }
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
#define CURR_IDX_VEC_OFFSET
void minerror_exec_ci_signed_integer64_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
#define SE_SE0_PARAM_OFFSET
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int64_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
void minerror_exec_ci_integer16_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
DSPLIB_STATUS DSPLIB_minerror_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
void minerror_exec_ci_unsigned_integer64_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
void minerror_exec_ci_integer32_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint64_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
#define SE_SE1_PARAM_OFFSET
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
void minerror_exec_ci_float_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
void minerror_exec_ci_integer8_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_minerror.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_UINT64
@ DSPLIB_UINT8
@ DSPLIB_UINT16
@ DSPLIB_INT32
@ DSPLIB_INT16
@ DSPLIB_FLOAT32
@ DSPLIB_INT64
@ DSPLIB_FLOAT64
@ DSPLIB_UINT32
@ DSPLIB_INT8
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
uint32_t data_type
Values are of type DSPLIB_data_type_e.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint32_t errCoefsSize
Size of error coefficients vector
uint32_t vecInSize
Size of input data DSPLIB_minerror_init that will be retrieved and used by DSPLIB_minerror_exec
uint8_t bufPblock[DSPLIB_MINERROR_IXX_IXX_OXX_PBLOCK_SIZE]