47 #include "../common/DSPLIB_inlines.h"
49 #include <c7x_scalable.h>
57 #define SE_PARAM_BASE (0x0000)
58 #define SE_SE0_PARAM_OFFSET (SE_PARAM_BASE)
59 #define SE_SE1_PARAM_OFFSET (SE_SE0_PARAM_OFFSET + SE_PARAM_SIZE)
60 #define CURR_IDX_VEC_OFFSET (SE_SE1_PARAM_OFFSET + SE_PARAM_SIZE)
62 template <
typename dataType>
72 __SE_TEMPLATE_v1 se0Params;
74 __SE_ELETYPE SE_ELETYPE;
75 __SE_VECLEN SE_VECLEN;
79 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
80 uint32_t vecInSize = pKerPrivArgs->
vecInSize;
82 uint32_t strideIn = pKerPrivArgs->
strideIn;
84 typedef typename c7x::make_full_vector<dataType>::type vec;
85 int32_t eleCount = c7x::element_count_of<vec>::value;
86 SE_VECLEN = c7x::se_veclen<vec>::value;
87 SE_ELETYPE = c7x::se_eletype<vec>::value;
95 pKerPrivArgs->
mainLoopCount = (int32_t) (((vecInSize + (eleCount * 2) - 1) / (eleCount * 2)));
98 for (
int j = 0; j < eleCount; j++) {
103 pKerPrivArgs->
mainLoopCount = (int32_t) (((vecInSize + (eleCount * 2) - 1) / (eleCount * 2)) * 2);
106 for (
int j = 0; j < eleCount / 2; j++) {
111 pKerPrivArgs->
mainLoopCount = (int32_t) (((vecInSize + eleCount - 1) / eleCount) * 2);
114 for (
int j = 0; j < eleCount / 2; j++) {
120 #if __C7X_VEC_SIZE_BITS__ == 512
121 int32_t outEleCount = eleCount / 2;
123 int32_t outEleCount = eleCount;
125 pKerPrivArgs->
mainLoopCount = (int32_t) (((vecInSize + outEleCount - 1) / outEleCount));
128 for (
int j = 0; j < eleCount; j++) {
134 pKerPrivArgs->
mainLoopCount = (int32_t) ((vecInSize + (eleCount * 2) - 1) / (eleCount * 2));
136 for (
int j = 0; j < eleCount; j++) {
141 se0Params = __gen_SE_TEMPLATE_v1();
150 se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
153 se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
165 se0Params.ICNT0 = errCoefsSize;
166 se0Params.ICNT1 = temp =
167 (vecInSize > (uint32_t) eleCount)
170 se0Params.DIM1 = strideIn / dataSize;
174 se0Params.DIM2 = (strideIn / dataSize * temp * 2);
175 se0Params.ELETYPE = SE_ELETYPE;
176 se0Params.VECLEN = SE_VECLEN;
177 se0Params.DIMFMT = __SE_DIMFMT_3D;
182 se0Params.ICNT0 = errCoefsSize;
183 se0Params.ICNT1 = temp =
184 (vecInSize > (uint32_t) eleCount / 2)
187 se0Params.DIM1 = strideIn / dataSize;
191 se0Params.DIM2 = (strideIn / dataSize * temp * 2);
193 se0Params.PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
196 se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
198 se0Params.ELETYPE = SE_ELETYPE;
199 typedef typename c7x::make_full_vector<int64_t>::type vec64;
200 SE_VECLEN = c7x::se_veclen<vec64>::value;
201 se0Params.ELETYPE = SE_ELETYPE;
202 se0Params.VECLEN = SE_VECLEN;
203 se0Params.DIMFMT = __SE_DIMFMT_3D;
207 se0Params.ICNT0 = errCoefsSize;
208 se0Params.ICNT1 = (vecInSize > (uint32_t) (eleCount / 4))
212 se0Params.DIM1 = strideIn / dataSize;
214 se0Params.DIM2 = (strideIn / dataSize * eleCount / 2);
217 se0Params.PROMOTE = __SE_PROMOTE_2X_SIGNEXT;
220 se0Params.PROMOTE = __SE_PROMOTE_2X_ZEROEXT;
222 se0Params.ELETYPE = SE_ELETYPE;
223 typedef typename c7x::make_full_vector<int32_t>::type vec32;
224 SE_VECLEN = c7x::se_veclen<vec32>::value;
225 se0Params.VECLEN = SE_VECLEN;
226 se0Params.DIMFMT = __SE_DIMFMT_3D;
230 se0Params.VECLEN = SE_VECLEN;
231 se0Params.ICNT0 = errCoefsSize;
232 se0Params.ICNT1 = 16;
233 se0Params.DIM1 = strideIn;
235 se0Params.DIM2 = strideIn * eleCount;
236 se0Params.ELETYPE = SE_ELETYPE;
237 se0Params.DIMFMT = __SE_DIMFMT_3D;
239 #if __C7X_VEC_SIZE_BITS__ == 512
240 se0Params.DIM2 = strideIn * eleCount / 2;
322 template <
typename FloatingPo
intDataType>
324 const int *restrict pMaxIndex,
325 const void *restrict pMaxVal,
326 uint8_t *restrict pBlock,
328 uint32_t errCoefsSize,
329 int32_t mainLoopCount)
332 FloatingPointDataType *restrict pErrCoefsLocal = (FloatingPointDataType *) pErrCoefs;
333 int *restrict pMaxIndexLocal = (
int *) pMaxIndex;
334 FloatingPointDataType *restrict pMaxValLocal = (FloatingPointDataType *) pMaxVal;
336 typedef typename c7x::make_full_vector<FloatingPointDataType>::type vec;
337 int32_t eleCount = c7x::element_count_of<vec>::value;
340 *pMaxValLocal = (FloatingPointDataType) (std::numeric_limits<FloatingPointDataType>::min());
342 vec errCoefs1 = vec(pErrCoefsLocal[0]);
343 vec errCoefs2 = vec(pErrCoefsLocal[1]);
344 vec errCoefs3 = vec(pErrCoefsLocal[2]);
345 vec errCoefs4 = vec(pErrCoefsLocal[3]);
346 vec errCoefs5 = vec(pErrCoefsLocal[4]);
347 vec errCoefs6 = vec(pErrCoefsLocal[5]);
348 vec errCoefs7 = vec(pErrCoefsLocal[6]);
349 vec errCoefs8 = vec(pErrCoefsLocal[7]);
350 vec errCoefs9 = vec(pErrCoefsLocal[8]);
352 vec dotProduct = vec(0);
353 vec maxValVec = vec(*pMaxValLocal);
360 DSPLIB_DEBUGPRINTFN(0,
"mainLoopCount %d errCoefsSize %d vecInSize %d\n", mainLoopCount, errCoefsSize, vecInSize);
362 for (i = 0; i < mainLoopCount; i++) {
366 vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
367 vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
368 vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
369 vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
370 vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
371 vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
372 vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
373 vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
374 vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
376 vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
377 vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
378 vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
379 vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
380 vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
381 vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
382 vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
383 vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
384 vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
386 vec dotProduct1 = (tmp1_0 * errCoefs1);
387 vec dotProduct2 = (tmp2_0 * errCoefs2);
388 vec dotProduct3 = (tmp3_0 * errCoefs3);
389 vec dotProduct4 = (tmp4_0 * errCoefs4);
390 vec dotProduct5 = (tmp5_0 * errCoefs5);
391 vec dotProduct6 = (tmp6_0 * errCoefs6);
392 vec dotProduct7 = (tmp7_0 * errCoefs7);
393 vec dotProduct8 = (tmp8_0 * errCoefs8);
394 vec dotProduct9 = (tmp9_0 * errCoefs9);
396 dotProduct = dotProduct + dotProduct1;
397 dotProduct = dotProduct + dotProduct2;
398 dotProduct = dotProduct + dotProduct3;
399 dotProduct = dotProduct + dotProduct4;
400 dotProduct = dotProduct + dotProduct5;
401 dotProduct = dotProduct + dotProduct6;
402 dotProduct = dotProduct + dotProduct7;
403 dotProduct = dotProduct + dotProduct8;
404 dotProduct = dotProduct + dotProduct9;
407 vpMask = __cmp_lt_pred(dotProduct, maxValVec);
408 maxValVec = __select(vpMask, maxValVec, dotProduct);
409 vIdx = __select(vpMask, vIdx, vCurrIdx);
410 vCurrIdx = vCurrIdx + (eleCount);
413 dotProduct1 = (tmp1_1 * errCoefs1);
414 dotProduct2 = (tmp2_1 * errCoefs2);
415 dotProduct3 = (tmp3_1 * errCoefs3);
416 dotProduct4 = (tmp4_1 * errCoefs4);
417 dotProduct5 = (tmp5_1 * errCoefs5);
418 dotProduct6 = (tmp6_1 * errCoefs6);
419 dotProduct7 = (tmp7_1 * errCoefs7);
420 dotProduct8 = (tmp8_1 * errCoefs8);
421 dotProduct9 = (tmp9_1 * errCoefs9);
423 dotProduct = dotProduct + dotProduct1;
424 dotProduct = dotProduct + dotProduct2;
425 dotProduct = dotProduct + dotProduct3;
426 dotProduct = dotProduct + dotProduct4;
427 dotProduct = dotProduct + dotProduct5;
428 dotProduct = dotProduct + dotProduct6;
429 dotProduct = dotProduct + dotProduct7;
430 dotProduct = dotProduct + dotProduct8;
431 dotProduct = dotProduct + dotProduct9;
434 vpMask = __cmp_lt_pred(dotProduct, maxValVec);
435 maxValVec = __select(vpMask, maxValVec, dotProduct);
436 vIdx = __select(vpMask, vIdx, vCurrIdx);
437 vCurrIdx = vCurrIdx + (eleCount);
441 c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
447 template <
typename Integer64BitDataType,
typename Integer64BitConversionDataType>
449 const int *restrict pMaxIndex,
450 const void *restrict pMaxVal,
451 uint8_t *restrict pBlock,
453 uint32_t errCoefsSize,
454 int32_t mainLoopCount)
461 Integer64BitDataType *restrict pErrCoefsLocal = (Integer64BitDataType *) pErrCoefs;
462 int *restrict pMaxIndexLocal = (
int *) pMaxIndex;
463 Integer64BitConversionDataType *restrict pMaxValLocal = (Integer64BitConversionDataType *) pMaxVal;
465 typedef typename c7x::make_full_vector<Integer64BitDataType>::type vec;
466 int32_t eleCount = c7x::element_count_of<vec>::value;
468 typedef typename c7x::make_full_vector<Integer64BitConversionDataType>::type vecConverted;
469 typedef typename c7x::make_full_vector<int32_t>::type vecIntermediate;
472 *pMaxValLocal = (Integer64BitConversionDataType) (std::numeric_limits<Integer64BitConversionDataType>::min());
474 vecConverted errCoefs1 = vecConverted(pErrCoefsLocal[0]);
475 vecConverted errCoefs2 = vecConverted(pErrCoefsLocal[1]);
476 vecConverted errCoefs3 = vecConverted(pErrCoefsLocal[2]);
477 vecConverted errCoefs4 = vecConverted(pErrCoefsLocal[3]);
478 vecConverted errCoefs5 = vecConverted(pErrCoefsLocal[4]);
479 vecConverted errCoefs6 = vecConverted(pErrCoefsLocal[5]);
480 vecConverted errCoefs7 = vecConverted(pErrCoefsLocal[6]);
481 vecConverted errCoefs8 = vecConverted(pErrCoefsLocal[7]);
482 vecConverted errCoefs9 = vecConverted(pErrCoefsLocal[8]);
484 vecConverted dotProduct = vecConverted(0);
485 vecConverted maxValVec = vecConverted(*pMaxValLocal);
487 vecConverted vIdx = vecConverted(0);
490 vecConverted mulFactor = vecConverted(4294967296.00);
494 DSPLIB_DEBUGPRINTFN(0,
"mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
496 for (i = 0; i < mainLoopCount; i++) {
497 dotProduct = vecConverted(0);
501 vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
502 vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
503 vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
504 vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
505 vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
506 vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
507 vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
508 vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
509 vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
511 vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
512 vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
513 vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
514 vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
515 vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
516 vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
517 vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
518 vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
519 vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
521 vecIntermediate v16bits1_0 = convert_long_to_int<vecIntermediate, vec>(tmp1_0);
522 vecIntermediate v16bits2_0 = convert_long_to_int<vecIntermediate, vec>(tmp2_0);
523 vecIntermediate v16bits3_0 = convert_long_to_int<vecIntermediate, vec>(tmp3_0);
524 vecIntermediate v16bits4_0 = convert_long_to_int<vecIntermediate, vec>(tmp4_0);
525 vecIntermediate v16bits5_0 = convert_long_to_int<vecIntermediate, vec>(tmp5_0);
526 vecIntermediate v16bits6_0 = convert_long_to_int<vecIntermediate, vec>(tmp6_0);
527 vecIntermediate v16bits7_0 = convert_long_to_int<vecIntermediate, vec>(tmp7_0);
528 vecIntermediate v16bits8_0 = convert_long_to_int<vecIntermediate, vec>(tmp8_0);
529 vecIntermediate v16bits9_0 = convert_long_to_int<vecIntermediate, vec>(tmp9_0);
531 vecConverted vhigh16bits1_0 = __high_int_to_double(v16bits1_0);
532 vecConverted vlow16bits1_0 = __low_int_to_double(v16bits1_0);
533 vecConverted vSum1_0 = vhigh16bits1_0 * mulFactor;
534 vSum1_0 = vSum1_0 + vlow16bits1_0;
536 vecConverted vhigh16bits2_0 = __high_int_to_double(v16bits2_0);
537 vecConverted vlow16bits2_0 = __low_int_to_double(v16bits2_0);
538 vecConverted vSum2_0 = vhigh16bits2_0 * mulFactor;
539 vSum2_0 = vSum2_0 + vlow16bits2_0;
541 vecConverted vhigh16bits3_0 = __high_int_to_double(v16bits3_0);
542 vecConverted vlow16bits3_0 = __low_int_to_double(v16bits3_0);
543 vecConverted vSum3_0 = vhigh16bits3_0 * mulFactor;
544 vSum3_0 = vSum3_0 + vlow16bits3_0;
546 vecConverted vhigh16bits4_0 = __high_int_to_double(v16bits4_0);
547 vecConverted vlow16bits4_0 = __low_int_to_double(v16bits4_0);
548 vecConverted vSum4_0 = vhigh16bits4_0 * mulFactor;
549 vSum4_0 = vSum4_0 + vlow16bits4_0;
551 vecConverted vhigh16bits5_0 = __high_int_to_double(v16bits5_0);
552 vecConverted vlow16bits5_0 = __low_int_to_double(v16bits5_0);
553 vecConverted vSum5_0 = vhigh16bits5_0 * mulFactor;
554 vSum5_0 = vSum5_0 + vlow16bits5_0;
556 vecConverted vhigh16bits6_0 = __high_int_to_double(v16bits6_0);
557 vecConverted vlow16bits6_0 = __low_int_to_double(v16bits6_0);
558 vecConverted vSum6_0 = vhigh16bits6_0 * mulFactor;
559 vSum6_0 = vSum6_0 + vlow16bits6_0;
561 vecConverted vhigh16bits7_0 = __high_int_to_double(v16bits7_0);
562 vecConverted vlow16bits7_0 = __low_int_to_double(v16bits7_0);
563 vecConverted vSum7_0 = vhigh16bits7_0 * mulFactor;
564 vSum7_0 = vSum7_0 + vlow16bits7_0;
566 vecConverted vhigh16bits8_0 = __high_int_to_double(v16bits8_0);
567 vecConverted vlow16bits8_0 = __low_int_to_double(v16bits8_0);
568 vecConverted vSum8_0 = vhigh16bits8_0 * mulFactor;
569 vSum8_0 = vSum8_0 + vlow16bits8_0;
571 vecConverted vhigh16bits9_0 = __high_int_to_double(v16bits9_0);
572 vecConverted vlow16bits9_0 = __low_int_to_double(v16bits9_0);
573 vecConverted vSum9_0 = vhigh16bits9_0 * mulFactor;
574 vSum9_0 = vSum9_0 + vlow16bits9_0;
576 vecConverted dotProduct1 = (vSum1_0 * errCoefs1);
577 vecConverted dotProduct2 = (vSum2_0 * errCoefs2);
578 vecConverted dotProduct3 = (vSum3_0 * errCoefs3);
579 vecConverted dotProduct4 = (vSum4_0 * errCoefs4);
580 vecConverted dotProduct5 = (vSum5_0 * errCoefs5);
581 vecConverted dotProduct6 = (vSum6_0 * errCoefs6);
582 vecConverted dotProduct7 = (vSum7_0 * errCoefs7);
583 vecConverted dotProduct8 = (vSum8_0 * errCoefs8);
584 vecConverted dotProduct9 = (vSum9_0 * errCoefs9);
586 dotProduct = dotProduct + dotProduct1;
587 dotProduct = dotProduct + dotProduct2;
588 dotProduct = dotProduct + dotProduct3;
589 dotProduct = dotProduct + dotProduct4;
590 dotProduct = dotProduct + dotProduct5;
591 dotProduct = dotProduct + dotProduct6;
592 dotProduct = dotProduct + dotProduct7;
593 dotProduct = dotProduct + dotProduct8;
594 dotProduct = dotProduct + dotProduct9;
597 vpMask = __cmp_lt_pred(dotProduct, maxValVec);
598 maxValVec = __select(vpMask, maxValVec, dotProduct);
599 vIdx = __select(vpMask, vIdx, vCurrIdx);
600 vCurrIdx = vCurrIdx + (eleCount);
602 dotProduct = vecConverted(0);
604 vecIntermediate v16bits1_1 = convert_long_to_int<vecIntermediate, vec>(tmp1_1);
605 vecIntermediate v16bits2_1 = convert_long_to_int<vecIntermediate, vec>(tmp2_1);
606 vecIntermediate v16bits3_1 = convert_long_to_int<vecIntermediate, vec>(tmp3_1);
607 vecIntermediate v16bits4_1 = convert_long_to_int<vecIntermediate, vec>(tmp4_1);
608 vecIntermediate v16bits5_1 = convert_long_to_int<vecIntermediate, vec>(tmp5_1);
609 vecIntermediate v16bits6_1 = convert_long_to_int<vecIntermediate, vec>(tmp6_1);
610 vecIntermediate v16bits7_1 = convert_long_to_int<vecIntermediate, vec>(tmp7_1);
611 vecIntermediate v16bits8_1 = convert_long_to_int<vecIntermediate, vec>(tmp8_1);
612 vecIntermediate v16bits9_1 = convert_long_to_int<vecIntermediate, vec>(tmp9_1);
614 vecConverted vhigh16bits1_1 = __high_int_to_double(v16bits1_1);
615 vecConverted vlow16bits1_1 = __low_int_to_double(v16bits1_1);
616 vecConverted vSum1_1 = vhigh16bits1_1 * mulFactor;
617 vSum1_1 = vSum1_1 + vlow16bits1_1;
619 vecConverted vhigh16bits2_1 = __high_int_to_double(v16bits2_1);
620 vecConverted vlow16bits2_1 = __low_int_to_double(v16bits2_1);
621 vecConverted vSum2_1 = vhigh16bits2_1 * mulFactor;
622 vSum2_1 = vSum2_1 + vlow16bits2_1;
624 vecConverted vhigh16bits3_1 = __high_int_to_double(v16bits3_1);
625 vecConverted vlow16bits3_1 = __low_int_to_double(v16bits3_1);
626 vecConverted vSum3_1 = vhigh16bits3_1 * mulFactor;
627 vSum3_1 = vSum3_1 + vlow16bits3_1;
629 vecConverted vhigh16bits4_1 = __high_int_to_double(v16bits4_1);
630 vecConverted vlow16bits4_1 = __low_int_to_double(v16bits4_1);
631 vecConverted vSum4_1 = vhigh16bits4_1 * mulFactor;
632 vSum4_1 = vSum4_1 + vlow16bits4_1;
634 vecConverted vhigh16bits5_1 = __high_int_to_double(v16bits5_1);
635 vecConverted vlow16bits5_1 = __low_int_to_double(v16bits5_1);
636 vecConverted vSum5_1 = vhigh16bits5_1 * mulFactor;
637 vSum5_1 = vSum5_1 + vlow16bits5_1;
639 vecConverted vhigh16bits6_1 = __high_int_to_double(v16bits6_1);
640 vecConverted vlow16bits6_1 = __low_int_to_double(v16bits6_1);
641 vecConverted vSum6_1 = vhigh16bits6_1 * mulFactor;
642 vSum6_1 = vSum6_1 + vlow16bits6_1;
644 vecConverted vhigh16bits7_1 = __high_int_to_double(v16bits7_1);
645 vecConverted vlow16bits7_1 = __low_int_to_double(v16bits7_1);
646 vecConverted vSum7_1 = vhigh16bits7_1 * mulFactor;
647 vSum7_1 = vSum7_1 + vlow16bits7_1;
649 vecConverted vhigh16bits8_1 = __high_int_to_double(v16bits8_1);
650 vecConverted vlow16bits8_1 = __low_int_to_double(v16bits8_1);
651 vecConverted vSum8_1 = vhigh16bits8_1 * mulFactor;
652 vSum8_1 = vSum8_1 + vlow16bits8_1;
654 vecConverted vhigh16bits9_1 = __high_int_to_double(v16bits9_1);
655 vecConverted vlow16bits9_1 = __low_int_to_double(v16bits9_1);
656 vecConverted vSum9_1 = vhigh16bits9_1 * mulFactor;
657 vSum9_1 = vSum9_1 + vlow16bits9_1;
659 dotProduct1 = (vSum1_1 * errCoefs1);
660 dotProduct2 = (vSum2_1 * errCoefs2);
661 dotProduct3 = (vSum3_1 * errCoefs3);
662 dotProduct4 = (vSum4_1 * errCoefs4);
663 dotProduct5 = (vSum5_1 * errCoefs5);
664 dotProduct6 = (vSum6_1 * errCoefs6);
665 dotProduct7 = (vSum7_1 * errCoefs7);
666 dotProduct8 = (vSum8_1 * errCoefs8);
667 dotProduct9 = (vSum9_1 * errCoefs9);
669 dotProduct = dotProduct + dotProduct1;
670 dotProduct = dotProduct + dotProduct2;
671 dotProduct = dotProduct + dotProduct3;
672 dotProduct = dotProduct + dotProduct4;
673 dotProduct = dotProduct + dotProduct5;
674 dotProduct = dotProduct + dotProduct6;
675 dotProduct = dotProduct + dotProduct7;
676 dotProduct = dotProduct + dotProduct8;
677 dotProduct = dotProduct + dotProduct9;
680 vpMask = __cmp_lt_pred(dotProduct, maxValVec);
681 maxValVec = __select(vpMask, maxValVec, dotProduct);
682 vIdx = __select(vpMask, vIdx, vCurrIdx);
683 vCurrIdx = vCurrIdx + (eleCount);
687 c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
688 double maxval = *pMaxValLocal;
689 *pMaxValLocal = ((Integer64BitDataType) maxval);
695 template <
typename Integer64BitDataType,
typename Integer64BitConversionDataType>
697 const int *restrict pMaxIndex,
698 const void *restrict pMaxVal,
699 uint8_t *restrict pBlock,
701 uint32_t errCoefsSize,
702 int32_t mainLoopCount)
709 Integer64BitDataType *restrict pErrCoefsLocal = (Integer64BitDataType *) pErrCoefs;
710 int *restrict pMaxIndexLocal = (
int *) pMaxIndex;
711 Integer64BitConversionDataType *restrict pMaxValLocal = (Integer64BitConversionDataType *) pMaxVal;
713 typedef typename c7x::make_full_vector<Integer64BitDataType>::type vec;
714 int32_t eleCount = c7x::element_count_of<vec>::value;
716 typedef typename c7x::make_full_vector<Integer64BitConversionDataType>::type vecConverted;
718 typedef typename c7x::make_full_vector<uint32_t>::type vecIntermediate;
722 *pMaxValLocal = (Integer64BitConversionDataType) (std::numeric_limits<Integer64BitConversionDataType>::min());
724 vecConverted errCoefs1 = vecConverted(pErrCoefsLocal[0]);
725 vecConverted errCoefs2 = vecConverted(pErrCoefsLocal[1]);
726 vecConverted errCoefs3 = vecConverted(pErrCoefsLocal[2]);
727 vecConverted errCoefs4 = vecConverted(pErrCoefsLocal[3]);
728 vecConverted errCoefs5 = vecConverted(pErrCoefsLocal[4]);
729 vecConverted errCoefs6 = vecConverted(pErrCoefsLocal[5]);
730 vecConverted errCoefs7 = vecConverted(pErrCoefsLocal[6]);
731 vecConverted errCoefs8 = vecConverted(pErrCoefsLocal[7]);
732 vecConverted errCoefs9 = vecConverted(pErrCoefsLocal[8]);
734 vecConverted dotProduct = vecConverted(0);
735 vecConverted maxValVec = vecConverted(*pMaxValLocal);
737 vecConverted vIdx = vecConverted(0);
740 vecConverted mulFactor = vecConverted(4294967296.00);
744 DSPLIB_DEBUGPRINTFN(0,
"mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
746 for (i = 0; i < mainLoopCount; i++) {
747 dotProduct = vecConverted(0);
751 vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
752 vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
753 vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
754 vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
755 vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
756 vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
757 vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
758 vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
759 vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
761 vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
762 vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
763 vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
764 vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
765 vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
766 vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
767 vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
768 vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
769 vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
771 vecIntermediate v16bits1_0 = convert_long_to_int<vecIntermediate, vec>(tmp1_0);
772 vecIntermediate v16bits2_0 = convert_long_to_int<vecIntermediate, vec>(tmp2_0);
773 vecIntermediate v16bits3_0 = convert_long_to_int<vecIntermediate, vec>(tmp3_0);
774 vecIntermediate v16bits4_0 = convert_long_to_int<vecIntermediate, vec>(tmp4_0);
775 vecIntermediate v16bits5_0 = convert_long_to_int<vecIntermediate, vec>(tmp5_0);
776 vecIntermediate v16bits6_0 = convert_long_to_int<vecIntermediate, vec>(tmp6_0);
777 vecIntermediate v16bits7_0 = convert_long_to_int<vecIntermediate, vec>(tmp7_0);
778 vecIntermediate v16bits8_0 = convert_long_to_int<vecIntermediate, vec>(tmp8_0);
779 vecIntermediate v16bits9_0 = convert_long_to_int<vecIntermediate, vec>(tmp9_0);
781 vecConverted vhigh16bits1_0 = __high_int_to_double(v16bits1_0);
782 vecConverted vlow16bits1_0 = __low_int_to_double(v16bits1_0);
783 vecConverted vSum1_0 = vhigh16bits1_0 * mulFactor;
784 vSum1_0 = vSum1_0 + vlow16bits1_0;
786 vecConverted vhigh16bits2_0 = __high_int_to_double(v16bits2_0);
787 vecConverted vlow16bits2_0 = __low_int_to_double(v16bits2_0);
788 vecConverted vSum2_0 = vhigh16bits2_0 * mulFactor;
789 vSum2_0 = vSum2_0 + vlow16bits2_0;
791 vecConverted vhigh16bits3_0 = __high_int_to_double(v16bits3_0);
792 vecConverted vlow16bits3_0 = __low_int_to_double(v16bits3_0);
793 vecConverted vSum3_0 = vhigh16bits3_0 * mulFactor;
794 vSum3_0 = vSum3_0 + vlow16bits3_0;
796 vecConverted vhigh16bits4_0 = __high_int_to_double(v16bits4_0);
797 vecConverted vlow16bits4_0 = __low_int_to_double(v16bits4_0);
798 vecConverted vSum4_0 = vhigh16bits4_0 * mulFactor;
799 vSum4_0 = vSum4_0 + vlow16bits4_0;
801 vecConverted vhigh16bits5_0 = __high_int_to_double(v16bits5_0);
802 vecConverted vlow16bits5_0 = __low_int_to_double(v16bits5_0);
803 vecConverted vSum5_0 = vhigh16bits5_0 * mulFactor;
804 vSum5_0 = vSum5_0 + vlow16bits5_0;
806 vecConverted vhigh16bits6_0 = __high_int_to_double(v16bits6_0);
807 vecConverted vlow16bits6_0 = __low_int_to_double(v16bits6_0);
808 vecConverted vSum6_0 = vhigh16bits6_0 * mulFactor;
809 vSum6_0 = vSum6_0 + vlow16bits6_0;
811 vecConverted vhigh16bits7_0 = __high_int_to_double(v16bits7_0);
812 vecConverted vlow16bits7_0 = __low_int_to_double(v16bits7_0);
813 vecConverted vSum7_0 = vhigh16bits7_0 * mulFactor;
814 vSum7_0 = vSum7_0 + vlow16bits7_0;
816 vecConverted vhigh16bits8_0 = __high_int_to_double(v16bits8_0);
817 vecConverted vlow16bits8_0 = __low_int_to_double(v16bits8_0);
818 vecConverted vSum8_0 = vhigh16bits8_0 * mulFactor;
819 vSum8_0 = vSum8_0 + vlow16bits8_0;
821 vecConverted vhigh16bits9_0 = __high_int_to_double(v16bits9_0);
822 vecConverted vlow16bits9_0 = __low_int_to_double(v16bits9_0);
823 vecConverted vSum9_0 = vhigh16bits9_0 * mulFactor;
824 vSum9_0 = vSum9_0 + vlow16bits9_0;
826 vecConverted dotProduct1 = (vSum1_0 * errCoefs1);
827 vecConverted dotProduct2 = (vSum2_0 * errCoefs2);
828 vecConverted dotProduct3 = (vSum3_0 * errCoefs3);
829 vecConverted dotProduct4 = (vSum4_0 * errCoefs4);
830 vecConverted dotProduct5 = (vSum5_0 * errCoefs5);
831 vecConverted dotProduct6 = (vSum6_0 * errCoefs6);
832 vecConverted dotProduct7 = (vSum7_0 * errCoefs7);
833 vecConverted dotProduct8 = (vSum8_0 * errCoefs8);
834 vecConverted dotProduct9 = (vSum9_0 * errCoefs9);
836 dotProduct = dotProduct + dotProduct1;
837 dotProduct = dotProduct + dotProduct2;
838 dotProduct = dotProduct + dotProduct3;
839 dotProduct = dotProduct + dotProduct4;
840 dotProduct = dotProduct + dotProduct5;
841 dotProduct = dotProduct + dotProduct6;
842 dotProduct = dotProduct + dotProduct7;
843 dotProduct = dotProduct + dotProduct8;
844 dotProduct = dotProduct + dotProduct9;
847 vpMask = __cmp_lt_pred(dotProduct, maxValVec);
848 maxValVec = __select(vpMask, maxValVec, dotProduct);
849 vIdx = __select(vpMask, vIdx, vCurrIdx);
850 vCurrIdx = vCurrIdx + (eleCount);
852 dotProduct = vecConverted(0);
854 vecIntermediate v16bits1_1 = convert_long_to_int<vecIntermediate, vec>(tmp1_1);
855 vecIntermediate v16bits2_1 = convert_long_to_int<vecIntermediate, vec>(tmp2_1);
856 vecIntermediate v16bits3_1 = convert_long_to_int<vecIntermediate, vec>(tmp3_1);
857 vecIntermediate v16bits4_1 = convert_long_to_int<vecIntermediate, vec>(tmp4_1);
858 vecIntermediate v16bits5_1 = convert_long_to_int<vecIntermediate, vec>(tmp5_1);
859 vecIntermediate v16bits6_1 = convert_long_to_int<vecIntermediate, vec>(tmp6_1);
860 vecIntermediate v16bits7_1 = convert_long_to_int<vecIntermediate, vec>(tmp7_1);
861 vecIntermediate v16bits8_1 = convert_long_to_int<vecIntermediate, vec>(tmp8_1);
862 vecIntermediate v16bits9_1 = convert_long_to_int<vecIntermediate, vec>(tmp9_1);
864 vecConverted vhigh16bits1_1 = __high_int_to_double(v16bits1_1);
865 vecConverted vlow16bits1_1 = __low_int_to_double(v16bits1_1);
866 vecConverted vSum1_1 = vhigh16bits1_1 * mulFactor;
867 vSum1_1 = vSum1_1 + vlow16bits1_1;
869 vecConverted vhigh16bits2_1 = __high_int_to_double(v16bits2_1);
870 vecConverted vlow16bits2_1 = __low_int_to_double(v16bits2_1);
871 vecConverted vSum2_1 = vhigh16bits2_1 * mulFactor;
872 vSum2_1 = vSum2_1 + vlow16bits2_1;
874 vecConverted vhigh16bits3_1 = __high_int_to_double(v16bits3_1);
875 vecConverted vlow16bits3_1 = __low_int_to_double(v16bits3_1);
876 vecConverted vSum3_1 = vhigh16bits3_1 * mulFactor;
877 vSum3_1 = vSum3_1 + vlow16bits3_1;
879 vecConverted vhigh16bits4_1 = __high_int_to_double(v16bits4_1);
880 vecConverted vlow16bits4_1 = __low_int_to_double(v16bits4_1);
881 vecConverted vSum4_1 = vhigh16bits4_1 * mulFactor;
882 vSum4_1 = vSum4_1 + vlow16bits4_1;
884 vecConverted vhigh16bits5_1 = __high_int_to_double(v16bits5_1);
885 vecConverted vlow16bits5_1 = __low_int_to_double(v16bits5_1);
886 vecConverted vSum5_1 = vhigh16bits5_1 * mulFactor;
887 vSum5_1 = vSum5_1 + vlow16bits5_1;
889 vecConverted vhigh16bits6_1 = __high_int_to_double(v16bits6_1);
890 vecConverted vlow16bits6_1 = __low_int_to_double(v16bits6_1);
891 vecConverted vSum6_1 = vhigh16bits6_1 * mulFactor;
892 vSum6_1 = vSum6_1 + vlow16bits6_1;
894 vecConverted vhigh16bits7_1 = __high_int_to_double(v16bits7_1);
895 vecConverted vlow16bits7_1 = __low_int_to_double(v16bits7_1);
896 vecConverted vSum7_1 = vhigh16bits7_1 * mulFactor;
897 vSum7_1 = vSum7_1 + vlow16bits7_1;
899 vecConverted vhigh16bits8_1 = __high_int_to_double(v16bits8_1);
900 vecConverted vlow16bits8_1 = __low_int_to_double(v16bits8_1);
901 vecConverted vSum8_1 = vhigh16bits8_1 * mulFactor;
902 vSum8_1 = vSum8_1 + vlow16bits8_1;
904 vecConverted vhigh16bits9_1 = __high_int_to_double(v16bits9_1);
905 vecConverted vlow16bits9_1 = __low_int_to_double(v16bits9_1);
906 vecConverted vSum9_1 = vhigh16bits9_1 * mulFactor;
907 vSum9_1 = vSum9_1 + vlow16bits9_1;
909 dotProduct1 = (vSum1_1 * errCoefs1);
910 dotProduct2 = (vSum2_1 * errCoefs2);
911 dotProduct3 = (vSum3_1 * errCoefs3);
912 dotProduct4 = (vSum4_1 * errCoefs4);
913 dotProduct5 = (vSum5_1 * errCoefs5);
914 dotProduct6 = (vSum6_1 * errCoefs6);
915 dotProduct7 = (vSum7_1 * errCoefs7);
916 dotProduct8 = (vSum8_1 * errCoefs8);
917 dotProduct9 = (vSum9_1 * errCoefs9);
919 dotProduct = dotProduct + dotProduct1;
920 dotProduct = dotProduct + dotProduct2;
921 dotProduct = dotProduct + dotProduct3;
922 dotProduct = dotProduct + dotProduct4;
923 dotProduct = dotProduct + dotProduct5;
924 dotProduct = dotProduct + dotProduct6;
925 dotProduct = dotProduct + dotProduct7;
926 dotProduct = dotProduct + dotProduct8;
927 dotProduct = dotProduct + dotProduct9;
930 vpMask = __cmp_lt_pred(dotProduct, maxValVec);
931 maxValVec = __select(vpMask, maxValVec, dotProduct);
932 vIdx = __select(vpMask, vIdx, vCurrIdx);
933 vCurrIdx = vCurrIdx + (eleCount);
937 c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
938 double maxval = *pMaxValLocal;
939 *pMaxValLocal = ((Integer64BitDataType) maxval);
945 template <
typename Integer32BitDataType,
typename Integer32BitPromotedDataType>
947 const int *restrict pMaxIndex,
948 const void *restrict pMaxVal,
949 uint8_t *restrict pBlock,
951 uint32_t errCoefsSize,
952 int32_t mainLoopCount)
959 Integer32BitDataType *restrict pErrCoefsLocal = (Integer32BitDataType *) pErrCoefs;
960 int *restrict pMaxIndexLocal = (
int *) pMaxIndex;
961 Integer32BitPromotedDataType *restrict pMaxValLocal = (Integer32BitPromotedDataType *) pMaxVal;
963 typedef typename c7x::make_full_vector<Integer32BitPromotedDataType>::type vec;
964 int32_t eleCount = c7x::element_count_of<vec>::value;
968 *pMaxValLocal = (Integer32BitPromotedDataType) (std::numeric_limits<Integer32BitPromotedDataType>::min());
970 vec errCoefs1 = vec(pErrCoefsLocal[0]);
971 vec errCoefs2 = vec(pErrCoefsLocal[1]);
972 vec errCoefs3 = vec(pErrCoefsLocal[2]);
973 vec errCoefs4 = vec(pErrCoefsLocal[3]);
974 vec errCoefs5 = vec(pErrCoefsLocal[4]);
975 vec errCoefs6 = vec(pErrCoefsLocal[5]);
976 vec errCoefs7 = vec(pErrCoefsLocal[6]);
977 vec errCoefs8 = vec(pErrCoefsLocal[7]);
978 vec errCoefs9 = vec(pErrCoefsLocal[8]);
980 vec dotProduct = vec(0);
981 vec maxValVec = vec(*pMaxValLocal);
987 DSPLIB_DEBUGPRINTFN(0,
"mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
989 for (i = 0; i < mainLoopCount; i++) {
994 vec tmp1_0 = c7x::strm_eng<0, vec>::get_adv();
995 vec tmp2_0 = c7x::strm_eng<0, vec>::get_adv();
996 vec tmp3_0 = c7x::strm_eng<0, vec>::get_adv();
997 vec tmp4_0 = c7x::strm_eng<0, vec>::get_adv();
998 vec tmp5_0 = c7x::strm_eng<0, vec>::get_adv();
999 vec tmp6_0 = c7x::strm_eng<0, vec>::get_adv();
1000 vec tmp7_0 = c7x::strm_eng<0, vec>::get_adv();
1001 vec tmp8_0 = c7x::strm_eng<0, vec>::get_adv();
1002 vec tmp9_0 = c7x::strm_eng<0, vec>::get_adv();
1004 vec tmp1_1 = c7x::strm_eng<1, vec>::get_adv();
1005 vec tmp2_1 = c7x::strm_eng<1, vec>::get_adv();
1006 vec tmp3_1 = c7x::strm_eng<1, vec>::get_adv();
1007 vec tmp4_1 = c7x::strm_eng<1, vec>::get_adv();
1008 vec tmp5_1 = c7x::strm_eng<1, vec>::get_adv();
1009 vec tmp6_1 = c7x::strm_eng<1, vec>::get_adv();
1010 vec tmp7_1 = c7x::strm_eng<1, vec>::get_adv();
1011 vec tmp8_1 = c7x::strm_eng<1, vec>::get_adv();
1012 vec tmp9_1 = c7x::strm_eng<1, vec>::get_adv();
1014 vec dotProduct1 = (tmp1_0 * errCoefs1);
1015 vec dotProduct2 = (tmp2_0 * errCoefs2);
1016 vec dotProduct3 = (tmp3_0 * errCoefs3);
1017 vec dotProduct4 = (tmp4_0 * errCoefs4);
1018 vec dotProduct5 = (tmp5_0 * errCoefs5);
1019 vec dotProduct6 = (tmp6_0 * errCoefs6);
1020 vec dotProduct7 = (tmp7_0 * errCoefs7);
1021 vec dotProduct8 = (tmp8_0 * errCoefs8);
1022 vec dotProduct9 = (tmp9_0 * errCoefs9);
1024 vec acc1 = (__shift_right(dotProduct1, vec(1))) + (__shift_right(dotProduct2, vec(1)));
1025 vec acc2 = (__shift_right(dotProduct3, vec(1))) + (__shift_right(dotProduct4, vec(1)));
1026 vec acc3 = (__shift_right(dotProduct5, vec(1))) + (__shift_right(dotProduct6, vec(1)));
1027 vec acc4 = (__shift_right(dotProduct7, vec(1))) + (__shift_right(dotProduct8, vec(1)));
1028 vec acc5 = (__shift_right(acc1, vec(1))) + (__shift_right(acc2, vec(1)));
1029 vec acc6 = (__shift_right(acc3, vec(1))) + (__shift_right(acc4, vec(1)));
1030 vec acc7 = (__shift_right(acc5, vec(1))) + (__shift_right(acc6, vec(1)));
1031 dotProduct = (__shift_right(acc7, vec(1))) + (__shift_right(dotProduct9, vec(1)));
1034 __max_index(dotProduct, maxValVec, vpMask);
1035 vIdx = __select(vpMask, vCurrIdx, vIdx);
1036 vCurrIdx = vCurrIdx + (eleCount);
1038 dotProduct = vec(0);
1040 dotProduct1 = (tmp1_1 * errCoefs1);
1041 dotProduct2 = (tmp2_1 * errCoefs2);
1042 dotProduct3 = (tmp3_1 * errCoefs3);
1043 dotProduct4 = (tmp4_1 * errCoefs4);
1044 dotProduct5 = (tmp5_1 * errCoefs5);
1045 dotProduct6 = (tmp6_1 * errCoefs6);
1046 dotProduct7 = (tmp7_1 * errCoefs7);
1047 dotProduct8 = (tmp8_1 * errCoefs8);
1048 dotProduct9 = (tmp9_1 * errCoefs9);
1050 acc1 = (__shift_right(dotProduct1, vec(1))) + (__shift_right(dotProduct2, vec(1)));
1051 acc2 = (__shift_right(dotProduct3, vec(1))) + (__shift_right(dotProduct4, vec(1)));
1052 acc3 = (__shift_right(dotProduct5, vec(1))) + (__shift_right(dotProduct6, vec(1)));
1053 acc4 = (__shift_right(dotProduct7, vec(1))) + (__shift_right(dotProduct8, vec(1)));
1054 acc5 = (__shift_right(acc1, vec(1))) + (__shift_right(acc2, vec(1)));
1055 acc6 = (__shift_right(acc3, vec(1))) + (__shift_right(acc4, vec(1)));
1056 acc7 = (__shift_right(acc5, vec(1))) + (__shift_right(acc6, vec(1)));
1057 dotProduct = (__shift_right(acc7, vec(1))) + (__shift_right(dotProduct9, vec(1)));
1060 __max_index(dotProduct, maxValVec, vpMask);
1061 vIdx = __select(vpMask, vCurrIdx, vIdx);
1062 vCurrIdx = vCurrIdx + (eleCount);
1066 c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
1073 template <
typename Integer16BitDataType,
typename Integer16BitPromotedDataType>
1075 const int *restrict pMaxIndex,
1076 const void *restrict pMaxVal,
1077 uint8_t *restrict pBlock,
1079 uint32_t errCoefsSize,
1080 int32_t mainLoopCount)
1087 Integer16BitDataType *restrict pErrCoefsLocal = (Integer16BitDataType *) pErrCoefs;
1088 int *restrict pMaxIndexLocal = (
int *) pMaxIndex;
1089 Integer16BitPromotedDataType *restrict pMaxValLocal = (Integer16BitPromotedDataType *) pMaxVal;
1091 typedef typename c7x::make_full_vector<Integer16BitPromotedDataType>::type vec;
1092 int16_t eleCount = (c7x::element_count_of<vec>::value);
1095 *pMaxValLocal = (Integer16BitPromotedDataType) (std::numeric_limits<Integer16BitPromotedDataType>::min());
1097 vec errCoefs1 = vec(pErrCoefsLocal[0]);
1098 vec errCoefs2 = vec(pErrCoefsLocal[1]);
1099 vec errCoefs3 = vec(pErrCoefsLocal[2]);
1100 vec errCoefs4 = vec(pErrCoefsLocal[3]);
1101 vec errCoefs5 = vec(pErrCoefsLocal[4]);
1102 vec errCoefs6 = vec(pErrCoefsLocal[5]);
1103 vec errCoefs7 = vec(pErrCoefsLocal[6]);
1104 vec errCoefs8 = vec(pErrCoefsLocal[7]);
1105 vec errCoefs9 = vec(pErrCoefsLocal[8]);
1107 vec dotProduct = vec(0);
1108 vec maxValVec = vec(*pMaxValLocal);
1114 DSPLIB_DEBUGPRINTFN(0,
" mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
1116 for (i = 0; i < mainLoopCount; i++) {
1117 dotProduct = vec(0);
1119 vec loadVec1 = c7x::strm_eng<0, vec>::get_adv();
1120 vec loadVec2 = c7x::strm_eng<1, vec>::get_adv();
1122 vec checkEven1 = __pack_consec_low(loadVec2, loadVec1);
1123 vec checkOdd1 = __pack_consec_high(loadVec2, loadVec1);
1125 vec loadVec3 = c7x::strm_eng<0, vec>::get_adv();
1126 vec loadVec4 = c7x::strm_eng<1, vec>::get_adv();
1128 vec checkEven2 = __pack_consec_low(loadVec4, loadVec3);
1129 vec checkOdd2 = __pack_consec_high(loadVec4, loadVec3);
1131 vec loadVec5 = c7x::strm_eng<0, vec>::get_adv();
1132 vec loadVec6 = c7x::strm_eng<1, vec>::get_adv();
1134 vec checkEven3 = __pack_consec_low(loadVec6, loadVec5);
1135 vec checkOdd3 = __pack_consec_high(loadVec6, loadVec5);
1137 vec loadVec7 = c7x::strm_eng<0, vec>::get_adv();
1138 vec loadVec8 = c7x::strm_eng<1, vec>::get_adv();
1140 vec checkEven4 = __pack_consec_low(loadVec8, loadVec7);
1141 vec checkOdd4 = __pack_consec_high(loadVec8, loadVec7);
1143 vec loadVec9 = c7x::strm_eng<0, vec>::get_adv();
1144 vec loadVec10 = c7x::strm_eng<1, vec>::get_adv();
1146 vec checkEven5 = __pack_consec_low(loadVec10, loadVec9);
1148 vec dotProduct1 = (checkEven1 * errCoefs1);
1149 vec dotProduct2 = (checkOdd1 * errCoefs2);
1150 vec dotProduct3 = (checkEven2 * errCoefs3);
1151 vec dotProduct4 = (checkOdd2 * errCoefs4);
1152 vec dotProduct5 = (checkEven3 * errCoefs5);
1153 vec dotProduct6 = (checkOdd3 * errCoefs6);
1154 vec dotProduct7 = (checkEven4 * errCoefs7);
1155 vec dotProduct8 = (checkOdd4 * errCoefs8);
1156 vec dotProduct9 = (checkEven5 * errCoefs9);
1158 vec acc1 = (__shift_right(dotProduct1, vec(1))) + (__shift_right(dotProduct2, vec(1)));
1159 vec acc2 = (__shift_right(dotProduct3, vec(1))) + (__shift_right(dotProduct4, vec(1)));
1160 vec acc3 = (__shift_right(dotProduct5, vec(1))) + (__shift_right(dotProduct6, vec(1)));
1161 vec acc4 = (__shift_right(dotProduct7, vec(1))) + (__shift_right(dotProduct8, vec(1)));
1162 vec acc5 = (__shift_right(acc1, vec(1))) + (__shift_right(acc2, vec(1)));
1163 vec acc6 = (__shift_right(acc3, vec(1))) + (__shift_right(acc4, vec(1)));
1164 vec acc7 = (__shift_right(acc5, vec(1))) + (__shift_right(acc6, vec(1)));
1165 dotProduct = (__shift_right(acc7, vec(1))) + (__shift_right(dotProduct9, vec(1)));
1168 __max_index(dotProduct, maxValVec, vpMask);
1169 vIdx = __select(vpMask, vCurrIdx, vIdx);
1170 vCurrIdx = vCurrIdx + (eleCount);
1174 c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
1181 template <
typename Integer8BitDataType,
typename Integer8BitPromotedDataType>
1183 const int *restrict pMaxIndex,
1184 const void *restrict pMaxVal,
1185 uint8_t *restrict pBlock,
1187 uint32_t errCoefsSize,
1188 int32_t mainLoopCount)
1195 Integer8BitDataType *restrict pErrCoefsLocal = (Integer8BitDataType *) pErrCoefs;
1196 int *restrict pMaxIndexLocal = (
int *) pMaxIndex;
1197 Integer8BitPromotedDataType *restrict pMaxValLocal = (Integer8BitPromotedDataType *) pMaxVal;
1199 typedef typename c7x::make_full_vector<Integer8BitDataType>::type vec;
1200 uint8_t eleCount = c7x::element_count_of<vec>::value;
1202 typedef typename c7x::make_full_vector<Integer8BitPromotedDataType>::type vecPromoted;
1204 typedef typename c7x::make_vector<Integer8BitDataType, __C7X_VEC_SIZE_BYTES__ / 2>::type vecPartial;
1207 *pMaxValLocal = (Integer8BitPromotedDataType) (std::numeric_limits<Integer8BitPromotedDataType>::min());
1209 vecPromoted dotProduct = vecPromoted(0);
1210 vecPromoted dotProductEven = vecPromoted(0);
1211 vecPromoted dotProductOdd = vecPromoted(0);
1212 vecPromoted maxValVec = vecPromoted(*pMaxValLocal);
1214 vecPromoted vIdx = vecPromoted(0);
1219 DSPLIB_DEBUGPRINTFN(0,
"mainLoopCount %d errCoefsSize %d vecInSize %d \n", mainLoopCount, errCoefsSize, vecInSize);
1221 #if __C7X_VEC_SIZE_BITS__ == 256
1224 vecPromoted errCoefs1 = vecPromoted(pErrCoefsLocal[0]);
1225 vecPromoted errCoefs2 = vecPromoted(pErrCoefsLocal[1]);
1226 vecPromoted errCoefs3 = vecPromoted(pErrCoefsLocal[2]);
1227 vec errCoefs4 = vec(pErrCoefsLocal[3]);
1228 vec errCoefs5 = vec(pErrCoefsLocal[4]);
1229 vec errCoefs6 = vec(pErrCoefsLocal[5]);
1230 vec errCoefs7 = vec(pErrCoefsLocal[6]);
1231 vec errCoefs8 = vec(pErrCoefsLocal[7]);
1232 vec errCoefs9 = vec(pErrCoefsLocal[8]);
1234 for (
int i = 0; i < mainLoopCount; i++) {
1236 vec loadVec1 = c7x::strm_eng<0, vec>::get_adv();
1237 vec loadVec2 = c7x::strm_eng<0, vec>::get_adv();
1239 vec loadVec3 = c7x::strm_eng<1, vec>::get_adv();
1240 vec loadVec4 = c7x::strm_eng<1, vec>::get_adv();
1242 vec checkEven1 = __pack_consec_low(loadVec2, loadVec1);
1243 vec checkOdd1 = __pack_consec_high(loadVec2, loadVec1);
1244 vec checkEven2 = __pack_consec_low(loadVec4, loadVec3);
1245 vec checkOdd2 = __pack_consec_high(loadVec4, loadVec3);
1247 vec iEven1 = __pack_consec_low(checkEven2, checkEven1);
1248 vec iOdd1 = __pack_consec_low(checkOdd2, checkOdd1);
1249 vec iEven2 = __pack_consec_high(checkEven2, checkEven1);
1250 vec iOdd2 = __pack_consec_high(checkOdd2, checkOdd1);
1252 vec loadVec5 = c7x::strm_eng<0, vec>::get_adv();
1253 vec loadVec6 = c7x::strm_eng<0, vec>::get_adv();
1255 vec loadVec7 = c7x::strm_eng<1, vec>::get_adv();
1256 vec loadVec8 = c7x::strm_eng<1, vec>::get_adv();
1258 vec checkEven3 = __pack_consec_low(loadVec6, loadVec5);
1259 vec checkOdd3 = __pack_consec_high(loadVec6, loadVec5);
1260 vec checkEven4 = __pack_consec_low(loadVec8, loadVec7);
1261 vec checkOdd4 = __pack_consec_high(loadVec8, loadVec7);
1263 vec iEven3 = __pack_consec_low(checkEven4, checkEven3);
1264 vec iOdd3 = __pack_consec_low(checkOdd4, checkOdd3);
1265 vec iEven4 = __pack_consec_high(checkEven4, checkEven3);
1266 vec iOdd4 = __pack_consec_high(checkOdd4, checkOdd3);
1268 vec loadVec9 = c7x::strm_eng<0, vec>::get_adv();
1269 vec loadVec10 = c7x::strm_eng<0, vec>::get_adv();
1271 vec loadVec11 = c7x::strm_eng<1, vec>::get_adv();
1272 vec loadVec12 = c7x::strm_eng<1, vec>::get_adv();
1274 vec checkEven5 = __pack_consec_low(loadVec10, loadVec9);
1275 vec checkEven6 = __pack_consec_low(loadVec12, loadVec11);
1277 vec iEven5 = __pack_consec_low(checkEven6, checkEven5);
1279 vecPromoted dotProductEven1 = vecPromoted(0);
1280 vecPromoted dotProductEven2 = vecPromoted(0);
1281 vecPromoted dotProductEven3 = vecPromoted(0);
1282 vecPromoted dotProductEven4 = vecPromoted(0);
1283 vecPromoted dotProductEven5 = vecPromoted(0);
1284 vecPromoted dotProductEven6 = vecPromoted(0);
1285 vecPromoted dotProductEven7 = vecPromoted(0);
1286 vecPromoted dotProductEven8 = vecPromoted(0);
1287 vecPromoted dotProductEven9 = vecPromoted(0);
1289 vecPromoted dotProductOdd1 = vecPromoted(0);
1290 vecPromoted dotProductOdd2 = vecPromoted(0);
1291 vecPromoted dotProductOdd3 = vecPromoted(0);
1292 vecPromoted dotProductOdd4 = vecPromoted(0);
1293 vecPromoted dotProductOdd5 = vecPromoted(0);
1294 vecPromoted dotProductOdd6 = vecPromoted(0);
1295 vecPromoted dotProductOdd7 = vecPromoted(0);
1296 vecPromoted dotProductOdd8 = vecPromoted(0);
1297 vecPromoted dotProductOdd9 = vecPromoted(0);
1299 vecPromoted iEvenEvenShort1 = vecPromoted(0);
1300 vecPromoted iEvenOddShort1 = vecPromoted(0);
1301 vecPromoted iOddEvenShort1 = vecPromoted(0);
1302 vecPromoted iOddOddShort1 = vecPromoted(0);
1303 vecPromoted iEvenEvenShort2 = vecPromoted(0);
1304 vecPromoted iEvenOddShort2 = vecPromoted(0);
1306 iEvenEvenShort1 = convert_char_to_short<vecPromoted, vecPartial>(iEven1.even(),
false);
1308 dotProductEven1 = (iEvenEvenShort1 * errCoefs1);
1309 iEvenOddShort1 = convert_char_to_short<vecPromoted, vecPartial>(iEven1.odd(),
false);
1310 dotProductOdd1 = (iEvenOddShort1 * errCoefs1);
1312 iOddEvenShort1 = convert_char_to_short<vecPromoted, vecPartial>(iOdd1.even(),
false);
1313 dotProductEven2 = (iOddEvenShort1 * errCoefs2);
1314 iOddOddShort1 = convert_char_to_short<vecPromoted, vecPartial>(iOdd1.odd(),
false);
1315 dotProductOdd2 = (iOddOddShort1 * errCoefs2);
1317 iEvenEvenShort2 = convert_char_to_short<vecPromoted, vecPartial>(iEven2.even(),
false);
1318 dotProductEven3 = (iEvenEvenShort2 * errCoefs3);
1319 iEvenOddShort2 = convert_char_to_short<vecPromoted, vecPartial>(iEven2.odd(),
false);
1320 dotProductOdd3 = (iEvenOddShort2 * errCoefs3);
1322 mul_char_to_short<vecPromoted &, vec>(iOdd2, errCoefs4, dotProductEven4, dotProductOdd4);
1323 mul_char_to_short<vecPromoted &, vec>(iEven3, errCoefs5, dotProductEven5, dotProductOdd5);
1324 mul_char_to_short<vecPromoted &, vec>(iOdd3, errCoefs6, dotProductEven6, dotProductOdd6);
1325 mul_char_to_short<vecPromoted &, vec>(iEven4, errCoefs7, dotProductEven7, dotProductOdd7);
1326 mul_char_to_short<vecPromoted &, vec>(iOdd4, errCoefs8, dotProductEven8, dotProductOdd8);
1327 mul_char_to_short<vecPromoted &, vec>(iEven5, errCoefs9, dotProductEven9, dotProductOdd9);
1329 vecPromoted accEven1 =
1330 (__shift_right(dotProductEven1, vecPromoted(1))) + (__shift_right(dotProductEven2, vecPromoted(1)));
1331 vecPromoted accEven2 =
1332 (__shift_right(dotProductEven3, vecPromoted(1))) + (__shift_right(dotProductEven4, vecPromoted(1)));
1333 vecPromoted accEven3 =
1334 (__shift_right(dotProductEven5, vecPromoted(1))) + (__shift_right(dotProductEven6, vecPromoted(1)));
1335 vecPromoted accEven4 =
1336 (__shift_right(dotProductEven7, vecPromoted(1))) + (__shift_right(dotProductEven8, vecPromoted(1)));
1337 vecPromoted accEven5 = (__shift_right(accEven1, vecPromoted(1))) + (__shift_right(accEven2, vecPromoted(1)));
1338 vecPromoted accEven6 = (__shift_right(accEven3, vecPromoted(1))) + (__shift_right(accEven4, vecPromoted(1)));
1339 vecPromoted accEven7 = (__shift_right(accEven5, vecPromoted(1))) + (__shift_right(accEven6, vecPromoted(1)));
1340 dotProductEven = (__shift_right(accEven7, vecPromoted(1))) + (__shift_right(dotProductEven9, vecPromoted(1)));
1343 __max_index(dotProductEven, maxValVec, vpMask);
1344 vecPromoted vCurrIdxPrmt;
1345 vCurrIdxPrmt = convert_char_to_short<vecPromoted, vecPartial>(vCurrIdx.even(),
true);
1346 vIdx = __select(vpMask, vCurrIdxPrmt, vIdx);
1348 vecPromoted accOdd1 =
1349 (__shift_right(dotProductOdd1, vecPromoted(1))) + (__shift_right(dotProductOdd2, vecPromoted(1)));
1350 vecPromoted accOdd2 =
1351 (__shift_right(dotProductOdd3, vecPromoted(1))) + (__shift_right(dotProductOdd4, vecPromoted(1)));
1352 vecPromoted accOdd3 =
1353 (__shift_right(dotProductOdd5, vecPromoted(1))) + (__shift_right(dotProductOdd6, vecPromoted(1)));
1354 vecPromoted accOdd4 =
1355 (__shift_right(dotProductOdd7, vecPromoted(1))) + (__shift_right(dotProductOdd8, vecPromoted(1)));
1356 vecPromoted accOdd5 = (__shift_right(accOdd1, vecPromoted(1))) + (__shift_right(accOdd2, vecPromoted(1)));
1357 vecPromoted accOdd6 = (__shift_right(accOdd3, vecPromoted(1))) + (__shift_right(accOdd4, vecPromoted(1)));
1358 vecPromoted accOdd7 = (__shift_right(accOdd5, vecPromoted(1))) + (__shift_right(accOdd6, vecPromoted(1)));
1359 dotProductOdd = (__shift_right(accOdd7, vecPromoted(1))) + (__shift_right(dotProductOdd9, vecPromoted(1)));
1362 __max_index(dotProductOdd, maxValVec, vpMask);
1363 vCurrIdxPrmt = convert_char_to_short<vecPromoted, vecPartial>(vCurrIdx.odd(),
true);
1364 vIdx = __select(vpMask, vCurrIdxPrmt, vIdx);
1365 vCurrIdx = vCurrIdx + vec(eleCount);
1370 vecPromoted errCoefs1 = vecPromoted(pErrCoefsLocal[0]);
1371 vecPromoted errCoefs2 = vecPromoted(pErrCoefsLocal[1]);
1372 vecPromoted errCoefs3 = vecPromoted(pErrCoefsLocal[2]);
1373 vecPartial errCoefs4 = vecPartial(pErrCoefsLocal[3]);
1374 vecPartial errCoefs5 = vecPartial(pErrCoefsLocal[4]);
1375 vecPartial errCoefs6 = vecPartial(pErrCoefsLocal[5]);
1376 vecPartial errCoefs7 = vecPartial(pErrCoefsLocal[6]);
1377 vecPartial errCoefs8 = vecPartial(pErrCoefsLocal[7]);
1378 vecPartial errCoefs9 = vecPartial(pErrCoefsLocal[8]);
1380 for (
int i = 0; i < mainLoopCount; i++) {
1381 dotProductEven = vecPromoted(0);
1382 dotProductOdd = vecPromoted(0);
1386 vec loadVec1 = c7x::strm_eng<0, vec>::get_adv();
1387 vec loadVec2 = c7x::strm_eng<1, vec>::get_adv();
1389 vec checkEven1 = __pack_consec_low(loadVec2, loadVec1);
1390 vec checkOdd1 = __pack_consec_high(loadVec2, loadVec1);
1392 vecPartial iEven1 = checkEven1.even();
1393 vecPartial iOdd1 = checkOdd1.even();
1395 vecPartial iEven2 = checkEven1.odd();
1396 vecPartial iOdd2 = checkOdd1.odd();
1398 vec loadVec3 = c7x::strm_eng<0, vec>::get_adv();
1399 vec loadVec4 = c7x::strm_eng<1, vec>::get_adv();
1401 vec checkEven2 = __pack_consec_low(loadVec4, loadVec3);
1402 vec checkOdd2 = __pack_consec_high(loadVec4, loadVec3);
1404 vecPartial iEven3 = checkEven2.even();
1405 vecPartial iOdd3 = checkOdd2.even();
1406 vecPartial iEven4 = checkEven2.odd();
1407 vecPartial iOdd4 = checkOdd2.odd();
1409 vec loadVec5 = c7x::strm_eng<0, vec>::get_adv();
1410 vec loadVec6 = c7x::strm_eng<1, vec>::get_adv();
1412 vec checkEven3 = __pack_consec_low(loadVec6, loadVec5);
1414 vecPartial iEven5 = checkEven3.even();
1416 vecPromoted dotProduct1 = vecPromoted(0);
1417 vecPromoted dotProduct2 = vecPromoted(0);
1418 vecPromoted dotProduct3 = vecPromoted(0);
1419 vecPromoted dotProduct4 = vecPromoted(0);
1420 vecPromoted dotProduct5 = vecPromoted(0);
1421 vecPromoted dotProduct6 = vecPromoted(0);
1422 vecPromoted dotProduct7 = vecPromoted(0);
1423 vecPromoted dotProduct8 = vecPromoted(0);
1424 vecPromoted dotProduct9 = vecPromoted(0);
1426 vecPromoted iColShort1 = vecPromoted(0);
1427 vecPromoted iColShort2 = vecPromoted(0);
1428 vecPromoted iColShort3 = vecPromoted(0);
1430 iColShort1 = convert_char_to_short<vecPromoted, vecPartial>(iEven1,
false);
1431 dotProduct1 = (iColShort1 * errCoefs1);
1433 iColShort2 = convert_char_to_short<vecPromoted, vecPartial>(iOdd1,
false);
1434 dotProduct2 = (iColShort2 * errCoefs2);
1436 iColShort3 = convert_char_to_short<vecPromoted, vecPartial>(iEven2,
false);
1437 dotProduct3 = (iColShort3 * errCoefs3);
1439 mul_char_to_short<vecPromoted &, vecPartial>(iOdd2, errCoefs4, dotProduct4, dotProduct);
1440 mul_char_to_short<vecPromoted &, vecPartial>(iEven3, errCoefs5, dotProduct5, dotProduct);
1441 mul_char_to_short<vecPromoted &, vecPartial>(iOdd3, errCoefs6, dotProduct6, dotProduct);
1442 mul_char_to_short<vecPromoted &, vecPartial>(iEven4, errCoefs7, dotProduct7, dotProduct);
1443 mul_char_to_short<vecPromoted &, vecPartial>(iOdd4, errCoefs8, dotProduct8, dotProduct);
1444 mul_char_to_short<vecPromoted &, vecPartial>(iEven5, errCoefs9, dotProduct9, dotProduct);
1446 vecPromoted acc1 = (__shift_right(dotProduct1, vecPromoted(1))) + (__shift_right(dotProduct2, vecPromoted(1)));
1447 vecPromoted acc2 = (__shift_right(dotProduct3, vecPromoted(1))) + (__shift_right(dotProduct4, vecPromoted(1)));
1448 vecPromoted acc3 = (__shift_right(dotProduct5, vecPromoted(1))) + (__shift_right(dotProduct6, vecPromoted(1)));
1449 vecPromoted acc4 = (__shift_right(dotProduct7, vecPromoted(1))) + (__shift_right(dotProduct8, vecPromoted(1)));
1450 vecPromoted acc5 = (__shift_right(acc1, vecPromoted(1))) + (__shift_right(acc2, vecPromoted(1)));
1451 vecPromoted acc6 = (__shift_right(acc3, vecPromoted(1))) + (__shift_right(acc4, vecPromoted(1)));
1452 vecPromoted acc7 = (__shift_right(acc5, vecPromoted(1))) + (__shift_right(acc6, vecPromoted(1)));
1453 dotProduct = (__shift_right(acc7, vecPromoted(1))) + (__shift_right(dotProduct9, vecPromoted(1)));
1455 vecPromoted vCurrIdxPrmt;
1458 __max_index(dotProduct, maxValVec, vpMask);
1459 vCurrIdxPrmt = convert_char_to_short<vecPromoted, vecPartial>(vCurrIdx.lo(),
true);
1460 vIdx = __select(vpMask, vCurrIdxPrmt, vIdx);
1461 vCurrIdx = vCurrIdx + vec(eleCount / 2);
1466 c7x_horizontal_max_with_index(maxValVec, vIdx, pMaxValLocal, pMaxIndexLocal);
1475 void *restrict pErrCoefs,
1476 const int *restrict pMaxIndex,
1477 const void *restrict pMaxVal)
1482 typedef typename c7x::make_full_vector<int32_t>::type vec;
1483 int32_t eleCount = c7x::element_count_of<vec>::value;
1484 uint32_t dataSize = 4;
1485 int32_t strideIn = pKerPrivArgs->
strideIn;
1487 __SE_TEMPLATE_v1 se0Params;
1489 int32_t *restrict pInLocal = (int32_t *) pIn;
1492 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1497 __SE0_OPEN(pInLocal, se0Params);
1498 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount / 2), se0Params);
1500 minerror_exec_ci_integer32_inputs<int32_t, int64_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
1514 void *restrict pErrCoefs,
1515 const int *restrict pMaxIndex,
1516 const void *restrict pMaxVal)
1521 typedef typename c7x::make_full_vector<uint32_t>::type vec;
1522 int32_t eleCount = c7x::element_count_of<vec>::value;
1523 uint32_t dataSize = 4;
1524 int32_t strideIn = pKerPrivArgs->
strideIn;
1525 __SE_TEMPLATE_v1 se0Params;
1527 uint32_t *restrict pInLocal = (uint32_t *) pIn;
1530 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1535 __SE0_OPEN(pInLocal, se0Params);
1536 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount / 2), se0Params);
1538 minerror_exec_ci_integer32_inputs<uint32_t, uint64_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
1552 void *restrict pErrCoefs,
1553 const int *restrict pMaxIndex,
1554 const void *restrict pMaxVal)
1558 typedef typename c7x::make_full_vector<int64_t>::type vec;
1559 int32_t eleCount = c7x::element_count_of<vec>::value;
1560 uint32_t dataSize = 8;
1561 int32_t strideIn = pKerPrivArgs->
strideIn;
1563 __SE_TEMPLATE_v1 se0Params;
1565 int64_t *restrict pInLocal = (int64_t *) pIn;
1568 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1572 __SE0_OPEN(pInLocal, se0Params);
1573 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1575 minerror_exec_ci_signed_integer64_inputs<int64_t, double>(pErrCoefs, pMaxIndex, pMaxVal, pBlock,
1590 void *restrict pErrCoefs,
1591 const int *restrict pMaxIndex,
1592 const void *restrict pMaxVal)
1596 typedef typename c7x::make_full_vector<uint64_t>::type vec;
1597 int32_t eleCount = c7x::element_count_of<vec>::value;
1598 uint32_t dataSize = 8;
1599 int32_t strideIn = pKerPrivArgs->
strideIn;
1601 __SE_TEMPLATE_v1 se0Params;
1603 uint64_t *restrict pInLocal = (uint64_t *) pIn;
1606 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1610 __SE0_OPEN(pInLocal, se0Params);
1611 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1613 minerror_exec_ci_unsigned_integer64_inputs<uint64_t, double>(pErrCoefs, pMaxIndex, pMaxVal, pBlock,
1627 void *restrict pErrCoefs,
1628 const int *restrict pMaxIndex,
1629 const void *restrict pMaxVal)
1634 typedef typename c7x::make_full_vector<float>::type vec;
1635 int32_t eleCount = c7x::element_count_of<vec>::value;
1636 uint32_t dataSize = 4;
1637 int32_t strideIn = pKerPrivArgs->
strideIn;
1639 __SE_TEMPLATE_v1 se0Params;
1641 float *restrict pInLocal = (
float *) pIn;
1644 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1648 __SE0_OPEN(pInLocal, se0Params);
1649 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1651 minerror_exec_ci_float_inputs<float>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
1665 void *restrict pErrCoefs,
1666 const int *restrict pMaxIndex,
1667 const void *restrict pMaxVal)
1672 typedef typename c7x::make_full_vector<double>::type vec;
1673 int32_t eleCount = c7x::element_count_of<vec>::value;
1674 uint32_t dataSize = 8;
1675 int32_t strideIn = pKerPrivArgs->
strideIn;
1677 __SE_TEMPLATE_v1 se0Params;
1679 double *restrict pInLocal = (
double *) pIn;
1681 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1685 __SE0_OPEN(pInLocal, se0Params);
1686 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * eleCount), se0Params);
1688 minerror_exec_ci_float_inputs<double>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
1701 void *restrict pErrCoefs,
1702 const int *restrict pMaxIndex,
1703 const void *restrict pMaxVal)
1708 __SE_TEMPLATE_v1 se0Params;
1710 int8_t *restrict pInLocal = (int8_t *) pIn;
1712 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1713 typedef typename c7x::make_full_vector<int8_t>::type vec;
1714 int32_t eleCount = c7x::element_count_of<vec>::value;
1715 uint32_t dataSize = 1;
1716 int32_t strideIn = pKerPrivArgs->
strideIn;
1720 __SE0_OPEN(pInLocal, se0Params);
1723 #if __C7X_VEC_SIZE_BITS__ == 256
1724 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 2)), se0Params);
1726 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1729 minerror_exec_ci_integer8_inputs<int8_t, int16_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
1743 void *restrict pErrCoefs,
1744 const int *restrict pMaxIndex,
1745 const void *restrict pMaxVal)
1750 __SE_TEMPLATE_v1 se0Params;
1752 uint8_t *restrict pInLocal = (uint8_t *) pIn;
1754 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1755 typedef typename c7x::make_full_vector<uint8_t>::type vec;
1756 int32_t eleCount = c7x::element_count_of<vec>::value;
1757 uint32_t dataSize = 1;
1758 int32_t strideIn = pKerPrivArgs->
strideIn;
1762 __SE0_OPEN(pInLocal, se0Params);
1764 #if __C7X_VEC_SIZE_BITS__ == 256
1765 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 2)), se0Params);
1767 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1770 minerror_exec_ci_integer8_inputs<uint8_t, uint16_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
1784 void *restrict pErrCoefs,
1785 const int *restrict pMaxIndex,
1786 const void *restrict pMaxVal)
1791 __SE_TEMPLATE_v1 se0Params;
1793 int16_t *restrict pInLocal = (int16_t *) pIn;
1795 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1796 typedef typename c7x::make_full_vector<int16_t>::type vec;
1797 int32_t eleCount = c7x::element_count_of<vec>::value;
1798 uint32_t dataSize = 2;
1799 int32_t strideIn = pKerPrivArgs->
strideIn;
1803 __SE0_OPEN(pInLocal, se0Params);
1804 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1806 minerror_exec_ci_integer16_inputs<int16_t, int32_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
1819 void *restrict pErrCoefs,
1820 const int *restrict pMaxIndex,
1821 const void *restrict pMaxVal)
1826 __SE_TEMPLATE_v1 se0Params;
1828 uint16_t *restrict pInLocal = (uint16_t *) pIn;
1830 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1831 typedef typename c7x::make_full_vector<uint16_t>::type vec;
1832 int32_t eleCount = c7x::element_count_of<vec>::value;
1833 uint32_t dataSize = 2;
1834 int32_t strideIn = pKerPrivArgs->
strideIn;
1838 __SE0_OPEN(pInLocal, se0Params);
1839 __SE1_OPEN(pInLocal + ((strideIn / dataSize) * (eleCount / 4)), se0Params);
1841 minerror_exec_ci_integer16_inputs<uint16_t, uint32_t>(pErrCoefs, pMaxIndex, pMaxVal, pBlock, pKerPrivArgs->
vecInSize,
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
#define CURR_IDX_VEC_OFFSET
void minerror_exec_ci_signed_integer64_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
#define SE_SE0_PARAM_OFFSET
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int64_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
void minerror_exec_ci_integer16_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
DSPLIB_STATUS DSPLIB_minerror_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
void minerror_exec_ci_unsigned_integer64_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
void minerror_exec_ci_integer32_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint64_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
#define SE_SE1_PARAM_OFFSET
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
void minerror_exec_ci_float_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< uint64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
void minerror_exec_ci_integer8_inputs(void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal, uint8_t *restrict pBlock, uint32_t vecInSize, uint32_t errCoefsSize, int32_t mainLoopCount)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minerror_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsErrCoefs, const DSPLIB_bufParams1D_t *bufParamsOutIndex, const DSPLIB_bufParams1D_t *bufParamsOutVal, const DSPLIB_minerror_InitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
DSPLIB_STATUS DSPLIB_minerror_exec_ci< int32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pErrCoefs, const int *restrict pMaxIndex, const void *restrict pMaxVal)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_minerror.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 1 dimensional buffer descriptor.
A structure for a 2 dimensional buffer descriptor.
uint32_t data_type
Values are of type DSPLIB_data_type_e.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint32_t errCoefsSize
Size of error coefficients vector
uint32_t vecInSize
Size of input data DSPLIB_minerror_init that will be retrieved and used by DSPLIB_minerror_exec
uint8_t bufPblock[DSPLIB_MINERROR_IXX_IXX_OXX_PBLOCK_SIZE]