47 #include "../common/c71/DSPLIB_inlines.h"
50 #include "c7x_scalable.h"
59 #define INDEX_UNROLL_FACTOR 2
62 const c7x::uint_vec
lastRunOffsets = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
66 const c7x::ushort_vec
lastRunOffsetsShort = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
67 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
75 const c7x::uchar_vec
lastRunOffsetsChar = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
76 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
77 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
78 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
81 template <
typename dataType>
88 __SE_TEMPLATE_v1 se0Params, se1Params;
90 __SE_ELETYPE SE_ELETYPE;
91 __SE_VECLEN SE_VECLEN;
95 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
96 uint32_t blocksSize = pKerPrivArgs->
blockSize;
98 typedef typename c7x::make_full_vector<dataType>::type vec;
99 uint32_t eleCount = c7x::element_count_of<vec>::value;
100 SE_VECLEN = c7x::se_veclen<vec>::value;
101 SE_ELETYPE = c7x::se_eletype<vec>::value;
102 uint32_t length = blocksSize;
103 uint32_t width = eleCount;
105 #if DSPLIB_DEBUGPRINT
106 printf(
"Enter eleCount %d\n", eleCount);
112 se0Params = __gen_SE_TEMPLATE_v1();
115 se0Params.ICNT0 = width;
116 se0Params.ELETYPE = SE_ELETYPE;
117 se0Params.VECLEN = SE_VECLEN;
118 se0Params.DIMFMT = __SE_DIMFMT_1D;
120 se1Params = __gen_SE_TEMPLATE_v1();
123 se1Params.ICNT0 = width;
124 se1Params.ELETYPE = SE_ELETYPE;
125 se1Params.VECLEN = SE_VECLEN;
126 se1Params.DIMFMT = __SE_DIMFMT_1D;
129 uint32_t numBlocks = length / width;
130 uint32_t remBlocksSize = length % width;
139 if (length <= width) {
142 se0Params.ICNT0 = length;
148 else if (length < 2 * width) {
151 se0Params.ICNT0 = width;
153 se1Params.ICNT0 = remBlocksSize;
162 se0Params.DIMFMT = __SE_DIMFMT_2D;
164 se0Params.DIM1 = 2 * width;
168 se0Params.ICNT0 = width;
171 se1Params = se0Params;
220 template <
typename T,
typename TIndex>
224 uint32_t blockSize = pKerPrivArgs->
blockSize;
225 uint32_t length = blockSize;
228 __SE_TEMPLATE_v1 se0Params, se1Params;
231 T *restrict pInLocal = (T *) pIn;
232 uint32_t *restrict pOutLocal = (uint32_t *) pOut;
234 #if DSPLIB_DEBUGPRINT
235 printf(
"Enter DSPLIB_maxIndex_exec_ci\n");
238 typedef typename c7x::make_full_vector<T>::type vec;
239 uint32_t eleCount = c7x::element_count_of<vec>::value;
240 uint32_t width = eleCount;
241 #if DSPLIB_DEBUGPRINT
242 printf(
"Enter eleCount %d\n", eleCount);
245 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
250 __SE0_OPEN(pInLocal, se0Params);
251 if (length > width) {
252 __SE1_OPEN(pInLocal + eleCount, se1Params);
255 #if DSPLIB_DEBUGPRINT
256 printf(
"DSPLIB_DEBUGPRINT blockSize %d\n", blockSize);
259 size_t bitsInType =
sizeof(T) * 8;
260 bitsInType = (bitsInType > 32) ? 32 : bitsInType;
262 size_t maxSingleBufferSize = pow(2, bitsInType);
263 uint32_t numBufferIterations = DSPLIB_ceilingDiv(length, maxSingleBufferSize);
265 std::vector<T> maxVals(numBufferIterations);
266 std::vector<uint32_t> maxIndices(numBufferIterations);
272 size_t currentIterationSize;
274 TIndex *currentIndexPtr;
277 for (uint32_t buffer = 0; buffer < numBufferIterations; buffer++) {
279 currentIterationSize = std::min((
size_t) maxSingleBufferSize, (
size_t) (length - (maxSingleBufferSize * buffer)));
281 loopOutput = DSPLIB_maxIndex_loopLogic<T, TIndex>(currentIterationSize, pInLocal);
284 currentValuePtr = (T *) &loopOutput.
maxVals;
285 largest = *currentValuePtr++;
286 currentIndexPtr = (TIndex *) &loopOutput.
maxIndices;
287 maxIndex = *currentIndexPtr++;
288 for (i = 1; i < c7x::element_count_of<vec>::value; i++) {
289 currentValue = *currentValuePtr;
290 currentIndex = *currentIndexPtr;
291 if (currentValue > largest) {
292 largest = currentValue;
293 maxIndex = currentIndex;
297 else if (currentValue == largest) {
298 if (currentIndex < maxIndex) {
299 maxIndex = currentIndex;
308 maxVals[buffer] = largest;
309 maxIndices[buffer] = ((uint32_t) maxIndex) + (buffer * maxSingleBufferSize);
312 T largestVal = maxVals[0];
313 uint32_t largestIndex = maxIndices[0];
314 for (i = 1; i < maxVals.size(); i++) {
315 if (maxVals[i] > largestVal) {
316 largestVal = maxVals[i];
317 largestIndex = maxIndices[i];
321 *pOutLocal = largestIndex;
324 if (length > width) {
357 c7x::uint_vec maxIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
358 c7x::uint_vec maxIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
359 c7x::uint_vec maxIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
361 c7x::uint_vec maxIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
362 c7x::uint_vec maxIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
363 c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
364 c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
366 typedef typename c7x::make_full_vector<T>::type vec;
367 typedef typename c7x::make_full_vector<TIndex>::type index_vec;
376 size_t width = c7x::element_count_of<vec>::value;
378 if (length <= width) {
379 maxVals = c7x::strm_eng<0, vec>::get_adv();
381 for (
size_t i = length; i < width; i++) {
382 maxVals.s[i] = std::numeric_limits<T>::lowest();
386 else if (length < 2 * width) {
387 maxVals0 = c7x::strm_eng<0, vec>::get_adv();
388 maxVals1 = c7x::strm_eng<1, vec>::get_adv();
390 size_t remElements = length % width;
391 for (
size_t i = remElements; i < width; i++) {
392 maxVals1.s[i] = std::numeric_limits<T>::lowest();
394 maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
395 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
396 maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
400 maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
401 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
402 index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
403 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
404 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
405 index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
406 index_vec zeroVec = c7x::uint_vec(0);
407 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
408 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
410 maxIndices = nonTiebreakerVec + tiebreakerVec;
415 __vpred mask0, mask1;
418 vec maxValsA = T(std::numeric_limits<T>::lowest());
419 vec maxValsB = maxValsA;
422 vec maxValsLarge = T(std::numeric_limits<T>::lowest());
426 for (
size_t i = 0; i < numIterations; i += 1) {
427 inVec0 = c7x::strm_eng<0, vec>::get_adv();
428 mask0 = __cmp_gt_pred(inVec0, maxValsA);
430 __select(mask0, inVec0, maxValsA);
433 __select(mask0, firstHalfIndices, maxIndicesA);
436 inVec1 = c7x::strm_eng<1, vec>::get_adv();
437 mask1 = __cmp_gt_pred(inVec1, maxValsB);
438 maxValsB = __select(mask1, inVec1, maxValsB);
439 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
449 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
450 T *remStart = (T *) pSrc + length - width;
452 if (remBlockSize != 0 && remVecLen == 1) {
454 inVec0 = *(vec *) remStart;
455 firstHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsets;
456 mask0 = __cmp_gt_pred(inVec0, maxValsA);
458 __select(mask0, inVec0, maxValsA);
460 maxIndicesA = __select(mask0, firstHalfIndices,
465 else if (remBlockSize != 0 && remVecLen == 2) {
466 inVec0 = *(vec *) (remStart - width);
467 firstHalfIndices = c7x::uint_vec(length - (2 * c7x::element_count_of<index_vec>::value)) +
lastRunOffsets;
468 mask0 = __cmp_gt_pred(inVec0, maxValsA);
470 __select(mask0, inVec0, maxValsA);
472 maxIndicesA = __select(mask0, firstHalfIndices,
477 inVec1 = *(vec *) remStart;
478 secondHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsets;
479 mask1 = __cmp_gt_pred(inVec1, maxValsB);
480 maxValsB = __select(mask1, inVec1, maxValsB);
481 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
489 __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
490 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
491 index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
492 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
493 maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
494 index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
495 index_vec zeroVec = c7x::uint_vec(0);
496 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
497 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
499 index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
501 maxVals = maxValsLarge;
502 maxIndices = maxIndicesLarge;
514 c7x::uchar_vec maxIndices =
515 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
516 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
517 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
518 c7x::uchar_vec maxIndices0 =
519 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
520 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
521 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
522 c7x::uchar_vec maxIndices1 =
523 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
524 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
525 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
527 c7x::uchar_vec maxIndicesA =
528 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
529 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
530 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
531 c7x::uchar_vec maxIndicesB =
532 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
533 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
534 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
535 c7x::uchar_vec firstHalfIndices =
536 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
537 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
538 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
539 c7x::uchar_vec secondHalfIndices =
540 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
541 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
542 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
544 typedef typename c7x::make_full_vector<int8_t>::type vec;
545 typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
554 size_t width = c7x::element_count_of<vec>::value;
557 if (length <= width) {
558 maxVals = c7x::strm_eng<0, vec>::get_adv();
560 for (
size_t i = length; i < width; i++) {
561 maxVals.s[i] = std::numeric_limits<int8_t>::lowest();
565 else if (length < 2 * width) {
566 maxVals0 = c7x::strm_eng<0, vec>::get_adv();
567 maxVals1 = c7x::strm_eng<1, vec>::get_adv();
569 size_t remElements = length % width;
570 for (
size_t i = remElements; i < width; i++) {
571 maxVals1.s[i] = std::numeric_limits<int8_t>::lowest();
575 maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
576 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
577 index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
578 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
579 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
580 index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
581 index_vec zeroVec = c7x::uchar_vec(0);
582 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
583 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
585 maxIndices = nonTiebreakerVec + tiebreakerVec;
590 __vpred mask0, mask1;
593 vec maxValsA = int8_t(std::numeric_limits<int8_t>::lowest());
594 vec maxValsB = maxValsA;
597 vec maxValsLarge = int8_t(std::numeric_limits<int8_t>::lowest());
601 for (
size_t i = 0; i < numIterations; i += 1) {
602 inVec0 = c7x::strm_eng<0, vec>::get_adv();
603 mask0 = __cmp_gt_pred(inVec0, maxValsA);
605 __select(mask0, inVec0, maxValsA);
608 __select(mask0, firstHalfIndices, maxIndicesA);
611 inVec1 = c7x::strm_eng<1, vec>::get_adv();
612 mask1 = __cmp_gt_pred(inVec1, maxValsB);
613 maxValsB = __select(mask1, inVec1, maxValsB);
614 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
623 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
624 int8_t *remStart = (int8_t *) pSrc + length - width;
626 if (remBlockSize != 0 && remVecLen == 1) {
627 inVec0 = *(vec *) remStart;
630 mask0 = __cmp_gt_pred(inVec0, maxValsA);
632 __select(mask0, inVec0, maxValsA);
634 maxIndicesA = __select(mask0, firstHalfIndices,
639 else if (remBlockSize != 0 && remVecLen == 2) {
640 inVec0 = *(vec *) (remStart - width);
642 mask0 = __cmp_gt_pred(inVec0, maxValsA);
644 __select(mask0, inVec0, maxValsA);
646 maxIndicesA = __select(mask0, firstHalfIndices,
650 inVec1 = *(vec *) remStart;
652 mask1 = __cmp_gt_pred(inVec1, maxValsB);
653 maxValsB = __select(mask1, inVec1, maxValsB);
654 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
662 __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
663 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
664 index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
665 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
666 maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
667 index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
668 index_vec zeroVec = c7x::uchar_vec(0);
669 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
670 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
672 index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
674 maxVals = maxValsLarge;
675 maxIndices = maxIndicesLarge;
686 c7x::uchar_vec maxIndices =
687 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
688 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
689 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
690 c7x::uchar_vec maxIndices0 =
691 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
692 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
693 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
694 c7x::uchar_vec maxIndices1 =
695 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
696 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
697 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
699 c7x::uchar_vec maxIndicesA =
700 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
701 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
702 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
703 c7x::uchar_vec maxIndicesB =
704 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
705 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
706 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
707 c7x::uchar_vec firstHalfIndices =
708 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
709 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
710 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
711 c7x::uchar_vec secondHalfIndices =
712 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
713 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
714 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
716 typedef typename c7x::make_full_vector<uint8_t>::type vec;
717 typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
726 size_t width = c7x::element_count_of<vec>::value;
729 if (length <= width) {
730 maxVals = c7x::strm_eng<0, vec>::get_adv();
732 for (
size_t i = length; i < width; i++) {
733 maxVals.s[i] = std::numeric_limits<uint8_t>::lowest();
737 else if (length < 2 * width) {
738 maxVals0 = c7x::strm_eng<0, vec>::get_adv();
739 maxVals1 = c7x::strm_eng<1, vec>::get_adv();
741 size_t remElements = length % width;
742 for (
size_t i = remElements; i < width; i++) {
743 maxVals1.s[i] = std::numeric_limits<uint8_t>::lowest();
747 maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
748 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
749 index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
750 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
751 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
752 index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
753 index_vec zeroVec = c7x::uchar_vec(0);
754 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
755 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
757 maxIndices = nonTiebreakerVec + tiebreakerVec;
762 __vpred mask0, mask1;
765 vec maxValsA = uint8_t(std::numeric_limits<uint8_t>::lowest());
766 vec maxValsB = maxValsA;
769 vec maxValsLarge = uint8_t(std::numeric_limits<uint8_t>::lowest());
773 for (
size_t i = 0; i < numIterations; i += 1) {
774 inVec0 = c7x::strm_eng<0, vec>::get_adv();
775 mask0 = __cmp_gt_pred(inVec0, maxValsA);
777 __select(mask0, inVec0, maxValsA);
780 __select(mask0, firstHalfIndices, maxIndicesA);
783 inVec1 = c7x::strm_eng<1, vec>::get_adv();
784 mask1 = __cmp_gt_pred(inVec1, maxValsB);
785 maxValsB = __select(mask1, inVec1, maxValsB);
786 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
795 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
796 int8_t *remStart = (int8_t *) pSrc + length - width;
798 if (remBlockSize != 0 && remVecLen == 1) {
799 inVec0 = *(vec *) remStart;
802 mask0 = __cmp_gt_pred(inVec0, maxValsA);
804 __select(mask0, inVec0, maxValsA);
806 maxIndicesA = __select(mask0, firstHalfIndices,
811 else if (remBlockSize != 0 && remVecLen == 2) {
812 inVec0 = *(vec *) (remStart - width);
814 mask0 = __cmp_gt_pred(inVec0, maxValsA);
816 __select(mask0, inVec0, maxValsA);
818 maxIndicesA = __select(mask0, firstHalfIndices,
822 inVec1 = *(vec *) remStart;
824 mask1 = __cmp_gt_pred(inVec1, maxValsB);
825 maxValsB = __select(mask1, inVec1, maxValsB);
826 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
833 __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
834 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
835 index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
836 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
837 maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
838 index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
839 index_vec zeroVec = c7x::uchar_vec(0);
840 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
841 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
843 index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
845 maxVals = maxValsLarge;
846 maxIndices = maxIndicesLarge;
858 c7x::ushort_vec maxIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
859 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
860 c7x::ushort_vec maxIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
861 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
862 c7x::ushort_vec maxIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
863 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
865 c7x::ushort_vec maxIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
866 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
867 c7x::ushort_vec maxIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
868 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
869 c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
870 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
871 c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
872 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
874 typedef typename c7x::make_full_vector<int16_t>::type vec;
875 typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
884 size_t width = c7x::element_count_of<vec>::value;
887 if (length <= width) {
888 maxVals = c7x::strm_eng<0, vec>::get_adv();
891 for (
size_t i = length; i < width; i++) {
892 maxVals.s[i] = std::numeric_limits<int16_t>::lowest();
896 else if (length < 2 * width) {
897 maxVals0 = c7x::strm_eng<0, vec>::get_adv();
898 maxVals1 = c7x::strm_eng<1, vec>::get_adv();
901 size_t remElements = length % width;
903 for (
size_t i = remElements; i < width; i++) {
904 maxVals1.s[i] = std::numeric_limits<int16_t>::lowest();
908 maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
909 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
910 c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
911 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
912 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
913 c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
914 c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
915 c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
916 c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
918 maxIndices = nonTiebreakerVec + tiebreakerVec;
922 c7x::short_vec inVec0, inVec1;
923 __vpred mask0, mask1;
926 c7x::short_vec maxValsA = int16_t(std::numeric_limits<int16_t>::lowest());
927 c7x::short_vec maxValsB = maxValsA;
930 c7x::short_vec maxValsLarge = int16_t(std::numeric_limits<int16_t>::lowest());
934 for (
size_t i = 0; i < numIterations; i += 1) {
935 inVec0 = c7x::strm_eng<0, vec>::get_adv();
936 mask0 = __cmp_gt_pred(inVec0, maxValsA);
938 __select(mask0, inVec0, maxValsA);
941 __select(mask0, firstHalfIndices, maxIndicesA);
944 inVec1 = c7x::strm_eng<1, vec>::get_adv();
945 mask1 = __cmp_gt_pred(inVec1, maxValsB);
946 maxValsB = __select(mask1, inVec1, maxValsB);
947 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
958 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
959 int16_t *remStart = (int16_t *) pSrc + length - width;
961 if (remBlockSize != 0 && remVecLen == 1) {
962 inVec0 = *(vec *) remStart;
965 mask0 = __cmp_gt_pred(inVec0, maxValsA);
967 __select(mask0, inVec0, maxValsA);
969 maxIndicesA = __select(mask0, firstHalfIndices,
974 else if (remBlockSize != 0 && remVecLen == 2) {
975 inVec0 = *(vec *) (remStart - width);
977 mask0 = __cmp_gt_pred(inVec0, maxValsA);
979 __select(mask0, inVec0, maxValsA);
981 maxIndicesA = __select(mask0, firstHalfIndices,
984 inVec1 = *(vec *) remStart;
986 mask1 = __cmp_gt_pred(inVec1, maxValsB);
987 maxValsB = __select(mask1, inVec1, maxValsB);
988 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
993 __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
994 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
995 c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
996 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
997 maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
998 c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
999 c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1000 c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1001 c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1003 c7x::ushort_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
1005 maxVals = maxValsLarge;
1006 maxIndices = maxIndicesLarge;
1018 c7x::ushort_vec maxIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
1019 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1020 c7x::ushort_vec maxIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
1021 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1022 c7x::ushort_vec maxIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
1023 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
1025 c7x::ushort_vec maxIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
1026 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1027 c7x::ushort_vec maxIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
1028 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
1029 c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1030 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1031 c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1032 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
1034 typedef typename c7x::make_full_vector<uint16_t>::type vec;
1035 typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
1044 size_t width = c7x::element_count_of<vec>::value;
1047 if (length <= width) {
1048 maxVals = c7x::strm_eng<0, vec>::get_adv();
1051 for (
size_t i = length; i < width; i++) {
1052 maxVals.s[i] = std::numeric_limits<uint16_t>::lowest();
1056 else if (length < 2 * width) {
1057 maxVals0 = c7x::strm_eng<0, vec>::get_adv();
1058 maxVals1 = c7x::strm_eng<1, vec>::get_adv();
1061 size_t remElements = length % width;
1063 for (
size_t i = remElements; i < width; i++) {
1064 maxVals1.s[i] = std::numeric_limits<uint16_t>::lowest();
1068 maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
1069 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
1070 c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
1071 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
1072 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1073 c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
1074 c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1075 c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1076 c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1078 maxIndices = nonTiebreakerVec + tiebreakerVec;
1082 c7x::ushort_vec inVec0, inVec1;
1083 __vpred mask0, mask1;
1086 c7x::ushort_vec maxValsA = uint16_t(std::numeric_limits<uint16_t>::lowest());
1087 c7x::ushort_vec maxValsB = maxValsA;
1090 c7x::ushort_vec maxValsLarge = uint16_t(std::numeric_limits<uint16_t>::lowest());
1094 for (
size_t i = 0; i < numIterations; i += 1) {
1095 inVec0 = c7x::strm_eng<0, c7x::ushort_vec>::get_adv();
1096 mask0 = __cmp_gt_pred(inVec0, maxValsA);
1098 __select(mask0, inVec0, maxValsA);
1101 __select(mask0, firstHalfIndices, maxIndicesA);
1104 inVec1 = c7x::strm_eng<1, c7x::ushort_vec>::get_adv();
1105 mask1 = __cmp_gt_pred(inVec1, maxValsB);
1106 maxValsB = __select(mask1, inVec1, maxValsB);
1107 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1118 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1119 uint16_t *remStart = (uint16_t *) pSrc + length - width;
1121 if (remBlockSize != 0 && remVecLen == 1) {
1122 inVec0 = *(vec *) remStart;
1125 mask0 = __cmp_gt_pred(inVec0, maxValsA);
1127 __select(mask0, inVec0, maxValsA);
1129 maxIndicesA = __select(mask0, firstHalfIndices,
1133 else if (remBlockSize != 0 && remVecLen == 2) {
1134 inVec0 = *(vec *) (remStart - width);
1136 mask0 = __cmp_gt_pred(inVec0, maxValsA);
1138 __select(mask0, inVec0, maxValsA);
1140 maxIndicesA = __select(mask0, firstHalfIndices,
1143 inVec1 = *(vec *) remStart;
1145 mask1 = __cmp_gt_pred(inVec1, maxValsB);
1146 maxValsB = __select(mask1, inVec1, maxValsB);
1147 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1152 __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
1153 __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
1154 c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
1155 __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
1156 maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
1157 c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
1158 c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1159 c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1160 c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1162 c7x::ushort_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
1164 maxVals = maxValsLarge;
1165 maxIndices = maxIndicesLarge;
1177 c7x::uint_vec maxIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1178 c7x::uint_vec maxIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1179 c7x::uint_vec maxIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1181 c7x::uint_vec maxIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1182 c7x::uint_vec maxIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1183 c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1184 c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1186 c7x::float_vec maxVals0;
1187 c7x::float_vec maxVals1;
1190 size_t width = c7x::element_count_of<c7x::float_vec>::value;
1193 c7x::float_vec maxVals;
1195 if (length <= width) {
1196 maxVals = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1198 for (
size_t i = length; i < width; i++) {
1199 maxVals.s[i] = std::numeric_limits<float>::lowest();
1203 else if (length < 2 * width) {
1204 maxVals0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1205 maxVals1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1207 size_t remElements = length % width;
1208 for (
size_t i = remElements; i < width; i++) {
1209 maxVals1.s[i] = std::numeric_limits<float>::lowest();
1211 maskOfMaxs = __cmp_lt_pred(maxVals1, maxVals0);
1212 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1213 maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
1217 c7x::float_vec inVec0, inVec1;
1218 __vpred mask0, mask1, maskOfMaxsLarge;
1221 c7x::float_vec maxValsA = std::numeric_limits<float>::lowest();
1222 c7x::float_vec maxValsB = maxValsA;
1225 c7x::float_vec maxValsLarge = std::numeric_limits<float>::lowest();
1230 for (
size_t i = 0; i < numIterations; i += 1) {
1231 inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1232 mask0 = __cmp_lt_pred(maxValsA, inVec0);
1234 __select(mask0, inVec0, maxValsA);
1237 __select(mask0, firstHalfIndices, maxIndicesA);
1240 inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1241 mask1 = __cmp_lt_pred(maxValsB, inVec1);
1242 maxValsB = __select(mask1, inVec1, maxValsB);
1243 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1255 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1256 float *remStart = (
float *) pSrc + length - width;
1259 if (remBlockSize != 0 && remVecLen == 1) {
1262 inVec0 = *(c7x::float_vec *) remStart;
1263 firstHalfIndices = c7x::uint_vec(length - (width)) +
lastRunOffsets;
1264 mask0 = __cmp_lt_pred(maxValsA, inVec0);
1266 __select(mask0, inVec0, maxValsA);
1268 maxIndicesA = __select(mask0, firstHalfIndices,
1274 else if (remBlockSize != 0 && remVecLen == 2) {
1278 inVec0 = *(c7x::float_vec *) (remStart - width);
1279 firstHalfIndices = c7x::uint_vec(length - (2 * width)) +
lastRunOffsets;
1280 mask0 = __cmp_lt_pred(maxValsA, inVec0);
1282 __select(mask0, inVec0, maxValsA);
1284 maxIndicesA = __select(mask0, firstHalfIndices,
1289 inVec1 = *(c7x::float_vec *) remStart;
1290 secondHalfIndices = c7x::uint_vec(length - (width)) +
lastRunOffsets;
1291 mask1 = __cmp_lt_pred(maxValsB, inVec1);
1292 maxValsB = __select(mask1, inVec1, maxValsB);
1293 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1298 maskOfMaxsLarge = __cmp_lt_pred(maxValsB, maxValsA);
1299 maxValsLarge = __select(maskOfMaxsLarge, maxValsA, maxValsB);
1300 c7x::uint_vec maxIndicesLarge = __select(maskOfMaxsLarge, maxIndicesA, maxIndicesB);
1305 maxVals = maxValsLarge;
1306 maxIndices = maxIndicesLarge;
1317 c7x::ulong_vec maxIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1318 c7x::ulong_vec maxIndices0 = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1319 c7x::ulong_vec maxIndices1 = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1321 c7x::ulong_vec maxIndicesA = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1322 c7x::ulong_vec maxIndicesB = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1323 c7x::ulong_vec firstHalfIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1324 c7x::ulong_vec secondHalfIndices = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1326 c7x::double_vec maxVals0;
1327 c7x::double_vec maxVals1;
1330 size_t width = c7x::element_count_of<c7x::double_vec>::value;
1333 c7x::double_vec maxVals;
1335 if (length <= width) {
1336 maxVals = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1338 for (
size_t i = length; i < width; i++) {
1339 maxVals.s[i] = std::numeric_limits<double>::lowest();
1343 else if (length < 2 * width) {
1344 maxVals0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1345 maxVals1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1347 size_t remainingElement = length % width;
1348 for (
size_t i = remainingElement; i < width; i++) {
1349 maxVals1.s[i] = std::numeric_limits<double>::lowest();
1351 maskOfMaxs = __cmp_lt_pred(maxVals1, maxVals0);
1352 maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1353 maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
1357 c7x::double_vec inVec0, inVec1;
1358 __vpred mask0, mask1, maskOfMaxsLarge;
1361 c7x::double_vec maxValsA = std::numeric_limits<double>::lowest();
1362 c7x::double_vec maxValsB = maxValsA;
1365 c7x::double_vec maxValsLarge = std::numeric_limits<double>::lowest();
1370 for (
size_t i = 0; i < numIterations; i += 1) {
1371 inVec0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1372 mask0 = __cmp_lt_pred(maxValsA, inVec0);
1374 __select(mask0, inVec0, maxValsA);
1377 __select(mask0, firstHalfIndices, maxIndicesA);
1380 inVec1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1381 mask1 = __cmp_lt_pred(maxValsB, inVec1);
1382 maxValsB = __select(mask1, inVec1, maxValsB);
1383 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1395 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1396 double *remStart = (
double *) pSrc + length - width;
1399 if (remBlockSize != 0 && remVecLen == 1) {
1402 inVec0 = *(c7x::double_vec *) remStart;
1404 mask0 = __cmp_lt_pred(maxValsA, inVec0);
1406 __select(mask0, inVec0, maxValsA);
1408 maxIndicesA = __select(mask0, firstHalfIndices,
1414 else if (remBlockSize != 0 && remVecLen == 2) {
1418 inVec0 = *(c7x::double_vec *) (remStart - width);
1419 firstHalfIndices = c7x::ulong_vec(length - (2 * width)) +
lastRunOffsetsDp;
1420 mask0 = __cmp_lt_pred(maxValsA, inVec0);
1422 __select(mask0, inVec0, maxValsA);
1424 maxIndicesA = __select(mask0, firstHalfIndices,
1429 inVec1 = *(c7x::double_vec *) remStart;
1431 mask1 = __cmp_lt_pred(maxValsB, inVec1);
1432 maxValsB = __select(mask1, inVec1, maxValsB);
1433 maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1439 maskOfMaxsLarge = __cmp_lt_pred(maxValsB, maxValsA);
1440 maxValsLarge = __select(maskOfMaxsLarge, maxValsA, maxValsB);
1441 c7x::ulong_vec maxIndicesLarge = __select(maskOfMaxsLarge, maxIndicesA, maxIndicesB);
1443 maxVals = maxValsLarge;
1444 maxIndices = maxIndicesLarge;
#define SE_SE0_PARAM_OFFSET
#define SE_SE1_PARAM_OFFSET
metadata< uint16_t, uint16_t > DSPLIB_maxIndex_loopLogic< uint16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< float, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< int16_t, uint16_t > DSPLIB_maxIndex_loopLogic< int16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
metadata< double, uint64_t > DSPLIB_maxIndex_loopLogic< double, uint64_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
DSPLIB_STATUS DSPLIB_maxIndex_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< double, uint64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
const c7x::uint_vec jumpFactor
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< float, uint32_t > DSPLIB_maxIndex_loopLogic< float, uint32_t >(size_t length, void *pSrc)
const c7x::ushort_vec jumpFactorShort
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
const c7x::uchar_vec jumpFactorChar
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
const c7x::uint_vec lastRunOffsets
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< uint8_t, uint8_t > DSPLIB_maxIndex_loopLogic< uint8_t, uint8_t >(size_t length, void *pSrc)
const c7x::uchar_vec lastRunOffsetsChar
#define INDEX_UNROLL_FACTOR
const c7x::ushort_vec lastRunOffsetsShort
metadata< T, TIndex > DSPLIB_maxIndex_loopLogic(size_t length, void *pSrc)
This function is the kernel loop helper function for the optimized implementation of the kernel....
const c7x::ulong_vec jumpFactorDp
metadata< int8_t, uint8_t > DSPLIB_maxIndex_loopLogic< int8_t, uint8_t >(size_t length, void *pSrc)
DSPLIB_STATUS DSPLIB_maxIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
const c7x::ulong_vec lastRunOffsetsDp
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_maxIndex.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 1 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MAXINDEX_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t blockSize
Size of input buffer for different batches DSPLIB_maxIndex_init that will be retrieved and used by DS...