47 #include "../common/c71/DSPLIB_inlines.h"
50 #include "c7x_scalable.h"
58 #define INDEX_UNROLL_FACTOR 2
61 const c7x::uint_vec
lastRunOffsets = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
65 const c7x::ushort_vec
lastRunOffsetsShort = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
66 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
74 const c7x::uchar_vec
lastRunOffsetsChar = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
75 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
76 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
77 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
80 template <
typename dataType>
87 __SE_TEMPLATE_v1 se0Params, se1Params;
89 __SE_ELETYPE SE_ELETYPE;
90 __SE_VECLEN SE_VECLEN;
94 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
95 uint32_t blocksSize = pKerPrivArgs->
blockSize;
97 typedef typename c7x::make_full_vector<dataType>::type vec;
98 uint32_t eleCount = c7x::element_count_of<vec>::value;
99 SE_VECLEN = c7x::se_veclen<vec>::value;
100 SE_ELETYPE = c7x::se_eletype<vec>::value;
101 uint32_t length = blocksSize;
102 uint32_t width = eleCount;
104 #if DSPLIB_DEBUGPRINT
105 printf(
"Enter eleCount %d\n", eleCount);
111 se0Params = __gen_SE_TEMPLATE_v1();
114 se0Params.ICNT0 = width;
115 se0Params.ELETYPE = SE_ELETYPE;
116 se0Params.VECLEN = SE_VECLEN;
117 se0Params.DIMFMT = __SE_DIMFMT_1D;
119 se1Params = __gen_SE_TEMPLATE_v1();
122 se1Params.ICNT0 = width;
123 se1Params.ELETYPE = SE_ELETYPE;
124 se1Params.VECLEN = SE_VECLEN;
125 se1Params.DIMFMT = __SE_DIMFMT_1D;
128 uint32_t numBlocks = length / width;
129 uint32_t remBlocksSize = length % width;
138 if (length <= width) {
141 se0Params.ICNT0 = length;
147 else if (length < 2 * width) {
151 se0Params.ICNT0 = width;
153 se1Params.ICNT0 = remBlocksSize;
161 se0Params.DIMFMT = __SE_DIMFMT_2D;
163 se0Params.DIM1 = 2 * width;
167 se0Params.ICNT0 = width;
170 se1Params = se0Params;
221 c7x::uint_vec minIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
222 c7x::uint_vec minIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
223 c7x::uint_vec minIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
225 c7x::uint_vec minIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
226 c7x::uint_vec minIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
227 c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
228 c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
230 typedef typename c7x::make_full_vector<T>::type vec;
231 typedef typename c7x::make_full_vector<TIndex>::type index_vec;
240 size_t width = c7x::element_count_of<vec>::value;
242 if (length <= width) {
243 minVals = c7x::strm_eng<0, vec>::get_adv();
245 for (
size_t i = length; i < width; i++) {
246 minVals.s[i] = std::numeric_limits<T>::max();
250 else if (length < 2 * width) {
251 minVals0 = c7x::strm_eng<0, vec>::get_adv();
252 minVals1 = c7x::strm_eng<1, vec>::get_adv();
254 size_t remElements = length % width;
255 for (
size_t i = remElements; i < width; i++) {
256 minVals1.s[i] = std::numeric_limits<T>::max();
258 maskOfmins = __cmp_gt_pred(minVals0, minVals1);
259 minVals = __select(maskOfmins, minVals1, minVals0);
260 minIndices = __select(maskOfmins, minIndices1, minIndices0);
264 maskOfmins = __cmp_ge_pred(minVals0, minVals1);
265 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
266 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
267 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
268 minVals = __select(maskOfmins, minVals1, minVals0);
269 index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
270 index_vec zeroVec = c7x::uint_vec(0);
271 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
272 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
274 minIndices = nonTiebreakerVec + tiebreakerVec;
279 __vpred mask0, mask1;
282 vec minValsA = T(std::numeric_limits<T>::max());
283 vec minValsB = minValsA;
290 for (
size_t i = 0; i < numIterations; i += 1) {
291 inVec0 = c7x::strm_eng<0, vec>::get_adv();
292 mask0 = __cmp_ge_pred(inVec0, minValsA);
294 __select(mask0, minValsA, inVec0);
297 __select(mask0, minIndicesA, firstHalfIndices);
300 inVec1 = c7x::strm_eng<1, vec>::get_adv();
301 mask1 = __cmp_ge_pred(inVec1, minValsB);
302 minValsB = __select(mask1, minValsB, inVec1);
303 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
314 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
315 T *remStart = (T *) pSrc + length - width;
317 if (remBlockSize != 0 && remVecLen == 1) {
318 inVec0 = *(vec *) remStart;
319 firstHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsets;
320 mask0 = __cmp_ge_pred(inVec0, minValsA);
322 __select(mask0, minValsA, inVec0);
324 minIndicesA = __select(mask0, minIndicesA,
329 else if (remBlockSize != 0 && remVecLen == 2) {
330 inVec0 = *(vec *) (remStart - width);
331 firstHalfIndices = c7x::uint_vec(length - (2 * c7x::element_count_of<index_vec>::value)) +
lastRunOffsets;
332 mask0 = __cmp_ge_pred(inVec0, minValsA);
334 __select(mask0, minValsA, inVec0);
336 minIndicesA = __select(mask0, minIndicesA,
340 inVec1 = *(vec *) remStart;
341 secondHalfIndices = firstHalfIndices +
jumpFactor;
342 mask1 = __cmp_ge_pred(inVec1, minValsB);
343 minValsB = __select(mask1, minValsB, inVec1);
344 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
351 __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
352 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
353 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
354 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
355 minValsLarge = __select(maskOfminValues, minValsB, minValsA);
356 index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
357 index_vec zeroVec = c7x::uint_vec(0);
358 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
359 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
361 index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
363 minVals = minValsLarge;
364 minIndices = minIndicesLarge;
382 c7x::uchar_vec minIndices =
383 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
384 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
385 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
386 c7x::uchar_vec minIndices0 =
387 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
388 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
389 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
390 c7x::uchar_vec minIndices1 =
391 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
392 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
393 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
395 c7x::uchar_vec minIndicesA =
396 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
397 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
398 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
399 c7x::uchar_vec minIndicesB =
400 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
401 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
402 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
403 c7x::uchar_vec firstHalfIndices =
404 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
405 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
406 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
407 c7x::uchar_vec secondHalfIndices =
408 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
409 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
410 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
412 typedef typename c7x::make_full_vector<int8_t>::type vec;
413 typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
422 size_t width = c7x::element_count_of<vec>::value;
425 if (length <= width) {
426 minVals = c7x::strm_eng<0, vec>::get_adv();
429 for (
size_t i = length; i < width; i++) {
430 minVals.s[i] = std::numeric_limits<int8_t>::max();
434 else if (length < 2 * width) {
435 minVals0 = c7x::strm_eng<0, vec>::get_adv();
436 minVals1 = c7x::strm_eng<1, vec>::get_adv();
438 size_t remElements = length % width;
440 for (
size_t i = remElements; i < width; i++) {
441 minVals1.s[i] = std::numeric_limits<int8_t>::max();
445 maskOfmins = __cmp_ge_pred(minVals0, minVals1);
446 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
447 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
448 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
449 minVals = __select(maskOfmins, minVals1, minVals0);
450 index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
451 index_vec zeroVec = c7x::uchar_vec(0);
452 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
453 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
455 minIndices = nonTiebreakerVec + tiebreakerVec;
460 __vpred mask0, mask1;
463 vec minValsA = int8_t(std::numeric_limits<int8_t>::max());
464 vec minValsB = minValsA;
471 for (
size_t i = 0; i < numIterations; i += 1) {
472 inVec0 = c7x::strm_eng<0, vec>::get_adv();
473 mask0 = __cmp_ge_pred(inVec0, minValsA);
475 __select(mask0, minValsA, inVec0);
478 __select(mask0, minIndicesA, firstHalfIndices);
481 inVec1 = c7x::strm_eng<1, vec>::get_adv();
482 mask1 = __cmp_ge_pred(inVec1, minValsB);
483 minValsB = __select(mask1, minValsB, inVec1);
484 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
495 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
496 int8_t *remStart = (int8_t *) pSrc + length - width;
498 if (remBlockSize != 0 && remVecLen == 1) {
500 inVec0 = *(vec *) remStart;
501 firstHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsChar;
502 mask0 = __cmp_ge_pred(inVec0, minValsA);
504 __select(mask0, minValsA, inVec0);
506 minIndicesA = __select(mask0, minIndicesA,
511 else if (remBlockSize != 0 && remVecLen == 2) {
512 inVec0 = *(vec *) (remStart - width);
513 firstHalfIndices = c7x::uchar_vec(length - (2 * c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsChar;
514 mask0 = __cmp_ge_pred(inVec0, minValsA);
516 __select(mask0, minValsA, inVec0);
518 minIndicesA = __select(mask0, minIndicesA,
522 inVec1 = *(vec *) remStart;
523 secondHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsChar;
524 mask1 = __cmp_ge_pred(inVec1, minValsB);
525 minValsB = __select(mask1, minValsB, inVec1);
526 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
533 __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
534 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
535 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
536 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
537 minValsLarge = __select(maskOfminValues, minValsB, minValsA);
538 index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
539 index_vec zeroVec = c7x::uchar_vec(0);
540 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
541 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
543 index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
545 minVals = minValsLarge;
546 minIndices = minIndicesLarge;
558 c7x::uchar_vec minIndices =
559 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
560 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
561 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
562 c7x::uchar_vec minIndices0 =
563 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
564 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
565 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
566 c7x::uchar_vec minIndices1 =
567 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
568 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
569 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
571 c7x::uchar_vec minIndicesA =
572 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
573 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
574 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
575 c7x::uchar_vec minIndicesB =
576 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
577 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
578 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
579 c7x::uchar_vec firstHalfIndices =
580 c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
581 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
582 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
583 c7x::uchar_vec secondHalfIndices =
584 c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
585 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
586 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
588 typedef typename c7x::make_full_vector<uint8_t>::type vec;
589 typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
598 size_t width = c7x::element_count_of<vec>::value;
601 if (length <= width) {
602 minVals = c7x::strm_eng<0, vec>::get_adv();
605 for (
size_t i = length; i < width; i++) {
606 minVals.s[i] = std::numeric_limits<uint8_t>::max();
610 else if (length < 2 * width) {
611 minVals0 = c7x::strm_eng<0, vec>::get_adv();
612 minVals1 = c7x::strm_eng<1, vec>::get_adv();
614 size_t remElements = length % width;
616 for (
size_t i = remElements; i < width; i++) {
617 minVals1.s[i] = std::numeric_limits<uint8_t>::max();
621 maskOfmins = __cmp_ge_pred(minVals0, minVals1);
622 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
623 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
624 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
625 minVals = __select(maskOfmins, minVals1, minVals0);
626 index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
627 index_vec zeroVec = c7x::uchar_vec(0);
628 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
629 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
631 minIndices = nonTiebreakerVec + tiebreakerVec;
636 __vpred mask0, mask1;
639 vec minValsA = uint8_t(std::numeric_limits<uint8_t>::max());
640 vec minValsB = minValsA;
647 for (
size_t i = 0; i < numIterations; i += 1) {
648 inVec0 = c7x::strm_eng<0, vec>::get_adv();
649 mask0 = __cmp_ge_pred(inVec0, minValsA);
651 __select(mask0, minValsA, inVec0);
654 __select(mask0, minIndicesA, firstHalfIndices);
657 inVec1 = c7x::strm_eng<1, vec>::get_adv();
658 mask1 = __cmp_ge_pred(inVec1, minValsB);
659 minValsB = __select(mask1, minValsB, inVec1);
660 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
671 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
672 uint8_t *remStart = (uint8_t *) pSrc + length - width;
674 if (remBlockSize != 0 && remVecLen == 1) {
676 inVec0 = *(vec *) remStart;
677 firstHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsChar;
678 mask0 = __cmp_ge_pred(inVec0, minValsA);
680 __select(mask0, minValsA, inVec0);
682 minIndicesA = __select(mask0, minIndicesA,
687 else if (remBlockSize != 0 && remVecLen == 2) {
688 inVec0 = *(vec *) (remStart - width);
689 firstHalfIndices = c7x::uchar_vec(length - (2 * c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsChar;
690 mask0 = __cmp_ge_pred(inVec0, minValsA);
692 __select(mask0, minValsA, inVec0);
694 minIndicesA = __select(mask0, minIndicesA,
698 inVec1 = *(vec *) remStart;
699 secondHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsChar;
700 mask1 = __cmp_ge_pred(inVec1, minValsB);
701 minValsB = __select(mask1, minValsB, inVec1);
702 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
709 __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
710 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
711 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
712 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
713 minValsLarge = __select(maskOfminValues, minValsB, minValsA);
714 index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
715 index_vec zeroVec = c7x::uchar_vec(0);
716 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
717 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
719 index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
721 minVals = minValsLarge;
722 minIndices = minIndicesLarge;
734 c7x::ushort_vec minIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
735 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
736 c7x::ushort_vec minIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
737 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
738 c7x::ushort_vec minIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
739 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
741 c7x::ushort_vec minIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
742 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
743 c7x::ushort_vec minIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
744 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
745 c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
746 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
747 c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
748 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
750 typedef typename c7x::make_full_vector<int16_t>::type vec;
751 typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
760 size_t width = c7x::element_count_of<vec>::value;
763 if (length <= width) {
764 minVals = c7x::strm_eng<0, vec>::get_adv();
767 for (
size_t i = length; i < width; i++) {
768 minVals.s[i] = std::numeric_limits<int16_t>::max();
772 else if (length < 2 * width) {
773 minVals0 = c7x::strm_eng<0, vec>::get_adv();
774 minVals1 = c7x::strm_eng<1, vec>::get_adv();
776 size_t remElements = length % width;
778 for (
size_t i = remElements; i < width; i++) {
779 minVals1.s[i] = std::numeric_limits<int16_t>::max();
783 maskOfmins = __cmp_ge_pred(minVals0, minVals1);
784 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
785 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
786 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
787 minVals = __select(maskOfmins, minVals1, minVals0);
788 index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
789 index_vec zeroVec = c7x::ushort_vec(0);
790 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
791 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
793 minIndices = nonTiebreakerVec + tiebreakerVec;
798 __vpred mask0, mask1;
801 vec minValsA = int16_t(std::numeric_limits<int16_t>::max());
802 vec minValsB = minValsA;
809 for (
size_t i = 0; i < numIterations; i += 1) {
810 inVec0 = c7x::strm_eng<0, vec>::get_adv();
811 mask0 = __cmp_ge_pred(inVec0, minValsA);
813 __select(mask0, minValsA, inVec0);
816 __select(mask0, minIndicesA, firstHalfIndices);
819 inVec1 = c7x::strm_eng<1, vec>::get_adv();
820 mask1 = __cmp_ge_pred(inVec1, minValsB);
821 minValsB = __select(mask1, minValsB, inVec1);
822 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
833 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
834 int16_t *remStart = (int16_t *) pSrc + length - width;
836 if (remBlockSize != 0 && remVecLen == 1) {
838 inVec0 = *(vec *) remStart;
839 firstHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsShort;
840 mask0 = __cmp_ge_pred(inVec0, minValsA);
842 __select(mask0, minValsA, inVec0);
844 minIndicesA = __select(mask0, minIndicesA,
849 else if (remBlockSize != 0 && remVecLen == 2) {
850 inVec0 = *(vec *) (remStart - width);
852 c7x::ushort_vec(length - (2 * c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsShort;
853 mask0 = __cmp_ge_pred(inVec0, minValsA);
855 __select(mask0, minValsA, inVec0);
857 minIndicesA = __select(mask0, minIndicesA,
861 inVec1 = *(vec *) remStart;
862 secondHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsShort;
863 mask1 = __cmp_ge_pred(inVec1, minValsB);
864 minValsB = __select(mask1, minValsB, inVec1);
865 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
872 __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
873 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
874 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
875 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
876 minValsLarge = __select(maskOfminValues, minValsB, minValsA);
877 index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
878 index_vec zeroVec = c7x::ushort_vec(0);
879 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
880 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
882 index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
884 minVals = minValsLarge;
885 minIndices = minIndicesLarge;
897 c7x::ushort_vec minIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
898 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
899 c7x::ushort_vec minIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
900 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
901 c7x::ushort_vec minIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
902 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
904 c7x::ushort_vec minIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
905 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
906 c7x::ushort_vec minIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
907 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
908 c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
909 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
910 c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
911 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
913 typedef typename c7x::make_full_vector<uint16_t>::type vec;
914 typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
923 size_t width = c7x::element_count_of<vec>::value;
926 if (length <= width) {
927 minVals = c7x::strm_eng<0, vec>::get_adv();
930 for (
size_t i = length; i < width; i++) {
931 minVals.s[i] = std::numeric_limits<int16_t>::max();
935 else if (length < 2 * width) {
936 minVals0 = c7x::strm_eng<0, vec>::get_adv();
937 minVals1 = c7x::strm_eng<1, vec>::get_adv();
939 size_t remElements = length % width;
941 for (
size_t i = remElements; i < width; i++) {
942 minVals1.s[i] = std::numeric_limits<int16_t>::max();
946 maskOfmins = __cmp_ge_pred(minVals0, minVals1);
947 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
948 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
949 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
950 minVals = __select(maskOfmins, minVals1, minVals0);
951 index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
952 index_vec zeroVec = c7x::ushort_vec(0);
953 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
954 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
956 minIndices = nonTiebreakerVec + tiebreakerVec;
961 __vpred mask0, mask1;
964 vec minValsA = uint16_t(std::numeric_limits<uint16_t>::max());
965 vec minValsB = minValsA;
972 for (
size_t i = 0; i < numIterations; i += 1) {
973 inVec0 = c7x::strm_eng<0, vec>::get_adv();
974 mask0 = __cmp_ge_pred(inVec0, minValsA);
976 __select(mask0, minValsA, inVec0);
979 __select(mask0, minIndicesA, firstHalfIndices);
982 inVec1 = c7x::strm_eng<1, vec>::get_adv();
983 mask1 = __cmp_ge_pred(inVec1, minValsB);
984 minValsB = __select(mask1, minValsB, inVec1);
985 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
996 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
997 uint16_t *remStart = (uint16_t *) pSrc + length - width;
999 if (remBlockSize != 0 && remVecLen == 1) {
1001 inVec0 = *(vec *) remStart;
1002 firstHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsShort;
1003 mask0 = __cmp_ge_pred(inVec0, minValsA);
1005 __select(mask0, minValsA, inVec0);
1007 minIndicesA = __select(mask0, minIndicesA,
1012 else if (remBlockSize != 0 && remVecLen == 2) {
1013 inVec0 = *(vec *) (remStart - width);
1015 c7x::ushort_vec(length - (2 * c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsShort;
1016 mask0 = __cmp_ge_pred(inVec0, minValsA);
1018 __select(mask0, minValsA, inVec0);
1020 minIndicesA = __select(mask0, minIndicesA,
1024 inVec1 = *(vec *) remStart;
1025 secondHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) +
lastRunOffsetsShort;
1026 mask1 = __cmp_ge_pred(inVec1, minValsB);
1027 minValsB = __select(mask1, minValsB, inVec1);
1028 minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
1035 __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
1036 __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
1037 index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
1038 __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
1039 minValsLarge = __select(maskOfminValues, minValsB, minValsA);
1040 index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
1041 index_vec zeroVec = c7x::ushort_vec(0);
1042 index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
1043 index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1045 index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
1047 minVals = minValsLarge;
1048 minIndices = minIndicesLarge;
1060 c7x::uint_vec minIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1061 c7x::uint_vec minIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1062 c7x::uint_vec minIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1064 c7x::uint_vec minIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1065 c7x::uint_vec minIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1066 c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1067 c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1069 c7x::float_vec minVals0;
1070 c7x::float_vec minVals1;
1073 size_t width = c7x::element_count_of<c7x::float_vec>::value;
1076 c7x::float_vec minVals;
1078 if (length <= width) {
1079 minVals = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1081 for (
size_t i = length; i < width; i++) {
1082 minVals.s[i] = std::numeric_limits<float>::max();
1086 else if (length < 2 * width) {
1087 minVals0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1088 minVals1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1090 size_t remElements = length % width;
1091 for (
size_t i = remElements; i < width; i++) {
1092 minVals1.s[i] = std::numeric_limits<float>::max();
1094 maskOfmins = __cmp_lt_pred(minVals0, minVals1);
1095 minVals = __select(maskOfmins, minVals0, minVals1);
1096 minIndices = __select(maskOfmins, minIndices0, minIndices1);
1100 c7x::float_vec inVec0, inVec1;
1101 __vpred mask0, mask1, maskOfminsLarge;
1104 c7x::float_vec minValsA = std::numeric_limits<float>::max();
1105 c7x::float_vec minValsB = minValsA;
1108 c7x::float_vec minValsLarge;
1110 for (
size_t i = 0; i < numIterations; i += 1) {
1111 inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1112 mask0 = __cmp_lt_pred(inVec0, minValsA);
1114 __select(mask0, inVec0, minValsA);
1117 __select(mask0, firstHalfIndices, minIndicesA);
1120 inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1121 mask1 = __cmp_lt_pred(inVec1, minValsB);
1122 minValsB = __select(mask1, inVec1, minValsB);
1123 minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1134 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1135 float *remStart = (
float *) pSrc + length - width;
1138 if (remBlockSize != 0 && remVecLen == 1) {
1140 inVec0 = *(c7x::float_vec *) remStart;
1141 firstHalfIndices = c7x::uint_vec(length - (width)) +
lastRunOffsets;
1142 mask0 = __cmp_lt_pred(inVec0, minValsA);
1144 __select(mask0, inVec0, minValsA);
1146 minIndicesA = __select(mask0, firstHalfIndices,
1152 else if (remBlockSize != 0 && remVecLen == 2) {
1154 inVec0 = *(c7x::float_vec *) (remStart - width);
1155 firstHalfIndices = c7x::uint_vec(length - (2 * width)) +
lastRunOffsets;
1156 mask0 = __cmp_lt_pred(inVec0, minValsA);
1158 __select(mask0, inVec0, minValsA);
1160 minIndicesA = __select(mask0, firstHalfIndices,
1164 inVec1 = *(c7x::float_vec *) remStart;
1165 secondHalfIndices = c7x::uint_vec(length - (width)) +
lastRunOffsets;
1166 mask1 = __cmp_lt_pred(inVec1, minValsB);
1167 minValsB = __select(mask1, inVec1, minValsB);
1168 minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1173 maskOfminsLarge = __cmp_lt_pred(minValsA, minValsB);
1174 minValsLarge = __select(maskOfminsLarge, minValsA, minValsB);
1175 c7x::uint_vec minIndicesLarge = __select(maskOfminsLarge, minIndicesA, minIndicesB);
1177 minVals = minValsLarge;
1178 minIndices = minIndicesLarge;
1189 c7x::ulong_vec minIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1190 c7x::ulong_vec minIndices0 = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1191 c7x::ulong_vec minIndices1 = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1193 c7x::ulong_vec minIndicesA = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1194 c7x::ulong_vec minIndicesB = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1195 c7x::ulong_vec firstHalfIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1196 c7x::ulong_vec secondHalfIndices = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1198 c7x::double_vec minVals0;
1199 c7x::double_vec minVals1;
1202 size_t width = c7x::element_count_of<c7x::double_vec>::value;
1205 c7x::double_vec minVals;
1207 if (length <= width) {
1208 minVals = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1210 for (
size_t i = length; i < width; i++) {
1211 minVals.s[i] = std::numeric_limits<double>::max();
1215 else if (length < 2 * width) {
1216 minVals0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1217 minVals1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1219 size_t remainingElement = length % width;
1220 for (
size_t i = remainingElement; i < width; i++) {
1221 minVals1.s[i] = std::numeric_limits<double>::max();
1223 maskOfmins = __cmp_lt_pred(minVals0, minVals1);
1224 minVals = __select(maskOfmins, minVals0, minVals1);
1225 minIndices = __select(maskOfmins, minIndices0, minIndices1);
1229 c7x::double_vec inVec0, inVec1;
1230 __vpred mask0, mask1, maskOfminsLarge;
1233 c7x::double_vec minValsA = std::numeric_limits<double>::max();
1234 c7x::double_vec minValsB = minValsA;
1237 c7x::double_vec minValsLarge;
1239 for (
size_t i = 0; i < numIterations; i += 1) {
1240 inVec0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1241 mask0 = __cmp_lt_pred(inVec0, minValsA);
1243 __select(mask0, inVec0, minValsA);
1246 __select(mask0, firstHalfIndices, minIndicesA);
1249 inVec1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1250 mask1 = __cmp_lt_pred(inVec1, minValsB);
1251 minValsB = __select(mask1, inVec1, minValsB);
1252 minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1263 int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1264 double *remStart = (
double *) pSrc + length - width;
1267 if (remBlockSize != 0 && remVecLen == 1) {
1269 inVec0 = *(c7x::double_vec *) remStart;
1271 mask0 = __cmp_lt_pred(inVec0, minValsA);
1273 __select(mask0, inVec0, minValsA);
1275 minIndicesA = __select(mask0, firstHalfIndices,
1281 else if (remBlockSize != 0 && remVecLen == 2) {
1283 inVec0 = *(c7x::double_vec *) (remStart - width);
1284 firstHalfIndices = c7x::ulong_vec(length - (2 * width)) +
lastRunOffsetsDp;
1285 mask0 = __cmp_lt_pred(inVec0, minValsA);
1287 __select(mask0, inVec0, minValsA);
1289 minIndicesA = __select(mask0, firstHalfIndices,
1293 inVec1 = *(c7x::double_vec *) remStart;
1295 mask1 = __cmp_lt_pred(inVec1, minValsB);
1296 minValsB = __select(mask1, inVec1, minValsB);
1297 minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1303 maskOfminsLarge = __cmp_lt_pred(minValsA, minValsB);
1304 minValsLarge = __select(maskOfminsLarge, minValsA, minValsB);
1305 c7x::ulong_vec minIndicesLarge = __select(maskOfminsLarge, minIndicesA, minIndicesB);
1307 minVals = minValsLarge;
1308 minIndices = minIndicesLarge;
1317 template <
typename T,
typename TIndex>
1321 uint32_t blockSize = pKerPrivArgs->
blockSize;
1322 uint32_t length = blockSize;
1325 __SE_TEMPLATE_v1 se0Params, se1Params;
1328 T *restrict pInLocal = (T *) pIn;
1329 uint32_t *restrict pOutLocal = (uint32_t *) pOut;
1331 #if DSPLIB_DEBUGPRINT
1332 printf(
"Enter DSPLIB_minIndex_exec_ci\n");
1335 typedef typename c7x::make_full_vector<T>::type vec;
1337 uint32_t eleCount = c7x::element_count_of<vec>::value;
1338 uint32_t width = eleCount;
1339 #if DSPLIB_DEBUGPRINT
1340 printf(
"Enter eleCount %d\n", eleCount);
1343 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
1348 __SE0_OPEN(pInLocal, se0Params);
1349 if (length > width) {
1350 __SE1_OPEN(pInLocal + eleCount, se1Params);
1356 #if DSPLIB_DEBUGPRINT
1357 printf(
"DSPLIB_DEBUGPRINT blockSize %d\n", blockSize);
1360 size_t bitsInType =
sizeof(T) * 8;
1361 bitsInType = (bitsInType > 32) ? 32 : bitsInType;
1363 size_t minSingleBufferSize = pow(2, bitsInType);
1364 uint32_t numBufferIterations = DSPLIB_ceilingDiv(length, minSingleBufferSize);
1366 std::vector<T> minVals(numBufferIterations);
1367 std::vector<uint32_t> minIndices(numBufferIterations);
1372 size_t currentIterationSize;
1374 TIndex *currentIndexPtr;
1375 TIndex currentIndex;
1377 for (uint32_t buffer = 0; buffer < numBufferIterations; buffer++) {
1378 currentIterationSize = std::min((
size_t) minSingleBufferSize, (
size_t) (length - (minSingleBufferSize * buffer)));
1379 loopOutput = DSPLIB_minIndex_loopLogic<T, TIndex>(currentIterationSize, pInLocal);
1382 currentValuePtr = (T *) &loopOutput.
minVals;
1383 smallest = *currentValuePtr++;
1384 currentIndexPtr = (TIndex *) &loopOutput.
minIndices;
1385 minIndex = *currentIndexPtr++;
1386 for (i = 1; i < c7x::element_count_of<vec>::value; i++) {
1387 currentValue = *currentValuePtr;
1388 currentIndex = *currentIndexPtr;
1389 if (currentValue < smallest) {
1390 smallest = currentValue;
1391 minIndex = currentIndex;
1395 else if (currentValue == smallest) {
1396 if (currentIndex < minIndex) {
1397 minIndex = currentIndex;
1410 minVals[buffer] = smallest;
1411 minIndices[buffer] = ((uint32_t) minIndex) + (buffer * minSingleBufferSize);
1413 T smallestVal = minVals[0];
1414 uint32_t smallestIndex = minIndices[0];
1416 for (i = 1; i < minVals.size(); i++) {
1419 if (minVals[i] < smallestVal) {
1420 smallestVal = minVals[i];
1421 smallestIndex = minIndices[i];
1429 *pOutLocal = smallestIndex;
1432 if (length > width) {
#define SE_SE0_PARAM_OFFSET
#define SE_SE1_PARAM_OFFSET
metadata< T, TIndex > DSPLIB_minIndex_loopLogic(size_t length, void *pSrc)
This function is the kernel loop helper function for the optimized implementation of the kernel....
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
metadata< int8_t, uint8_t > DSPLIB_minIndex_loopLogic< int8_t, uint8_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
const c7x::uint_vec jumpFactor
const c7x::ushort_vec jumpFactorShort
const c7x::uchar_vec jumpFactorChar
DSPLIB_STATUS DSPLIB_minIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< float, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< uint16_t, uint16_t > DSPLIB_minIndex_loopLogic< uint16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
metadata< uint8_t, uint8_t > DSPLIB_minIndex_loopLogic< uint8_t, uint8_t >(size_t length, void *pSrc)
metadata< int16_t, uint16_t > DSPLIB_minIndex_loopLogic< int16_t, uint16_t >(size_t length, void *pSrc)
DSPLIB_STATUS DSPLIB_minIndex_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
const c7x::uint_vec lastRunOffsets
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< int16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< double, uint64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< uint16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
const c7x::uchar_vec lastRunOffsetsChar
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< int32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< double, uint64_t > DSPLIB_minIndex_loopLogic< double, uint64_t >(size_t length, void *pSrc)
#define INDEX_UNROLL_FACTOR
const c7x::ushort_vec lastRunOffsetsShort
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< uint32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
metadata< float, uint32_t > DSPLIB_minIndex_loopLogic< float, uint32_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< int8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
const c7x::ulong_vec jumpFactorDp
const c7x::ulong_vec lastRunOffsetsDp
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< uint8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_minIndex.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
A structure for a 1 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
int32_t blockSize
Size of input buffer for different batches DSPLIB_minIndex_init that will be retrieved and used by DS...
uint8_t bufPblock[DSPLIB_MININDEX_IXX_IXX_OXX_PBLOCK_SIZE]