DSPLIB User Guide
DSPLIB_maxIndex_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
47 #include "../common/c71/DSPLIB_inlines.h"
48 #include "DSPLIB_maxIndex_priv.h"
49 #include "DSPLIB_types.h"
50 #include "c7x_scalable.h"
51 #include <algorithm> // std::min
52 #include <cmath> // floor
53 #include <cstdint>
54 #include <float.h>
55 #include <limits>
56 #include <numeric> // std::iota
57 #include <vector> // std::vector
58 
59 #define INDEX_UNROLL_FACTOR 2
60 
61 const c7x::uint_vec jumpFactor = c7x::uint_vec(32);
62 const c7x::uint_vec lastRunOffsets = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
63 const c7x::ushort_vec jumpFactorShort = c7x::ushort_vec(64);
64 
65 // clang-format off
66 const c7x::ushort_vec lastRunOffsetsShort = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
67  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
68 // clang-format on
69 
70 const c7x::ulong_vec jumpFactorDp = c7x::ulong_vec(16);
71 const c7x::ulong_vec lastRunOffsetsDp = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
72 const c7x::uchar_vec jumpFactorChar = c7x::uchar_vec(128);
73 
74 // clang-format off
75 const c7x::uchar_vec lastRunOffsetsChar = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
76  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
77  34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
78  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
79 // clang-format on
80 
81 template <typename dataType>
83  const DSPLIB_bufParams1D_t *bufParamsIn,
84  const DSPLIB_bufParams1D_t *bufParamsOut,
85  const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
86 {
88  __SE_TEMPLATE_v1 se0Params, se1Params;
89 
90  __SE_ELETYPE SE_ELETYPE;
91  __SE_VECLEN SE_VECLEN;
92 
93  DSPLIB_maxIndex_PrivArgs *pKerPrivArgs = (DSPLIB_maxIndex_PrivArgs *) handle;
94 
95  uint8_t *pBlock = pKerPrivArgs->bufPblock;
96  uint32_t blocksSize = pKerPrivArgs->blockSize;
97 
98  typedef typename c7x::make_full_vector<dataType>::type vec;
99  uint32_t eleCount = c7x::element_count_of<vec>::value;
100  SE_VECLEN = c7x::se_veclen<vec>::value;
101  SE_ELETYPE = c7x::se_eletype<vec>::value;
102  uint32_t length = blocksSize;
103  uint32_t width = eleCount;
104 
105 #if DSPLIB_DEBUGPRINT
106  printf("Enter eleCount %d\n", eleCount);
107 #endif
108 
109  /**********************************************************************/
110  /* Prepare streaming engine 0,1 to fetch the input */
111  /**********************************************************************/
112  se0Params = __gen_SE_TEMPLATE_v1();
113 
114  // default SE0 parameters
115  se0Params.ICNT0 = width;
116  se0Params.ELETYPE = SE_ELETYPE;
117  se0Params.VECLEN = SE_VECLEN;
118  se0Params.DIMFMT = __SE_DIMFMT_1D;
119 
120  se1Params = __gen_SE_TEMPLATE_v1();
121 
122  // default SE1 parameters
123  se1Params.ICNT0 = width;
124  se1Params.ELETYPE = SE_ELETYPE;
125  se1Params.VECLEN = SE_VECLEN;
126  se1Params.DIMFMT = __SE_DIMFMT_1D;
127 
128  // variables to calculate and store compute loop's iteration counter
129  uint32_t numBlocks = length / width;
130  uint32_t remBlocksSize = length % width;
131  if (remBlocksSize) {
132  numBlocks++;
133  }
134  else {
135  /* Nothing to do here */
136  }
137  // case: length of input <= width
138  // one SE fetch is length elements, rest of vec filled with '0'
139  if (length <= width) {
140 
141  // SE0 fetch length
142  se0Params.ICNT0 = length;
143  // SE1 not used
144  }
145 
146  // case: length of input is > width but < 2*width
147  // SE0 fetch is one full width, SE1 fetch is partial fetch, rest of vec filled with '0'
148  else if (length < 2 * width) {
149 
150  // SE0 full fetch
151  se0Params.ICNT0 = width;
152  // SE1 partial fetch
153  se1Params.ICNT0 = remBlocksSize;
154  }
155 
156  // case: len >= 2*width
157  // SE0 and SE1 fetches are full widths only
158  else {
159 
160  // printf("\ninside len > 4SIMD\n");
161  // SE0 Dim is 2D
162  se0Params.DIMFMT = __SE_DIMFMT_2D;
163  // SE0 jump length each get_adv is 2 widths
164  se0Params.DIM1 = 2 * width;
165  // SE only performs full fetches in multiples of INDEX_UNROLL_FACTOR, i.e. 2
166  se0Params.ICNT1 = length / (INDEX_UNROLL_FACTOR * width);
167  // SE0 fetches full widths
168  se0Params.ICNT0 = width;
169 
170  // SE1 fetches in same manner as SE0, but starts 1 width ahead
171  se1Params = se0Params;
172  }
173 
174  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET) = se0Params;
175  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET) = se1Params;
176 
177  return status;
178 }
179 
181  const DSPLIB_bufParams1D_t *bufParamsIn,
182  const DSPLIB_bufParams1D_t *bufParamsOut,
183  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
184 
186  const DSPLIB_bufParams1D_t *bufParamsIn,
187  const DSPLIB_bufParams1D_t *bufParamsOut,
188  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
189 
191  const DSPLIB_bufParams1D_t *bufParamsIn,
192  const DSPLIB_bufParams1D_t *bufParamsOut,
193  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
194 
196  const DSPLIB_bufParams1D_t *bufParamsIn,
197  const DSPLIB_bufParams1D_t *bufParamsOut,
198  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
199 
201  const DSPLIB_bufParams1D_t *bufParamsIn,
202  const DSPLIB_bufParams1D_t *bufParamsOut,
203  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
204 
206  const DSPLIB_bufParams1D_t *bufParamsIn,
207  const DSPLIB_bufParams1D_t *bufParamsOut,
208  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
209 
211  const DSPLIB_bufParams1D_t *bufParamsIn,
212  const DSPLIB_bufParams1D_t *bufParamsOut,
213  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
214 
216  const DSPLIB_bufParams1D_t *bufParamsIn,
217  const DSPLIB_bufParams1D_t *bufParamsOut,
218  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
219 
220 template <typename T, typename TIndex>
221 DSPLIB_STATUS DSPLIB_maxIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
222 {
223  DSPLIB_maxIndex_PrivArgs *pKerPrivArgs = (DSPLIB_maxIndex_PrivArgs *) handle;
224  uint32_t blockSize = pKerPrivArgs->blockSize;
225  uint32_t length = blockSize;
226  DSPLIB_STATUS status = DSPLIB_SUCCESS;
227 
228  __SE_TEMPLATE_v1 se0Params, se1Params;
229  // __SA_TEMPLATE_v1 sa0Params;
230 
231  T *restrict pInLocal = (T *) pIn;
232  uint32_t *restrict pOutLocal = (uint32_t *) pOut;
233 
234 #if DSPLIB_DEBUGPRINT
235  printf("Enter DSPLIB_maxIndex_exec_ci\n");
236 #endif
237 
238  typedef typename c7x::make_full_vector<T>::type vec;
239  uint32_t eleCount = c7x::element_count_of<vec>::value;
240  uint32_t width = eleCount;
241 #if DSPLIB_DEBUGPRINT
242  printf("Enter eleCount %d\n", eleCount);
243 #endif
244 
245  uint8_t *pBlock = pKerPrivArgs->bufPblock;
246  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
247  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
248 
249  // Input samples
250  __SE0_OPEN(pInLocal, se0Params);
251  if (length > width) {
252  __SE1_OPEN(pInLocal + eleCount, se1Params);
253  }
254 
255 #if DSPLIB_DEBUGPRINT
256  printf("DSPLIB_DEBUGPRINT blockSize %d\n", blockSize);
257 #endif
258 
259  size_t bitsInType = sizeof(T) * 8; // sizeof(T) is measured in bytes
260  bitsInType = (bitsInType > 32) ? 32 : bitsInType;
261 
262  size_t maxSingleBufferSize = pow(2, bitsInType);
263  uint32_t numBufferIterations = DSPLIB_ceilingDiv(length, maxSingleBufferSize);
264 
265  std::vector<T> maxVals(numBufferIterations);
266  std::vector<uint32_t> maxIndices(numBufferIterations);
267 
268  T *currentValuePtr;
269  T currentValue; // using this value so we don't dereference the pointer three different times per iteration
270  T largest;
271  metadata<T, TIndex> loopOutput;
272  size_t currentIterationSize;
273  TIndex maxIndex;
274  TIndex *currentIndexPtr;
275  TIndex currentIndex;
276  size_t i = 1;
277  for (uint32_t buffer = 0; buffer < numBufferIterations; buffer++) {
278 
279  currentIterationSize = std::min((size_t) maxSingleBufferSize, (size_t) (length - (maxSingleBufferSize * buffer)));
280 
281  loopOutput = DSPLIB_maxIndex_loopLogic<T, TIndex>(currentIterationSize, pInLocal);
282  // find the maximum index by looping through the min vector and getting the corresponding min index
283  // use pointer since .s[i] is problematic
284  currentValuePtr = (T *) &loopOutput.maxVals;
285  largest = *currentValuePtr++;
286  currentIndexPtr = (TIndex *) &loopOutput.maxIndices;
287  maxIndex = *currentIndexPtr++;
288  for (i = 1; i < c7x::element_count_of<vec>::value; i++) {
289  currentValue = *currentValuePtr;
290  currentIndex = *currentIndexPtr;
291  if (currentValue > largest) {
292  largest = currentValue;
293  maxIndex = currentIndex;
294  }
295  // need the first instance of the maximum value, so set the maximum index to the lower index if current value
296  // is same as current maximum value
297  else if (currentValue == largest) {
298  if (currentIndex < maxIndex) {
299  maxIndex = currentIndex;
300  }
301  }
302  else {
303  /* Nothing to do here */
304  }
305  currentValuePtr++;
306  currentIndexPtr++;
307  }
308  maxVals[buffer] = largest;
309  maxIndices[buffer] = ((uint32_t) maxIndex) + (buffer * maxSingleBufferSize);
310  }
311 
312  T largestVal = maxVals[0];
313  uint32_t largestIndex = maxIndices[0];
314  for (i = 1; i < maxVals.size(); i++) {
315  if (maxVals[i] > largestVal) {
316  largestVal = maxVals[i];
317  largestIndex = maxIndices[i];
318  }
319  }
320 
321  *pOutLocal = largestIndex;
322  // close SE0 and SE1
323  __SE0_CLOSE();
324  if (length > width) {
325  __SE1_CLOSE();
326  }
327 
328  return status;
329 }
330 
332 DSPLIB_maxIndex_exec_ci<int8_t, uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
333 
335 DSPLIB_maxIndex_exec_ci<uint8_t, uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
336 
338 DSPLIB_maxIndex_exec_ci<int16_t, uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
339 
341 DSPLIB_maxIndex_exec_ci<uint16_t, uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
342 
344 DSPLIB_maxIndex_exec_ci<int32_t, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
345 
347 DSPLIB_maxIndex_exec_ci<uint32_t, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
348 
350 DSPLIB_maxIndex_exec_ci<float, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
351 
353 DSPLIB_maxIndex_exec_ci<double, uint64_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
354 
355 template <typename T, typename TIndex> metadata<T, TIndex> DSPLIB_maxIndex_loopLogic(size_t length, void *pSrc)
356 {
357  c7x::uint_vec maxIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
358  c7x::uint_vec maxIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
359  c7x::uint_vec maxIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
360  // re-defined for large widths so that there's no common vectors used between small and large widths
361  c7x::uint_vec maxIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
362  c7x::uint_vec maxIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
363  c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
364  c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
365  // derive c7x vector type from value template and index template
366  typedef typename c7x::make_full_vector<T>::type vec;
367  typedef typename c7x::make_full_vector<TIndex>::type index_vec;
368 
369  // holds all of the maximum values that have previously been read
370  vec maxVals0; // = T(std::numeric_limits<T>::lowest());
371  vec maxVals1; // = maxVals0;
372  __vpred maskOfMaxs;
373 
374  // holds the overall max vals
375  vec maxVals;
376  size_t width = c7x::element_count_of<vec>::value;
377  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
378  if (length <= width) {
379  maxVals = c7x::strm_eng<0, vec>::get_adv();
380  // fill the uninitialized values with MIN_VAL
381  for (size_t i = length; i < width; i++) {
382  maxVals.s[i] = std::numeric_limits<T>::lowest();
383  }
384  }
385  // can fill one width but only part of a second
386  else if (length < 2 * width) {
387  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
388  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
389  // fill the uninitialized values with MIN_VAL
390  size_t remElements = length % width;
391  for (size_t i = remElements; i < width; i++) {
392  maxVals1.s[i] = std::numeric_limits<T>::lowest();
393  }
394  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
395  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
396  maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
397 
398  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
399  // as well
400  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
401  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
402  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
403  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
404  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
405  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
406  index_vec zeroVec = c7x::uint_vec(0);
407  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
408  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
409 
410  maxIndices = nonTiebreakerVec + tiebreakerVec;
411  }
412  else {
413  // input vectors
414  vec inVec0, inVec1;
415  __vpred mask0, mask1;
416  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
417  // when not being used, which will increase the ii
418  vec maxValsA = T(std::numeric_limits<T>::lowest());
419  vec maxValsB = maxValsA;
420 
421  // holds the overall max vals
422  vec maxValsLarge = T(std::numeric_limits<T>::lowest());
423 
424  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
425 
426  for (size_t i = 0; i < numIterations; i += 1) {
427  inVec0 = c7x::strm_eng<0, vec>::get_adv();
428  mask0 = __cmp_gt_pred(inVec0, maxValsA);
429  maxValsA =
430  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
431  // have been shown to have larger values (contained in mask)
432  maxIndicesA =
433  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
434  // changed from the previous max values
435 
436  inVec1 = c7x::strm_eng<1, vec>::get_adv();
437  mask1 = __cmp_gt_pred(inVec1, maxValsB);
438  maxValsB = __select(mask1, inVec1, maxValsB);
439  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
440 
441  // update the new locations of the indices to be set for the next iteration
442  firstHalfIndices += jumpFactor;
443  secondHalfIndices += jumpFactor;
444  }
445 
446  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
447  // if no remainder block, go to end
448 
449  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
450  T *remStart = (T *) pSrc + length - width;
451 
452  if (remBlockSize != 0 && remVecLen == 1) {
453 
454  inVec0 = *(vec *) remStart;
455  firstHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
456  mask0 = __cmp_gt_pred(inVec0, maxValsA);
457  maxValsA =
458  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
459  // have been shown to have larger values (contained in mask)
460  maxIndicesA = __select(mask0, firstHalfIndices,
461  maxIndicesA); // set the max indices to be the indices whose values have changed from
462  // the previous max values
463  }
464 
465  else if (remBlockSize != 0 && remVecLen == 2) {
466  inVec0 = *(vec *) (remStart - width);
467  firstHalfIndices = c7x::uint_vec(length - (2 * c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
468  mask0 = __cmp_gt_pred(inVec0, maxValsA);
469  maxValsA =
470  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
471  // have been shown to have larger values (contained in mask)
472  maxIndicesA = __select(mask0, firstHalfIndices,
473  maxIndicesA); // set the max indices to be the indices whose values have changed from
474  // the previous max values
475 
476  // inVec1 = c7x::strm_eng<1, vec>::get_adv();
477  inVec1 = *(vec *) remStart;
478  secondHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
479  mask1 = __cmp_gt_pred(inVec1, maxValsB);
480  maxValsB = __select(mask1, inVec1, maxValsB);
481  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
482  }
483  else {
484  /* Nothing to do here */
485  }
486 
487  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
488  // as well
489  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
490  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
491  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
492  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
493  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
494  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
495  index_vec zeroVec = c7x::uint_vec(0);
496  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
497  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
498 
499  index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
500 
501  maxVals = maxValsLarge;
502  maxIndices = maxIndicesLarge;
503  }
504  metadata<T, TIndex> output;
505  output.maxVals = maxVals;
506  output.maxIndices = maxIndices;
507  return output;
508 }
509 
510 // explicit templatization for int8_t type
511 template <> metadata<int8_t, uint8_t> DSPLIB_maxIndex_loopLogic<int8_t, uint8_t>(size_t length, void *pSrc)
512 {
513 
514  c7x::uchar_vec maxIndices =
515  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
516  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
517  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
518  c7x::uchar_vec maxIndices0 =
519  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
520  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
521  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
522  c7x::uchar_vec maxIndices1 =
523  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
524  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
525  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
526  // re-defined for large widths so that there's no common vectors used between small and large widths
527  c7x::uchar_vec maxIndicesA =
528  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
529  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
530  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
531  c7x::uchar_vec maxIndicesB =
532  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
533  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
534  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
535  c7x::uchar_vec firstHalfIndices =
536  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
537  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
538  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
539  c7x::uchar_vec secondHalfIndices =
540  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
541  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
542  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
543 
544  typedef typename c7x::make_full_vector<int8_t>::type vec;
545  typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
546 
547  // holds all of the maximum values that have previously been read
548  vec maxVals0; // = int8_t(std::numeric_limits<int8_t>::lowest());
549  vec maxVals1; // = maxVals0;
550  __vpred maskOfMaxs;
551 
552  // holds the overall max vals
553  vec maxVals;
554  size_t width = c7x::element_count_of<vec>::value;
555 
556  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
557  if (length <= width) {
558  maxVals = c7x::strm_eng<0, vec>::get_adv();
559  // fill the uninitialized values with MIN_VAL
560  for (size_t i = length; i < width; i++) {
561  maxVals.s[i] = std::numeric_limits<int8_t>::lowest();
562  }
563  }
564  // can fill one width but only part of a second
565  else if (length < 2 * width) {
566  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
567  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
568  // fill the uninitialized values with MIN_VAL
569  size_t remElements = length % width;
570  for (size_t i = remElements; i < width; i++) {
571  maxVals1.s[i] = std::numeric_limits<int8_t>::lowest();
572  }
573  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
574  // as well
575  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
576  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
577  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
578  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
579  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
580  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
581  index_vec zeroVec = c7x::uchar_vec(0);
582  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
583  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
584 
585  maxIndices = nonTiebreakerVec + tiebreakerVec;
586  }
587  else {
588  // input vectors
589  vec inVec0, inVec1;
590  __vpred mask0, mask1;
591  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
592  // when not being used, which will increase the ii
593  vec maxValsA = int8_t(std::numeric_limits<int8_t>::lowest());
594  vec maxValsB = maxValsA;
595 
596  // holds the overall max vals
597  vec maxValsLarge = int8_t(std::numeric_limits<int8_t>::lowest());
598 
599  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
600 
601  for (size_t i = 0; i < numIterations; i += 1) {
602  inVec0 = c7x::strm_eng<0, vec>::get_adv();
603  mask0 = __cmp_gt_pred(inVec0, maxValsA);
604  maxValsA =
605  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
606  // have been shown to have larger values (contained in mask)
607  maxIndicesA =
608  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
609  // changed from the previous max values
610 
611  inVec1 = c7x::strm_eng<1, vec>::get_adv();
612  mask1 = __cmp_gt_pred(inVec1, maxValsB);
613  maxValsB = __select(mask1, inVec1, maxValsB);
614  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
615 
616  // update the new locations of the indices to be set for the next iteration
617  firstHalfIndices += jumpFactorChar;
618  secondHalfIndices += jumpFactorChar;
619  }
620 
621  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
622 
623  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
624  int8_t *remStart = (int8_t *) pSrc + length - width;
625 
626  if (remBlockSize != 0 && remVecLen == 1) {
627  inVec0 = *(vec *) remStart;
628 
629  firstHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
630  mask0 = __cmp_gt_pred(inVec0, maxValsA);
631  maxValsA =
632  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
633  // have been shown to have larger values (contained in mask)
634  maxIndicesA = __select(mask0, firstHalfIndices,
635  maxIndicesA); // set the max indices to be the indices whose values have changed from
636  // the previous max values
637  }
638 
639  else if (remBlockSize != 0 && remVecLen == 2) {
640  inVec0 = *(vec *) (remStart - width);
641  firstHalfIndices = c7x::uchar_vec(length - 2 * width) + lastRunOffsetsChar;
642  mask0 = __cmp_gt_pred(inVec0, maxValsA);
643  maxValsA =
644  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
645  // have been shown to have larger values (contained in mask)
646  maxIndicesA = __select(mask0, firstHalfIndices,
647  maxIndicesA); // set the max indices to be the indices whose values have changed from
648  // the previous max values
649 
650  inVec1 = *(vec *) remStart;
651  secondHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
652  mask1 = __cmp_gt_pred(inVec1, maxValsB);
653  maxValsB = __select(mask1, inVec1, maxValsB);
654  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
655  }
656  else {
657  /* Nothing to do here */
658  }
659 
660  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
661  // as well
662  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
663  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
664  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
665  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
666  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
667  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
668  index_vec zeroVec = c7x::uchar_vec(0);
669  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
670  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
671 
672  index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
673 
674  maxVals = maxValsLarge;
675  maxIndices = maxIndicesLarge;
676  }
678  output.maxVals = maxVals;
679  output.maxIndices = maxIndices;
680  return output;
681 }
682 
683 // explicit templatization for uint8_t type
684 template <> metadata<uint8_t, uint8_t> DSPLIB_maxIndex_loopLogic<uint8_t, uint8_t>(size_t length, void *pSrc)
685 {
686  c7x::uchar_vec maxIndices =
687  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
688  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
689  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
690  c7x::uchar_vec maxIndices0 =
691  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
692  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
693  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
694  c7x::uchar_vec maxIndices1 =
695  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
696  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
697  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
698  // re-defined for large widths so that there's no common vectors used between small and large widths
699  c7x::uchar_vec maxIndicesA =
700  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
701  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
702  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
703  c7x::uchar_vec maxIndicesB =
704  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
705  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
706  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
707  c7x::uchar_vec firstHalfIndices =
708  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
709  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
710  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
711  c7x::uchar_vec secondHalfIndices =
712  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
713  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
714  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
715 
716  typedef typename c7x::make_full_vector<uint8_t>::type vec;
717  typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
718 
719  // holds all of the maximum values that have previously been read
720  vec maxVals0; // = uint8_t(std::numeric_limits<uint8_t>::lowest());
721  vec maxVals1; // = maxVals0;
722  __vpred maskOfMaxs;
723 
724  // holds the overall max vals
725  vec maxVals;
726  size_t width = c7x::element_count_of<vec>::value;
727 
728  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
729  if (length <= width) {
730  maxVals = c7x::strm_eng<0, vec>::get_adv();
731  // fill the uninitialized values with MIN_VAL
732  for (size_t i = length; i < width; i++) {
733  maxVals.s[i] = std::numeric_limits<uint8_t>::lowest();
734  }
735  }
736  // can fill one width but only part of a second
737  else if (length < 2 * width) {
738  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
739  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
740  // fill the uninitialized values with MIN_VAL
741  size_t remElements = length % width;
742  for (size_t i = remElements; i < width; i++) {
743  maxVals1.s[i] = std::numeric_limits<uint8_t>::lowest();
744  }
745  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
746  // as well
747  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
748  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
749  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
750  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
751  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
752  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
753  index_vec zeroVec = c7x::uchar_vec(0);
754  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
755  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
756 
757  maxIndices = nonTiebreakerVec + tiebreakerVec;
758  }
759  else {
760  // input vectors
761  vec inVec0, inVec1;
762  __vpred mask0, mask1;
763  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
764  // when not being used, which will increase the ii
765  vec maxValsA = uint8_t(std::numeric_limits<uint8_t>::lowest());
766  vec maxValsB = maxValsA;
767 
768  // holds the overall max vals
769  vec maxValsLarge = uint8_t(std::numeric_limits<uint8_t>::lowest());
770 
771  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
772 
773  for (size_t i = 0; i < numIterations; i += 1) {
774  inVec0 = c7x::strm_eng<0, vec>::get_adv();
775  mask0 = __cmp_gt_pred(inVec0, maxValsA);
776  maxValsA =
777  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
778  // have been shown to have larger values (contained in mask)
779  maxIndicesA =
780  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
781  // changed from the previous max values
782 
783  inVec1 = c7x::strm_eng<1, vec>::get_adv();
784  mask1 = __cmp_gt_pred(inVec1, maxValsB);
785  maxValsB = __select(mask1, inVec1, maxValsB);
786  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
787 
788  // update the new locations of the indices to be set for the next iteration
789  firstHalfIndices += jumpFactorChar;
790  secondHalfIndices += jumpFactorChar;
791  }
792 
793  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
794 
795  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
796  int8_t *remStart = (int8_t *) pSrc + length - width;
797 
798  if (remBlockSize != 0 && remVecLen == 1) {
799  inVec0 = *(vec *) remStart;
800 
801  firstHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
802  mask0 = __cmp_gt_pred(inVec0, maxValsA);
803  maxValsA =
804  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
805  // have been shown to have larger values (contained in mask)
806  maxIndicesA = __select(mask0, firstHalfIndices,
807  maxIndicesA); // set the max indices to be the indices whose values have changed from
808  // the previous max values
809  }
810 
811  else if (remBlockSize != 0 && remVecLen == 2) {
812  inVec0 = *(vec *) (remStart - width);
813  firstHalfIndices = c7x::uchar_vec(length - 2 * width) + lastRunOffsetsChar;
814  mask0 = __cmp_gt_pred(inVec0, maxValsA);
815  maxValsA =
816  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
817  // have been shown to have larger values (contained in mask)
818  maxIndicesA = __select(mask0, firstHalfIndices,
819  maxIndicesA); // set the max indices to be the indices whose values have changed from
820  // the previous max values
821 
822  inVec1 = *(vec *) remStart;
823  secondHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
824  mask1 = __cmp_gt_pred(inVec1, maxValsB);
825  maxValsB = __select(mask1, inVec1, maxValsB);
826  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
827  }
828  else {
829  /* Nothing to do here */
830  }
831  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
832  // as well
833  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
834  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
835  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
836  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
837  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
838  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
839  index_vec zeroVec = c7x::uchar_vec(0);
840  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
841  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
842 
843  index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
844 
845  maxVals = maxValsLarge;
846  maxIndices = maxIndicesLarge;
847  }
848 
850  output.maxVals = maxVals;
851  output.maxIndices = maxIndices;
852  return output;
853 }
854 
855 // explicit templatization for int16_t type
857 {
858  c7x::ushort_vec maxIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
859  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
860  c7x::ushort_vec maxIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
861  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
862  c7x::ushort_vec maxIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
863  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
864  // re-defined for large widths so that there's no common vectors used between small and large widths
865  c7x::ushort_vec maxIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
866  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
867  c7x::ushort_vec maxIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
868  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
869  c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
870  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
871  c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
872  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
873 
874  typedef typename c7x::make_full_vector<int16_t>::type vec;
875  typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
876 
877  // holds all of the maximum values that have previously been read
878  vec maxVals0; // = int16_t(std::numeric_limits<int16_t>::lowest());
879  vec maxVals1; // = maxVals0;
880  __vpred maskOfMaxs;
881 
882  // holds the overall max vals
883  vec maxVals;
884  size_t width = c7x::element_count_of<vec>::value;
885 
886  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
887  if (length <= width) {
888  maxVals = c7x::strm_eng<0, vec>::get_adv();
889 
890  // fill the uninitialized values with MIN_VAL
891  for (size_t i = length; i < width; i++) {
892  maxVals.s[i] = std::numeric_limits<int16_t>::lowest();
893  }
894  }
895  // can fill one width but only part of a second
896  else if (length < 2 * width) {
897  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
898  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
899 
900  // fill the uninitialized values with MIN_VAL
901  size_t remElements = length % width;
902 
903  for (size_t i = remElements; i < width; i++) {
904  maxVals1.s[i] = std::numeric_limits<int16_t>::lowest();
905  }
906  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
907  // as well
908  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
909  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
910  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
911  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
912  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
913  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
914  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
915  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
916  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
917 
918  maxIndices = nonTiebreakerVec + tiebreakerVec;
919  }
920  else {
921  // input vectors
922  c7x::short_vec inVec0, inVec1;
923  __vpred mask0, mask1;
924  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
925  // when not being used, which will increase the ii
926  c7x::short_vec maxValsA = int16_t(std::numeric_limits<int16_t>::lowest());
927  c7x::short_vec maxValsB = maxValsA;
928 
929  // holds the overall max vals
930  c7x::short_vec maxValsLarge = int16_t(std::numeric_limits<int16_t>::lowest());
931 
932  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
933 
934  for (size_t i = 0; i < numIterations; i += 1) {
935  inVec0 = c7x::strm_eng<0, vec>::get_adv();
936  mask0 = __cmp_gt_pred(inVec0, maxValsA);
937  maxValsA =
938  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
939  // have been shown to have larger values (contained in mask)
940  maxIndicesA =
941  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
942  // changed from the previous max values
943 
944  inVec1 = c7x::strm_eng<1, vec>::get_adv();
945  mask1 = __cmp_gt_pred(inVec1, maxValsB);
946  maxValsB = __select(mask1, inVec1, maxValsB);
947  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
948 
949  // update the new locations of the indices to be set for the next iteration
950  firstHalfIndices += jumpFactorShort;
951  secondHalfIndices += jumpFactorShort;
952  }
953 
954  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
955 
956  // if no remainder block, go to end
957 
958  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
959  int16_t *remStart = (int16_t *) pSrc + length - width;
960 
961  if (remBlockSize != 0 && remVecLen == 1) {
962  inVec0 = *(vec *) remStart;
963  firstHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
964 
965  mask0 = __cmp_gt_pred(inVec0, maxValsA);
966  maxValsA =
967  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
968  // have been shown to have larger values (contained in mask)
969  maxIndicesA = __select(mask0, firstHalfIndices,
970  maxIndicesA); // set the max indices to be the indices whose values have changed from
971  // the previous max values
972  }
973 
974  else if (remBlockSize != 0 && remVecLen == 2) {
975  inVec0 = *(vec *) (remStart - width);
976  firstHalfIndices = c7x::ushort_vec(length - 2 * width) + lastRunOffsetsShort;
977  mask0 = __cmp_gt_pred(inVec0, maxValsA);
978  maxValsA =
979  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
980  // have been shown to have larger values (contained in mask)
981  maxIndicesA = __select(mask0, firstHalfIndices,
982  maxIndicesA); // set the max indices to be the indices whose values have changed from
983  // the previous max values
984  inVec1 = *(vec *) remStart;
985  secondHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
986  mask1 = __cmp_gt_pred(inVec1, maxValsB);
987  maxValsB = __select(mask1, inVec1, maxValsB);
988  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
989  }
990  else {
991  /* Nothing to do here */
992  }
993  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
994  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
995  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
996  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
997  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
998  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
999  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1000  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1001  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1002 
1003  c7x::ushort_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
1004 
1005  maxVals = maxValsLarge;
1006  maxIndices = maxIndicesLarge;
1007  }
1008 
1010  output.maxVals = maxVals;
1011  output.maxIndices = maxIndices;
1012  return output;
1013 }
1014 
1015 // explicit templatization for uint16_t type
1017 {
1018  c7x::ushort_vec maxIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
1019  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1020  c7x::ushort_vec maxIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
1021  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1022  c7x::ushort_vec maxIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
1023  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
1024  // re-defined for large widths so that there's no common vectors used between small and large widths
1025  c7x::ushort_vec maxIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
1026  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1027  c7x::ushort_vec maxIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
1028  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
1029  c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1030  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1031  c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1032  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
1033 
1034  typedef typename c7x::make_full_vector<uint16_t>::type vec;
1035  typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
1036 
1037  // holds all of the maximum values that have previously been read
1038  vec maxVals0; // = uint16_t(std::numeric_limits<uint16_t>::lowest());
1039  vec maxVals1; // = maxVals0;
1040  __vpred maskOfMaxs;
1041 
1042  // holds the overall max vals
1043  vec maxVals;
1044  size_t width = c7x::element_count_of<vec>::value;
1045 
1046  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1047  if (length <= width) {
1048  maxVals = c7x::strm_eng<0, vec>::get_adv();
1049 
1050  // fill the uninitialized values with MIN_VAL
1051  for (size_t i = length; i < width; i++) {
1052  maxVals.s[i] = std::numeric_limits<uint16_t>::lowest();
1053  }
1054  }
1055  // can fill one width but only part of a second
1056  else if (length < 2 * width) {
1057  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
1058  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
1059 
1060  // fill the uninitialized values with MIN_VAL
1061  size_t remElements = length % width;
1062 
1063  for (size_t i = remElements; i < width; i++) {
1064  maxVals1.s[i] = std::numeric_limits<uint16_t>::lowest();
1065  }
1066  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
1067  // as well
1068  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
1069  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
1070  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
1071  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
1072  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1073  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
1074  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1075  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1076  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1077 
1078  maxIndices = nonTiebreakerVec + tiebreakerVec;
1079  }
1080  else {
1081  // input vectors
1082  c7x::ushort_vec inVec0, inVec1;
1083  __vpred mask0, mask1;
1084  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1085  // when not being used, which will increase the ii
1086  c7x::ushort_vec maxValsA = uint16_t(std::numeric_limits<uint16_t>::lowest());
1087  c7x::ushort_vec maxValsB = maxValsA;
1088 
1089  // holds the overall max vals
1090  c7x::ushort_vec maxValsLarge = uint16_t(std::numeric_limits<uint16_t>::lowest());
1091 
1092  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1093 
1094  for (size_t i = 0; i < numIterations; i += 1) {
1095  inVec0 = c7x::strm_eng<0, c7x::ushort_vec>::get_adv();
1096  mask0 = __cmp_gt_pred(inVec0, maxValsA);
1097  maxValsA =
1098  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1099  // have been shown to have larger values (contained in mask)
1100  maxIndicesA =
1101  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
1102  // changed from the previous max values
1103 
1104  inVec1 = c7x::strm_eng<1, c7x::ushort_vec>::get_adv();
1105  mask1 = __cmp_gt_pred(inVec1, maxValsB);
1106  maxValsB = __select(mask1, inVec1, maxValsB);
1107  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1108 
1109  // update the new locations of the indices to be set for the next iteration
1110  firstHalfIndices += jumpFactorShort;
1111  secondHalfIndices += jumpFactorShort;
1112  }
1113 
1114  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
1115 
1116  // if no remainder block, go to end
1117 
1118  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1119  uint16_t *remStart = (uint16_t *) pSrc + length - width;
1120 
1121  if (remBlockSize != 0 && remVecLen == 1) {
1122  inVec0 = *(vec *) remStart;
1123  firstHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
1124 
1125  mask0 = __cmp_gt_pred(inVec0, maxValsA);
1126  maxValsA =
1127  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1128  // have been shown to have larger values (contained in mask)
1129  maxIndicesA = __select(mask0, firstHalfIndices,
1130  maxIndicesA); // set the max indices to be the indices whose values have changed from
1131  // the previous max values
1132  }
1133  else if (remBlockSize != 0 && remVecLen == 2) {
1134  inVec0 = *(vec *) (remStart - width);
1135  firstHalfIndices = c7x::ushort_vec(length - 2 * width) + lastRunOffsetsShort;
1136  mask0 = __cmp_gt_pred(inVec0, maxValsA);
1137  maxValsA =
1138  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1139  // have been shown to have larger values (contained in mask)
1140  maxIndicesA = __select(mask0, firstHalfIndices,
1141  maxIndicesA); // set the max indices to be the indices whose values have changed from
1142  // the previous max values
1143  inVec1 = *(vec *) remStart;
1144  secondHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
1145  mask1 = __cmp_gt_pred(inVec1, maxValsB);
1146  maxValsB = __select(mask1, inVec1, maxValsB);
1147  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1148  }
1149  else {
1150  /* Nothing to do here */
1151  }
1152  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
1153  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
1154  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
1155  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
1156  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
1157  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
1158  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1159  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1160  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1161 
1162  c7x::ushort_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
1163 
1164  maxVals = maxValsLarge;
1165  maxIndices = maxIndicesLarge;
1166  }
1167 
1169  output.maxVals = maxVals;
1170  output.maxIndices = maxIndices;
1171  return output;
1172 }
1173 
1174 // explicit templatization for float type
1175 template <> metadata<float, uint32_t> DSPLIB_maxIndex_loopLogic<float, uint32_t>(size_t length, void *pSrc)
1176 {
1177  c7x::uint_vec maxIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1178  c7x::uint_vec maxIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1179  c7x::uint_vec maxIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1180  // re-defined for large widths so that there's no common vectors used between small and large widths
1181  c7x::uint_vec maxIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1182  c7x::uint_vec maxIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1183  c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1184  c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1185  // holds all of the maximum values that have previously been read
1186  c7x::float_vec maxVals0; // = std::numeric_limits<float>::lowest();
1187  c7x::float_vec maxVals1; // = maxVals0;
1188  __vpred maskOfMaxs;
1189 
1190  size_t width = c7x::element_count_of<c7x::float_vec>::value;
1191 
1192  // holds the overall max vals
1193  c7x::float_vec maxVals;
1194  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1195  if (length <= width) {
1196  maxVals = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1197  // fill the uninitialized values with MIN_VAL
1198  for (size_t i = length; i < width; i++) {
1199  maxVals.s[i] = std::numeric_limits<float>::lowest();
1200  }
1201  }
1202  // can fill one width but only part of a second
1203  else if (length < 2 * width) {
1204  maxVals0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1205  maxVals1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1206  // fill the uninitialized values with MIN_VAL
1207  size_t remElements = length % width;
1208  for (size_t i = remElements; i < width; i++) {
1209  maxVals1.s[i] = std::numeric_limits<float>::lowest();
1210  }
1211  maskOfMaxs = __cmp_lt_pred(maxVals1, maxVals0);
1212  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1213  maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
1214  }
1215  else {
1216  // input vectors
1217  c7x::float_vec inVec0, inVec1;
1218  __vpred mask0, mask1, maskOfMaxsLarge;
1219  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1220  // when not being used, which will increase the ii
1221  c7x::float_vec maxValsA = std::numeric_limits<float>::lowest();
1222  c7x::float_vec maxValsB = maxValsA;
1223 
1224  // holds the overall max vals
1225  c7x::float_vec maxValsLarge = std::numeric_limits<float>::lowest();
1226  // printf("length: %d, width: %d\n", length, width);
1227  // size_t numIterations = DSPLIB_ceilingDiv(length, width);
1228  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1229  // printf("\nnumIter: %d\n", numIterations);
1230  for (size_t i = 0; i < numIterations; i += 1) {
1231  inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1232  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1233  maxValsA =
1234  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1235  // have been shown to have larger values (contained in mask)
1236  maxIndicesA =
1237  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
1238  // changed from the previous max values
1239 
1240  inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1241  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1242  maxValsB = __select(mask1, inVec1, maxValsB);
1243  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1244 
1245  // update the new locations of the indices to be set for the next iteration
1246  firstHalfIndices += jumpFactor;
1247  secondHalfIndices += jumpFactor;
1248  }
1249 
1250  int32_t remBlockSize = length - (numIterations * INDEX_UNROLL_FACTOR * width);
1251  // printf("\nrem block size: %d\n", remBlockSize);
1252 
1253  // if no remainder block, go to end
1254 
1255  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1256  float *remStart = (float *) pSrc + length - width;
1257 
1258  // if remainder 1
1259  if (remBlockSize != 0 && remVecLen == 1) {
1260 
1261  // printf("\ninside rem1\n");
1262  inVec0 = *(c7x::float_vec *) remStart;
1263  firstHalfIndices = c7x::uint_vec(length - (width)) + lastRunOffsets;
1264  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1265  maxValsA =
1266  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1267  // have been shown to have larger values (contained in mask)
1268  maxIndicesA = __select(mask0, firstHalfIndices,
1269  maxIndicesA); // set the max indices to be the indices whose values have changed from
1270  // the previous max values
1271  }
1272 
1273  // if remainder 2
1274  else if (remBlockSize != 0 && remVecLen == 2) {
1275 
1276  // printf("\ninside rem 2\n");
1277  // inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1278  inVec0 = *(c7x::float_vec *) (remStart - width);
1279  firstHalfIndices = c7x::uint_vec(length - (2 * width)) + lastRunOffsets;
1280  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1281  maxValsA =
1282  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1283  // have been shown to have larger values (contained in mask)
1284  maxIndicesA = __select(mask0, firstHalfIndices,
1285  maxIndicesA); // set the max indices to be the indices whose values have changed from
1286  // the previous max values
1287 
1288  // inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1289  inVec1 = *(c7x::float_vec *) remStart;
1290  secondHalfIndices = c7x::uint_vec(length - (width)) + lastRunOffsets;
1291  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1292  maxValsB = __select(mask1, inVec1, maxValsB);
1293  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1294  }
1295  else {
1296  /* Nothing to do here */
1297  }
1298  maskOfMaxsLarge = __cmp_lt_pred(maxValsB, maxValsA);
1299  maxValsLarge = __select(maskOfMaxsLarge, maxValsA, maxValsB);
1300  c7x::uint_vec maxIndicesLarge = __select(maskOfMaxsLarge, maxIndicesA, maxIndicesB);
1301 
1302  // metadata<float, uint32_t> outputLarge;
1303  // outputLarge.maxVals = maxValsLarge;
1304  // outputLarge.maxIndices = maxIndicesLarge;
1305  maxVals = maxValsLarge;
1306  maxIndices = maxIndicesLarge;
1307  }
1309  output.maxVals = maxVals;
1310  output.maxIndices = maxIndices;
1311  return output;
1312 }
1313 
1314 // explicit templatization for double type
1315 template <> metadata<double, uint64_t> DSPLIB_maxIndex_loopLogic<double, uint64_t>(size_t length, void *pSrc)
1316 {
1317  c7x::ulong_vec maxIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1318  c7x::ulong_vec maxIndices0 = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1319  c7x::ulong_vec maxIndices1 = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1320  // re-defined for large widths so that there's no common vectors used between small and large widths
1321  c7x::ulong_vec maxIndicesA = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1322  c7x::ulong_vec maxIndicesB = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1323  c7x::ulong_vec firstHalfIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1324  c7x::ulong_vec secondHalfIndices = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1325 
1326  c7x::double_vec maxVals0; // = std::numeric_limits<double>::lowest();
1327  c7x::double_vec maxVals1; // = maxVals0;
1328  __vpred maskOfMaxs;
1329 
1330  size_t width = c7x::element_count_of<c7x::double_vec>::value;
1331 
1332  // holds the overall max vals
1333  c7x::double_vec maxVals;
1334  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1335  if (length <= width) {
1336  maxVals = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1337  // fill the uninitialized values with MIN_VAL
1338  for (size_t i = length; i < width; i++) {
1339  maxVals.s[i] = std::numeric_limits<double>::lowest();
1340  }
1341  }
1342  // can fill one width but only part of a second
1343  else if (length < 2 * width) {
1344  maxVals0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1345  maxVals1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1346  // fill the uninitialized values with MIN_VAL
1347  size_t remainingElement = length % width;
1348  for (size_t i = remainingElement; i < width; i++) {
1349  maxVals1.s[i] = std::numeric_limits<double>::lowest();
1350  }
1351  maskOfMaxs = __cmp_lt_pred(maxVals1, maxVals0);
1352  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1353  maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
1354  }
1355  else {
1356  // input vectors
1357  c7x::double_vec inVec0, inVec1;
1358  __vpred mask0, mask1, maskOfMaxsLarge;
1359  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1360  // when not being used, which will increase the ii
1361  c7x::double_vec maxValsA = std::numeric_limits<double>::lowest();
1362  c7x::double_vec maxValsB = maxValsA;
1363 
1364  // holds the overall max vals
1365  c7x::double_vec maxValsLarge = std::numeric_limits<double>::lowest();
1366  // printf("length: %d, width: %d\n", length, width);
1367  // size_t numIterations = DSPLIB_ceilingDiv(length, width);
1368  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1369  // printf("\nnumIter: %d\n", numIterations);
1370  for (size_t i = 0; i < numIterations; i += 1) {
1371  inVec0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1372  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1373  maxValsA =
1374  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1375  // have been shown to have larger values (contained in mask)
1376  maxIndicesA =
1377  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
1378  // changed from the previous max values
1379 
1380  inVec1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1381  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1382  maxValsB = __select(mask1, inVec1, maxValsB);
1383  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1384 
1385  // update the new locations of the indices to be set for the next iteration
1386  firstHalfIndices += jumpFactorDp;
1387  secondHalfIndices += jumpFactorDp;
1388  }
1389 
1390  int32_t remBlockSize = length - (numIterations * INDEX_UNROLL_FACTOR * width);
1391  // printf("\nrem block size: %d\n", remBlockSize);
1392 
1393  // if no remainder block, go to end
1394 
1395  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1396  double *remStart = (double *) pSrc + length - width;
1397 
1398  // if remainder 1
1399  if (remBlockSize != 0 && remVecLen == 1) {
1400 
1401  // printf("\ninside rem1\n");
1402  inVec0 = *(c7x::double_vec *) remStart;
1403  firstHalfIndices = c7x::ulong_vec(length - (width)) + lastRunOffsetsDp;
1404  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1405  maxValsA =
1406  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1407  // have been shown to have larger values (contained in mask)
1408  maxIndicesA = __select(mask0, firstHalfIndices,
1409  maxIndicesA); // set the max indices to be the indices whose values have changed from
1410  // the previous max values
1411  }
1412 
1413  // if remainder 2
1414  else if (remBlockSize != 0 && remVecLen == 2) {
1415 
1416  // printf("\ninside rem 2\n");
1417  // inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1418  inVec0 = *(c7x::double_vec *) (remStart - width);
1419  firstHalfIndices = c7x::ulong_vec(length - (2 * width)) + lastRunOffsetsDp;
1420  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1421  maxValsA =
1422  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1423  // have been shown to have larger values (contained in mask)
1424  maxIndicesA = __select(mask0, firstHalfIndices,
1425  maxIndicesA); // set the max indices to be the indices whose values have changed from
1426  // the previous max values
1427 
1428  // inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1429  inVec1 = *(c7x::double_vec *) remStart;
1430  secondHalfIndices = c7x::ulong_vec(length - (width)) + lastRunOffsetsDp;
1431  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1432  maxValsB = __select(mask1, inVec1, maxValsB);
1433  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1434  }
1435  else {
1436  /* Do nothing */
1437  }
1438 
1439  maskOfMaxsLarge = __cmp_lt_pred(maxValsB, maxValsA);
1440  maxValsLarge = __select(maskOfMaxsLarge, maxValsA, maxValsB);
1441  c7x::ulong_vec maxIndicesLarge = __select(maskOfMaxsLarge, maxIndicesA, maxIndicesB);
1442 
1443  maxVals = maxValsLarge;
1444  maxIndices = maxIndicesLarge;
1445  }
1446 
1448  output.maxVals = maxVals;
1449  output.maxIndices = maxIndices;
1450  return output;
1451 }
#define SE_SE0_PARAM_OFFSET
#define SE_SE1_PARAM_OFFSET
metadata< uint16_t, uint16_t > DSPLIB_maxIndex_loopLogic< uint16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< float, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< int16_t, uint16_t > DSPLIB_maxIndex_loopLogic< int16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
metadata< double, uint64_t > DSPLIB_maxIndex_loopLogic< double, uint64_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
DSPLIB_STATUS DSPLIB_maxIndex_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< double, uint64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
const c7x::uint_vec jumpFactor
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< float, uint32_t > DSPLIB_maxIndex_loopLogic< float, uint32_t >(size_t length, void *pSrc)
const c7x::ushort_vec jumpFactorShort
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
const c7x::uchar_vec jumpFactorChar
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
const c7x::uint_vec lastRunOffsets
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< uint8_t, uint8_t > DSPLIB_maxIndex_loopLogic< uint8_t, uint8_t >(size_t length, void *pSrc)
const c7x::uchar_vec lastRunOffsetsChar
#define INDEX_UNROLL_FACTOR
const c7x::ushort_vec lastRunOffsetsShort
metadata< T, TIndex > DSPLIB_maxIndex_loopLogic(size_t length, void *pSrc)
This function is the kernel loop helper function for the optimized implementation of the kernel....
const c7x::ulong_vec jumpFactorDp
metadata< int8_t, uint8_t > DSPLIB_maxIndex_loopLogic< int8_t, uint8_t >(size_t length, void *pSrc)
DSPLIB_STATUS DSPLIB_maxIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
const c7x::ulong_vec lastRunOffsetsDp
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_maxIndex.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 1 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MAXINDEX_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t blockSize
Size of input buffer for different batches DSPLIB_maxIndex_init that will be retrieved and used by DS...
index_vec maxIndices