DSPLIB User Guide
DSPLIB_maxIndex_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
47 #include "../common/c71/DSPLIB_inlines.h"
48 #include "DSPLIB_maxIndex_priv.h"
49 #include "DSPLIB_types.h"
50 #include "c7x_scalable.h"
51 #include <algorithm> // std::min
52 #include <cmath> // floor
53 #include <cstdint>
54 #include <float.h>
55 #include <limits>
56 #include <numeric> // std::iota
57 #include <vector> // std::vector
58 
59 #define INDEX_UNROLL_FACTOR 2
60 
61 // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
62 const c7x::uint_vec jumpFactor = c7x::uint_vec(16);
63 const c7x::uint_vec lastRunOffsets = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
64 const c7x::ushort_vec jumpFactorShort = c7x::ushort_vec(32);
65 const c7x::ushort_vec lastRunOffsetsShort = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
66 const c7x::ulong_vec jumpFactorDp = c7x::ulong_vec(8);
67 const c7x::ulong_vec lastRunOffsetsDp = c7x::ulong_vec(0, 1, 2, 3);
68 const c7x::uchar_vec jumpFactorChar = c7x::uchar_vec(64);
69 
70 // clang-format off
71 const c7x::uchar_vec lastRunOffsetsChar = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
72  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
73 // clang-format on
74 
75 template <typename dataType>
77  const DSPLIB_bufParams1D_t *bufParamsIn,
78  const DSPLIB_bufParams1D_t *bufParamsOut,
79  const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
80 {
82  __SE_TEMPLATE_v1 se0Params, se1Params;
83 
84  __SE_ELETYPE SE_ELETYPE;
85  __SE_VECLEN SE_VECLEN;
86 
87  DSPLIB_maxIndex_PrivArgs *pKerPrivArgs = (DSPLIB_maxIndex_PrivArgs *) handle;
88 
89  uint8_t *pBlock = pKerPrivArgs->bufPblock;
90  uint32_t blocksSize = pKerPrivArgs->blockSize;
91 
92  typedef typename c7x::make_full_vector<dataType>::type vec;
93  uint32_t eleCount = c7x::element_count_of<vec>::value;
94  SE_VECLEN = c7x::se_veclen<vec>::value;
95  SE_ELETYPE = c7x::se_eletype<vec>::value;
96  uint32_t length = blocksSize;
97  uint32_t width = eleCount;
98 
99 #if DSPLIB_DEBUGPRINT
100  printf("Enter eleCount %d\n", eleCount);
101 #endif
102 
103  /**********************************************************************/
104  /* Prepare streaming engine 0,1 to fetch the input */
105  /**********************************************************************/
106  se0Params = __gen_SE_TEMPLATE_v1();
107 
108  // default SE0 parameters
109  se0Params.ICNT0 = width;
110  se0Params.ELETYPE = SE_ELETYPE;
111  se0Params.VECLEN = SE_VECLEN;
112  se0Params.DIMFMT = __SE_DIMFMT_1D;
113 
114  se1Params = __gen_SE_TEMPLATE_v1();
115 
116  // default SE1 parameters
117  se1Params.ICNT0 = width;
118  se1Params.ELETYPE = SE_ELETYPE;
119  se1Params.VECLEN = SE_VECLEN;
120  se1Params.DIMFMT = __SE_DIMFMT_1D;
121 
122  // variables to calculate and store compute loop's iteration counter
123  uint32_t numBlocks = length / width;
124  uint32_t remBlocksSize = length % width;
125  if (remBlocksSize) {
126  numBlocks++;
127  }
128  else {
129  /* Nothing to do here */
130  }
131  // case: length of input <= width
132  // one SE fetch is length elements, rest of vec filled with '0'
133  if (length <= width) {
134 
135  // SE0 fetch length
136  se0Params.ICNT0 = length;
137  // SE1 not used
138  }
139 
140  // case: length of input is > width but < 2*width
141  // SE0 fetch is one full width, SE1 fetch is partial fetch, rest of vec filled with '0'
142  else if (length < 2 * width) {
143 
144  // SE0 full fetch
145  se0Params.ICNT0 = width;
146  // SE1 partial fetch
147  se1Params.ICNT0 = remBlocksSize;
148  }
149 
150  // case: len >= 2*width
151  // SE0 and SE1 fetches are full widths only
152  else {
153 
154  // printf("\ninside len > 4SIMD\n");
155  // SE0 Dim is 2D
156  se0Params.DIMFMT = __SE_DIMFMT_2D;
157  // SE0 jump length each get_adv is 2 widths
158  se0Params.DIM1 = 2 * width;
159  // SE only performs full fetches in multiples of INDEX_UNROLL_FACTOR, i.e. 2
160  se0Params.ICNT1 = length / (INDEX_UNROLL_FACTOR * width);
161  // SE0 fetches full widths
162  se0Params.ICNT0 = width;
163 
164  // SE1 fetches in same manner as SE0, but starts 1 width ahead
165  se1Params = se0Params;
166  }
167 
168  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET) = se0Params;
169  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET) = se1Params;
170 
171  return status;
172 }
173 
175  const DSPLIB_bufParams1D_t *bufParamsIn,
176  const DSPLIB_bufParams1D_t *bufParamsOut,
177  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
178 
180  const DSPLIB_bufParams1D_t *bufParamsIn,
181  const DSPLIB_bufParams1D_t *bufParamsOut,
182  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
183 
185  const DSPLIB_bufParams1D_t *bufParamsIn,
186  const DSPLIB_bufParams1D_t *bufParamsOut,
187  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
188 
190  const DSPLIB_bufParams1D_t *bufParamsIn,
191  const DSPLIB_bufParams1D_t *bufParamsOut,
192  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
193 
195  const DSPLIB_bufParams1D_t *bufParamsIn,
196  const DSPLIB_bufParams1D_t *bufParamsOut,
197  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
198 
200  const DSPLIB_bufParams1D_t *bufParamsIn,
201  const DSPLIB_bufParams1D_t *bufParamsOut,
202  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
203 
205  const DSPLIB_bufParams1D_t *bufParamsIn,
206  const DSPLIB_bufParams1D_t *bufParamsOut,
207  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
208 
210  const DSPLIB_bufParams1D_t *bufParamsIn,
211  const DSPLIB_bufParams1D_t *bufParamsOut,
212  const DSPLIB_maxIndex_InitArgs *pKerInitArgs);
213 
214 template <typename T, typename TIndex>
215 DSPLIB_STATUS DSPLIB_maxIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
216 {
217  DSPLIB_maxIndex_PrivArgs *pKerPrivArgs = (DSPLIB_maxIndex_PrivArgs *) handle;
218  uint32_t blockSize = pKerPrivArgs->blockSize;
219  uint32_t length = blockSize;
220  DSPLIB_STATUS status = DSPLIB_SUCCESS;
221 
222  __SE_TEMPLATE_v1 se0Params, se1Params;
223  // __SA_TEMPLATE_v1 sa0Params;
224 
225  T *restrict pInLocal = (T *) pIn;
226  uint32_t *restrict pOutLocal = (uint32_t *) pOut;
227 
228 #if DSPLIB_DEBUGPRINT
229  printf("Enter DSPLIB_maxIndex_exec_ci\n");
230 #endif
231 
232  typedef typename c7x::make_full_vector<T>::type vec;
233  uint32_t eleCount = c7x::element_count_of<vec>::value;
234  uint32_t width = eleCount;
235 #if DSPLIB_DEBUGPRINT
236  printf("Enter eleCount %d\n", eleCount);
237 #endif
238 
239  uint8_t *pBlock = pKerPrivArgs->bufPblock;
240  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
241  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
242 
243  // Input samples
244  __SE0_OPEN(pInLocal, se0Params);
245  if (length > width) {
246  __SE1_OPEN(pInLocal + eleCount, se1Params);
247  }
248 
249 #if DSPLIB_DEBUGPRINT
250  printf("DSPLIB_DEBUGPRINT blockSize %d\n", blockSize);
251 #endif
252 
253  size_t bitsInType = sizeof(T) * 8; // sizeof(T) is measured in bytes
254  bitsInType = (bitsInType > 32) ? 32 : bitsInType;
255 
256  size_t maxSingleBufferSize = pow(2, bitsInType);
257  uint32_t numBufferIterations = DSPLIB_ceilingDiv(length, maxSingleBufferSize);
258 
259  std::vector<T> maxVals(numBufferIterations);
260  std::vector<uint32_t> maxIndices(numBufferIterations);
261 
262  T *currentValuePtr;
263  T currentValue; // using this value so we don't dereference the pointer three different times per iteration
264  T largest;
265  metadata<T, TIndex> loopOutput;
266  size_t currentIterationSize;
267  TIndex maxIndex;
268  TIndex *currentIndexPtr;
269  TIndex currentIndex;
270  size_t i = 1;
271  for (uint32_t buffer = 0; buffer < numBufferIterations; buffer++) {
272 
273  currentIterationSize = std::min((size_t) maxSingleBufferSize, (size_t) (length - (maxSingleBufferSize * buffer)));
274 
275  loopOutput = DSPLIB_maxIndex_loopLogic<T, TIndex>(currentIterationSize, pInLocal);
276  // find the maximum index by looping through the min vector and getting the corresponding min index
277  // use pointer since .s[i] is problematic
278  currentValuePtr = (T *) &loopOutput.maxVals;
279  largest = *currentValuePtr++;
280  currentIndexPtr = (TIndex *) &loopOutput.maxIndices;
281  maxIndex = *currentIndexPtr++;
282  for (i = 1; i < c7x::element_count_of<vec>::value; i++) {
283  currentValue = *currentValuePtr;
284  currentIndex = *currentIndexPtr;
285  if (currentValue > largest) {
286  largest = currentValue;
287  maxIndex = currentIndex;
288  }
289  // need the first instance of the maximum value, so set the maximum index to the lower index if current value
290  // is same as current maximum value
291  else if (currentValue == largest) {
292  if (currentIndex < maxIndex) {
293  maxIndex = currentIndex;
294  }
295  }
296  else {
297  /* Nothing to do here */
298  }
299  currentValuePtr++;
300  currentIndexPtr++;
301  }
302  maxVals[buffer] = largest;
303  maxIndices[buffer] = ((uint32_t) maxIndex) + (buffer * maxSingleBufferSize);
304  }
305 
306  T largestVal = maxVals[0];
307  uint32_t largestIndex = maxIndices[0];
308  for (i = 1; i < maxVals.size(); i++) {
309  if (maxVals[i] > largestVal) {
310  largestVal = maxVals[i];
311  largestIndex = maxIndices[i];
312  }
313  }
314 
315  *pOutLocal = largestIndex;
316  // close SE0 and SE1
317  __SE0_CLOSE();
318  if (length > width) {
319  __SE1_CLOSE();
320  }
321 
322  return status;
323 }
324 
326 DSPLIB_maxIndex_exec_ci<int8_t, uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
327 
329 DSPLIB_maxIndex_exec_ci<uint8_t, uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
330 
332 DSPLIB_maxIndex_exec_ci<int16_t, uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
333 
335 DSPLIB_maxIndex_exec_ci<uint16_t, uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
336 
338 DSPLIB_maxIndex_exec_ci<int32_t, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
339 
341 DSPLIB_maxIndex_exec_ci<uint32_t, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
342 
344 DSPLIB_maxIndex_exec_ci<float, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
345 
347 DSPLIB_maxIndex_exec_ci<double, uint64_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
348 
349 template <typename T, typename TIndex> metadata<T, TIndex> DSPLIB_maxIndex_loopLogic(size_t length, void *pSrc)
350 {
351  // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
352  c7x::uint_vec maxIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
353  c7x::uint_vec maxIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
354  c7x::uint_vec maxIndices1 = c7x::uint_vec(8, 9, 10, 11, 12, 13, 14, 15);
355  // re-defined for large widths so that there's no common vectors used between small and large widths
356  c7x::uint_vec maxIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
357  c7x::uint_vec maxIndicesB = c7x::uint_vec(8, 9, 10, 11, 12, 13, 14, 15);
358  c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
359  c7x::uint_vec secondHalfIndices = c7x::uint_vec(8, 9, 10, 11, 12, 13, 14, 15);
360  // derive c7x vector type from value template and index template
361  typedef typename c7x::make_full_vector<T>::type vec;
362  typedef typename c7x::make_full_vector<TIndex>::type index_vec;
363 
364  // holds all of the maximum values that have previously been read
365  vec maxVals0; // = T(std::numeric_limits<T>::lowest());
366  vec maxVals1; // = maxVals0;
367  __vpred maskOfMaxs;
368 
369  // holds the overall max vals
370  vec maxVals;
371  size_t width = c7x::element_count_of<vec>::value;
372  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
373  if (length <= width) {
374  maxVals = c7x::strm_eng<0, vec>::get_adv();
375  // fill the uninitialized values with MIN_VAL
376  for (size_t i = length; i < width; i++) {
377  maxVals.s[i] = std::numeric_limits<T>::lowest();
378  }
379  }
380  // can fill one width but only part of a second
381  else if (length < 2 * width) {
382  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
383  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
384  // fill the uninitialized values with MIN_VAL
385  size_t remElements = length % width;
386  for (size_t i = remElements; i < width; i++) {
387  maxVals1.s[i] = std::numeric_limits<T>::lowest();
388  }
389  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
390  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
391  maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
392 
393  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
394  // as well
395  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
396  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
397  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
398  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
399  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
400  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
401  index_vec zeroVec = c7x::uint_vec(0);
402  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
403  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
404 
405  maxIndices = nonTiebreakerVec + tiebreakerVec;
406  }
407  else {
408  // input vectors
409  vec inVec0, inVec1;
410  __vpred mask0, mask1;
411  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
412  // when not being used, which will increase the ii
413  vec maxValsA = T(std::numeric_limits<T>::lowest());
414  vec maxValsB = maxValsA;
415 
416  // holds the overall max vals
417  vec maxValsLarge = T(std::numeric_limits<T>::lowest());
418 
419  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
420 
421  for (size_t i = 0; i < numIterations; i += 1) {
422  inVec0 = c7x::strm_eng<0, vec>::get_adv();
423  mask0 = __cmp_gt_pred(inVec0, maxValsA);
424  maxValsA =
425  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
426  // have been shown to have larger values (contained in mask)
427  maxIndicesA =
428  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
429  // changed from the previous max values
430 
431  inVec1 = c7x::strm_eng<1, vec>::get_adv();
432  mask1 = __cmp_gt_pred(inVec1, maxValsB);
433  maxValsB = __select(mask1, inVec1, maxValsB);
434  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
435 
436  // update the new locations of the indices to be set for the next iteration
437  firstHalfIndices += jumpFactor;
438  secondHalfIndices += jumpFactor;
439  }
440 
441  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
442  // if no remainder block, go to end
443 
444  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
445  T *remStart = (T *) pSrc + length - width;
446 
447  if (remBlockSize != 0 && remVecLen == 1) {
448 
449  inVec0 = *(vec *) remStart;
450  firstHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
451  mask0 = __cmp_gt_pred(inVec0, maxValsA);
452  maxValsA =
453  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
454  // have been shown to have larger values (contained in mask)
455  maxIndicesA = __select(mask0, firstHalfIndices,
456  maxIndicesA); // set the max indices to be the indices whose values have changed from
457  // the previous max values
458  }
459 
460  else if (remBlockSize != 0 && remVecLen == 2) {
461  inVec0 = *(vec *) (remStart - width);
462  firstHalfIndices = c7x::uint_vec(length - (2 * c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
463  mask0 = __cmp_gt_pred(inVec0, maxValsA);
464  maxValsA =
465  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
466  // have been shown to have larger values (contained in mask)
467  maxIndicesA = __select(mask0, firstHalfIndices,
468  maxIndicesA); // set the max indices to be the indices whose values have changed from
469  // the previous max values
470 
471  // inVec1 = c7x::strm_eng<1, vec>::get_adv();
472  inVec1 = *(vec *) remStart;
473  secondHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
474  mask1 = __cmp_gt_pred(inVec1, maxValsB);
475  maxValsB = __select(mask1, inVec1, maxValsB);
476  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
477  }
478  else {
479  /* Nothing to do here */
480  }
481 
482  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
483  // as well
484  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
485  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
486  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
487  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
488  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
489  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
490  index_vec zeroVec = c7x::uint_vec(0);
491  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
492  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
493 
494  index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
495 
496  maxVals = maxValsLarge;
497  maxIndices = maxIndicesLarge;
498  }
499  metadata<T, TIndex> output;
500  output.maxVals = maxVals;
501  output.maxIndices = maxIndices;
502  return output;
503 }
504 
505 // explicit templatization for int8_t type
506 template <> metadata<int8_t, uint8_t> DSPLIB_maxIndex_loopLogic<int8_t, uint8_t>(size_t length, void *pSrc)
507 {
508 
509  // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
510  c7x::uchar_vec maxIndices = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
511  21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
512  c7x::uchar_vec maxIndices0 = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
513  21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
514  c7x::uchar_vec maxIndices1 = c7x::uchar_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
515  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
516  // re-defined for large widths so that there's no common vectors used between small and large widths
517  c7x::uchar_vec maxIndicesA = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
518  21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
519  c7x::uchar_vec maxIndicesB = c7x::uchar_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
520  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
521  c7x::uchar_vec firstHalfIndices = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
522  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
523  c7x::uchar_vec secondHalfIndices = c7x::uchar_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
524  49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
525 
526  typedef typename c7x::make_full_vector<int8_t>::type vec;
527  typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
528 
529  // holds all of the maximum values that have previously been read
530  vec maxVals0; // = int8_t(std::numeric_limits<int8_t>::lowest());
531  vec maxVals1; // = maxVals0;
532  __vpred maskOfMaxs;
533 
534  // holds the overall max vals
535  vec maxVals;
536  size_t width = c7x::element_count_of<vec>::value;
537 
538  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
539  if (length <= width) {
540  maxVals = c7x::strm_eng<0, vec>::get_adv();
541  // fill the uninitialized values with MIN_VAL
542  for (size_t i = length; i < width; i++) {
543  maxVals.s[i] = std::numeric_limits<int8_t>::lowest();
544  }
545  }
546  // can fill one width but only part of a second
547  else if (length < 2 * width) {
548  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
549  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
550  // fill the uninitialized values with MIN_VAL
551  size_t remElements = length % width;
552  for (size_t i = remElements; i < width; i++) {
553  maxVals1.s[i] = std::numeric_limits<int8_t>::lowest();
554  }
555  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
556  // as well
557  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
558  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
559  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
560  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
561  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
562  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
563  index_vec zeroVec = c7x::uchar_vec(0);
564  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
565  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
566 
567  maxIndices = nonTiebreakerVec + tiebreakerVec;
568  }
569  else {
570  // input vectors
571  vec inVec0, inVec1;
572  __vpred mask0, mask1;
573  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
574  // when not being used, which will increase the ii
575  vec maxValsA = int8_t(std::numeric_limits<int8_t>::lowest());
576  vec maxValsB = maxValsA;
577 
578  // holds the overall max vals
579  vec maxValsLarge = int8_t(std::numeric_limits<int8_t>::lowest());
580 
581  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
582 
583  for (size_t i = 0; i < numIterations; i += 1) {
584  inVec0 = c7x::strm_eng<0, vec>::get_adv();
585  mask0 = __cmp_gt_pred(inVec0, maxValsA);
586  maxValsA =
587  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
588  // have been shown to have larger values (contained in mask)
589  maxIndicesA =
590  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
591  // changed from the previous max values
592 
593  inVec1 = c7x::strm_eng<1, vec>::get_adv();
594  mask1 = __cmp_gt_pred(inVec1, maxValsB);
595  maxValsB = __select(mask1, inVec1, maxValsB);
596  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
597 
598  // update the new locations of the indices to be set for the next iteration
599  firstHalfIndices += jumpFactorChar;
600  secondHalfIndices += jumpFactorChar;
601  }
602 
603  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
604 
605  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
606  int8_t *remStart = (int8_t *) pSrc + length - width;
607 
608  if (remBlockSize != 0 && remVecLen == 1) {
609  inVec0 = *(vec *) remStart;
610 
611  firstHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
612  mask0 = __cmp_gt_pred(inVec0, maxValsA);
613  maxValsA =
614  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
615  // have been shown to have larger values (contained in mask)
616  maxIndicesA = __select(mask0, firstHalfIndices,
617  maxIndicesA); // set the max indices to be the indices whose values have changed from
618  // the previous max values
619  }
620 
621  else if (remBlockSize != 0 && remVecLen == 2) {
622  inVec0 = *(vec *) (remStart - width);
623  firstHalfIndices = c7x::uchar_vec(length - 2 * width) + lastRunOffsetsChar;
624  mask0 = __cmp_gt_pred(inVec0, maxValsA);
625  maxValsA =
626  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
627  // have been shown to have larger values (contained in mask)
628  maxIndicesA = __select(mask0, firstHalfIndices,
629  maxIndicesA); // set the max indices to be the indices whose values have changed from
630  // the previous max values
631 
632  inVec1 = *(vec *) remStart;
633  secondHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
634  mask1 = __cmp_gt_pred(inVec1, maxValsB);
635  maxValsB = __select(mask1, inVec1, maxValsB);
636  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
637  }
638  else {
639  /* Nothing to do here */
640  }
641 
642  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
643  // as well
644  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
645  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
646  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
647  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
648  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
649  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
650  index_vec zeroVec = c7x::uchar_vec(0);
651  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
652  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
653 
654  index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
655 
656  maxVals = maxValsLarge;
657  maxIndices = maxIndicesLarge;
658  }
660  output.maxVals = maxVals;
661  output.maxIndices = maxIndices;
662  return output;
663 }
664 
665 // explicit templatization for uint8_t type
666 template <> metadata<uint8_t, uint8_t> DSPLIB_maxIndex_loopLogic<uint8_t, uint8_t>(size_t length, void *pSrc)
667 {
668  // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
669  c7x::uchar_vec maxIndices = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
670  21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
671  c7x::uchar_vec maxIndices0 = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
672  21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
673  c7x::uchar_vec maxIndices1 = c7x::uchar_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
674  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
675  // re-defined for large widths so that there's no common vectors used between small and large widths
676  c7x::uchar_vec maxIndicesA = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
677  21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
678  c7x::uchar_vec maxIndicesB = c7x::uchar_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
679  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
680  c7x::uchar_vec firstHalfIndices = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
681  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
682  c7x::uchar_vec secondHalfIndices = c7x::uchar_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
683  49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
684 
685  typedef typename c7x::make_full_vector<uint8_t>::type vec;
686  typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
687 
688  // holds all of the maximum values that have previously been read
689  vec maxVals0; // = uint8_t(std::numeric_limits<uint8_t>::lowest());
690  vec maxVals1; // = maxVals0;
691  __vpred maskOfMaxs;
692 
693  // holds the overall max vals
694  vec maxVals;
695  size_t width = c7x::element_count_of<vec>::value;
696 
697  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
698  if (length <= width) {
699  maxVals = c7x::strm_eng<0, vec>::get_adv();
700  // fill the uninitialized values with MIN_VAL
701  for (size_t i = length; i < width; i++) {
702  maxVals.s[i] = std::numeric_limits<uint8_t>::lowest();
703  }
704  }
705  // can fill one width but only part of a second
706  else if (length < 2 * width) {
707  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
708  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
709  // fill the uninitialized values with MIN_VAL
710  size_t remElements = length % width;
711  for (size_t i = remElements; i < width; i++) {
712  maxVals1.s[i] = std::numeric_limits<uint8_t>::lowest();
713  }
714  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
715  // as well
716  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
717  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
718  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
719  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
720  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
721  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
722  index_vec zeroVec = c7x::uchar_vec(0);
723  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
724  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
725 
726  maxIndices = nonTiebreakerVec + tiebreakerVec;
727  }
728  else {
729  // input vectors
730  vec inVec0, inVec1;
731  __vpred mask0, mask1;
732  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
733  // when not being used, which will increase the ii
734  vec maxValsA = uint8_t(std::numeric_limits<uint8_t>::lowest());
735  vec maxValsB = maxValsA;
736 
737  // holds the overall max vals
738  vec maxValsLarge = uint8_t(std::numeric_limits<uint8_t>::lowest());
739 
740  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
741 
742  for (size_t i = 0; i < numIterations; i += 1) {
743  inVec0 = c7x::strm_eng<0, vec>::get_adv();
744  mask0 = __cmp_gt_pred(inVec0, maxValsA);
745  maxValsA =
746  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
747  // have been shown to have larger values (contained in mask)
748  maxIndicesA =
749  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
750  // changed from the previous max values
751 
752  inVec1 = c7x::strm_eng<1, vec>::get_adv();
753  mask1 = __cmp_gt_pred(inVec1, maxValsB);
754  maxValsB = __select(mask1, inVec1, maxValsB);
755  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
756 
757  // update the new locations of the indices to be set for the next iteration
758  firstHalfIndices += jumpFactorChar;
759  secondHalfIndices += jumpFactorChar;
760  }
761 
762  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
763 
764  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
765  int8_t *remStart = (int8_t *) pSrc + length - width;
766 
767  if (remBlockSize != 0 && remVecLen == 1) {
768  inVec0 = *(vec *) remStart;
769 
770  firstHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
771  mask0 = __cmp_gt_pred(inVec0, maxValsA);
772  maxValsA =
773  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
774  // have been shown to have larger values (contained in mask)
775  maxIndicesA = __select(mask0, firstHalfIndices,
776  maxIndicesA); // set the max indices to be the indices whose values have changed from
777  // the previous max values
778  }
779 
780  else if (remBlockSize != 0 && remVecLen == 2) {
781  inVec0 = *(vec *) (remStart - width);
782  firstHalfIndices = c7x::uchar_vec(length - 2 * width) + lastRunOffsetsChar;
783  mask0 = __cmp_gt_pred(inVec0, maxValsA);
784  maxValsA =
785  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
786  // have been shown to have larger values (contained in mask)
787  maxIndicesA = __select(mask0, firstHalfIndices,
788  maxIndicesA); // set the max indices to be the indices whose values have changed from
789  // the previous max values
790 
791  inVec1 = *(vec *) remStart;
792  secondHalfIndices = c7x::uchar_vec(length - width) + lastRunOffsetsChar;
793  mask1 = __cmp_gt_pred(inVec1, maxValsB);
794  maxValsB = __select(mask1, inVec1, maxValsB);
795  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
796  }
797  else {
798  /* Nothing to do here */
799  }
800  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
801  // as well
802  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
803  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
804  index_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
805  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
806  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
807  index_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
808  index_vec zeroVec = c7x::uchar_vec(0);
809  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
810  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
811 
812  index_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
813 
814  maxVals = maxValsLarge;
815  maxIndices = maxIndicesLarge;
816  }
817 
819  output.maxVals = maxVals;
820  output.maxIndices = maxIndices;
821  return output;
822 }
823 
824 // explicit templatization for int16_t type
826 {
827  // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
828  c7x::ushort_vec maxIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
829  c7x::ushort_vec maxIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
830  c7x::ushort_vec maxIndices1 = c7x::ushort_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
831  // re-defined for large widths so that there's no common vectors used between small and large widths
832  c7x::ushort_vec maxIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
833  c7x::ushort_vec maxIndicesB = c7x::ushort_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
834  c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
835  c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
836 
837  typedef typename c7x::make_full_vector<int16_t>::type vec;
838  typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
839 
840  // holds all of the maximum values that have previously been read
841  vec maxVals0; // = int16_t(std::numeric_limits<int16_t>::lowest());
842  vec maxVals1; // = maxVals0;
843  __vpred maskOfMaxs;
844 
845  // holds the overall max vals
846  vec maxVals;
847  size_t width = c7x::element_count_of<vec>::value;
848 
849  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
850  if (length <= width) {
851  maxVals = c7x::strm_eng<0, vec>::get_adv();
852 
853  // fill the uninitialized values with MIN_VAL
854  for (size_t i = length; i < width; i++) {
855  maxVals.s[i] = std::numeric_limits<int16_t>::lowest();
856  }
857  }
858  // can fill one width but only part of a second
859  else if (length < 2 * width) {
860  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
861  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
862 
863  // fill the uninitialized values with MIN_VAL
864  size_t remElements = length % width;
865 
866  for (size_t i = remElements; i < width; i++) {
867  maxVals1.s[i] = std::numeric_limits<int16_t>::lowest();
868  }
869  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
870  // as well
871  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
872  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
873  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
874  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
875  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
876  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
877  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
878  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
879  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
880 
881  maxIndices = nonTiebreakerVec + tiebreakerVec;
882  }
883  else {
884  // input vectors
885  c7x::short_vec inVec0, inVec1;
886  __vpred mask0, mask1;
887  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
888  // when not being used, which will increase the ii
889  c7x::short_vec maxValsA = int16_t(std::numeric_limits<int16_t>::lowest());
890  c7x::short_vec maxValsB = maxValsA;
891 
892  // holds the overall max vals
893  c7x::short_vec maxValsLarge = int16_t(std::numeric_limits<int16_t>::lowest());
894 
895  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
896 
897  for (size_t i = 0; i < numIterations; i += 1) {
898  inVec0 = c7x::strm_eng<0, vec>::get_adv();
899  mask0 = __cmp_gt_pred(inVec0, maxValsA);
900  maxValsA =
901  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
902  // have been shown to have larger values (contained in mask)
903  maxIndicesA =
904  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
905  // changed from the previous max values
906 
907  inVec1 = c7x::strm_eng<1, vec>::get_adv();
908  mask1 = __cmp_gt_pred(inVec1, maxValsB);
909  maxValsB = __select(mask1, inVec1, maxValsB);
910  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
911 
912  // update the new locations of the indices to be set for the next iteration
913  firstHalfIndices += jumpFactorShort;
914  secondHalfIndices += jumpFactorShort;
915  }
916 
917  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
918 
919  // if no remainder block, go to end
920 
921  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
922  int16_t *remStart = (int16_t *) pSrc + length - width;
923 
924  if (remBlockSize != 0 && remVecLen == 1) {
925  inVec0 = *(vec *) remStart;
926  firstHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
927 
928  mask0 = __cmp_gt_pred(inVec0, maxValsA);
929  maxValsA =
930  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
931  // have been shown to have larger values (contained in mask)
932  maxIndicesA = __select(mask0, firstHalfIndices,
933  maxIndicesA); // set the max indices to be the indices whose values have changed from
934  // the previous max values
935  }
936 
937  else if (remBlockSize != 0 && remVecLen == 2) {
938  inVec0 = *(vec *) (remStart - width);
939  firstHalfIndices = c7x::ushort_vec(length - 2 * width) + lastRunOffsetsShort;
940  mask0 = __cmp_gt_pred(inVec0, maxValsA);
941  maxValsA =
942  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
943  // have been shown to have larger values (contained in mask)
944  maxIndicesA = __select(mask0, firstHalfIndices,
945  maxIndicesA); // set the max indices to be the indices whose values have changed from
946  // the previous max values
947  inVec1 = *(vec *) remStart;
948  secondHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
949  mask1 = __cmp_gt_pred(inVec1, maxValsB);
950  maxValsB = __select(mask1, inVec1, maxValsB);
951  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
952  }
953  else {
954  /* Nothing to do here */
955  }
956  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
957  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
958  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
959  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
960  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
961  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
962  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
963  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
964  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
965 
966  c7x::ushort_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
967 
968  maxVals = maxValsLarge;
969  maxIndices = maxIndicesLarge;
970  }
971 
973  output.maxVals = maxVals;
974  output.maxIndices = maxIndices;
975  return output;
976 }
977 
978 // explicit templatization for uint16_t type
980 {
981  // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
982  c7x::ushort_vec maxIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
983  c7x::ushort_vec maxIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
984  c7x::ushort_vec maxIndices1 = c7x::ushort_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
985  // re-defined for large widths so that there's no common vectors used between small and large widths
986  c7x::ushort_vec maxIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
987  c7x::ushort_vec maxIndicesB = c7x::ushort_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
988  c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
989  c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
990 
991  typedef typename c7x::make_full_vector<uint16_t>::type vec;
992  typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
993 
994  // holds all of the maximum values that have previously been read
995  vec maxVals0; // = uint16_t(std::numeric_limits<uint16_t>::lowest());
996  vec maxVals1; // = maxVals0;
997  __vpred maskOfMaxs;
998 
999  // holds the overall max vals
1000  vec maxVals;
1001  size_t width = c7x::element_count_of<vec>::value;
1002 
1003  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1004  if (length <= width) {
1005  maxVals = c7x::strm_eng<0, vec>::get_adv();
1006 
1007  // fill the uninitialized values with MIN_VAL
1008  for (size_t i = length; i < width; i++) {
1009  maxVals.s[i] = std::numeric_limits<uint16_t>::lowest();
1010  }
1011  }
1012  // can fill one width but only part of a second
1013  else if (length < 2 * width) {
1014  maxVals0 = c7x::strm_eng<0, vec>::get_adv();
1015  maxVals1 = c7x::strm_eng<1, vec>::get_adv();
1016 
1017  // fill the uninitialized values with MIN_VAL
1018  size_t remElements = length % width;
1019 
1020  for (size_t i = remElements; i < width; i++) {
1021  maxVals1.s[i] = std::numeric_limits<uint16_t>::lowest();
1022  }
1023  // get the maximum values and their corresponding indices into single vectors while properly handling tiebreakers
1024  // as well
1025  maskOfMaxs = __cmp_gt_pred(maxVals0, maxVals1);
1026  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndices1, maxIndices0);
1027  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndices0, maxIndices1);
1028  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxVals0, maxVals1);
1029  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1030  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxs, maxIndices0, maxIndices1);
1031  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1032  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1033  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1034 
1035  maxIndices = nonTiebreakerVec + tiebreakerVec;
1036  }
1037  else {
1038  // input vectors
1039  c7x::ushort_vec inVec0, inVec1;
1040  __vpred mask0, mask1;
1041  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1042  // when not being used, which will increase the ii
1043  c7x::ushort_vec maxValsA = uint16_t(std::numeric_limits<uint16_t>::lowest());
1044  c7x::ushort_vec maxValsB = maxValsA;
1045 
1046  // holds the overall max vals
1047  c7x::ushort_vec maxValsLarge = uint16_t(std::numeric_limits<uint16_t>::lowest());
1048 
1049  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1050 
1051  for (size_t i = 0; i < numIterations; i += 1) {
1052  inVec0 = c7x::strm_eng<0, c7x::ushort_vec>::get_adv();
1053  mask0 = __cmp_gt_pred(inVec0, maxValsA);
1054  maxValsA =
1055  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1056  // have been shown to have larger values (contained in mask)
1057  maxIndicesA =
1058  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
1059  // changed from the previous max values
1060 
1061  inVec1 = c7x::strm_eng<1, c7x::ushort_vec>::get_adv();
1062  mask1 = __cmp_gt_pred(inVec1, maxValsB);
1063  maxValsB = __select(mask1, inVec1, maxValsB);
1064  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1065 
1066  // update the new locations of the indices to be set for the next iteration
1067  firstHalfIndices += jumpFactorShort;
1068  secondHalfIndices += jumpFactorShort;
1069  }
1070 
1071  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
1072 
1073  // if no remainder block, go to end
1074 
1075  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1076  uint16_t *remStart = (uint16_t *) pSrc + length - width;
1077 
1078  if (remBlockSize != 0 && remVecLen == 1) {
1079  inVec0 = *(vec *) remStart;
1080  firstHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
1081 
1082  mask0 = __cmp_gt_pred(inVec0, maxValsA);
1083  maxValsA =
1084  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1085  // have been shown to have larger values (contained in mask)
1086  maxIndicesA = __select(mask0, firstHalfIndices,
1087  maxIndicesA); // set the max indices to be the indices whose values have changed from
1088  // the previous max values
1089  }
1090  else if (remBlockSize != 0 && remVecLen == 2) {
1091  inVec0 = *(vec *) (remStart - width);
1092  firstHalfIndices = c7x::ushort_vec(length - 2 * width) + lastRunOffsetsShort;
1093  mask0 = __cmp_gt_pred(inVec0, maxValsA);
1094  maxValsA =
1095  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1096  // have been shown to have larger values (contained in mask)
1097  maxIndicesA = __select(mask0, firstHalfIndices,
1098  maxIndicesA); // set the max indices to be the indices whose values have changed from
1099  // the previous max values
1100  inVec1 = *(vec *) remStart;
1101  secondHalfIndices = c7x::ushort_vec(length - width) + lastRunOffsetsShort;
1102  mask1 = __cmp_gt_pred(inVec1, maxValsB);
1103  maxValsB = __select(mask1, inVec1, maxValsB);
1104  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1105  }
1106  else {
1107  /* Nothing to do here */
1108  }
1109  __vpred maskOfMaxValues = __cmp_gt_pred(maxValsA, maxValsB);
1110  __vpred maskOfSmallerIndices = __cmp_ge_pred(maxIndicesB, maxIndicesA);
1111  c7x::ushort_vec smallestIndices = __select(maskOfSmallerIndices, maxIndicesA, maxIndicesB);
1112  __vpred maskOfTiebreakerValues = __cmp_eq_pred(maxValsA, maxValsB);
1113  maxValsLarge = __select(maskOfMaxValues, maxValsA, maxValsB);
1114  c7x::ushort_vec maxIndicesIgnoringTiebreaker = __select(maskOfMaxValues, maxIndicesA, maxIndicesB);
1115  c7x::ushort_vec zeroVec = c7x::ushort_vec(0);
1116  c7x::ushort_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, maxIndicesIgnoringTiebreaker);
1117  c7x::ushort_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1118 
1119  c7x::ushort_vec maxIndicesLarge = nonTiebreakerVec + tiebreakerVec;
1120 
1121  maxVals = maxValsLarge;
1122  maxIndices = maxIndicesLarge;
1123  }
1124 
1126  output.maxVals = maxVals;
1127  output.maxIndices = maxIndices;
1128  return output;
1129 }
1130 
1131 // explicit templatization for float type
1132 template <> metadata<float, uint32_t> DSPLIB_maxIndex_loopLogic<float, uint32_t>(size_t length, void *pSrc)
1133 {
1134  // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
1135  c7x::uint_vec maxIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
1136  c7x::uint_vec maxIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
1137  c7x::uint_vec maxIndices1 = c7x::uint_vec(8, 9, 10, 11, 12, 13, 14, 15);
1138  // re-defined for large widths so that there's no common vectors used between small and large widths
1139  c7x::uint_vec maxIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
1140  c7x::uint_vec maxIndicesB = c7x::uint_vec(8, 9, 10, 11, 12, 13, 14, 15);
1141  c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7);
1142  c7x::uint_vec secondHalfIndices = c7x::uint_vec(8, 9, 10, 11, 12, 13, 14, 15);
1143  // holds all of the maximum values that have previously been read
1144  c7x::float_vec maxVals0; // = std::numeric_limits<float>::lowest();
1145  c7x::float_vec maxVals1; // = maxVals0;
1146  __vpred maskOfMaxs;
1147 
1148  size_t width = c7x::element_count_of<c7x::float_vec>::value;
1149 
1150  // holds the overall max vals
1151  c7x::float_vec maxVals;
1152  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1153  if (length <= width) {
1154  maxVals = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1155  // fill the uninitialized values with MIN_VAL
1156  for (size_t i = length; i < width; i++) {
1157  maxVals.s[i] = std::numeric_limits<float>::lowest();
1158  }
1159  }
1160  // can fill one width but only part of a second
1161  else if (length < 2 * width) {
1162  maxVals0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1163  maxVals1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1164  // fill the uninitialized values with MIN_VAL
1165  size_t remElements = length % width;
1166  for (size_t i = remElements; i < width; i++) {
1167  maxVals1.s[i] = std::numeric_limits<float>::lowest();
1168  }
1169  maskOfMaxs = __cmp_lt_pred(maxVals1, maxVals0);
1170  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1171  maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
1172  }
1173  else {
1174  // input vectors
1175  c7x::float_vec inVec0, inVec1;
1176  __vpred mask0, mask1, maskOfMaxsLarge;
1177  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1178  // when not being used, which will increase the ii
1179  c7x::float_vec maxValsA = std::numeric_limits<float>::lowest();
1180  c7x::float_vec maxValsB = maxValsA;
1181 
1182  // holds the overall max vals
1183  c7x::float_vec maxValsLarge = std::numeric_limits<float>::lowest();
1184  // printf("length: %d, width: %d\n", length, width);
1185  // size_t numIterations = DSPLIB_ceilingDiv(length, width);
1186  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1187  // printf("\nnumIter: %d\n", numIterations);
1188  for (size_t i = 0; i < numIterations; i += 1) {
1189  inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1190  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1191  maxValsA =
1192  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1193  // have been shown to have larger values (contained in mask)
1194  maxIndicesA =
1195  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
1196  // changed from the previous max values
1197 
1198  inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1199  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1200  maxValsB = __select(mask1, inVec1, maxValsB);
1201  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1202 
1203  // update the new locations of the indices to be set for the next iteration
1204  firstHalfIndices += jumpFactor;
1205  secondHalfIndices += jumpFactor;
1206  }
1207 
1208  int32_t remBlockSize = length - (numIterations * INDEX_UNROLL_FACTOR * width);
1209  // printf("\nrem block size: %d\n", remBlockSize);
1210 
1211  // if no remainder block, go to end
1212 
1213  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1214  float *remStart = (float *) pSrc + length - width;
1215 
1216  // if remainder 1
1217  if (remBlockSize != 0 && remVecLen == 1) {
1218 
1219  // printf("\ninside rem1\n");
1220  inVec0 = *(c7x::float_vec *) remStart;
1221  firstHalfIndices = c7x::uint_vec(length - (width)) + lastRunOffsets;
1222  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1223  maxValsA =
1224  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1225  // have been shown to have larger values (contained in mask)
1226  maxIndicesA = __select(mask0, firstHalfIndices,
1227  maxIndicesA); // set the max indices to be the indices whose values have changed from
1228  // the previous max values
1229  }
1230 
1231  // if remainder 2
1232  else if (remBlockSize != 0 && remVecLen == 2) {
1233 
1234  // printf("\ninside rem 2\n");
1235  // inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1236  inVec0 = *(c7x::float_vec *) (remStart - width);
1237  firstHalfIndices = c7x::uint_vec(length - (2 * width)) + lastRunOffsets;
1238  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1239  maxValsA =
1240  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1241  // have been shown to have larger values (contained in mask)
1242  maxIndicesA = __select(mask0, firstHalfIndices,
1243  maxIndicesA); // set the max indices to be the indices whose values have changed from
1244  // the previous max values
1245 
1246  // inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1247  inVec1 = *(c7x::float_vec *) remStart;
1248  secondHalfIndices = c7x::uint_vec(length - (width)) + lastRunOffsets;
1249  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1250  maxValsB = __select(mask1, inVec1, maxValsB);
1251  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1252  }
1253  else {
1254  /* Nothing to do here */
1255  }
1256  maskOfMaxsLarge = __cmp_lt_pred(maxValsB, maxValsA);
1257  maxValsLarge = __select(maskOfMaxsLarge, maxValsA, maxValsB);
1258  c7x::uint_vec maxIndicesLarge = __select(maskOfMaxsLarge, maxIndicesA, maxIndicesB);
1259 
1260  // metadata<float, uint32_t> outputLarge;
1261  // outputLarge.maxVals = maxValsLarge;
1262  // outputLarge.maxIndices = maxIndicesLarge;
1263  maxVals = maxValsLarge;
1264  maxIndices = maxIndicesLarge;
1265  }
1267  output.maxVals = maxVals;
1268  output.maxIndices = maxIndices;
1269  return output;
1270 }
1271 
1272 // explicit templatization for double type
1273 template <> metadata<double, uint64_t> DSPLIB_maxIndex_loopLogic<double, uint64_t>(size_t length, void *pSrc)
1274 {
1275  // vector containing indices of maximum indices (starts with 0, ..., SIMD width but changes as we loop through)
1276  c7x::ulong_vec maxIndices = c7x::ulong_vec(0, 1, 2, 3);
1277  c7x::ulong_vec maxIndices0 = c7x::ulong_vec(0, 1, 2, 3);
1278  c7x::ulong_vec maxIndices1 = c7x::ulong_vec(4, 5, 6, 7);
1279  // re-defined for large widths so that there's no common vectors used between small and large widths
1280  c7x::ulong_vec maxIndicesA = c7x::ulong_vec(0, 1, 2, 3);
1281  c7x::ulong_vec maxIndicesB = c7x::ulong_vec(4, 5, 6, 7);
1282  c7x::ulong_vec firstHalfIndices = c7x::ulong_vec(0, 1, 2, 3);
1283  c7x::ulong_vec secondHalfIndices = c7x::ulong_vec(4, 5, 6, 7);
1284 
1285  c7x::double_vec maxVals0; // = std::numeric_limits<double>::lowest();
1286  c7x::double_vec maxVals1; // = maxVals0;
1287  __vpred maskOfMaxs;
1288 
1289  size_t width = c7x::element_count_of<c7x::double_vec>::value;
1290 
1291  // holds the overall max vals
1292  c7x::double_vec maxVals;
1293  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1294  if (length <= width) {
1295  maxVals = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1296  // fill the uninitialized values with MIN_VAL
1297  for (size_t i = length; i < width; i++) {
1298  maxVals.s[i] = std::numeric_limits<double>::lowest();
1299  }
1300  }
1301  // can fill one width but only part of a second
1302  else if (length < 2 * width) {
1303  maxVals0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1304  maxVals1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1305  // fill the uninitialized values with MIN_VAL
1306  size_t remainingElement = length % width;
1307  for (size_t i = remainingElement; i < width; i++) {
1308  maxVals1.s[i] = std::numeric_limits<double>::lowest();
1309  }
1310  maskOfMaxs = __cmp_lt_pred(maxVals1, maxVals0);
1311  maxVals = __select(maskOfMaxs, maxVals0, maxVals1);
1312  maxIndices = __select(maskOfMaxs, maxIndices0, maxIndices1);
1313  }
1314  else {
1315  // input vectors
1316  c7x::double_vec inVec0, inVec1;
1317  __vpred mask0, mask1, maskOfMaxsLarge;
1318  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1319  // when not being used, which will increase the ii
1320  c7x::double_vec maxValsA = std::numeric_limits<double>::lowest();
1321  c7x::double_vec maxValsB = maxValsA;
1322 
1323  // holds the overall max vals
1324  c7x::double_vec maxValsLarge = std::numeric_limits<double>::lowest();
1325  // printf("length: %d, width: %d\n", length, width);
1326  // size_t numIterations = DSPLIB_ceilingDiv(length, width);
1327  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1328  // printf("\nnumIter: %d\n", numIterations);
1329  for (size_t i = 0; i < numIterations; i += 1) {
1330  inVec0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1331  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1332  maxValsA =
1333  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1334  // have been shown to have larger values (contained in mask)
1335  maxIndicesA =
1336  __select(mask0, firstHalfIndices, maxIndicesA); // set the max indices to be the indices whose values have
1337  // changed from the previous max values
1338 
1339  inVec1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1340  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1341  maxValsB = __select(mask1, inVec1, maxValsB);
1342  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1343 
1344  // update the new locations of the indices to be set for the next iteration
1345  firstHalfIndices += jumpFactorDp;
1346  secondHalfIndices += jumpFactorDp;
1347  }
1348 
1349  int32_t remBlockSize = length - (numIterations * INDEX_UNROLL_FACTOR * width);
1350  // printf("\nrem block size: %d\n", remBlockSize);
1351 
1352  // if no remainder block, go to end
1353 
1354  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1355  double *remStart = (double *) pSrc + length - width;
1356 
1357  // if remainder 1
1358  if (remBlockSize != 0 && remVecLen == 1) {
1359 
1360  // printf("\ninside rem1\n");
1361  inVec0 = *(c7x::double_vec *) remStart;
1362  firstHalfIndices = c7x::ulong_vec(length - (width)) + lastRunOffsetsDp;
1363  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1364  maxValsA =
1365  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1366  // have been shown to have larger values (contained in mask)
1367  maxIndicesA = __select(mask0, firstHalfIndices,
1368  maxIndicesA); // set the max indices to be the indices whose values have changed from
1369  // the previous max values
1370  }
1371 
1372  // if remainder 2
1373  else if (remBlockSize != 0 && remVecLen == 2) {
1374 
1375  // printf("\ninside rem 2\n");
1376  // inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1377  inVec0 = *(c7x::double_vec *) (remStart - width);
1378  firstHalfIndices = c7x::ulong_vec(length - (2 * width)) + lastRunOffsetsDp;
1379  mask0 = __cmp_lt_pred(maxValsA, inVec0);
1380  maxValsA =
1381  __select(mask0, inVec0, maxValsA); // change the values in the max val vector depending on which positions
1382  // have been shown to have larger values (contained in mask)
1383  maxIndicesA = __select(mask0, firstHalfIndices,
1384  maxIndicesA); // set the max indices to be the indices whose values have changed from
1385  // the previous max values
1386 
1387  // inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1388  inVec1 = *(c7x::double_vec *) remStart;
1389  secondHalfIndices = c7x::ulong_vec(length - (width)) + lastRunOffsetsDp;
1390  mask1 = __cmp_lt_pred(maxValsB, inVec1);
1391  maxValsB = __select(mask1, inVec1, maxValsB);
1392  maxIndicesB = __select(mask1, secondHalfIndices, maxIndicesB);
1393  }
1394  else {
1395  /* Do nothing */
1396  }
1397 
1398  maskOfMaxsLarge = __cmp_lt_pred(maxValsB, maxValsA);
1399  maxValsLarge = __select(maskOfMaxsLarge, maxValsA, maxValsB);
1400  c7x::ulong_vec maxIndicesLarge = __select(maskOfMaxsLarge, maxIndicesA, maxIndicesB);
1401 
1402  maxVals = maxValsLarge;
1403  maxIndices = maxIndicesLarge;
1404  }
1405 
1407  output.maxVals = maxVals;
1408  output.maxIndices = maxIndices;
1409  return output;
1410 }
#define SE_SE0_PARAM_OFFSET
#define SE_SE1_PARAM_OFFSET
metadata< uint16_t, uint16_t > DSPLIB_maxIndex_loopLogic< uint16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< float, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< int16_t, uint16_t > DSPLIB_maxIndex_loopLogic< int16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
metadata< double, uint64_t > DSPLIB_maxIndex_loopLogic< double, uint64_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
DSPLIB_STATUS DSPLIB_maxIndex_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< double, uint64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
const c7x::uint_vec jumpFactor
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< float, uint32_t > DSPLIB_maxIndex_loopLogic< float, uint32_t >(size_t length, void *pSrc)
const c7x::ushort_vec jumpFactorShort
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
const c7x::uchar_vec jumpFactorChar
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< int8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
const c7x::uint_vec lastRunOffsets
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_maxIndex_exec_ci< uint8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< uint8_t, uint8_t > DSPLIB_maxIndex_loopLogic< uint8_t, uint8_t >(size_t length, void *pSrc)
const c7x::uchar_vec lastRunOffsetsChar
#define INDEX_UNROLL_FACTOR
const c7x::ushort_vec lastRunOffsetsShort
metadata< T, TIndex > DSPLIB_maxIndex_loopLogic(size_t length, void *pSrc)
This function is the kernel loop helper function for the optimized implementation of the kernel....
const c7x::ulong_vec jumpFactorDp
metadata< int8_t, uint8_t > DSPLIB_maxIndex_loopLogic< int8_t, uint8_t >(size_t length, void *pSrc)
DSPLIB_STATUS DSPLIB_maxIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
const c7x::ulong_vec lastRunOffsetsDp
template DSPLIB_STATUS DSPLIB_maxIndex_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_maxIndex_InitArgs *pKerInitArgs)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_maxIndex.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 1 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_MAXINDEX_IXX_IXX_OXX_PBLOCK_SIZE]
int32_t blockSize
Size of input buffer for different batches DSPLIB_maxIndex_init that will be retrieved and used by DS...
index_vec maxIndices