DSPLIB User Guide
DSPLIB_minIndex_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date 10/2/22 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
47 #include "../common/c71/DSPLIB_inlines.h"
48 #include "DSPLIB_minIndex_priv.h"
49 #include "DSPLIB_types.h"
50 #include "c7x_scalable.h"
51 #include <algorithm> // std::min
52 #include <cmath> // floor
53 #include <float.h>
54 #include <limits>
55 #include <numeric> // std::iota
56 #include <vector> // std::vector
57 
58 #define INDEX_UNROLL_FACTOR 2
59 
60 const c7x::uint_vec jumpFactor = c7x::uint_vec(32);
61 const c7x::uint_vec lastRunOffsets = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
62 const c7x::ushort_vec jumpFactorShort = c7x::ushort_vec(64);
63 
64 // clang-format off
65 const c7x::ushort_vec lastRunOffsetsShort = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
66  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
67 // clang-format on
68 
69 const c7x::ulong_vec jumpFactorDp = c7x::ulong_vec(16);
70 const c7x::ulong_vec lastRunOffsetsDp = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
71 const c7x::uchar_vec jumpFactorChar = c7x::uchar_vec(128);
72 
73 // clang-format off
74 const c7x::uchar_vec lastRunOffsetsChar = c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
75  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
76  34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
77  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
78 // clang-format on
79 
80 template <typename dataType>
82  const DSPLIB_bufParams1D_t *bufParamsIn,
83  const DSPLIB_bufParams1D_t *bufParamsOut,
84  const DSPLIB_minIndex_InitArgs *pKerInitArgs)
85 {
87  __SE_TEMPLATE_v1 se0Params, se1Params;
88 
89  __SE_ELETYPE SE_ELETYPE;
90  __SE_VECLEN SE_VECLEN;
91 
92  DSPLIB_minIndex_PrivArgs *pKerPrivArgs = (DSPLIB_minIndex_PrivArgs *) handle;
93 
94  uint8_t *pBlock = pKerPrivArgs->bufPblock;
95  uint32_t blocksSize = pKerPrivArgs->blockSize;
96 
97  typedef typename c7x::make_full_vector<dataType>::type vec;
98  uint32_t eleCount = c7x::element_count_of<vec>::value;
99  SE_VECLEN = c7x::se_veclen<vec>::value;
100  SE_ELETYPE = c7x::se_eletype<vec>::value;
101  uint32_t length = blocksSize;
102  uint32_t width = eleCount;
103 
104 #if DSPLIB_DEBUGPRINT
105  printf("Enter eleCount %d\n", eleCount);
106 #endif
107 
108  /**********************************************************************/
109  /* Prepare streaming engine 0,1 to fetch the input */
110  /**********************************************************************/
111  se0Params = __gen_SE_TEMPLATE_v1();
112 
113  // default SE0 parameters
114  se0Params.ICNT0 = width;
115  se0Params.ELETYPE = SE_ELETYPE;
116  se0Params.VECLEN = SE_VECLEN;
117  se0Params.DIMFMT = __SE_DIMFMT_1D;
118 
119  se1Params = __gen_SE_TEMPLATE_v1();
120 
121  // default SE1 parameters
122  se1Params.ICNT0 = width;
123  se1Params.ELETYPE = SE_ELETYPE;
124  se1Params.VECLEN = SE_VECLEN;
125  se1Params.DIMFMT = __SE_DIMFMT_1D;
126 
127  // variables to calculate and store compute loop's iteration counter
128  uint32_t numBlocks = length / width;
129  uint32_t remBlocksSize = length % width;
130  if (remBlocksSize) {
131  numBlocks++;
132  }
133  else {
134  /* Nothing to do here */
135  }
136  // case: length of input <= width
137  // one SE fetch is length elements, rest of vec filled with '0'
138  if (length <= width) {
139 
140  // SE0 fetch length
141  se0Params.ICNT0 = length;
142  // SE1 not used
143  }
144 
145  // case: length of input is > width but < 2*width
146  // SE0 fetch is one full width, SE1 fetch is partial fetch, rest of vec filled with '0'
147  else if (length < 2 * width) {
148  // printf("\ninit between 1 and 2\n");
149 
150  // SE0 full fetch
151  se0Params.ICNT0 = width;
152  // SE1 partial fetch
153  se1Params.ICNT0 = remBlocksSize;
154  }
155 
156  // case: len >= 2*width
157  // SE0 and SE1 fetches are full widths only
158  else {
159 
160  // SE0 Dim is 2D
161  se0Params.DIMFMT = __SE_DIMFMT_2D;
162  // SE0 jump length each get_adv is 2 widths
163  se0Params.DIM1 = 2 * width;
164  // SE only performs full fetches in multiples of INDEX_UNROLL_FACTOR, i.e. 2
165  se0Params.ICNT1 = length / (INDEX_UNROLL_FACTOR * width);
166  // SE0 fetches full widths
167  se0Params.ICNT0 = width;
168 
169  // SE1 fetches in same manner as SE0, but starts 1 width ahead
170  se1Params = se0Params;
171  }
172 
173  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET) = se0Params;
174  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET) = se1Params;
175 
176  return status;
177 }
178 
180  const DSPLIB_bufParams1D_t *bufParamsIn,
181  const DSPLIB_bufParams1D_t *bufParamsOut,
182  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
183 
185  const DSPLIB_bufParams1D_t *bufParamsIn,
186  const DSPLIB_bufParams1D_t *bufParamsOut,
187  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
188 
190  const DSPLIB_bufParams1D_t *bufParamsIn,
191  const DSPLIB_bufParams1D_t *bufParamsOut,
192  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
193 
195  const DSPLIB_bufParams1D_t *bufParamsIn,
196  const DSPLIB_bufParams1D_t *bufParamsOut,
197  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
198 
200  const DSPLIB_bufParams1D_t *bufParamsIn,
201  const DSPLIB_bufParams1D_t *bufParamsOut,
202  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
203 
205  const DSPLIB_bufParams1D_t *bufParamsIn,
206  const DSPLIB_bufParams1D_t *bufParamsOut,
207  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
208 
210  const DSPLIB_bufParams1D_t *bufParamsIn,
211  const DSPLIB_bufParams1D_t *bufParamsOut,
212  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
213 
215  const DSPLIB_bufParams1D_t *bufParamsIn,
216  const DSPLIB_bufParams1D_t *bufParamsOut,
217  const DSPLIB_minIndex_InitArgs *pKerInitArgs);
218 
219 template <typename T, typename TIndex> metadata<T, TIndex> DSPLIB_minIndex_loopLogic(size_t length, void *pSrc)
220 {
221  c7x::uint_vec minIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
222  c7x::uint_vec minIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
223  c7x::uint_vec minIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
224  // re-defined for large widths so that there's no common vectors used between small and large widths
225  c7x::uint_vec minIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
226  c7x::uint_vec minIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
227  c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
228  c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
229  // derive c7x vector type from value template and index template
230  typedef typename c7x::make_full_vector<T>::type vec;
231  typedef typename c7x::make_full_vector<TIndex>::type index_vec;
232 
233  // holds all of the minimum values that have previously been read
234  vec minVals0; // = T(std::numeric_limits<T>::max());
235  vec minVals1; // = minVals0;
236  __vpred maskOfmins;
237 
238  // holds the overall min vals
239  vec minVals;
240  size_t width = c7x::element_count_of<vec>::value;
241  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
242  if (length <= width) {
243  minVals = c7x::strm_eng<0, vec>::get_adv();
244  // fill the uninitialized values with MIN_VAL
245  for (size_t i = length; i < width; i++) {
246  minVals.s[i] = std::numeric_limits<T>::max();
247  }
248  }
249  // can fill one width but only part of a second
250  else if (length < 2 * width) {
251  minVals0 = c7x::strm_eng<0, vec>::get_adv();
252  minVals1 = c7x::strm_eng<1, vec>::get_adv();
253  // fill the uninitialized values with MIN_VAL
254  size_t remElements = length % width;
255  for (size_t i = remElements; i < width; i++) {
256  minVals1.s[i] = std::numeric_limits<T>::max();
257  }
258  maskOfmins = __cmp_gt_pred(minVals0, minVals1);
259  minVals = __select(maskOfmins, minVals1, minVals0);
260  minIndices = __select(maskOfmins, minIndices1, minIndices0);
261 
262  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
263  // as well
264  maskOfmins = __cmp_ge_pred(minVals0, minVals1);
265  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
266  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
267  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
268  minVals = __select(maskOfmins, minVals1, minVals0);
269  index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
270  index_vec zeroVec = c7x::uint_vec(0);
271  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
272  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
273 
274  minIndices = nonTiebreakerVec + tiebreakerVec;
275  }
276  else {
277  // input vectors
278  vec inVec0, inVec1;
279  __vpred mask0, mask1;
280  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
281  // when not being used, which will increase the ii
282  vec minValsA = T(std::numeric_limits<T>::max());
283  vec minValsB = minValsA;
284 
285  // holds the overall min vals
286  vec minValsLarge; // = T(std::numeric_limits<T>::max());
287 
288  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
289 
290  for (size_t i = 0; i < numIterations; i += 1) {
291  inVec0 = c7x::strm_eng<0, vec>::get_adv();
292  mask0 = __cmp_ge_pred(inVec0, minValsA);
293  minValsA =
294  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
295  // have been shown to have larger values (contained in mask)
296  minIndicesA =
297  __select(mask0, minIndicesA, firstHalfIndices); // set the min indices to be the indices whose values have
298  // changed from the previous min values
299 
300  inVec1 = c7x::strm_eng<1, vec>::get_adv();
301  mask1 = __cmp_ge_pred(inVec1, minValsB);
302  minValsB = __select(mask1, minValsB, inVec1);
303  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
304 
305  // update the new locations of the indices to be set for the next iteration
306  firstHalfIndices += jumpFactor;
307  secondHalfIndices += jumpFactor;
308  }
309 
310  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
311 
312  // if no remainder block, go to end
313 
314  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
315  T *remStart = (T *) pSrc + length - width;
316 
317  if (remBlockSize != 0 && remVecLen == 1) {
318  inVec0 = *(vec *) remStart;
319  firstHalfIndices = c7x::uint_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
320  mask0 = __cmp_ge_pred(inVec0, minValsA);
321  minValsA =
322  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
323  // have been shown to have larger values (contained in mask)
324  minIndicesA = __select(mask0, minIndicesA,
325  firstHalfIndices); // set the min indices to be the indices whose values have changed
326  // from the previous min values
327  }
328 
329  else if (remBlockSize != 0 && remVecLen == 2) {
330  inVec0 = *(vec *) (remStart - width);
331  firstHalfIndices = c7x::uint_vec(length - (2 * c7x::element_count_of<index_vec>::value)) + lastRunOffsets;
332  mask0 = __cmp_ge_pred(inVec0, minValsA);
333  minValsA =
334  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
335  // have been shown to have larger values (contained in mask)
336  minIndicesA = __select(mask0, minIndicesA,
337  firstHalfIndices); // set the min indices to be the indices whose values have changed
338  // from the previous min values
339 
340  inVec1 = *(vec *) remStart;
341  secondHalfIndices = firstHalfIndices + jumpFactor;
342  mask1 = __cmp_ge_pred(inVec1, minValsB);
343  minValsB = __select(mask1, minValsB, inVec1);
344  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
345  }
346  else {
347  /* Nothing to do here */
348  }
349  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
350  // as well
351  __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
352  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
353  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
354  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
355  minValsLarge = __select(maskOfminValues, minValsB, minValsA);
356  index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
357  index_vec zeroVec = c7x::uint_vec(0);
358  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
359  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
360 
361  index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
362 
363  minVals = minValsLarge;
364  minIndices = minIndicesLarge;
365  }
366  metadata<T, TIndex> output;
367  output.minVals = minVals;
368  output.minIndices = minIndices;
369  return output;
370 }
371 
372 // explicit templatization for int32, uint32_t
373 template <int32_t, uint32_t> metadata<int32_t, uint32_t> DSPLIB_minIndex_loopLogic(size_t length, void *pSrc);
374 
375 template <uint32_t, uint32_t> metadata<uint32_t, uint32_t> DSPLIB_minIndex_loopLogic(size_t length, void *pSrc);
376 
377 // explicit templatization for int8_t type
378 template <> metadata<int8_t, uint8_t> DSPLIB_minIndex_loopLogic<int8_t, uint8_t>(size_t length, void *pSrc)
379 {
380 
381  /* printf("\ninside int 8 loop logic with length: %d\n", length); */
382  c7x::uchar_vec minIndices =
383  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
384  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
385  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
386  c7x::uchar_vec minIndices0 =
387  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
388  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
389  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
390  c7x::uchar_vec minIndices1 =
391  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
392  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
393  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
394  // re-defined for large widths so that there's no common vectors used between small and large widths
395  c7x::uchar_vec minIndicesA =
396  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
397  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
398  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
399  c7x::uchar_vec minIndicesB =
400  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
401  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
402  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
403  c7x::uchar_vec firstHalfIndices =
404  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
405  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
406  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
407  c7x::uchar_vec secondHalfIndices =
408  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
409  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
410  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
411 
412  typedef typename c7x::make_full_vector<int8_t>::type vec;
413  typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
414 
415  // holds all of the minimum values that have previously been read
416  vec minVals0; // = int8_t(std::numeric_limits<int8_t>::max());
417  vec minVals1; // = minVals0;
418  __vpred maskOfmins;
419 
420  // holds the overall min vals
421  vec minVals;
422  size_t width = c7x::element_count_of<vec>::value;
423 
424  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
425  if (length <= width) {
426  minVals = c7x::strm_eng<0, vec>::get_adv();
427 
428  // fill the uninitialized values with MIN_VAL
429  for (size_t i = length; i < width; i++) {
430  minVals.s[i] = std::numeric_limits<int8_t>::max();
431  }
432  }
433  // can fill one width but only part of a second
434  else if (length < 2 * width) {
435  minVals0 = c7x::strm_eng<0, vec>::get_adv();
436  minVals1 = c7x::strm_eng<1, vec>::get_adv();
437  // fill the uninitialized values with MIN_VAL
438  size_t remElements = length % width;
439 
440  for (size_t i = remElements; i < width; i++) {
441  minVals1.s[i] = std::numeric_limits<int8_t>::max();
442  }
443  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
444  // as well
445  maskOfmins = __cmp_ge_pred(minVals0, minVals1);
446  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
447  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
448  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
449  minVals = __select(maskOfmins, minVals1, minVals0);
450  index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
451  index_vec zeroVec = c7x::uchar_vec(0);
452  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
453  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
454 
455  minIndices = nonTiebreakerVec + tiebreakerVec;
456  }
457  else {
458  // input vectors
459  vec inVec0, inVec1;
460  __vpred mask0, mask1;
461  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
462  // when not being used, which will increase the ii
463  vec minValsA = int8_t(std::numeric_limits<int8_t>::max());
464  vec minValsB = minValsA;
465 
466  // holds the overall min vals
467  vec minValsLarge; // = int8_t(std::numeric_limits<int8_t>::max());
468 
469  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
470 
471  for (size_t i = 0; i < numIterations; i += 1) {
472  inVec0 = c7x::strm_eng<0, vec>::get_adv();
473  mask0 = __cmp_ge_pred(inVec0, minValsA);
474  minValsA =
475  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
476  // have been shown to have larger values (contained in mask)
477  minIndicesA =
478  __select(mask0, minIndicesA, firstHalfIndices); // set the min indices to be the indices whose values have
479  // changed from the previous min values
480 
481  inVec1 = c7x::strm_eng<1, vec>::get_adv();
482  mask1 = __cmp_ge_pred(inVec1, minValsB);
483  minValsB = __select(mask1, minValsB, inVec1);
484  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
485 
486  // update the new locations of the indices to be set for the next iteration
487  firstHalfIndices += jumpFactorChar;
488  secondHalfIndices += jumpFactorChar;
489  }
490 
491  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
492 
493  // if no remainder block, go to end
494 
495  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
496  int8_t *remStart = (int8_t *) pSrc + length - width;
497 
498  if (remBlockSize != 0 && remVecLen == 1) {
499 
500  inVec0 = *(vec *) remStart;
501  firstHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsChar;
502  mask0 = __cmp_ge_pred(inVec0, minValsA);
503  minValsA =
504  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
505  // have been shown to have larger values (contained in mask)
506  minIndicesA = __select(mask0, minIndicesA,
507  firstHalfIndices); // set the min indices to be the indices whose values have changed
508  // from the previous min values
509  }
510 
511  else if (remBlockSize != 0 && remVecLen == 2) {
512  inVec0 = *(vec *) (remStart - width);
513  firstHalfIndices = c7x::uchar_vec(length - (2 * c7x::element_count_of<index_vec>::value)) + lastRunOffsetsChar;
514  mask0 = __cmp_ge_pred(inVec0, minValsA);
515  minValsA =
516  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
517  // have been shown to have larger values (contained in mask)
518  minIndicesA = __select(mask0, minIndicesA,
519  firstHalfIndices); // set the min indices to be the indices whose values have changed
520  // from the previous min values
521 
522  inVec1 = *(vec *) remStart;
523  secondHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsChar;
524  mask1 = __cmp_ge_pred(inVec1, minValsB);
525  minValsB = __select(mask1, minValsB, inVec1);
526  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
527  }
528  else {
529  /* Nothing to do here */
530  }
531  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
532  // as well
533  __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
534  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
535  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
536  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
537  minValsLarge = __select(maskOfminValues, minValsB, minValsA);
538  index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
539  index_vec zeroVec = c7x::uchar_vec(0);
540  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
541  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
542 
543  index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
544 
545  minVals = minValsLarge;
546  minIndices = minIndicesLarge;
547  }
548 
550  output.minVals = minVals;
551  output.minIndices = minIndices;
552  return output;
553 }
554 
555 // explicit templatization for uint8_t type
556 template <> metadata<uint8_t, uint8_t> DSPLIB_minIndex_loopLogic<uint8_t, uint8_t>(size_t length, void *pSrc)
557 {
558  c7x::uchar_vec minIndices =
559  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
560  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
561  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
562  c7x::uchar_vec minIndices0 =
563  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
564  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
565  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
566  c7x::uchar_vec minIndices1 =
567  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
568  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
569  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
570  // re-defined for large widths so that there's no common vectors used between small and large widths
571  c7x::uchar_vec minIndicesA =
572  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
573  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
574  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
575  c7x::uchar_vec minIndicesB =
576  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
577  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
578  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
579  c7x::uchar_vec firstHalfIndices =
580  c7x::uchar_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
581  27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
582  51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
583  c7x::uchar_vec secondHalfIndices =
584  c7x::uchar_vec(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
585  88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
586  110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
587 
588  typedef typename c7x::make_full_vector<uint8_t>::type vec;
589  typedef typename c7x::make_full_vector<uint8_t>::type index_vec;
590 
591  // holds all of the minimum values that have previously been read
592  vec minVals0; // = uint8_t(std::numeric_limits<uint8_t>::max());
593  vec minVals1; // = minVals0;
594  __vpred maskOfmins;
595 
596  // holds the overall min vals
597  vec minVals;
598  size_t width = c7x::element_count_of<vec>::value;
599 
600  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
601  if (length <= width) {
602  minVals = c7x::strm_eng<0, vec>::get_adv();
603 
604  // fill the uninitialized values with MIN_VAL
605  for (size_t i = length; i < width; i++) {
606  minVals.s[i] = std::numeric_limits<uint8_t>::max();
607  }
608  }
609  // can fill one width but only part of a second
610  else if (length < 2 * width) {
611  minVals0 = c7x::strm_eng<0, vec>::get_adv();
612  minVals1 = c7x::strm_eng<1, vec>::get_adv();
613  // fill the uninitialized values with MIN_VAL
614  size_t remElements = length % width;
615 
616  for (size_t i = remElements; i < width; i++) {
617  minVals1.s[i] = std::numeric_limits<uint8_t>::max();
618  }
619  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
620  // as well
621  maskOfmins = __cmp_ge_pred(minVals0, minVals1);
622  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
623  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
624  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
625  minVals = __select(maskOfmins, minVals1, minVals0);
626  index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
627  index_vec zeroVec = c7x::uchar_vec(0);
628  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
629  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
630 
631  minIndices = nonTiebreakerVec + tiebreakerVec;
632  }
633  else {
634  // input vectors
635  vec inVec0, inVec1;
636  __vpred mask0, mask1;
637  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
638  // when not being used, which will increase the ii
639  vec minValsA = uint8_t(std::numeric_limits<uint8_t>::max());
640  vec minValsB = minValsA;
641 
642  // holds the overall min vals
643  vec minValsLarge; // = uint8_t(std::numeric_limits<uint8_t>::max());
644 
645  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
646 
647  for (size_t i = 0; i < numIterations; i += 1) {
648  inVec0 = c7x::strm_eng<0, vec>::get_adv();
649  mask0 = __cmp_ge_pred(inVec0, minValsA);
650  minValsA =
651  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
652  // have been shown to have larger values (contained in mask)
653  minIndicesA =
654  __select(mask0, minIndicesA, firstHalfIndices); // set the min indices to be the indices whose values have
655  // changed from the previous min values
656 
657  inVec1 = c7x::strm_eng<1, vec>::get_adv();
658  mask1 = __cmp_ge_pred(inVec1, minValsB);
659  minValsB = __select(mask1, minValsB, inVec1);
660  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
661 
662  // update the new locations of the indices to be set for the next iteration
663  firstHalfIndices += jumpFactorChar;
664  secondHalfIndices += jumpFactorChar;
665  }
666 
667  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
668 
669  // if no remainder block, go to end
670 
671  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
672  uint8_t *remStart = (uint8_t *) pSrc + length - width;
673 
674  if (remBlockSize != 0 && remVecLen == 1) {
675 
676  inVec0 = *(vec *) remStart;
677  firstHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsChar;
678  mask0 = __cmp_ge_pred(inVec0, minValsA);
679  minValsA =
680  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
681  // have been shown to have larger values (contained in mask)
682  minIndicesA = __select(mask0, minIndicesA,
683  firstHalfIndices); // set the min indices to be the indices whose values have changed
684  // from the previous min values
685  }
686 
687  else if (remBlockSize != 0 && remVecLen == 2) {
688  inVec0 = *(vec *) (remStart - width);
689  firstHalfIndices = c7x::uchar_vec(length - (2 * c7x::element_count_of<index_vec>::value)) + lastRunOffsetsChar;
690  mask0 = __cmp_ge_pred(inVec0, minValsA);
691  minValsA =
692  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
693  // have been shown to have larger values (contained in mask)
694  minIndicesA = __select(mask0, minIndicesA,
695  firstHalfIndices); // set the min indices to be the indices whose values have changed
696  // from the previous min values
697 
698  inVec1 = *(vec *) remStart;
699  secondHalfIndices = c7x::uchar_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsChar;
700  mask1 = __cmp_ge_pred(inVec1, minValsB);
701  minValsB = __select(mask1, minValsB, inVec1);
702  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
703  }
704  else {
705  /* Nothing to do here */
706  }
707  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
708  // as well
709  __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
710  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
711  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
712  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
713  minValsLarge = __select(maskOfminValues, minValsB, minValsA);
714  index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
715  index_vec zeroVec = c7x::uchar_vec(0);
716  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
717  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
718 
719  index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
720 
721  minVals = minValsLarge;
722  minIndices = minIndicesLarge;
723  }
724 
726  output.minVals = minVals;
727  output.minIndices = minIndices;
728  return output;
729 }
730 
731 // explicit templatization for int16_t type
733 {
734  c7x::ushort_vec minIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
735  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
736  c7x::ushort_vec minIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
737  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
738  c7x::ushort_vec minIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
739  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
740  // re-defined for large widths so that there's no common vectors used between small and large widths
741  c7x::ushort_vec minIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
742  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
743  c7x::ushort_vec minIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
744  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
745  c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
746  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
747  c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
748  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
749 
750  typedef typename c7x::make_full_vector<int16_t>::type vec;
751  typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
752 
753  // holds all of the minimum values that have previously been read
754  vec minVals0; // = int16_t(std::numeric_limits<int16_t>::max());
755  vec minVals1; // = minVals0;
756  __vpred maskOfmins;
757 
758  // holds the overall min vals
759  vec minVals;
760  size_t width = c7x::element_count_of<vec>::value;
761 
762  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
763  if (length <= width) {
764  minVals = c7x::strm_eng<0, vec>::get_adv();
765 
766  // fill the uninitialized values with MIN_VAL
767  for (size_t i = length; i < width; i++) {
768  minVals.s[i] = std::numeric_limits<int16_t>::max();
769  }
770  }
771  // can fill one width but only part of a second
772  else if (length < 2 * width) {
773  minVals0 = c7x::strm_eng<0, vec>::get_adv();
774  minVals1 = c7x::strm_eng<1, vec>::get_adv();
775  // fill the uninitialized values with MIN_VAL
776  size_t remElements = length % width;
777 
778  for (size_t i = remElements; i < width; i++) {
779  minVals1.s[i] = std::numeric_limits<int16_t>::max();
780  }
781  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
782  // as well
783  maskOfmins = __cmp_ge_pred(minVals0, minVals1);
784  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
785  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
786  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
787  minVals = __select(maskOfmins, minVals1, minVals0);
788  index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
789  index_vec zeroVec = c7x::ushort_vec(0);
790  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
791  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
792 
793  minIndices = nonTiebreakerVec + tiebreakerVec;
794  }
795  else {
796  // input vectors
797  vec inVec0, inVec1;
798  __vpred mask0, mask1;
799  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
800  // when not being used, which will increase the ii
801  vec minValsA = int16_t(std::numeric_limits<int16_t>::max());
802  vec minValsB = minValsA;
803 
804  // holds the overall min vals
805  vec minValsLarge; // = int16_t(std::numeric_limits<int16_t>::max());
806 
807  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
808 
809  for (size_t i = 0; i < numIterations; i += 1) {
810  inVec0 = c7x::strm_eng<0, vec>::get_adv();
811  mask0 = __cmp_ge_pred(inVec0, minValsA);
812  minValsA =
813  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
814  // have been shown to have larger values (contained in mask)
815  minIndicesA =
816  __select(mask0, minIndicesA, firstHalfIndices); // set the min indices to be the indices whose values have
817  // changed from the previous min values
818 
819  inVec1 = c7x::strm_eng<1, vec>::get_adv();
820  mask1 = __cmp_ge_pred(inVec1, minValsB);
821  minValsB = __select(mask1, minValsB, inVec1);
822  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
823 
824  // update the new locations of the indices to be set for the next iteration
825  firstHalfIndices += jumpFactorShort;
826  secondHalfIndices += jumpFactorShort;
827  }
828 
829  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
830 
831  // if no remainder block, go to end
832 
833  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
834  int16_t *remStart = (int16_t *) pSrc + length - width;
835 
836  if (remBlockSize != 0 && remVecLen == 1) {
837 
838  inVec0 = *(vec *) remStart;
839  firstHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsShort;
840  mask0 = __cmp_ge_pred(inVec0, minValsA);
841  minValsA =
842  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
843  // have been shown to have larger values (contained in mask)
844  minIndicesA = __select(mask0, minIndicesA,
845  firstHalfIndices); // set the min indices to be the indices whose values have changed
846  // from the previous min values
847  }
848 
849  else if (remBlockSize != 0 && remVecLen == 2) {
850  inVec0 = *(vec *) (remStart - width);
851  firstHalfIndices =
852  c7x::ushort_vec(length - (2 * c7x::element_count_of<index_vec>::value)) + lastRunOffsetsShort;
853  mask0 = __cmp_ge_pred(inVec0, minValsA);
854  minValsA =
855  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
856  // have been shown to have larger values (contained in mask)
857  minIndicesA = __select(mask0, minIndicesA,
858  firstHalfIndices); // set the min indices to be the indices whose values have changed
859  // from the previous min values
860 
861  inVec1 = *(vec *) remStart;
862  secondHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsShort;
863  mask1 = __cmp_ge_pred(inVec1, minValsB);
864  minValsB = __select(mask1, minValsB, inVec1);
865  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
866  }
867  else {
868  /* Nothing to do here */
869  }
870  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
871  // as well
872  __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
873  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
874  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
875  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
876  minValsLarge = __select(maskOfminValues, minValsB, minValsA);
877  index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
878  index_vec zeroVec = c7x::ushort_vec(0);
879  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
880  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
881 
882  index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
883 
884  minVals = minValsLarge;
885  minIndices = minIndicesLarge;
886  }
887 
889  output.minVals = minVals;
890  output.minIndices = minIndices;
891  return output;
892 }
893 
894 // explicit templatization for uint16_t type
896 {
897  c7x::ushort_vec minIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
898  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
899  c7x::ushort_vec minIndices0 = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
900  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
901  c7x::ushort_vec minIndices1 = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
902  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
903  // re-defined for large widths so that there's no common vectors used between small and large widths
904  c7x::ushort_vec minIndicesA = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
905  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
906  c7x::ushort_vec minIndicesB = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
907  50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
908  c7x::ushort_vec firstHalfIndices = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
909  19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
910  c7x::ushort_vec secondHalfIndices = c7x::ushort_vec(32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
911  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
912 
913  typedef typename c7x::make_full_vector<uint16_t>::type vec;
914  typedef typename c7x::make_full_vector<uint16_t>::type index_vec;
915 
916  // holds all of the minimum values that have previously been read
917  vec minVals0; // = uint16_t(std::numeric_limits<uint16_t>::max());
918  vec minVals1; // = minVals0;
919  __vpred maskOfmins;
920 
921  // holds the overall min vals
922  vec minVals;
923  size_t width = c7x::element_count_of<vec>::value;
924 
925  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
926  if (length <= width) {
927  minVals = c7x::strm_eng<0, vec>::get_adv();
928 
929  // fill the uninitialized values with MIN_VAL
930  for (size_t i = length; i < width; i++) {
931  minVals.s[i] = std::numeric_limits<int16_t>::max();
932  }
933  }
934  // can fill one width but only part of a second
935  else if (length < 2 * width) {
936  minVals0 = c7x::strm_eng<0, vec>::get_adv();
937  minVals1 = c7x::strm_eng<1, vec>::get_adv();
938  // fill the uninitialized values with MIN_VAL
939  size_t remElements = length % width;
940 
941  for (size_t i = remElements; i < width; i++) {
942  minVals1.s[i] = std::numeric_limits<int16_t>::max();
943  }
944  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
945  // as well
946  maskOfmins = __cmp_ge_pred(minVals0, minVals1);
947  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndices1, minIndices0);
948  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndices0, minIndices1);
949  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minVals0, minVals1);
950  minVals = __select(maskOfmins, minVals1, minVals0);
951  index_vec minIndicesIgnoringTiebreaker = __select(maskOfmins, minIndices1, minIndices0);
952  index_vec zeroVec = c7x::ushort_vec(0);
953  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
954  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
955 
956  minIndices = nonTiebreakerVec + tiebreakerVec;
957  }
958  else {
959  // input vectors
960  vec inVec0, inVec1;
961  __vpred mask0, mask1;
962  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
963  // when not being used, which will increase the ii
964  vec minValsA = uint16_t(std::numeric_limits<uint16_t>::max());
965  vec minValsB = minValsA;
966 
967  // holds the overall min vals
968  vec minValsLarge; // = uint16_t(std::numeric_limits<uint16_t>::max());
969 
970  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
971 
972  for (size_t i = 0; i < numIterations; i += 1) {
973  inVec0 = c7x::strm_eng<0, vec>::get_adv();
974  mask0 = __cmp_ge_pred(inVec0, minValsA);
975  minValsA =
976  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
977  // have been shown to have larger values (contained in mask)
978  minIndicesA =
979  __select(mask0, minIndicesA, firstHalfIndices); // set the min indices to be the indices whose values have
980  // changed from the previous min values
981 
982  inVec1 = c7x::strm_eng<1, vec>::get_adv();
983  mask1 = __cmp_ge_pred(inVec1, minValsB);
984  minValsB = __select(mask1, minValsB, inVec1);
985  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
986 
987  // update the new locations of the indices to be set for the next iteration
988  firstHalfIndices += jumpFactorShort;
989  secondHalfIndices += jumpFactorShort;
990  }
991 
992  int32_t remBlockSize = length - (INDEX_UNROLL_FACTOR * numIterations * width);
993 
994  // if no remainder block, go to end
995 
996  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
997  uint16_t *remStart = (uint16_t *) pSrc + length - width;
998 
999  if (remBlockSize != 0 && remVecLen == 1) {
1000 
1001  inVec0 = *(vec *) remStart;
1002  firstHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsShort;
1003  mask0 = __cmp_ge_pred(inVec0, minValsA);
1004  minValsA =
1005  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
1006  // have been shown to have larger values (contained in mask)
1007  minIndicesA = __select(mask0, minIndicesA,
1008  firstHalfIndices); // set the min indices to be the indices whose values have changed
1009  // from the previous min values
1010  }
1011 
1012  else if (remBlockSize != 0 && remVecLen == 2) {
1013  inVec0 = *(vec *) (remStart - width);
1014  firstHalfIndices =
1015  c7x::ushort_vec(length - (2 * c7x::element_count_of<index_vec>::value)) + lastRunOffsetsShort;
1016  mask0 = __cmp_ge_pred(inVec0, minValsA);
1017  minValsA =
1018  __select(mask0, minValsA, inVec0); // change the values in the min val vector depending on which positions
1019  // have been shown to have larger values (contained in mask)
1020  minIndicesA = __select(mask0, minIndicesA,
1021  firstHalfIndices); // set the min indices to be the indices whose values have changed
1022  // from the previous min values
1023 
1024  inVec1 = *(vec *) remStart;
1025  secondHalfIndices = c7x::ushort_vec(length - (c7x::element_count_of<index_vec>::value)) + lastRunOffsetsShort;
1026  mask1 = __cmp_ge_pred(inVec1, minValsB);
1027  minValsB = __select(mask1, minValsB, inVec1);
1028  minIndicesB = __select(mask1, minIndicesB, secondHalfIndices);
1029  }
1030  else {
1031  /* Nothing to do here */
1032  }
1033  // get the minimum values and their corresponding indices into single vectors while properly handling tiebreakers
1034  // as well
1035  __vpred maskOfminValues = __cmp_ge_pred(minValsA, minValsB);
1036  __vpred maskOfSmallerIndices = __cmp_ge_pred(minIndicesB, minIndicesA);
1037  index_vec smallestIndices = __select(maskOfSmallerIndices, minIndicesA, minIndicesB);
1038  __vpred maskOfTiebreakerValues = __cmp_eq_pred(minValsA, minValsB);
1039  minValsLarge = __select(maskOfminValues, minValsB, minValsA);
1040  index_vec minIndicesIgnoringTiebreaker = __select(maskOfminValues, minIndicesB, minIndicesA);
1041  index_vec zeroVec = c7x::ushort_vec(0);
1042  index_vec nonTiebreakerVec = __select(maskOfTiebreakerValues, zeroVec, minIndicesIgnoringTiebreaker);
1043  index_vec tiebreakerVec = __select(maskOfTiebreakerValues, smallestIndices, zeroVec);
1044 
1045  index_vec minIndicesLarge = nonTiebreakerVec + tiebreakerVec;
1046 
1047  minVals = minValsLarge;
1048  minIndices = minIndicesLarge;
1049  }
1050 
1052  output.minVals = minVals;
1053  output.minIndices = minIndices;
1054  return output;
1055 }
1056 
1057 // explicit templatization for float type
1058 template <> metadata<float, uint32_t> DSPLIB_minIndex_loopLogic<float, uint32_t>(size_t length, void *pSrc)
1059 {
1060  c7x::uint_vec minIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1061  c7x::uint_vec minIndices0 = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1062  c7x::uint_vec minIndices1 = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1063  // re-defined for large widths so that there's no common vectors used between small and large widths
1064  c7x::uint_vec minIndicesA = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1065  c7x::uint_vec minIndicesB = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1066  c7x::uint_vec firstHalfIndices = c7x::uint_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1067  c7x::uint_vec secondHalfIndices = c7x::uint_vec(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
1068  // holds all of the minimum values that have previously been read
1069  c7x::float_vec minVals0; // = std::numeric_limits<float>::max();
1070  c7x::float_vec minVals1; // = minVals0;
1071  __vpred maskOfmins;
1072 
1073  size_t width = c7x::element_count_of<c7x::float_vec>::value;
1074 
1075  // holds the overall min vals
1076  c7x::float_vec minVals;
1077  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1078  if (length <= width) {
1079  minVals = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1080  // fill the uninitialized values with MIN_VAL
1081  for (size_t i = length; i < width; i++) {
1082  minVals.s[i] = std::numeric_limits<float>::max();
1083  }
1084  }
1085  // can fill one width but only part of a second
1086  else if (length < 2 * width) {
1087  minVals0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1088  minVals1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1089  // fill the uninitialized values with MIN_VAL
1090  size_t remElements = length % width;
1091  for (size_t i = remElements; i < width; i++) {
1092  minVals1.s[i] = std::numeric_limits<float>::max();
1093  }
1094  maskOfmins = __cmp_lt_pred(minVals0, minVals1);
1095  minVals = __select(maskOfmins, minVals0, minVals1);
1096  minIndices = __select(maskOfmins, minIndices0, minIndices1);
1097  }
1098  else {
1099  // input vectors
1100  c7x::float_vec inVec0, inVec1;
1101  __vpred mask0, mask1, maskOfminsLarge;
1102  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1103  // when not being used, which will increase the ii
1104  c7x::float_vec minValsA = std::numeric_limits<float>::max();
1105  c7x::float_vec minValsB = minValsA;
1106 
1107  // holds the overall min vals
1108  c7x::float_vec minValsLarge; // = std::numeric_limits<float>::max();
1109  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1110  for (size_t i = 0; i < numIterations; i += 1) {
1111  inVec0 = c7x::strm_eng<0, c7x::float_vec>::get_adv();
1112  mask0 = __cmp_lt_pred(inVec0, minValsA);
1113  minValsA =
1114  __select(mask0, inVec0, minValsA); // change the values in the min val vector depending on which positions
1115  // have been shown to have larger values (contained in mask)
1116  minIndicesA =
1117  __select(mask0, firstHalfIndices, minIndicesA); // set the min indices to be the indices whose values have
1118  // changed from the previous min values
1119 
1120  inVec1 = c7x::strm_eng<1, c7x::float_vec>::get_adv();
1121  mask1 = __cmp_lt_pred(inVec1, minValsB);
1122  minValsB = __select(mask1, inVec1, minValsB);
1123  minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1124 
1125  // update the new locations of the indices to be set for the next iteration
1126  firstHalfIndices += jumpFactor;
1127  secondHalfIndices += jumpFactor;
1128  }
1129 
1130  int32_t remBlockSize = length - (numIterations * INDEX_UNROLL_FACTOR * width);
1131 
1132  // if no remainder block, go to end
1133 
1134  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1135  float *remStart = (float *) pSrc + length - width;
1136 
1137  // if remainder 1
1138  if (remBlockSize != 0 && remVecLen == 1) {
1139 
1140  inVec0 = *(c7x::float_vec *) remStart;
1141  firstHalfIndices = c7x::uint_vec(length - (width)) + lastRunOffsets;
1142  mask0 = __cmp_lt_pred(inVec0, minValsA);
1143  minValsA =
1144  __select(mask0, inVec0, minValsA); // change the values in the min val vector depending on which positions
1145  // have been shown to have larger values (contained in mask)
1146  minIndicesA = __select(mask0, firstHalfIndices,
1147  minIndicesA); // set the min indices to be the indices whose values have changed from
1148  // the previous min values
1149  }
1150 
1151  // if remainder 2
1152  else if (remBlockSize != 0 && remVecLen == 2) {
1153 
1154  inVec0 = *(c7x::float_vec *) (remStart - width);
1155  firstHalfIndices = c7x::uint_vec(length - (2 * width)) + lastRunOffsets;
1156  mask0 = __cmp_lt_pred(inVec0, minValsA);
1157  minValsA =
1158  __select(mask0, inVec0, minValsA); // change the values in the min val vector depending on which positions
1159  // have been shown to have larger values (contained in mask)
1160  minIndicesA = __select(mask0, firstHalfIndices,
1161  minIndicesA); // set the min indices to be the indices whose values have changed from
1162  // the previous min values
1163 
1164  inVec1 = *(c7x::float_vec *) remStart;
1165  secondHalfIndices = c7x::uint_vec(length - (width)) + lastRunOffsets;
1166  mask1 = __cmp_lt_pred(inVec1, minValsB);
1167  minValsB = __select(mask1, inVec1, minValsB);
1168  minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1169  }
1170  else {
1171  /* Nothing to do here */
1172  }
1173  maskOfminsLarge = __cmp_lt_pred(minValsA, minValsB);
1174  minValsLarge = __select(maskOfminsLarge, minValsA, minValsB);
1175  c7x::uint_vec minIndicesLarge = __select(maskOfminsLarge, minIndicesA, minIndicesB);
1176 
1177  minVals = minValsLarge;
1178  minIndices = minIndicesLarge;
1179  }
1181  output.minVals = minVals;
1182  output.minIndices = minIndices;
1183  return output;
1184 }
1185 
1186 // explicit templatization for double type
1187 template <> metadata<double, uint64_t> DSPLIB_minIndex_loopLogic<double, uint64_t>(size_t length, void *pSrc)
1188 {
1189  c7x::ulong_vec minIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1190  c7x::ulong_vec minIndices0 = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1191  c7x::ulong_vec minIndices1 = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1192  // re-defined for large widths so that there's no common vectors used between small and large widths
1193  c7x::ulong_vec minIndicesA = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1194  c7x::ulong_vec minIndicesB = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1195  c7x::ulong_vec firstHalfIndices = c7x::ulong_vec(0, 1, 2, 3, 4, 5, 6, 7);
1196  c7x::ulong_vec secondHalfIndices = c7x::ulong_vec(8, 9, 10, 11, 12, 13, 14, 15);
1197  // holds all of the minimum values that have previously been read
1198  c7x::double_vec minVals0; // = std::numeric_limits<double>::max();
1199  c7x::double_vec minVals1; // = minVals0;
1200  __vpred maskOfmins;
1201 
1202  size_t width = c7x::element_count_of<c7x::double_vec>::value;
1203 
1204  // holds the overall min vals
1205  c7x::double_vec minVals;
1206  // can only fill part of one width - all we need to do is fill in the uninitialized values with MIN_VAL
1207  if (length <= width) {
1208  minVals = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1209  // fill the uninitialized values with MIN_VAL
1210  for (size_t i = length; i < width; i++) {
1211  minVals.s[i] = std::numeric_limits<double>::max();
1212  }
1213  }
1214  // can fill one width but only part of a second
1215  else if (length < 2 * width) {
1216  minVals0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1217  minVals1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1218  // fill the uninitialized values with MIN_VAL
1219  size_t remainingElement = length % width;
1220  for (size_t i = remainingElement; i < width; i++) {
1221  minVals1.s[i] = std::numeric_limits<double>::max();
1222  }
1223  maskOfmins = __cmp_lt_pred(minVals0, minVals1);
1224  minVals = __select(maskOfmins, minVals0, minVals1);
1225  minIndices = __select(maskOfmins, minIndices0, minIndices1);
1226  }
1227  else {
1228  // input vectors
1229  c7x::double_vec inVec0, inVec1;
1230  __vpred mask0, mask1, maskOfminsLarge;
1231  // redefine the vectors used in small loops for large loops since .s[i] calls make random calls to the stack even
1232  // when not being used, which will increase the ii
1233  c7x::double_vec minValsA = std::numeric_limits<double>::max();
1234  c7x::double_vec minValsB = minValsA;
1235 
1236  // holds the overall min vals
1237  c7x::double_vec minValsLarge; // = std::numeric_limits<double>::max();
1238  size_t numIterations = length / (INDEX_UNROLL_FACTOR * width);
1239  for (size_t i = 0; i < numIterations; i += 1) {
1240  inVec0 = c7x::strm_eng<0, c7x::double_vec>::get_adv();
1241  mask0 = __cmp_lt_pred(inVec0, minValsA);
1242  minValsA =
1243  __select(mask0, inVec0, minValsA); // change the values in the min val vector depending on which positions
1244  // have been shown to have larger values (contained in mask)
1245  minIndicesA =
1246  __select(mask0, firstHalfIndices, minIndicesA); // set the min indices to be the indices whose values have
1247  // changed from the previous min values
1248 
1249  inVec1 = c7x::strm_eng<1, c7x::double_vec>::get_adv();
1250  mask1 = __cmp_lt_pred(inVec1, minValsB);
1251  minValsB = __select(mask1, inVec1, minValsB);
1252  minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1253 
1254  // update the new locations of the indices to be set for the next iteration
1255  firstHalfIndices += jumpFactorDp;
1256  secondHalfIndices += jumpFactorDp;
1257  }
1258 
1259  int32_t remBlockSize = length - (numIterations * INDEX_UNROLL_FACTOR * width);
1260 
1261  // if no remainder block, go to end
1262 
1263  int32_t remVecLen = DSPLIB_ceilingDiv(remBlockSize, width);
1264  double *remStart = (double *) pSrc + length - width;
1265 
1266  // if remainder 1
1267  if (remBlockSize != 0 && remVecLen == 1) {
1268 
1269  inVec0 = *(c7x::double_vec *) remStart;
1270  firstHalfIndices = c7x::ulong_vec(length - (width)) + lastRunOffsetsDp;
1271  mask0 = __cmp_lt_pred(inVec0, minValsA);
1272  minValsA =
1273  __select(mask0, inVec0, minValsA); // change the values in the min val vector depending on which positions
1274  // have been shown to have larger values (contained in mask)
1275  minIndicesA = __select(mask0, firstHalfIndices,
1276  minIndicesA); // set the min indices to be the indices whose values have changed from
1277  // the previous min values
1278  }
1279 
1280  // if remainder 2
1281  else if (remBlockSize != 0 && remVecLen == 2) {
1282 
1283  inVec0 = *(c7x::double_vec *) (remStart - width);
1284  firstHalfIndices = c7x::ulong_vec(length - (2 * width)) + lastRunOffsetsDp;
1285  mask0 = __cmp_lt_pred(inVec0, minValsA);
1286  minValsA =
1287  __select(mask0, inVec0, minValsA); // change the values in the min val vector depending on which positions
1288  // have been shown to have larger values (contained in mask)
1289  minIndicesA = __select(mask0, firstHalfIndices,
1290  minIndicesA); // set the min indices to be the indices whose values have changed from
1291  // the previous min values
1292 
1293  inVec1 = *(c7x::double_vec *) remStart;
1294  secondHalfIndices = c7x::ulong_vec(length - (width)) + lastRunOffsetsDp;
1295  mask1 = __cmp_lt_pred(inVec1, minValsB);
1296  minValsB = __select(mask1, inVec1, minValsB);
1297  minIndicesB = __select(mask1, secondHalfIndices, minIndicesB);
1298  }
1299  else {
1300  /* Do nothing */
1301  }
1302 
1303  maskOfminsLarge = __cmp_lt_pred(minValsA, minValsB);
1304  minValsLarge = __select(maskOfminsLarge, minValsA, minValsB);
1305  c7x::ulong_vec minIndicesLarge = __select(maskOfminsLarge, minIndicesA, minIndicesB);
1306 
1307  minVals = minValsLarge;
1308  minIndices = minIndicesLarge;
1309  }
1310 
1312  output.minVals = minVals;
1313  output.minIndices = minIndices;
1314  return output;
1315 }
1316 
1317 template <typename T, typename TIndex>
1318 DSPLIB_STATUS DSPLIB_minIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
1319 {
1320  DSPLIB_minIndex_PrivArgs *pKerPrivArgs = (DSPLIB_minIndex_PrivArgs *) handle;
1321  uint32_t blockSize = pKerPrivArgs->blockSize;
1322  uint32_t length = blockSize;
1323  DSPLIB_STATUS status = DSPLIB_SUCCESS;
1324 
1325  __SE_TEMPLATE_v1 se0Params, se1Params;
1326  // __SA_TEMPLATE_v1 sa0Params;
1327 
1328  T *restrict pInLocal = (T *) pIn;
1329  uint32_t *restrict pOutLocal = (uint32_t *) pOut;
1330 
1331 #if DSPLIB_DEBUGPRINT
1332  printf("Enter DSPLIB_minIndex_exec_ci\n");
1333 #endif
1334 
1335  typedef typename c7x::make_full_vector<T>::type vec;
1336 
1337  uint32_t eleCount = c7x::element_count_of<vec>::value;
1338  uint32_t width = eleCount;
1339 #if DSPLIB_DEBUGPRINT
1340  printf("Enter eleCount %d\n", eleCount);
1341 #endif
1342 
1343  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1344  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE0_PARAM_OFFSET);
1345  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + SE_SE1_PARAM_OFFSET);
1346 
1347  // Input samples
1348  __SE0_OPEN(pInLocal, se0Params);
1349  if (length > width) {
1350  __SE1_OPEN(pInLocal + eleCount, se1Params);
1351  }
1352  else {
1353  /* Nothing to do here */
1354  }
1355 
1356 #if DSPLIB_DEBUGPRINT
1357  printf("DSPLIB_DEBUGPRINT blockSize %d\n", blockSize);
1358 #endif
1359 
1360  size_t bitsInType = sizeof(T) * 8; // sizeof(T) is measured in bytes
1361  bitsInType = (bitsInType > 32) ? 32 : bitsInType;
1362 
1363  size_t minSingleBufferSize = pow(2, bitsInType);
1364  uint32_t numBufferIterations = DSPLIB_ceilingDiv(length, minSingleBufferSize);
1365 
1366  std::vector<T> minVals(numBufferIterations);
1367  std::vector<uint32_t> minIndices(numBufferIterations);
1368  T *currentValuePtr;
1369  T currentValue; // using this value so we don't dereference the pointer three different times per iteration
1370  T smallest;
1371  metadata<T, TIndex> loopOutput;
1372  size_t currentIterationSize;
1373  TIndex minIndex;
1374  TIndex *currentIndexPtr;
1375  TIndex currentIndex;
1376  size_t i;
1377  for (uint32_t buffer = 0; buffer < numBufferIterations; buffer++) {
1378  currentIterationSize = std::min((size_t) minSingleBufferSize, (size_t) (length - (minSingleBufferSize * buffer)));
1379  loopOutput = DSPLIB_minIndex_loopLogic<T, TIndex>(currentIterationSize, pInLocal);
1380  // find the minimum index by looping through the min vector and getting the corresponding min index
1381  // use pointer since .s[i] is problematic
1382  currentValuePtr = (T *) &loopOutput.minVals;
1383  smallest = *currentValuePtr++;
1384  currentIndexPtr = (TIndex *) &loopOutput.minIndices;
1385  minIndex = *currentIndexPtr++;
1386  for (i = 1; i < c7x::element_count_of<vec>::value; i++) {
1387  currentValue = *currentValuePtr;
1388  currentIndex = *currentIndexPtr;
1389  if (currentValue < smallest) {
1390  smallest = currentValue;
1391  minIndex = currentIndex;
1392  }
1393  // need the first instance of the minimum value, so set the minimum index to the lower index if current value
1394  // is same as current minimum value
1395  else if (currentValue == smallest) {
1396  if (currentIndex < minIndex) {
1397  minIndex = currentIndex;
1398  }
1399  else {
1400  /* Nothing to do here */
1401  }
1402  }
1403  else {
1404  /* Nothing to do here */
1405  }
1406  currentValuePtr++;
1407  currentIndexPtr++;
1408  }
1409 
1410  minVals[buffer] = smallest;
1411  minIndices[buffer] = ((uint32_t) minIndex) + (buffer * minSingleBufferSize);
1412  }
1413  T smallestVal = minVals[0];
1414  uint32_t smallestIndex = minIndices[0];
1415  /* printf("\n%d, %d\n", minIndices[0], minVals[0]); */
1416  for (i = 1; i < minVals.size(); i++) {
1417  /* printf("\n%d, %d\n", minIndices[i], minVals[i]); */
1418 
1419  if (minVals[i] < smallestVal) {
1420  smallestVal = minVals[i];
1421  smallestIndex = minIndices[i];
1422  }
1423  else {
1424  /* Nothing to do here */
1425  }
1426  }
1427 
1428  /* printf("\n%d, %d\n", smallestIndex, smallestVal); */
1429  *pOutLocal = smallestIndex;
1430  // close SE0 and SE1
1431  __SE0_CLOSE();
1432  if (length > width) {
1433  __SE1_CLOSE();
1434  }
1435  else {
1436  /* Nothing to do here */
1437  }
1438 
1439  return status;
1440 }
1441 
1443 DSPLIB_minIndex_exec_ci<int8_t, uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
1444 
1446 DSPLIB_minIndex_exec_ci<uint8_t, uint8_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
1447 
1449 DSPLIB_minIndex_exec_ci<int16_t, uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
1450 
1452 DSPLIB_minIndex_exec_ci<uint16_t, uint16_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
1453 
1455 DSPLIB_minIndex_exec_ci<int32_t, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
1456 
1458 DSPLIB_minIndex_exec_ci<uint32_t, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
1459 
1461 DSPLIB_minIndex_exec_ci<float, uint32_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
1462 
1464 DSPLIB_minIndex_exec_ci<double, uint64_t>(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut);
#define SE_SE0_PARAM_OFFSET
#define SE_SE1_PARAM_OFFSET
metadata< T, TIndex > DSPLIB_minIndex_loopLogic(size_t length, void *pSrc)
This function is the kernel loop helper function for the optimized implementation of the kernel....
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< int16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
metadata< int8_t, uint8_t > DSPLIB_minIndex_loopLogic< int8_t, uint8_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< uint16_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
const c7x::uint_vec jumpFactor
const c7x::ushort_vec jumpFactorShort
const c7x::uchar_vec jumpFactorChar
DSPLIB_STATUS DSPLIB_minIndex_exec_ci(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
This function is the main execution function for the C7x implementation of the kernel....
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< float, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< uint16_t, uint16_t > DSPLIB_minIndex_loopLogic< uint16_t, uint16_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
metadata< uint8_t, uint8_t > DSPLIB_minIndex_loopLogic< uint8_t, uint8_t >(size_t length, void *pSrc)
metadata< int16_t, uint16_t > DSPLIB_minIndex_loopLogic< int16_t, uint16_t >(size_t length, void *pSrc)
DSPLIB_STATUS DSPLIB_minIndex_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
const c7x::uint_vec lastRunOffsets
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< int16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< int32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< double, uint64_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< uint8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< uint16_t, uint16_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< uint32_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
const c7x::uchar_vec lastRunOffsetsChar
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< int32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
metadata< double, uint64_t > DSPLIB_minIndex_loopLogic< double, uint64_t >(size_t length, void *pSrc)
#define INDEX_UNROLL_FACTOR
const c7x::ushort_vec lastRunOffsetsShort
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< uint32_t, uint32_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
template DSPLIB_STATUS DSPLIB_minIndex_init_ci< int8_t >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams1D_t *bufParamsIn, const DSPLIB_bufParams1D_t *bufParamsOut, const DSPLIB_minIndex_InitArgs *pKerInitArgs)
metadata< float, uint32_t > DSPLIB_minIndex_loopLogic< float, uint32_t >(size_t length, void *pSrc)
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< int8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
const c7x::ulong_vec jumpFactorDp
const c7x::ulong_vec lastRunOffsetsDp
template DSPLIB_STATUS DSPLIB_minIndex_exec_ci< uint8_t, uint8_t >(DSPLIB_kernelHandle handle, void *restrict pIn, void *restrict pOut)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_minIndex.
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 1 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Structure that is reserved for internal use by the kernel.
int32_t blockSize
Size of input buffer for different batches DSPLIB_minIndex_init that will be retrieved and used by DS...
uint8_t bufPblock[DSPLIB_MININDEX_IXX_IXX_OXX_PBLOCK_SIZE]
index_vec minIndices