DSPLIB User Guide
DSPLIB_lud_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date Sep 2023 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 
47 #include "../common/DSPLIB_inlines.h"
48 #include "DSPLIB_lud_common.h"
49 #include "DSPLIB_lud_priv.h"
50 
51 #define LUD_HIGH_PRECISION
52 /**********************************************************************
53  *
54  * INITIALIZATION
55  *
56  *********************************************************************/
57 
58 template <typename dataType>
60  const DSPLIB_bufParams2D_t *bufParamsA,
61  const DSPLIB_bufParams2D_t *bufParamsL,
62  const DSPLIB_bufParams2D_t *bufParamsU,
63  const DSPLIB_bufParams2D_t *bufParamsP,
64  const DSPLIB_ludInitArgs *pKerInitArgs)
65 {
66  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
67 
68  DSPLIB_lud_PrivArgs *pKerPrivArgs = (DSPLIB_lud_PrivArgs *) handle;
69  uint8_t *pBlock = pKerPrivArgs->bufPblock;
70  int32_t strideOrder = pKerPrivArgs->strideOrder;
71  int32_t strideP = pKerPrivArgs->strideP;
72  int32_t order = pKerPrivArgs->order;
73  int32_t colStrideOrder = strideOrder / sizeof(dataType);
74 
76  DSPLIB_lud_blk_move_init_ci<dataType>(&pBlock[17 * SE_PARAM_SIZE], order, colStrideOrder);
77 
78  typedef typename c7x::make_full_vector<dataType>::type vec;
79  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
80  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
81  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
82  int32_t eleCount = c7x::element_count_of<vec>::value;
83 
84  /******************************* INDEX FINDING ******************************/
85  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
86 
87  se0Params.ICNT0 = 1;
88  se0Params.ICNT1 = eleCount;
89  se0Params.DIM1 = colStrideOrder;
90  se0Params.DIM2 = colStrideOrder * eleCount;
91  se0Params.DIMFMT = __SE_DIMFMT_3D;
92  se0Params.ELETYPE = SE_ELETYPE;
93  se0Params.VECLEN = SE_VECLEN;
94  if (sizeof(dataType) == 4) {
95  se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
96  }
97  else {
98  se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
99  }
100 
101  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = se0Params;
102  /******************************* ARRAY SWAP ******************************/
103  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
104  se0Params = __gen_SE_TEMPLATE_v1();
105 
106  sa0Params.ICNT0 = order;
107  sa0Params.DIMFMT = __SA_DIMFMT_1D;
108  sa0Params.VECLEN = SA_VECLEN;
109 
110  se0Params.ICNT0 = order;
111  se0Params.DIMFMT = __SE_DIMFMT_1D;
112  se0Params.ELETYPE = SE_ELETYPE;
113  se0Params.VECLEN = SE_VECLEN;
114 
115  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = sa0Params;
116  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = se0Params;
117 
118  /******************************* P ARRAY SWAP ******************************/
119  typedef typename c7x::make_full_vector<unsigned short>::type vecShort;
120 
121  __SE_ELETYPE SE_ELETYPE_SHORT = c7x::se_eletype<vecShort>::value;
122  __SE_VECLEN SE_VECLEN_SHORT = c7x::se_veclen<vecShort>::value;
123  __SA_VECLEN SA_VECLEN_SHORT = c7x::sa_veclen<vecShort>::value;
124 
125  sa0Params = __gen_SA_TEMPLATE_v1();
126  se0Params = __gen_SE_TEMPLATE_v1();
127 
128  sa0Params.ICNT0 = order;
129  sa0Params.DIMFMT = __SA_DIMFMT_1D;
130  sa0Params.VECLEN = SA_VECLEN_SHORT;
131 
132  se0Params.ICNT0 = order;
133  se0Params.DIMFMT = __SE_DIMFMT_1D;
134  se0Params.ELETYPE = SE_ELETYPE_SHORT;
135  se0Params.VECLEN = SE_VECLEN_SHORT;
136 
137  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE)) = sa0Params;
138  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE)) = se0Params;
139 
140  /******************************* U_generateE PARAMS (MULTIPLE OF 4)**************************************/
141  __SE_TEMPLATE_v1 seRefParams = __gen_SE_TEMPLATE_v1();
142  __SE_TEMPLATE_v1 seMatrixParams = __gen_SE_TEMPLATE_v1();
143  __SA_TEMPLATE_v1 saMatrixParams = __gen_SA_TEMPLATE_v1();
144  __SA_TEMPLATE_v1 sa2Params = __gen_SA_TEMPLATE_v1();
145 
146  sa0Params = __gen_SA_TEMPLATE_v1();
147 
148  int32_t lenTile4 = 4;
149 
150  sa0Params.ICNT0 = 1;
151  sa0Params.DIMFMT = __SA_DIMFMT_2D;
152  sa0Params.VECLEN = __SA_VECLEN_1ELEM;
153  sa0Params.DIM1 = colStrideOrder;
154 
155  sa2Params.ICNT0 = 1;
156  sa2Params.DIMFMT = __SA_DIMFMT_2D;
157  sa2Params.VECLEN = __SA_VECLEN_1ELEM;
158  sa2Params.DIM1 = colStrideOrder;
159 
160  seRefParams.DIM1 = (eleCount * lenTile4);
161  seRefParams.DIM2 = 0;
162  seRefParams.DIMFMT = __SE_DIMFMT_3D;
163  seRefParams.VECLEN = SE_VECLEN;
164  seRefParams.ELETYPE = SE_ELETYPE;
165  seRefParams.ICNT0 = (eleCount * lenTile4);
166  seRefParams.DECDIM2 = __SE_DECDIM_DIM1;
167 
168  seMatrixParams.ICNT0 = (eleCount * lenTile4);
169  seMatrixParams.DIM1 = colStrideOrder;
170  seMatrixParams.ICNT1 = lenTile4;
171  seMatrixParams.DIM2 = (eleCount * lenTile4);
172  seMatrixParams.DIM3 = (colStrideOrder * lenTile4);
173  seMatrixParams.DIMFMT = __SE_DIMFMT_4D;
174  seMatrixParams.ELETYPE = SE_ELETYPE;
175  seMatrixParams.VECLEN = SE_VECLEN;
176  seMatrixParams.DECDIM2 = __SE_DECDIM_DIM2;
177 
178  saMatrixParams.ICNT0 = (eleCount * lenTile4);
179  saMatrixParams.DIM1 = colStrideOrder;
180  saMatrixParams.ICNT1 = lenTile4;
181  saMatrixParams.DIM2 = (eleCount * lenTile4);
182  saMatrixParams.DIM3 = (colStrideOrder * lenTile4);
183  saMatrixParams.DIMFMT = __SA_DIMFMT_4D;
184  saMatrixParams.VECLEN = SA_VECLEN;
185  saMatrixParams.DECDIM2 = __SA_DECDIM_DIM2;
186 
187  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = sa0Params;
188  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = sa2Params;
189  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = seRefParams;
190  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = seMatrixParams;
191  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = saMatrixParams;
192 
193  /******************************* U_generateE PARAMS (NON-MULTIPLE OF 4)******************************/
194  seMatrixParams = __gen_SE_TEMPLATE_v1();
195  saMatrixParams = __gen_SA_TEMPLATE_v1();
196 
197  seMatrixParams.ICNT0 = (eleCount * lenTile4);
198  seMatrixParams.DIM2 = colStrideOrder;
199  seMatrixParams.DIM1 = (eleCount * lenTile4);
200  seMatrixParams.ICNT3 = 1;
201  seMatrixParams.DIM3 = 0;
202  seMatrixParams.DIMFMT = __SE_DIMFMT_4D;
203  seMatrixParams.ELETYPE = SE_ELETYPE;
204  seMatrixParams.VECLEN = SE_VECLEN;
205  seMatrixParams.DECDIM2 = __SE_DECDIM_DIM1;
206 
207  saMatrixParams.ICNT0 = (eleCount * lenTile4);
208  saMatrixParams.DIM2 = colStrideOrder;
209  saMatrixParams.DIM1 = (eleCount * lenTile4);
210  saMatrixParams.ICNT3 = 1;
211  saMatrixParams.DIM3 = 0;
212  saMatrixParams.DIMFMT = __SA_DIMFMT_4D;
213  saMatrixParams.VECLEN = SA_VECLEN;
214  saMatrixParams.DECDIM2 = __SA_DECDIM_DIM1;
215 
216  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE)) = seMatrixParams;
217  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (10 * SE_PARAM_SIZE)) = saMatrixParams;
218 
219  /***************************************** U and L extraction ********************************************/
220 
221  seMatrixParams = __gen_SE_TEMPLATE_v1();
222  saMatrixParams = __gen_SA_TEMPLATE_v1();
223 
224  seMatrixParams.ICNT0 = order;
225  seMatrixParams.ICNT1 = order;
226  seMatrixParams.DIM1 = colStrideOrder;
227  seMatrixParams.DIMFMT = __SE_DIMFMT_2D;
228  seMatrixParams.ELETYPE = SE_ELETYPE;
229  seMatrixParams.VECLEN = SE_VECLEN;
230 
231  saMatrixParams.ICNT0 = order;
232  saMatrixParams.ICNT1 = order;
233  saMatrixParams.DIM1 = colStrideOrder;
234  saMatrixParams.DIMFMT = __SA_DIMFMT_2D;
235  saMatrixParams.VECLEN = SA_VECLEN;
236 
237  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE)) = seMatrixParams;
238  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE)) = saMatrixParams;
239 
240  /***************************************** Col Update using 2 SE ********************************************/
241 
242  __SE_TEMPLATE_v1 seColParams = __gen_SE_TEMPLATE_v1();
243  __SA_TEMPLATE_v1 saColParams = __gen_SA_TEMPLATE_v1();
244 
245  seColParams.ICNT0 = eleCount;
246  seColParams.DIM1 = 2 * colStrideOrder;
247  seColParams.DIMFMT = __SE_DIMFMT_2D;
248  seColParams.ELETYPE = SE_ELETYPE;
249  seColParams.VECLEN = SE_VECLEN;
250 
251  saColParams.ICNT0 = 1;
252  saColParams.DIM1 = colStrideOrder;
253  saColParams.DIMFMT = __SA_DIMFMT_2D;
254  saColParams.VECLEN = __SA_VECLEN_1ELEM;
255 
256  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE)) = seColParams;
257  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE)) = saColParams;
258 
259  /************************************ U Generating Params ************************************/
260 
261  int32_t lenTile8 = 8;
262  int32_t nTiles_8 = DSPLIB_ceilingDiv(order, (eleCount * lenTile8));
263 
264  __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
265  __SE_TEMPLATE_v1 seScalarParams = __gen_SE_TEMPLATE_v1();
266  seMatrixParams = __gen_SE_TEMPLATE_v1();
267  saMatrixParams = __gen_SA_TEMPLATE_v1();
268 
269  seScalarParams.ICNT0 = 1;
270  seScalarParams.DIM1 = colStrideOrder;
271  seScalarParams.DIM2 = 0;
272  seScalarParams.ELEDUP = SE_ELEDUP;
273  seScalarParams.DIMFMT = __SE_DIMFMT_3D;
274  seScalarParams.VECLEN = SE_VECLEN;
275  seScalarParams.ELETYPE = SE_ELETYPE;
276 
277  seMatrixParams.ICNT0 = (eleCount * lenTile8);
278  seMatrixParams.DIM1 = colStrideOrder;
279  seMatrixParams.ICNT2 = nTiles_8;
280  seMatrixParams.DIM2 = (eleCount * lenTile8);
281  seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
282  seMatrixParams.ELETYPE = SE_ELETYPE;
283  seMatrixParams.VECLEN = SE_VECLEN;
284  seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
285  seMatrixParams.DECDIM1_WIDTH = order;
286 
287  saMatrixParams.ICNT0 = (eleCount * lenTile8);
288  saMatrixParams.DIM1 = colStrideOrder;
289  saMatrixParams.ICNT2 = nTiles_8;
290  saMatrixParams.DIM2 = (eleCount * lenTile8);
291  saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
292  saMatrixParams.VECLEN = SA_VECLEN;
293  saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
294  saMatrixParams.DECDIM1_WIDTH = order;
295 
296  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (19 * SE_PARAM_SIZE)) = seScalarParams;
297  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (20 * SE_PARAM_SIZE)) = seMatrixParams;
298  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (21 * SE_PARAM_SIZE)) = saMatrixParams;
299 
300  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
301 
302  return DSPLIB_SUCCESS;
303 }
304 
306  const DSPLIB_bufParams2D_t *bufParamsA,
307  const DSPLIB_bufParams2D_t *bufParamsL,
308  const DSPLIB_bufParams2D_t *bufParamsU,
309  const DSPLIB_bufParams2D_t *bufParamsP,
310  const DSPLIB_ludInitArgs *pKerInitArgs);
311 
313  const DSPLIB_bufParams2D_t *bufParamsA,
314  const DSPLIB_bufParams2D_t *bufParamsL,
315  const DSPLIB_bufParams2D_t *bufParamsU,
316  const DSPLIB_bufParams2D_t *bufParamsP,
317  const DSPLIB_ludInitArgs *pKerInitArgs);
318 /**********************************************************************/
319 /* IMPLEMENTATION */
320 /**********************************************************************/
321 
322 template <typename dataType> inline __vpred getPMask(uint32_t idx);
323 
324 template <> inline __vpred getPMask<float>(uint32_t idx) { return __mask_int(idx); }
325 
326 template <> inline __vpred getPMask<double>(uint32_t idx) { return __mask_long(idx); }
327 
328 float8 idx_float = float8(0, 1, 2, 3, 4, 5, 6, 7);
329 double4 idx_double = double4(0, 1, 2, 3);
330 
331 template <typename V, typename dataType> inline V getIdxVec();
332 
333 template <> inline c7x::float_vec getIdxVec<c7x::float_vec, float>() { return idx_float; }
334 
335 template <> inline c7x::double_vec getIdxVec<c7x::double_vec, double>() { return idx_double; }
336 
337 template <typename dataType, typename vec = typename c7x::make_full_vector<dataType>::type>
338 void static inline DSPLIB_lud_maxMinIndex_exec_ci(dataType *pCol,
339  int32_t nRows,
340  vec idx_0_to_eleCount,
341  int32_t k,
342  int32_t colStride,
343  int32_t *max,
344  int32_t *min,
345  __SE_TEMPLATE_v1 se0Params)
346 {
347  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
348 
349  int32_t eleCount = c7x::element_count_of<vec>::value;
350  int32_t nVec = nRows / eleCount;
351  int32_t remRows = nRows - (nVec * eleCount);
352 
353  vec maxValVec = (vec) (-std::numeric_limits<dataType>::max());
354  vec minValVec = (vec) (std::numeric_limits<dataType>::max());
355 
356  vec zeroVec = (vec) 0;
357 
358  __vpred predMask = getPMask<dataType>(remRows);
359  vec minMask = __select(predMask, zeroVec, minValVec);
360 
361  se0Params.ICNT2 = nVec + ((remRows > 0) ? 1 : 0);
362 
363  dataType maxVal = -std::numeric_limits<dataType>::max();
364  dataType minVal = std::numeric_limits<dataType>::max();
365  vec offSetVec = (vec) k;
366 
367  idx_0_to_eleCount += offSetVec;
368 
369  vec vCurrIdx = idx_0_to_eleCount;
370  vec vMaxIdx = idx_0_to_eleCount;
371  vec vMinIdx = idx_0_to_eleCount;
372 
373  __SE0_OPEN(pCol, se0Params);
374  __vpred vpMask;
375  for (int32_t k = 0; k < nVec; k++) {
376  vec v1 = c7x::strm_eng<0, vec>::get_adv();
377  v1 = __abs(v1);
378 
379  vpMask = __cmp_le_pred(v1, maxValVec);
380  maxValVec = __select(vpMask, maxValVec, v1);
381  vMaxIdx = __select(vpMask, vMaxIdx, vCurrIdx);
382 
383  vpMask = __cmp_le_pred(minValVec, v1);
384  minValVec = __select(vpMask, minValVec, v1);
385  vMinIdx = __select(vpMask, vMinIdx, vCurrIdx);
386 
387  vCurrIdx = vCurrIdx + (eleCount);
388  }
389  if (remRows) {
390  vec v1 = c7x::strm_eng<0, vec>::get_adv();
391  v1 = __abs(v1);
392  vec v3 = c7x::reinterpret<vec>(__andn(c7x::as_uchar_vec(minMask), c7x::as_uchar_vec(v1)));
393  vec v4 = v3 + minMask;
394 
395  vpMask = __cmp_le_pred(v3, maxValVec);
396  maxValVec = __select(vpMask, maxValVec, v3);
397  vMaxIdx = __select(vpMask, vMaxIdx, vCurrIdx);
398  vpMask = __cmp_lt_pred(minValVec, v4);
399  minValVec = __select(vpMask, minValVec, v4);
400  vMinIdx = __select(vpMask, vMinIdx, vCurrIdx);
401 
402  vCurrIdx = vCurrIdx + (eleCount);
403  }
404 
405  c7x_horizontal_max_with_index(maxValVec, vMaxIdx, &maxVal, max);
406  vpMask = __cmp_eq_pred(maxValVec, (vec) maxVal);
407  vec tmpIdx = __select(vpMask, vMaxIdx, (vec) (std::numeric_limits<dataType>::max()));
408  *max = (int32_t) c7x_horizontal_min_fp<dataType, vec>(tmpIdx);
409 
410  c7x_horizontal_min_with_index(minValVec, vMinIdx, &minVal, min);
411  vpMask = __cmp_eq_pred(minValVec, (vec) minVal);
412  tmpIdx = __select(vpMask, vMinIdx, (vec) (std::numeric_limits<dataType>::max()));
413  *min = (int32_t) c7x_horizontal_min_fp<dataType, vec>(tmpIdx);
414 
415  __SE0_CLOSE();
416  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
417 }
418 template void DSPLIB_lud_maxMinIndex_exec_ci<float>(float *pCol,
419  int32_t nRows,
420  typename c7x::make_full_vector<float>::type idx_0_to_eleCount,
421  int32_t k,
422  int32_t colStride,
423  int32_t *max,
424  int32_t *min,
425  __SE_TEMPLATE_v1 se0Params);
426 template void DSPLIB_lud_maxMinIndex_exec_ci<double>(double *pCol,
427  int32_t nRows,
428  typename c7x::make_full_vector<double>::type idx_0_to_eleCount,
429  int32_t k,
430  int32_t colStride,
431  int32_t *max,
432  int32_t *min,
433  __SE_TEMPLATE_v1 se0Params);
434 
435 template <typename dataType>
436 static inline void DSPLIB_lud_array_swap_exec_ci(dataType *pArray1,
437  dataType *pArray2,
438  int32_t nCols,
439  __SA_TEMPLATE_v1 sa1Params,
440  __SE_TEMPLATE_v1 se1Params)
441 {
442  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
443 
444  typedef typename c7x::make_full_vector<dataType>::type vec;
445  int32_t eleCount = c7x::element_count_of<vec>::value;
446  int32_t nTiles = DSPLIB_ceilingDiv(se1Params.ICNT0, eleCount);
447 
448  __SE0_OPEN(pArray1, se1Params);
449  __SA0_OPEN(sa1Params);
450  __SE1_OPEN(pArray2, se1Params);
451  __SA1_OPEN(sa1Params);
452 
453  for (int32_t horizontal = 0; horizontal < nTiles; horizontal++) {
454  vec v11 = c7x::strm_eng<0, vec>::get_adv();
455  vec v21 = c7x::strm_eng<1, vec>::get_adv();
456 
457  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
458  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pArray2);
459  __vstore_pred(sPred, pStoreVec, v11);
460 
461  sPred = c7x::strm_agen<0, vec>::get_vpred();
462  pStoreVec = c7x::strm_agen<0, vec>::get_adv(pArray1);
463  __vstore_pred(sPred, pStoreVec, v21);
464  }
465 
466  __SE0_CLOSE();
467  __SA0_CLOSE();
468  __SE1_CLOSE();
469  __SA1_CLOSE();
470 
471  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
472 }
473 template void DSPLIB_lud_array_swap_exec_ci<float>(float *pArray1,
474  float *pArray2,
475  int32_t nCols,
476  __SA_TEMPLATE_v1 sa1Params,
477  __SE_TEMPLATE_v1 se1Params);
478 template void DSPLIB_lud_array_swap_exec_ci<double>(double *pArray1,
479  double *pArray2,
480  int32_t nCols,
481  __SA_TEMPLATE_v1 sa1Params,
482  __SE_TEMPLATE_v1 se1Params);
483 template void DSPLIB_lud_array_swap_exec_ci<unsigned short>(unsigned short *pArray1,
484  unsigned short *pArray2,
485  int32_t nCols,
486  __SA_TEMPLATE_v1 sa1Params,
487  __SE_TEMPLATE_v1 se1Params);
488 template <typename dataType>
489 void static inline DSPLIB_lud_U_colDiv_exec_ci(dataType *pCol,
490  int32_t nRows,
491  int32_t colStride,
492  __SA_TEMPLATE_v1 saColParams,
493  __SE_TEMPLATE_v1 se0ColParams,
494  __SE_TEMPLATE_v1 se1ColParams)
495 {
496  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
497 
498  typedef typename c7x::make_full_vector<dataType>::type vec;
499  nRows = nRows - 1;
500  int32_t nBlocks = (int32_t) ((uint32_t) nRows >> (uint32_t) 1);
501  int32_t se0ICNT1 = nRows - nBlocks;
502  int32_t se1ICNT1 = nBlocks;
503 
504 #ifndef LUD_HIGH_PRECISION
505  dataType diag = pCol[0];
506  dataType recipScalar = __recip(diag);
507  dataType twoP0 = 2.0;
508  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
509  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
510 
511  vec recip = (vec) recipScalar;
512 #else
513  vec recip = (vec) (1 / pCol[0]);
514 #endif
515  se0ColParams.ICNT1 = se0ICNT1;
516  se1ColParams.ICNT1 = se1ICNT1;
517  saColParams.ICNT1 = nRows;
518 
519  __SE0_OPEN(pCol + colStride, se0ColParams);
520  __SA0_OPEN(saColParams);
521 
522  if (nBlocks > 0) {
523  __SE1_OPEN(pCol + (2 * colStride), se1ColParams);
524  }
525 
526  for (int32_t horizontal = 0; horizontal < nRows - 1; horizontal += 2) {
527  vec v1 = c7x::strm_eng<0, vec>::get_adv();
528  vec v2 = c7x::strm_eng<1, vec>::get_adv();
529 
530  v1 *= recip;
531  v2 *= recip;
532 
533  __vpred sPred = c7x::strm_agen<0, vec>::get_vpred();
534  vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv(pCol + colStride);
535  __vstore_pred(sPred, pStoreVec, v1);
536 
537  sPred = c7x::strm_agen<0, vec>::get_vpred();
538  pStoreVec = c7x::strm_agen<0, vec>::get_adv(pCol + colStride);
539  __vstore_pred(sPred, pStoreVec, v2);
540  }
541 
542  if (se0ICNT1 != se1ICNT1) {
543  vec v1 = c7x::strm_eng<0, vec>::get_adv();
544 
545  v1 *= recip;
546 
547  __vpred sPred = c7x::strm_agen<0, vec>::get_vpred();
548  vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv(pCol + colStride);
549  __vstore_pred(sPred, pStoreVec, v1);
550  }
551  __SE0_CLOSE();
552  __SA0_CLOSE();
553  if (nBlocks > 0) {
554  __SE1_CLOSE();
555  }
556 
557  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
558 }
559 template void DSPLIB_lud_U_colDiv_exec_ci<float>(float *pCol,
560  int32_t nRows,
561  int32_t colStride,
562  __SA_TEMPLATE_v1 saColParams,
563  __SE_TEMPLATE_v1 se0ColParams,
564  __SE_TEMPLATE_v1 se1ColParams);
565 template void DSPLIB_lud_U_colDiv_exec_ci<double>(double *pCol,
566  int32_t nRows,
567  int32_t colStride,
568  __SA_TEMPLATE_v1 saColParams,
569  __SE_TEMPLATE_v1 se0ColParams,
570  __SE_TEMPLATE_v1 se1ColParams);
571 
572 template <typename dataType>
573 static inline void DSPLIB_lud_U_generate_exec_ci(dataType *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock)
574 {
575  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
576  typedef typename c7x::make_full_vector<dataType>::type vec;
577 
578  uint32_t eleCount = c7x::element_count_of<vec>::value;
579 
580  /* Updation params */
581  __SE_TEMPLATE_v1 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (19 * SE_PARAM_SIZE));
582  __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (20 * SE_PARAM_SIZE));
583  __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (21 * SE_PARAM_SIZE));
584  __SA_TEMPLATE_v1 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
585 
586  int32_t lenTile8 = 8;
587  int32_t lenTile4 = 4;
588  int32_t lenTile2 = 2;
589  int32_t lenTile1 = 1;
590 
591  int32_t nRows = (order - 1);
592  saRefParams.ICNT0 = nRows;
593 
594  int32_t nTiles1 = DSPLIB_ceilingDiv(nRows, (eleCount));
595  int32_t nTiles8 = nTiles1 / lenTile8;
596  nTiles1 -= nTiles8 * lenTile8;
597  int32_t nTiles4 = nTiles1 / lenTile4;
598  nTiles1 -= nTiles4 * lenTile4;
599  int32_t nTiles2 = nTiles1 / lenTile2;
600  nTiles1 -= nTiles2 * lenTile2;
601 
602  int32_t remainingCols = nRows;
603  int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
604  colLimit8 = (remainingCols < (colLimit8)) ? remainingCols : colLimit8;
605 
606  remainingCols = remainingCols - colLimit8;
607  int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
608  colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
609 
610  remainingCols = remainingCols - colLimit4;
611  int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
612  colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
613 
614  int32_t colLimit1 = remainingCols - colLimit2;
615 
616  dataType *pRefL = pLocalU + 1;
617  dataType *pStartL = pRefL + colStrideU;
618 
619  seScalarParams.ICNT1 = nRows;
620  seScalarParams.ICNT2 = nTiles8 + nTiles4 + nTiles2 + nTiles1;
621 
622  __SE0_OPEN(pLocalU + colStrideU, seScalarParams);
623 
624  __SA0_OPEN(saRefParams);
625 
626  if (nTiles8 > 0) {
627  /* 1 X (8*eleCount) TILE */
628 
629  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
630  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
631  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
632  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
633  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
634 
635  __SE1_OPEN(pStartL, seMatrixParams);
636  __SA1_OPEN(saMatrixParams);
637 
638  for (int32_t tile = 0; tile < nTiles8; tile++) {
639  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
640  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
641  vec sV1 = __vload_pred(lPred, pLoadVec);
642 
643  lPred = c7x::strm_agen<0, vec>::get_vpred();
644  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
645  vec sV2 = __vload_pred(lPred, pLoadVec);
646 
647  lPred = c7x::strm_agen<0, vec>::get_vpred();
648  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
649  vec sV3 = __vload_pred(lPred, pLoadVec);
650 
651  lPred = c7x::strm_agen<0, vec>::get_vpred();
652  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
653  vec sV4 = __vload_pred(lPred, pLoadVec);
654 
655  lPred = c7x::strm_agen<0, vec>::get_vpred();
656  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
657  vec sV5 = __vload_pred(lPred, pLoadVec);
658 
659  lPred = c7x::strm_agen<0, vec>::get_vpred();
660  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
661  vec sV6 = __vload_pred(lPred, pLoadVec);
662 
663  lPred = c7x::strm_agen<0, vec>::get_vpred();
664  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
665  vec sV7 = __vload_pred(lPred, pLoadVec);
666 
667  lPred = c7x::strm_agen<0, vec>::get_vpred();
668  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
669  vec sV8 = __vload_pred(lPred, pLoadVec);
670 
671  /* 7 + trip_cnt * 8 */
672  for (int32_t vertical = 0; vertical < nRows; vertical++) {
673  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
674 
675  vec v1 = c7x::strm_eng<1, vec>::get_adv();
676  vec v2 = c7x::strm_eng<1, vec>::get_adv();
677  vec v3 = c7x::strm_eng<1, vec>::get_adv();
678  vec v4 = c7x::strm_eng<1, vec>::get_adv();
679  vec v5 = c7x::strm_eng<1, vec>::get_adv();
680  vec v6 = c7x::strm_eng<1, vec>::get_adv();
681  vec v7 = c7x::strm_eng<1, vec>::get_adv();
682  vec v8 = c7x::strm_eng<1, vec>::get_adv();
683 
684  v1 -= sV1 * scalarDup;
685  v2 -= sV2 * scalarDup;
686  v3 -= sV3 * scalarDup;
687  v4 -= sV4 * scalarDup;
688  v5 -= sV5 * scalarDup;
689  v6 -= sV6 * scalarDup;
690  v7 -= sV7 * scalarDup;
691  v8 -= sV8 * scalarDup;
692 
693  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
694  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
695  __vstore_pred(sPred, pStoreVec, v1);
696 
697  sPred = c7x::strm_agen<1, vec>::get_vpred();
698  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
699  __vstore_pred(sPred, pStoreVec, v2);
700 
701  sPred = c7x::strm_agen<1, vec>::get_vpred();
702  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
703  __vstore_pred(sPred, pStoreVec, v3);
704 
705  sPred = c7x::strm_agen<1, vec>::get_vpred();
706  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
707  __vstore_pred(sPred, pStoreVec, v4);
708 
709  sPred = c7x::strm_agen<1, vec>::get_vpred();
710  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
711  __vstore_pred(sPred, pStoreVec, v5);
712 
713  sPred = c7x::strm_agen<1, vec>::get_vpred();
714  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
715  __vstore_pred(sPred, pStoreVec, v6);
716 
717  sPred = c7x::strm_agen<1, vec>::get_vpred();
718  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
719  __vstore_pred(sPred, pStoreVec, v7);
720 
721  sPred = c7x::strm_agen<1, vec>::get_vpred();
722  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
723  __vstore_pred(sPred, pStoreVec, v8);
724  }
725  }
726  __SE1_CLOSE();
727  __SA1_CLOSE();
728  }
729 
730  if (nTiles4 > 0) {
731  /* 1 X (4*eleCount) TILE */
732  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
733  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
734  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
735  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
736  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
737 
738  dataType *pSE1 = pStartL + colLimit8;
739  dataType *pSA1 = pStartL + colLimit8;
740  dataType *pSA0 = pRefL;
741 
742  __SE1_OPEN(pSE1, seMatrixParams);
743  __SA1_OPEN(saMatrixParams);
744 
745  for (int32_t tile = 0; tile < nTiles4; tile++) {
746  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
747  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
748  vec sV1 = __vload_pred(lPred, pLoadVec);
749 
750  lPred = c7x::strm_agen<0, vec>::get_vpred();
751  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
752  vec sV2 = __vload_pred(lPred, pLoadVec);
753 
754  lPred = c7x::strm_agen<0, vec>::get_vpred();
755  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
756  vec sV3 = __vload_pred(lPred, pLoadVec);
757 
758  lPred = c7x::strm_agen<0, vec>::get_vpred();
759  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
760  vec sV4 = __vload_pred(lPred, pLoadVec);
761 
762  /* 7 + trip_cnt * 4 */
763  for (int32_t vertical = 0; vertical < nRows; vertical++) {
764 
765  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
766 
767  vec v1 = c7x::strm_eng<1, vec>::get_adv();
768  vec v2 = c7x::strm_eng<1, vec>::get_adv();
769  vec v3 = c7x::strm_eng<1, vec>::get_adv();
770  vec v4 = c7x::strm_eng<1, vec>::get_adv();
771 
772  v1 -= sV1 * scalarDup;
773  v2 -= sV2 * scalarDup;
774  v3 -= sV3 * scalarDup;
775  v4 -= sV4 * scalarDup;
776 
777  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
778  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
779  __vstore_pred(sPred, pStoreVec, v1);
780 
781  sPred = c7x::strm_agen<1, vec>::get_vpred();
782  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
783  __vstore_pred(sPred, pStoreVec, v2);
784 
785  sPred = c7x::strm_agen<1, vec>::get_vpred();
786  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
787  __vstore_pred(sPred, pStoreVec, v3);
788 
789  sPred = c7x::strm_agen<1, vec>::get_vpred();
790  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
791  __vstore_pred(sPred, pStoreVec, v4);
792  }
793  }
794  __SE1_CLOSE();
795  __SA1_CLOSE();
796  }
797 
798  if (nTiles2 > 0) {
799  /* 1 X (2*eleCount) TILE */
800 
801  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
802  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
803  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
804  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
805  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
806 
807  dataType *pSE1 = pStartL + colLimit8 + colLimit4;
808  dataType *pSA1 = pStartL + colLimit8 + colLimit4;
809  dataType *pSA0 = pRefL;
810  __SE1_OPEN(pSE1, seMatrixParams);
811  __SA1_OPEN(saMatrixParams);
812 
813  for (int32_t tile = 0; tile < nTiles2; tile++) {
814  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
815  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
816  vec sV1 = __vload_pred(lPred, pLoadVec);
817 
818  lPred = c7x::strm_agen<0, vec>::get_vpred();
819  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
820  vec sV2 = __vload_pred(lPred, pLoadVec);
821  /* 7 + trip_cnt * 2 */
822  for (int32_t vertical = 0; vertical < nRows; vertical++) {
823  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
824 
825  vec v1 = c7x::strm_eng<1, vec>::get_adv();
826  vec v2 = c7x::strm_eng<1, vec>::get_adv();
827 
828  v1 -= sV1 * scalarDup;
829  v2 -= sV2 * scalarDup;
830 
831  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
832  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
833  __vstore_pred(sPred, pStoreVec, v1);
834 
835  sPred = c7x::strm_agen<1, vec>::get_vpred();
836  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
837  __vstore_pred(sPred, pStoreVec, v2);
838  }
839  }
840 
841  __SE1_CLOSE();
842  __SA1_CLOSE();
843  }
844  if (nTiles1 > 0) {
845  /* 1 X eleCount TILE */
846 
847  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
848  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
849  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
850  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
851  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
852 
853  dataType *pSE1 = pStartL + colLimit8 + colLimit4 + colLimit2;
854  dataType *pSA1 = pStartL + colLimit8 + colLimit4 + colLimit2;
855  dataType *pSA0 = pRefL;
856 
857  __SE1_OPEN(pSE1, seMatrixParams);
858  __SA1_OPEN(saMatrixParams);
859 
860  for (int32_t tile = 0; tile < nTiles1; tile++) {
861  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
862  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
863  vec sV1 = __vload_pred(lPred, pLoadVec);
864 
865  /* 7 + trip_cnt * 2 */
866  for (int32_t vertical = 0; vertical < nRows; vertical++) {
867  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
868 
869  vec v1 = c7x::strm_eng<1, vec>::get_adv();
870  v1 -= sV1 * scalarDup;
871 
872  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
873  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
874  __vstore_pred(sPred, pStoreVec, v1);
875  }
876  }
877 
878  __SE1_CLOSE();
879  __SA1_CLOSE();
880  }
881  __SE0_CLOSE();
882  __SA0_CLOSE();
883  __SA2_CLOSE();
884 
885  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
886 }
887 template void DSPLIB_lud_U_generate_exec_ci<float>(float *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock);
888 template void
889 DSPLIB_lud_U_generate_exec_ci<double>(double *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock);
890 
891 template <typename dataType>
892 void DSPLIB_lud_extract_exec_ci(dataType *pU, dataType *pL, int32_t order, int32_t colStride, uint8_t *pBlock)
893 {
894  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
895 
896  typedef typename c7x::make_full_vector<dataType>::type vec;
897  typedef typename c7x::make_full_vector<unsigned char>::type ucharvec;
898  uint32_t eleCount = c7x::element_count_of<vec>::value;
899  const uchar shiftConst = 8 * sizeof(dataType);
900  vec zeroVec = (vec) 0;
901  int32_t frontZeroCount = 0;
902  int32_t iter1 = DSPLIB_ceilingDiv(order, eleCount);
903 
904  __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
905  __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE));
906 
907  __SA0_OPEN(saMatrixParams);
908  __SA1_OPEN(saMatrixParams);
909  __SE0_OPEN(pU, seMatrixParams);
910 
911  for (int32_t block = 0; block < iter1; block++) {
912 
913  ucharvec byteMask = (ucharvec) 0xFF;
914  vec oneVec = 0;
915  oneVec.s[0] = 1;
916 
917  for (uint32_t vertical = 0; vertical < eleCount; vertical++) {
918  int32_t horizontal = 0;
919  __vpred pred;
920  vec *pStoreVec;
921  vec uV;
922  for (; horizontal < frontZeroCount; horizontal++) {
923  uV = c7x::strm_eng<0, vec>::get_adv();
924 
925  pred = c7x::strm_agen<0, vec>::get_vpred();
926  pStoreVec = c7x::strm_agen<0, vec>::get_adv(pU);
927  __vstore_pred(pred, pStoreVec, zeroVec);
928 
929  pred = c7x::strm_agen<1, vec>::get_vpred();
930  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pL);
931  __vstore_pred(pred, pStoreVec, uV);
932  }
933 
934  uV = c7x::strm_eng<0, vec>::get_adv();
935 
936  vec lV = c7x::reinterpret<vec>(__andn(byteMask, c7x::as_uchar_vec(uV)));
937  uV = c7x::reinterpret<vec>((byteMask & (c7x::as_uchar_vec(uV))));
938 
939  byteMask = c7x::as_uchar_vec(__shift_left_full(c7x::as_ulong_vec(byteMask), shiftConst));
940 
941  lV = c7x::reinterpret<vec>(c7x::as_uchar_vec(oneVec) | c7x::as_uchar_vec(lV));
942  oneVec = c7x::reinterpret<vec>(__shift_left_full(c7x::as_ulong_vec(oneVec), shiftConst));
943 
944  pred = c7x::strm_agen<0, vec>::get_vpred();
945  pStoreVec = c7x::strm_agen<0, vec>::get_adv(pU);
946  __vstore_pred(pred, pStoreVec, uV);
947 
948  pred = c7x::strm_agen<1, vec>::get_vpred();
949  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pL);
950  __vstore_pred(pred, pStoreVec, lV);
951 
952  for (; horizontal < iter1 - 1; horizontal++) {
953  uV = c7x::strm_eng<0, vec>::get_adv();
954 
955  pStoreVec = c7x::strm_agen<0, vec>::get_adv(pU);
956 
957  pred = c7x::strm_agen<1, vec>::get_vpred();
958  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pL);
959  __vstore_pred(pred, pStoreVec, zeroVec);
960  }
961  }
962  frontZeroCount++;
963  }
964  __SA0_CLOSE();
965  __SA1_CLOSE();
966  __SE0_CLOSE();
967 
968  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
969 }
970 template void
971 DSPLIB_lud_extract_exec_ci<float>(float *pU, float *pL, int32_t order, int32_t colStride, uint8_t *pBlock);
972 template void
973 DSPLIB_lud_extract_exec_ci<double>(double *pU, double *pL, int32_t order, int32_t colStride, uint8_t *pBlock);
974 
975 template <typename dataType>
977  void *restrict pA,
978  void *restrict pL,
979  void *restrict pU,
980  void *restrict pP)
981 {
982  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering C7x Optimized implementation");
983 
984  DSPLIB_lud_PrivArgs *pKerPrivArgs = (DSPLIB_lud_PrivArgs *) handle;
985  int32_t order = pKerPrivArgs->order;
986  int32_t strideOrder = pKerPrivArgs->strideOrder;
987  int32_t strideP = pKerPrivArgs->strideP;
988  uint8_t *pBlock = pKerPrivArgs->bufPblock;
989 
990  /* Typecast void pointers to respective data type */
991  dataType *pALocal = (dataType *) pA;
992  dataType *pLLocal = (dataType *) pL;
993  dataType *pULocal = (dataType *) pU;
994  unsigned short *pPLocal = (unsigned short *) pP;
995 
996  DSPLIB_DEBUGPRINTFN(0, "pALocal: %p pLLocal: %p pULocal: %p pPLocal: %p order: %d\n", pALocal, pLLocal, pULocal,
997  pPLocal, order);
998 
999  int min_row, max_row, k;
1000 
1001  int32_t dataSize = sizeof(dataType);
1002  int32_t dataSizeP = sizeof(unsigned short);
1003 
1004  int32_t orderStride = strideOrder / dataSize;
1005  int32_t orderPStride = strideP / dataSizeP;
1006  int32_t nRows = order;
1007 
1008  __SA_TEMPLATE_v1 saSwap1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1009  __SE_TEMPLATE_v1 seSwap1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1010 
1011  __SA_TEMPLATE_v1 saSwap2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE));
1012  __SE_TEMPLATE_v1 seSwap2Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE));
1013 
1014  __SA_TEMPLATE_v1 saColParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE));
1015  __SE_TEMPLATE_v1 se0ColParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
1016  __SE_TEMPLATE_v1 se1ColParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
1017  __SE_TEMPLATE_v1 se0MinMax = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1018 
1019  /* ------------------------------------------------------------------- */
1020  /* generate identify matrix */
1021  /* ------------------------------------------------------------------- */
1022  DSPLIB_lud_identity_matrix_generate_exec_ci<unsigned short>(pPLocal, nRows, orderPStride, pBlock);
1023 
1024  DSPLIB_lud_blk_move_exec_ci<dataType>(pULocal, pALocal, order, order, orderStride, orderStride,
1025  &pBlock[17 * SE_PARAM_SIZE]);
1026 
1027  /* ------------------------------------------------------------------- */
1028  /* LU decomposition */
1029  /* ------------------------------------------------------------------- */
1030  typedef typename c7x::make_full_vector<dataType>::type vec;
1031 
1032  vec idx_0_to_eleCount = getIdxVec<vec, dataType>();
1033 
1034  for (k = 0; k < order - 1; k++) {
1035  DSPLIB_lud_maxMinIndex_exec_ci(&pULocal[k + (k * orderStride)], (order - k), idx_0_to_eleCount, k, orderStride,
1036  &max_row, &min_row, se0MinMax);
1037  if (k != max_row) {
1038  /* swap rows if necessary */
1039  DSPLIB_lud_array_swap_exec_ci<dataType>(&pULocal[min_row * orderStride], &pULocal[max_row * orderStride],
1040  order, saSwap1Params, seSwap1Params);
1042  &pPLocal[min_row * orderPStride], &pPLocal[max_row * orderPStride], order, saSwap2Params, seSwap2Params);
1043  }
1044  /* generate U matrix */
1045  DSPLIB_lud_U_colDiv_exec_ci<dataType>(&pULocal[k + k * orderStride], (order - k), orderStride, saColParams,
1046  se0ColParams, se1ColParams);
1047  DSPLIB_lud_U_generate_exec_ci<dataType>(&pULocal[k + k * orderStride], (order - k), orderStride, pBlock);
1048  }
1049 
1050  /* Extract lower triangular entries from L into U and set L lower entries to zero */
1051  DSPLIB_lud_extract_exec_ci<dataType>(pULocal, pLLocal, order, orderStride, pBlock);
1052 
1053  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting C7x Optimized implementation");
1054 
1055  return DSPLIB_SUCCESS;
1056 }
1057 
1059  void *restrict pA,
1060  void *restrict pL,
1061  void *restrict pU,
1062  void *restrict pP);
1063 
1065  void *restrict pA,
1066  void *restrict pL,
1067  void *restrict pU,
1068  void *restrict pP);
1069 
1070 /* ======================================================================== */
1071 /* End of file: DSPLIB_lud_ci.cpp */
1072 /* ======================================================================== */
template void DSPLIB_lud_U_generate_exec_ci< float >(float *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock)
__vpred getPMask< float >(uint32_t idx)
template DSPLIB_STATUS DSPLIB_lud_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsA, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_ludInitArgs *pKerInitArgs)
DSPLIB_STATUS DSPLIB_lud_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsA, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_ludInitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template void DSPLIB_lud_maxMinIndex_exec_ci< float >(float *pCol, int32_t nRows, typename c7x::make_full_vector< float >::type idx_0_to_eleCount, int32_t k, int32_t colStride, int32_t *max, int32_t *min, __SE_TEMPLATE_v1 se0Params)
template void DSPLIB_lud_U_colDiv_exec_ci< float >(float *pCol, int32_t nRows, int32_t colStride, __SA_TEMPLATE_v1 saColParams, __SE_TEMPLATE_v1 se0ColParams, __SE_TEMPLATE_v1 se1ColParams)
template void DSPLIB_lud_array_swap_exec_ci< unsigned short >(unsigned short *pArray1, unsigned short *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
template DSPLIB_STATUS DSPLIB_lud_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pL, void *restrict pU, void *restrict pP)
static void DSPLIB_lud_maxMinIndex_exec_ci(dataType *pCol, int32_t nRows, vec idx_0_to_eleCount, int32_t k, int32_t colStride, int32_t *max, int32_t *min, __SE_TEMPLATE_v1 se0Params)
__vpred getPMask(uint32_t idx)
template void DSPLIB_lud_array_swap_exec_ci< double >(double *pArray1, double *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
template void DSPLIB_lud_U_generate_exec_ci< double >(double *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock)
template void DSPLIB_lud_extract_exec_ci< double >(double *pU, double *pL, int32_t order, int32_t colStride, uint8_t *pBlock)
template void DSPLIB_lud_maxMinIndex_exec_ci< double >(double *pCol, int32_t nRows, typename c7x::make_full_vector< double >::type idx_0_to_eleCount, int32_t k, int32_t colStride, int32_t *max, int32_t *min, __SE_TEMPLATE_v1 se0Params)
V getIdxVec()
__vpred getPMask< double >(uint32_t idx)
static void DSPLIB_lud_U_colDiv_exec_ci(dataType *pCol, int32_t nRows, int32_t colStride, __SA_TEMPLATE_v1 saColParams, __SE_TEMPLATE_v1 se0ColParams, __SE_TEMPLATE_v1 se1ColParams)
static void DSPLIB_lud_array_swap_exec_ci(dataType *pArray1, dataType *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
template DSPLIB_STATUS DSPLIB_lud_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pL, void *restrict pU, void *restrict pP)
template DSPLIB_STATUS DSPLIB_lud_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsA, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_ludInitArgs *pKerInitArgs)
void DSPLIB_lud_extract_exec_ci(dataType *pU, dataType *pL, int32_t order, int32_t colStride, uint8_t *pBlock)
template void DSPLIB_lud_array_swap_exec_ci< float >(float *pArray1, float *pArray2, int32_t nCols, __SA_TEMPLATE_v1 sa1Params, __SE_TEMPLATE_v1 se1Params)
double4 idx_double
DSPLIB_STATUS DSPLIB_lud_exec_ci(DSPLIB_kernelHandle handle, void *restrict pA, void *restrict pL, void *restrict pU, void *restrict pP)
This function is the main execution function for the C7x implementation of the kernel....
static void DSPLIB_lud_U_generate_exec_ci(dataType *pLocalU, int32_t order, int32_t colStrideU, uint8_t *pBlock)
template void DSPLIB_lud_U_colDiv_exec_ci< double >(double *pCol, int32_t nRows, int32_t colStride, __SA_TEMPLATE_v1 saColParams, __SE_TEMPLATE_v1 se0ColParams, __SE_TEMPLATE_v1 se1ColParams)
template void DSPLIB_lud_extract_exec_ci< float >(float *pU, float *pL, int32_t order, int32_t colStride, uint8_t *pBlock)
float8 idx_float
template void DSPLIB_lud_identity_matrix_generate_init_ci< unsigned short >(uint8_t *pBlock, int32_t order, int32_t stride)
template void DSPLIB_lud_identity_matrix_generate_exec_ci< unsigned short >(unsigned short *pMat, int32_t order, int32_t colStride, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_lud.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 2 dimensional buffer descriptor.
Structure containing the parameters to initialize the kernel.
Definition: DSPLIB_lud.h:117
Structure that is reserved for internal use by the kernel.
uint8_t bufPblock[DSPLIB_LUD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters
int32_t order
Size of input buffer for different batches DSPLIB_lud_init that will be retrieved and used by DSPLIB_...
int32_t strideOrder
Stride between rows of input and output data matrix
int32_t strideP
Stride between rows of output data matrix P