DSPLIB User Guide
DSPLIB_lud_inv_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************/
5 /* Copyright (C) 2017 Texas Instruments Incorporated - https://www.ti.com/
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * Redistributions of source code must retain the above copyright
12  * notice, this list of conditions and the following disclaimer.
13  *
14  * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the
17  * distribution.
18  *
19  * Neither the name of Texas Instruments Incorporated nor the names of
20  * its contributors may be used to endorse or promote products derived
21  * from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  *
35  ******************************************************************************/
36 
37 /******************************************************************************
38  * Version 1.0 Date Aug 2023 Author: Asheesh Bhardwaj
39  *****************************************************************************/
40 
41 /*******************************************************************************
42  *
43  * INCLUDES
44  *
45  ******************************************************************************/
46 #include "DSPLIB_lud_common.h"
47 #include "DSPLIB_lud_inv_priv.h"
48 
49 #define LUD_INV_HIGH_PRECISION
50 /**********************************************************************
51  *
52  * INITIALIZATION
53  *
54  *********************************************************************/
55 
56 template <typename dataType> void DSPLIB_lud_inv_permuteRows_init_ci(DSPLIB_kernelHandle handle)
57 {
58  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
59 
60  DSPLIB_lud_inv_PrivArgs *pKerPrivArgs = (DSPLIB_lud_inv_PrivArgs *) handle;
61  uint8_t *pBlock = pKerPrivArgs->bufPblock;
62  int32_t order = pKerPrivArgs->order;
63  int32_t stride = pKerPrivArgs->strideOrder;
64  int32_t colStride = stride / sizeof(dataType);
65 
66  typedef typename c7x::make_full_vector<dataType>::type vec;
67  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
68  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
69  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
70 
71  __SE_TEMPLATE_v1 seMatReadParams = __gen_SE_TEMPLATE_v1();
72  seMatReadParams.ICNT0 = order;
73  seMatReadParams.DIM1 = colStride * 2;
74  seMatReadParams.DIMFMT = __SE_DIMFMT_2D;
75  seMatReadParams.ELETYPE = SE_ELETYPE;
76  seMatReadParams.VECLEN = SE_VECLEN;
77 
78  __SA_TEMPLATE_v1 saRowWriteParams = __gen_SA_TEMPLATE_v1();
79  saRowWriteParams.ICNT0 = order;
80  saRowWriteParams.DIM1 = 0;
81  saRowWriteParams.DIMFMT = __SA_DIMFMT_2D;
82  saRowWriteParams.VECLEN = SA_VECLEN;
83 
84  __SA_TEMPLATE_v1 saPermParams = __gen_SA_TEMPLATE_v1();
85  saPermParams.ICNT0 = order;
86  saPermParams.DIMFMT = __SA_DIMFMT_1D;
87  saPermParams.VECLEN = __SA_VECLEN_1ELEM;
88 
89  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = seMatReadParams;
90  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = saRowWriteParams;
91  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = saPermParams;
92 
93  typedef typename c7x::make_full_vector<uint16_t>::type vecUINT16;
94  SE_ELETYPE = c7x::se_eletype<vecUINT16>::value;
95  SE_VECLEN = c7x::se_veclen<vecUINT16>::value;
96  SA_VECLEN = c7x::sa_veclen<vecUINT16>::value;
97  int32_t pStride = pKerPrivArgs->strideP;
98  int32_t colPStride = pStride / sizeof(uint16_t);
99 
100  seMatReadParams = __gen_SE_TEMPLATE_v1();
101  seMatReadParams.ICNT0 = order;
102  seMatReadParams.DIM1 = colPStride * 2;
103  seMatReadParams.DIMFMT = __SE_DIMFMT_2D;
104  seMatReadParams.ELETYPE = SE_ELETYPE;
105  seMatReadParams.VECLEN = SE_VECLEN;
106 
107  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE)) = seMatReadParams;
108 
109  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
110 }
113 
114 template <typename dataType> void DSPLIB_lud_inv_opt_init_ci(DSPLIB_kernelHandle handle)
115 {
116  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
117  DSPLIB_lud_inv_PrivArgs *pKerPrivArgs = (DSPLIB_lud_inv_PrivArgs *) handle;
118  uint8_t *pBlock = pKerPrivArgs->bufPblock;
119  int32_t strideOrder = pKerPrivArgs->strideOrder;
120  int32_t order = pKerPrivArgs->order;
121  int32_t colStrideOrder = strideOrder / sizeof(dataType);
122 
123  DSPLIB_lud_identity_matrix_generate_init_ci<dataType>(pBlock, order, strideOrder);
124 
125  typedef typename c7x::make_full_vector<dataType>::type vec;
126 
127  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
128  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
129  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
130 
131  /* Fact copy params*/
132  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
133  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
134 
135  typedef typename c7x::make_full_vector<dataType>::type vec;
136 
137  uint32_t eleCount = c7x::element_count_of<vec>::value;
138  se0Params.ICNT0 = 1;
139  se0Params.ICNT1 = eleCount;
140  se0Params.DIM1 = colStrideOrder;
141  se0Params.DIM2 = colStrideOrder * eleCount;
142  se0Params.DIMFMT = __SE_DIMFMT_3D;
143  se0Params.ELETYPE = SE_ELETYPE;
144  se0Params.VECLEN = SE_VECLEN;
145  if (sizeof(dataType) == 4) {
146  se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
147  }
148  else {
149  se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
150  }
151 
152  sa0Params.ICNT0 = order;
153  sa0Params.VECLEN = SA_VECLEN;
154  sa0Params.DIMFMT = __SA_DIMFMT_1D;
155 
156  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = se0Params;
157  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = sa0Params;
158  /* Mat-update Params */
159 
160  int32_t lenTile8 = 8;
161  int32_t nTiles_8 = DSPLIB_ceilingDiv(order, (eleCount * lenTile8));
162 
163  __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
164 
165  __SE_TEMPLATE_v1 seScalarParams = __gen_SE_TEMPLATE_v1();
166  __SE_TEMPLATE_v1 seMatrixParams = __gen_SE_TEMPLATE_v1();
167  __SA_TEMPLATE_v1 saMatrixParams = __gen_SA_TEMPLATE_v1();
168 
169  seScalarParams.DIM1 = 0;
170  seScalarParams.ELEDUP = SE_ELEDUP;
171  seScalarParams.DIMFMT = __SE_DIMFMT_2D;
172  seScalarParams.VECLEN = SE_VECLEN;
173  seScalarParams.ELETYPE = SE_ELETYPE;
174 
175  seMatrixParams.ICNT0 = (eleCount * lenTile8);
176  seMatrixParams.DIM1 = colStrideOrder;
177  seMatrixParams.ICNT2 = nTiles_8;
178  seMatrixParams.DIM2 = (eleCount * lenTile8);
179  seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
180  seMatrixParams.ELETYPE = SE_ELETYPE;
181  seMatrixParams.VECLEN = SE_VECLEN;
182  seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
183  seMatrixParams.DECDIM1_WIDTH = order;
184 
185  saMatrixParams.ICNT0 = (eleCount * lenTile8);
186  saMatrixParams.DIM1 = colStrideOrder;
187  saMatrixParams.ICNT2 = nTiles_8;
188  saMatrixParams.DIM2 = (eleCount * lenTile8);
189  saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
190  saMatrixParams.VECLEN = SA_VECLEN;
191  saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
192  saMatrixParams.DECDIM1_WIDTH = order;
193 
194  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = seScalarParams;
195  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = seMatrixParams;
196  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = saMatrixParams;
197 
198  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
199 }
202 
203 template <typename dataType>
205  const DSPLIB_bufParams2D_t *bufParamsP,
206  const DSPLIB_bufParams2D_t *bufParamsL,
207  const DSPLIB_bufParams2D_t *bufParamsU,
208  const DSPLIB_bufParams2D_t *bufParamsinvA,
209  const DSPLIB_lud_invInitArgs *pKerInitArgs)
210 {
211  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
212  DSPLIB_lud_inv_PrivArgs *pKerPrivArgs = (DSPLIB_lud_inv_PrivArgs *) handle;
213  int32_t colStrideOrder = pKerPrivArgs->strideOrder / sizeof(dataType);
214 
215  DSPLIB_matMul_PrivArgs *pMatMulKerPrivArgs = &pKerPrivArgs->pMatMulKerPrivArgs;
216  DSPLIB_matTrans_PrivArgs *pMatTransKerPrivArgs = &pKerPrivArgs->pMatTransKerPrivArgs;
217 
218  DSPLIB_matMul_InitArgs kerInitArgsMatMul;
219  DSPLIB_matTransInitArgs kerInitArgsMatTrans;
220 
221  kerInitArgsMatMul.funcStyle = pKerInitArgs->funcStyle;
222  kerInitArgsMatTrans.funcStyle = pKerInitArgs->funcStyle;
223  kerInitArgsMatTrans.dimX = bufParamsU->dim_x;
224  kerInitArgsMatTrans.dimY = bufParamsU->dim_y;
225 
226  pMatMulKerPrivArgs->strideIn0Elements = colStrideOrder;
227  pMatMulKerPrivArgs->strideIn1Elements = colStrideOrder;
228  pMatMulKerPrivArgs->strideOutElements = colStrideOrder;
229 
230  pMatTransKerPrivArgs->widthIn = bufParamsU->dim_x;
231  pMatTransKerPrivArgs->heightIn = bufParamsU->dim_y;
232  pMatTransKerPrivArgs->strideIn = bufParamsU->stride_y;
233  pMatTransKerPrivArgs->strideOut = bufParamsinvA->stride_y;
234 
235  pMatMulKerPrivArgs->M = pKerPrivArgs->order;
236  pMatMulKerPrivArgs->N = pKerPrivArgs->order;
237  pMatMulKerPrivArgs->K = pKerPrivArgs->order;
238 
239  DSPLIB_matTrans_init_ci<dataType>(pMatTransKerPrivArgs, bufParamsU, bufParamsinvA, &kerInitArgsMatTrans);
240  DSPLIB_matMul_init_ci<dataType>(pMatMulKerPrivArgs, bufParamsL, bufParamsinvA, bufParamsU, &kerInitArgsMatMul);
241 
242  DSPLIB_lud_inv_opt_init_ci<dataType>(handle);
243  DSPLIB_lud_inv_permuteRows_init_ci<dataType>(handle);
244 
245  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
246 
247  return DSPLIB_SUCCESS;
248 }
249 
251  const DSPLIB_bufParams2D_t *bufParamsP,
252  const DSPLIB_bufParams2D_t *bufParamsL,
253  const DSPLIB_bufParams2D_t *bufParamsU,
254  const DSPLIB_bufParams2D_t *bufParamsinvA,
255  const DSPLIB_lud_invInitArgs *pKerInitArgs);
256 
258  const DSPLIB_bufParams2D_t *bufParamsP,
259  const DSPLIB_bufParams2D_t *bufParamsL,
260  const DSPLIB_bufParams2D_t *bufParamsU,
261  const DSPLIB_bufParams2D_t *bufParamsinvA,
262  const DSPLIB_lud_invInitArgs *pKerInitArgs);
263 
264 /**********************************************************************
265  *
266  * IMPLEMENTATION
267  *
268  *********************************************************************/
269 
270 template <typename dataType>
271 void DSPLIB_lud_inv_permuteRows_ci(dataType *pIn,
272  int32_t order,
273  int32_t colStride,
274  uint32_t *permuteOrder,
275  dataType *pOut,
276  uint8_t *pBlock)
277 {
278  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
279 
280  typedef typename c7x::make_full_vector<dataType>::type vec;
281  int32_t eleCount = c7x::element_count_of<vec>::value;
282 
283  __SE_TEMPLATE_v1 se0Params, se1Params;
284  __SA_TEMPLATE_v1 sa0Params, sa1Params, sa2Params;
285  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
286  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
287  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
288  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
289  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
290 
291  int32_t nVec = DSPLIB_ceilingDiv(order, eleCount);
292  int32_t se1ICNT1 = order / 2;
293  int32_t se0ICNT1 = order - se1ICNT1;
294  se0Params.ICNT1 = sa0Params.ICNT1 = se0ICNT1;
295  se1Params.ICNT1 = sa1Params.ICNT1 = se1ICNT1;
296 
297  __SE0_OPEN(pIn, se0Params);
298  __SA0_OPEN(sa0Params);
299  __SA2_OPEN(sa2Params);
300 
301  if (se1ICNT1 > 0) {
302  __SE1_OPEN(pIn + colStride, se1Params);
303  __SA1_OPEN(sa1Params);
304 
305  for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
306  uint32_t *loadPerm1 = c7x::strm_agen<2, uint32_t>::get_adv(permuteOrder);
307  uint32_t offset1 = *loadPerm1 * colStride;
308 
309  uint32_t *loadPerm2 = c7x::strm_agen<2, uint32_t>::get_adv(permuteOrder);
310  uint32_t offset2 = *loadPerm2 * colStride;
311 
312  for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
313  vec v1 = c7x::strm_eng<0, vec>::get_adv();
314  vec v2 = c7x::strm_eng<1, vec>::get_adv();
315 
316  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
317  vec *pStore1 = c7x::strm_agen<0, vec>::get_adv(pOut + offset1);
318  __vstore_pred(pred1, pStore1, v1);
319 
320  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
321  vec *pStore2 = c7x::strm_agen<1, vec>::get_adv(pOut + offset2);
322  __vstore_pred(pred2, pStore2, v2);
323  }
324  }
325  }
326 
327  if (se0ICNT1 != se1ICNT1) {
328  uint32_t *loadPerm1 = c7x::strm_agen<2, uint32_t>::get_adv(permuteOrder);
329  uint32_t offset1 = *loadPerm1 * colStride;
330 
331  for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
332  vec v1 = c7x::strm_eng<0, vec>::get_adv();
333 
334  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
335  vec *pStore1 = c7x::strm_agen<0, vec>::get_adv(pOut + offset1);
336  __vstore_pred(pred1, pStore1, v1);
337  }
338  }
339 
340  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
341 }
342 template void DSPLIB_lud_inv_permuteRows_ci<float>(float *pIn,
343  int32_t order,
344  int32_t colStride,
345  uint32_t *permuteOrder,
346  float *pOut,
347  uint8_t *pBlock);
348 template void DSPLIB_lud_inv_permuteRows_ci<double>(double *pIn,
349  int32_t order,
350  int32_t colStride,
351  uint32_t *permuteOrder,
352  double *pOut,
353  uint8_t *pBlock);
354 
355 void DSPLIB_lud_inv_permuteIndex_ci(unsigned short *pIn,
356  int32_t order,
357  int32_t colPStride,
358  uint32_t *permuteOrder,
359  uint8_t *pBlock)
360 {
361  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
362 
363  typedef typename c7x::make_full_vector<uint16_t>::type vec;
364  int32_t eleCount = c7x::element_count_of<vec>::value;
365 
366  __SE_TEMPLATE_v1 se0Params, se1Params;
367  __SA_TEMPLATE_v1 sa0Params, sa1Params, sa2Params;
368  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
369  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
370  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
371 
372  int32_t nVec = DSPLIB_ceilingDiv(order, eleCount);
373  int32_t se1ICNT1 = order / 2;
374  int32_t se0ICNT1 = order - se1ICNT1;
375 
376  se0Params.ICNT1 = sa0Params.ICNT1 = se0ICNT1;
377  se1Params.ICNT1 = sa1Params.ICNT1 = se1ICNT1;
378 
379  __SE0_OPEN(pIn, se0Params);
380  __SA2_OPEN(sa2Params);
381 
382  vec vecZero = (vec) 0;
383  vec vecOne = (vec) 1;
384 
385  vec idx_0_to_eleCount;
386 
387  idx_0_to_eleCount = c7x::ushort_vec(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
388 
389  int32_t vertical = 0;
390  if (se1ICNT1 > 0) {
391  __SE1_OPEN(pIn + colPStride, se1Params);
392 
393  for (vertical = 0; vertical < order - 1; vertical += 2) {
394 
395  vec maxValVec1 = (vec) 0;
396  vec maxValVec2 = (vec) 0;
397  vec vMaxIdx1; // = idx_0_to_eleCount;
398  vec vMaxIdx2; // = idx_0_to_eleCount;
399  vec vCurrIdx1 = idx_0_to_eleCount;
400  vec vCurrIdx2 = idx_0_to_eleCount;
401 
402  for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
403  vec v1 = c7x::strm_eng<0, vec>::get_adv();
404  vec v2 = c7x::strm_eng<1, vec>::get_adv();
405 
406  __vpred cmpPred1 = __cmp_eq_pred(vecZero, v1);
407  __vpred cmpPred2 = __cmp_eq_pred(vecZero, v2);
408 
409  maxValVec1 = __select(cmpPred1, maxValVec1, v1);
410  maxValVec2 = __select(cmpPred2, maxValVec2, v2);
411 
412  vMaxIdx1 = __select(cmpPred1, vMaxIdx1, vCurrIdx1);
413  vMaxIdx2 = __select(cmpPred2, vMaxIdx2, vCurrIdx2);
414 
415  vCurrIdx1 = vCurrIdx1 + (uint16_t) eleCount;
416  vCurrIdx2 = vCurrIdx2 + (uint16_t) eleCount;
417  }
418 
419  __vpred cmpPredFinal1 = __cmp_eq_pred(vecOne, maxValVec1);
420  uint32_t tempIdx1 = __rightmost_bit_detect_short(cmpPredFinal1) >> 1;
421  uint32_t finalIdx1 = __vgetuh_vrd(vMaxIdx1, tempIdx1);
422 
423  __vpred cmpPredFinal2 = __cmp_eq_pred(vecOne, maxValVec2);
424  uint32_t tempIdx2 = __rightmost_bit_detect_short(cmpPredFinal2) >> 1;
425  uint32_t finalIdx2 = __vgetuh_vrd(vMaxIdx2, tempIdx2);
426 
427  permuteOrder[vertical + 0] = finalIdx1;
428  permuteOrder[vertical + 1] = finalIdx2;
429  }
430  }
431 
432  if (se0ICNT1 != se1ICNT1) {
433 
434  vec maxValVec1 = (vec) 0;
435  vec vMaxIdx1 = idx_0_to_eleCount;
436  vec vCurrIdx1 = idx_0_to_eleCount;
437 
438  for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
439  vec v1 = c7x::strm_eng<0, vec>::get_adv();
440 
441  __vpred cmpPred1 = __cmp_eq_pred(vecZero, v1);
442 
443  maxValVec1 = __select(cmpPred1, maxValVec1, v1);
444 
445  vMaxIdx1 = __select(cmpPred1, vMaxIdx1, vCurrIdx1);
446 
447  vCurrIdx1 = vCurrIdx1 + (uint16_t) eleCount;
448  }
449 
450  __vpred cmpPredFinal1 = __cmp_eq_pred(vecOne, maxValVec1);
451  uint32_t tempIdx1 = __rightmost_bit_detect_short(cmpPredFinal1) >> 1;
452  uint32_t finalIdx1 = __vgetuh_vrd(vMaxIdx1, tempIdx1);
453 
454  permuteOrder[vertical + 0] = finalIdx1;
455  }
456 
457  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
458 }
459 
460 template <typename dataType, typename vec = typename c7x::make_full_vector<dataType>::type>
461 static inline dataType DSPLIB_lud_inv_factor_exec_ci(dataType *pCol,
462  int32_t colStride,
463  int32_t nRows,
464  dataType *pFactor,
465  vec scaleVec,
466  uint8_t *pBlock,
467  __SE_TEMPLATE_v1 se0Params,
468  __SE_TEMPLATE_v1 se1Params,
469  __SA_TEMPLATE_v1 sa0Params,
470  __SA_TEMPLATE_v1 sa1Params)
471 {
472 
473  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
474 
475  dataType sum = 0;
476  uint32_t eleCount = c7x::element_count_of<vec>::value;
477  int32_t nVec = DSPLIB_ceilingDiv(nRows, eleCount);
478  int32_t se0ICNT2 = nVec / 2;
479  int32_t se1ICNT2 = nVec - se0ICNT2;
480  se0Params.ICNT2 = se0ICNT2;
481  se1Params.ICNT2 = se1ICNT2;
482  dataType *pSE0 = pCol;
483  dataType *pSE1 = pCol + (se0ICNT2 * colStride * eleCount);
484 
485  __SE1_OPEN(pSE1, se1Params);
486  if (se0ICNT2 > 0) {
487  __SE0_OPEN(pSE0, se0Params);
488  }
489 
490  int32_t vertical;
491  sa0Params.ICNT0 = (se0ICNT2 * eleCount);
492  sa1Params.ICNT0 = nRows - ((se0ICNT2 * eleCount));
493  dataType *pFactorHalf = pFactor + (se0ICNT2 * eleCount);
494 
495  if (sa0Params.ICNT0) {
496  __SA0_OPEN(sa0Params);
497  }
498  __SA1_OPEN(sa1Params);
499 
500  for (vertical = 0; vertical < se0ICNT2 - 1; vertical += 2) {
501  vec v1 = c7x::strm_eng<0, vec>::get_adv();
502  vec v2 = c7x::strm_eng<1, vec>::get_adv();
503  vec v3 = c7x::strm_eng<0, vec>::get_adv();
504  vec v4 = c7x::strm_eng<1, vec>::get_adv();
505 
506  v1 *= scaleVec;
507  v2 *= scaleVec;
508  v3 *= scaleVec;
509  v4 *= scaleVec;
510  __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
511  vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
512  __vstore_pred(pred, pStoreVec, v1);
513 
514  pred = c7x::strm_agen<1, vec>::get_vpred();
515  pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
516  __vstore_pred(pred, pStoreVec, v2);
517 
518  pred = c7x::strm_agen<0, vec>::get_vpred();
519  pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
520  __vstore_pred(pred, pStoreVec, v3);
521 
522  pred = c7x::strm_agen<1, vec>::get_vpred();
523  pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
524  __vstore_pred(pred, pStoreVec, v4);
525  }
526 
527  for (; vertical < se0ICNT2; vertical++) {
528  vec v1 = c7x::strm_eng<0, vec>::get_adv();
529  vec v2 = c7x::strm_eng<1, vec>::get_adv();
530 
531  v1 *= scaleVec;
532  v2 *= scaleVec;
533 
534  __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
535  vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
536  __vstore_pred(pred, pStoreVec, v1);
537 
538  pred = c7x::strm_agen<1, vec>::get_vpred();
539  pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
540  __vstore_pred(pred, pStoreVec, v2);
541  }
542  if (se0ICNT2 != se1ICNT2) {
543  vec v1 = c7x::strm_eng<1, vec>::get_adv();
544 
545  v1 *= scaleVec;
546 
547  __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
548  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
549  __vstore_pred(pred, pStoreVec, v1);
550  }
551 
552  if (sa0Params.ICNT0) {
553  __SA0_CLOSE();
554  }
555  __SA1_CLOSE();
556  __SE1_CLOSE();
557 
558  if (se0ICNT2 > 0) {
559  __SE0_CLOSE();
560  }
561 
562  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return sum: %15.10f\n", sum);
563 
564  return sum;
565 }
566 template float DSPLIB_lud_inv_factor_exec_ci<float, typename c7x::make_full_vector<float>::type>(
567  float *pCol,
568  int32_t colStride,
569  int32_t nRows,
570  float *pFactor,
571  typename c7x::make_full_vector<float>::type scale,
572  uint8_t *pBlock,
573  __SE_TEMPLATE_v1 se0Params,
574  __SE_TEMPLATE_v1 se1Params,
575  __SA_TEMPLATE_v1 sa0Params,
576  __SA_TEMPLATE_v1 sa1Params);
577 template double DSPLIB_lud_inv_factor_exec_ci<double, typename c7x::make_full_vector<double>::type>(
578  double *pCol,
579  int32_t colStride,
580  int32_t nRows,
581  double *pFactor,
582  typename c7x::make_full_vector<double>::type scale,
583  uint8_t *pBlock,
584  __SE_TEMPLATE_v1 se0Params,
585  __SE_TEMPLATE_v1 se1Params,
586  __SA_TEMPLATE_v1 sa0Params,
587  __SA_TEMPLATE_v1 sa1Params);
588 
589 template <typename dataType>
590 void DSPLIB_lud_inv_invU_exec_ci(dataType *pLocalU,
591  dataType *pLocalInvU,
592  int32_t nCols,
593  int32_t colStrideOrder,
594  int32_t colInvAStride,
595  dataType *factArray,
596  uint8_t *pBlock)
597 {
598  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
599  typedef typename c7x::make_full_vector<dataType>::type vec;
600  uint32_t eleCount = c7x::element_count_of<vec>::value;
601 
602  /* Factor copy params */
603  __SE_TEMPLATE_v1 se0ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
604  __SE_TEMPLATE_v1 se1ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
605  __SA_TEMPLATE_v1 sa0ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
606  __SA_TEMPLATE_v1 sa1ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
607 
608  /* Updation params */
609  __SE_TEMPLATE_v1 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
610  __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
611  __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
612  __SA_TEMPLATE_v1 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
613  __SA_TEMPLATE_v1 saRefStoreParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
614 
615  int32_t lenTile8 = 8;
616  int32_t lenTile4 = 4;
617  int32_t lenTile2 = 2;
618  int32_t lenTile1 = 1;
619 
620  int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
621  int32_t nTiles8 = nTiles1 / lenTile8; // left shift
622  nTiles1 -= nTiles8 * lenTile8;
623  int32_t nTiles4 = nTiles1 / lenTile4;
624  nTiles1 -= nTiles4 * lenTile4;
625  int32_t nTiles2 = nTiles1 / lenTile2;
626  nTiles1 -= nTiles2 * lenTile2;
627 
628  int32_t remainingCols = nCols; // comment
629  int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
630  colLimit8 = (remainingCols < colLimit8) ? remainingCols : colLimit8;
631 
632  remainingCols = remainingCols - colLimit8;
633  int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
634  colLimit4 = (remainingCols < colLimit4) ? remainingCols : colLimit4;
635 
636  remainingCols = remainingCols - colLimit4;
637  int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
638  colLimit2 = (remainingCols < colLimit2) ? remainingCols : colLimit2;
639 
640  int32_t colLimit1 = remainingCols - colLimit2;
641  seScalarParams.ICNT1 = 2 * (nTiles8 + nTiles4 + nTiles2 + nTiles1);
642 
643  for (int32_t col = nCols - 1; col >= 0; col--) {
644  dataType *pLastU = pLocalU + (colStrideOrder * col);
645  dataType *pLastInvU = pLocalInvU + (colStrideOrder * col);
646 
647  dataType diag = pLocalU[col + col * colStrideOrder];
648 #ifdef LUD_INV_HIGH_PRECISION
649  vec divVec = (vec) (1 / diag);
650 #else
651  dataType recipScalar = __recip(diag);
652  dataType twoP0 = 2.0;
653 
654  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
655  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
656  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
657  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
658 
659  vec divVec = (vec) (recipScalar);
660 #endif
661  if (col > 0) {
662  DSPLIB_lud_inv_factor_exec_ci<dataType, vec>(&pLocalU[col], colStrideOrder, col, factArray, divVec, pBlock,
663  se0ParamsFact, se1ParamsFact, sa0ParamsFact, sa1ParamsFact);
664  seScalarParams.ICNT0 = col;
665  __SE0_OPEN(factArray, seScalarParams);
666  }
667 
668  __SA0_OPEN(saRefParams);
669  __SA2_OPEN(saRefStoreParams);
670 
671  if (nTiles8 > 0) {
672  /* 1 X (8 * eleCount) TILE */
673 
674  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
675  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
676  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
677  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
678  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
679 
680  if (col) {
681  __SE1_OPEN(pLocalU, seMatrixParams);
682  __SA1_OPEN(saMatrixParams);
683  }
684 
685  for (int32_t tile = 0; tile < nTiles8; tile++) {
686 
687  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
688  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
689  vec sV1 = __vload_pred(lPred, pLoadVec);
690 
691  lPred = c7x::strm_agen<0, vec>::get_vpred();
692  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
693  vec sV2 = __vload_pred(lPred, pLoadVec);
694 
695  lPred = c7x::strm_agen<0, vec>::get_vpred();
696  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
697  vec sV3 = __vload_pred(lPred, pLoadVec);
698 
699  lPred = c7x::strm_agen<0, vec>::get_vpred();
700  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
701  vec sV4 = __vload_pred(lPred, pLoadVec);
702 
703  lPred = c7x::strm_agen<0, vec>::get_vpred();
704  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
705  vec sV5 = __vload_pred(lPred, pLoadVec);
706 
707  lPred = c7x::strm_agen<0, vec>::get_vpred();
708  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
709  vec sV6 = __vload_pred(lPred, pLoadVec);
710 
711  lPred = c7x::strm_agen<0, vec>::get_vpred();
712  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
713  vec sV7 = __vload_pred(lPred, pLoadVec);
714 
715  lPred = c7x::strm_agen<0, vec>::get_vpred();
716  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastU);
717  vec sV8 = __vload_pred(lPred, pLoadVec);
718 
719  for (int32_t vertical = 0; vertical < col; vertical++) {
720 
721  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
722 
723  vec v1 = c7x::strm_eng<1, vec>::get_adv();
724  vec v2 = c7x::strm_eng<1, vec>::get_adv();
725  vec v3 = c7x::strm_eng<1, vec>::get_adv();
726  vec v4 = c7x::strm_eng<1, vec>::get_adv();
727  vec v5 = c7x::strm_eng<1, vec>::get_adv();
728  vec v6 = c7x::strm_eng<1, vec>::get_adv();
729  vec v7 = c7x::strm_eng<1, vec>::get_adv();
730  vec v8 = c7x::strm_eng<1, vec>::get_adv();
731 
732  v1 -= sV1 * scalarDup;
733  v2 -= sV2 * scalarDup;
734  v3 -= sV3 * scalarDup;
735  v4 -= sV4 * scalarDup;
736  v5 -= sV5 * scalarDup;
737  v6 -= sV6 * scalarDup;
738  v7 -= sV7 * scalarDup;
739  v8 -= sV8 * scalarDup;
740 
741  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
742  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
743  __vstore_pred(sPred, pStoreVec, v1);
744 
745  sPred = c7x::strm_agen<1, vec>::get_vpred();
746  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
747  __vstore_pred(sPred, pStoreVec, v2);
748 
749  sPred = c7x::strm_agen<1, vec>::get_vpred();
750  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
751  __vstore_pred(sPred, pStoreVec, v3);
752 
753  sPred = c7x::strm_agen<1, vec>::get_vpred();
754  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
755  __vstore_pred(sPred, pStoreVec, v4);
756 
757  sPred = c7x::strm_agen<1, vec>::get_vpred();
758  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
759  __vstore_pred(sPred, pStoreVec, v5);
760 
761  sPred = c7x::strm_agen<1, vec>::get_vpred();
762  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
763  __vstore_pred(sPred, pStoreVec, v6);
764 
765  sPred = c7x::strm_agen<1, vec>::get_vpred();
766  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
767  __vstore_pred(sPred, pStoreVec, v7);
768 
769  sPred = c7x::strm_agen<1, vec>::get_vpred();
770  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalU);
771  __vstore_pred(sPred, pStoreVec, v8);
772  }
773 
774  sV1 *= divVec;
775  sV2 *= divVec;
776  sV3 *= divVec;
777  sV4 *= divVec;
778  sV5 *= divVec;
779  sV6 *= divVec;
780  sV7 *= divVec;
781  sV8 *= divVec;
782 
783  lPred = c7x::strm_agen<2, vec>::get_vpred();
784  vec *psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
785  __vstore_pred(lPred, psV, sV1);
786 
787  lPred = c7x::strm_agen<2, vec>::get_vpred();
788  psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
789  __vstore_pred(lPred, psV, sV2);
790 
791  lPred = c7x::strm_agen<2, vec>::get_vpred();
792  psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
793  __vstore_pred(lPred, psV, sV3);
794 
795  lPred = c7x::strm_agen<2, vec>::get_vpred();
796  psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
797  __vstore_pred(lPred, psV, sV4);
798 
799  lPred = c7x::strm_agen<2, vec>::get_vpred();
800  psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
801  __vstore_pred(lPred, psV, sV5);
802 
803  lPred = c7x::strm_agen<2, vec>::get_vpred();
804  psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
805  __vstore_pred(lPred, psV, sV6);
806 
807  lPred = c7x::strm_agen<2, vec>::get_vpred();
808  psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
809  __vstore_pred(lPred, psV, sV7);
810 
811  lPred = c7x::strm_agen<2, vec>::get_vpred();
812  psV = c7x::strm_agen<2, vec>::get_adv(pLastU);
813  __vstore_pred(lPred, psV, sV8);
814  }
815  __SE1_CLOSE();
816  __SA1_CLOSE();
817  }
818 
819  if (nTiles4 > 0) {
820  /* 1 X (4 * eleCount) TILE */
821 
822  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
823  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
824  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
825  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
826  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
827 
828  dataType *pSE1 = pLocalU + colLimit8;
829  dataType *pSA1 = pLocalU + colLimit8;
830  dataType *pSA0 = pLastU;
831  dataType *pSA2 = pLastU;
832 
833  if (col) {
834  __SE1_OPEN(pSE1, seMatrixParams);
835  __SA1_OPEN(saMatrixParams);
836  }
837 
838  for (int32_t tile = 0; tile < nTiles4; tile++) {
839  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
840  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
841  vec sV1 = __vload_pred(lPred, pLoadVec);
842 
843  lPred = c7x::strm_agen<0, vec>::get_vpred();
844  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
845  vec sV2 = __vload_pred(lPred, pLoadVec);
846 
847  lPred = c7x::strm_agen<0, vec>::get_vpred();
848  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
849  vec sV3 = __vload_pred(lPred, pLoadVec);
850 
851  lPred = c7x::strm_agen<0, vec>::get_vpred();
852  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
853  vec sV4 = __vload_pred(lPred, pLoadVec);
854 
855  for (int32_t vertical = 0; vertical < col; vertical++) {
856  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
857 
858  vec v1 = c7x::strm_eng<1, vec>::get_adv();
859  vec v2 = c7x::strm_eng<1, vec>::get_adv();
860  vec v3 = c7x::strm_eng<1, vec>::get_adv();
861  vec v4 = c7x::strm_eng<1, vec>::get_adv();
862 
863  v1 -= sV1 * scalarDup;
864  v2 -= sV2 * scalarDup;
865  v3 -= sV3 * scalarDup;
866  v4 -= sV4 * scalarDup;
867 
868  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
869  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
870  __vstore_pred(sPred, pStoreVec, v1);
871 
872  sPred = c7x::strm_agen<1, vec>::get_vpred();
873  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
874  __vstore_pred(sPred, pStoreVec, v2);
875 
876  sPred = c7x::strm_agen<1, vec>::get_vpred();
877  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
878  __vstore_pred(sPred, pStoreVec, v3);
879 
880  sPred = c7x::strm_agen<1, vec>::get_vpred();
881  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
882  __vstore_pred(sPred, pStoreVec, v4);
883  }
884 
885  sV1 *= divVec;
886  sV2 *= divVec;
887  sV3 *= divVec;
888  sV4 *= divVec;
889 
890  lPred = c7x::strm_agen<2, vec>::get_vpred();
891  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
892  __vstore_pred(lPred, psV, sV1);
893 
894  lPred = c7x::strm_agen<2, vec>::get_vpred();
895  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
896  __vstore_pred(lPred, psV, sV2);
897 
898  lPred = c7x::strm_agen<2, vec>::get_vpred();
899  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
900  __vstore_pred(lPred, psV, sV3);
901 
902  lPred = c7x::strm_agen<2, vec>::get_vpred();
903  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
904  __vstore_pred(lPred, psV, sV4);
905  }
906  __SE1_CLOSE();
907  __SA1_CLOSE();
908  }
909 
910  if (nTiles2 > 0) {
911  /* 1 X (2*eleCount) TILE */
912 
913  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
914  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
915  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
916  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
917  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
918 
919  dataType *pSE1 = pLocalU + colLimit8 + colLimit4;
920  dataType *pSA1 = pLocalU + colLimit8 + colLimit4;
921  dataType *pSA0 = pLastU;
922  dataType *pSA2 = pLastU;
923 
924  if (col) {
925  __SE1_OPEN(pSE1, seMatrixParams);
926  __SA1_OPEN(saMatrixParams);
927  }
928 
929  for (int32_t tile = 0; tile < nTiles2; tile++) {
930  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
931  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
932  vec sV1 = __vload_pred(lPred, pLoadVec);
933 
934  lPred = c7x::strm_agen<0, vec>::get_vpred();
935  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
936  vec sV2 = __vload_pred(lPred, pLoadVec);
937 
938  for (int32_t vertical = 0; vertical < col; vertical++) {
939  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
940 
941  vec v1 = c7x::strm_eng<1, vec>::get_adv();
942  vec v2 = c7x::strm_eng<1, vec>::get_adv();
943 
944  v1 -= sV1 * scalarDup;
945  v2 -= sV2 * scalarDup;
946 
947  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
948  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
949  __vstore_pred(sPred, pStoreVec, v1);
950 
951  sPred = c7x::strm_agen<1, vec>::get_vpred();
952  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
953  __vstore_pred(sPred, pStoreVec, v2);
954  }
955 
956  sV1 *= divVec;
957  sV2 *= divVec;
958 
959  lPred = c7x::strm_agen<2, vec>::get_vpred();
960  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
961  __vstore_pred(lPred, psV, sV1);
962 
963  lPred = c7x::strm_agen<2, vec>::get_vpred();
964  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
965  __vstore_pred(lPred, psV, sV2);
966  }
967 
968  __SE1_CLOSE();
969  __SA1_CLOSE();
970  }
971 
972  if (nTiles1 > 0) {
973  /* 1 X (1*eleCount) TILE */
974 
975  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
976  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
977  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
978  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
979  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
980 
981  dataType *pSE1 = pLocalU + colLimit8 + colLimit4 + colLimit2;
982  dataType *pSA1 = pLocalU + colLimit8 + colLimit4 + colLimit2;
983  dataType *pSA0 = pLastU;
984  dataType *pSA2 = pLastU;
985 
986  if (col) {
987  __SE1_OPEN(pSE1, seMatrixParams);
988  __SA1_OPEN(saMatrixParams);
989  }
990 
991  for (int32_t tile = 0; tile < nTiles1; tile++) {
992  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
993  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
994  vec sV1 = __vload_pred(lPred, pLoadVec);
995 
996  for (int32_t vertical = 0; vertical < col; vertical++) {
997  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
998 
999  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1000 
1001  v1 -= sV1 * scalarDup;
1002 
1003  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1004  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1005  __vstore_pred(sPred, pStoreVec, v1);
1006  }
1007 
1008  sV1 *= divVec;
1009 
1010  lPred = c7x::strm_agen<2, vec>::get_vpred();
1011  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1012  __vstore_pred(lPred, psV, sV1);
1013  }
1014 
1015  __SE1_CLOSE();
1016  __SA1_CLOSE();
1017  }
1018 
1019  __SA0_CLOSE();
1020  __SA2_CLOSE();
1021 
1022  __SA0_OPEN(saRefParams);
1023  __SA2_OPEN(saRefStoreParams);
1024 
1025  if (nTiles8 > 0) {
1026  /* 1 X (8*eleCount) TILE */
1027 
1028  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
1029  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1030  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
1031  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
1032  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
1033 
1034  if (col) {
1035  __SE1_OPEN(pLocalInvU, seMatrixParams);
1036  __SA1_OPEN(saMatrixParams);
1037  }
1038 
1039  for (int32_t tile = 0; tile < nTiles8; tile++) {
1040  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1041  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1042  vec sV1 = __vload_pred(lPred, pLoadVec);
1043 
1044  lPred = c7x::strm_agen<0, vec>::get_vpred();
1045  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1046  vec sV2 = __vload_pred(lPred, pLoadVec);
1047 
1048  lPred = c7x::strm_agen<0, vec>::get_vpred();
1049  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1050  vec sV3 = __vload_pred(lPred, pLoadVec);
1051 
1052  lPred = c7x::strm_agen<0, vec>::get_vpred();
1053  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1054  vec sV4 = __vload_pred(lPred, pLoadVec);
1055 
1056  lPred = c7x::strm_agen<0, vec>::get_vpred();
1057  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1058  vec sV5 = __vload_pred(lPred, pLoadVec);
1059 
1060  lPred = c7x::strm_agen<0, vec>::get_vpred();
1061  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1062  vec sV6 = __vload_pred(lPred, pLoadVec);
1063 
1064  lPred = c7x::strm_agen<0, vec>::get_vpred();
1065  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1066  vec sV7 = __vload_pred(lPred, pLoadVec);
1067 
1068  lPred = c7x::strm_agen<0, vec>::get_vpred();
1069  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvU);
1070  vec sV8 = __vload_pred(lPred, pLoadVec);
1071  for (int32_t vertical = 0; vertical < col; vertical++) {
1072  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1073 
1074  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1075  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1076  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1077  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1078  vec v5 = c7x::strm_eng<1, vec>::get_adv();
1079  vec v6 = c7x::strm_eng<1, vec>::get_adv();
1080  vec v7 = c7x::strm_eng<1, vec>::get_adv();
1081  vec v8 = c7x::strm_eng<1, vec>::get_adv();
1082 
1083  v1 -= sV1 * scalarDup;
1084  v2 -= sV2 * scalarDup;
1085  v3 -= sV3 * scalarDup;
1086  v4 -= sV4 * scalarDup;
1087  v5 -= sV5 * scalarDup;
1088  v6 -= sV6 * scalarDup;
1089  v7 -= sV7 * scalarDup;
1090  v8 -= sV8 * scalarDup;
1091 
1092  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1093  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1094  __vstore_pred(sPred, pStoreVec, v1);
1095 
1096  sPred = c7x::strm_agen<1, vec>::get_vpred();
1097  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1098  __vstore_pred(sPred, pStoreVec, v2);
1099 
1100  sPred = c7x::strm_agen<1, vec>::get_vpred();
1101  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1102  __vstore_pred(sPred, pStoreVec, v3);
1103 
1104  sPred = c7x::strm_agen<1, vec>::get_vpred();
1105  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1106  __vstore_pred(sPred, pStoreVec, v4);
1107 
1108  sPred = c7x::strm_agen<1, vec>::get_vpred();
1109  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1110  __vstore_pred(sPred, pStoreVec, v5);
1111 
1112  sPred = c7x::strm_agen<1, vec>::get_vpred();
1113  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1114  __vstore_pred(sPred, pStoreVec, v6);
1115 
1116  sPred = c7x::strm_agen<1, vec>::get_vpred();
1117  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1118  __vstore_pred(sPred, pStoreVec, v7);
1119 
1120  sPred = c7x::strm_agen<1, vec>::get_vpred();
1121  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvU);
1122  __vstore_pred(sPred, pStoreVec, v8);
1123  }
1124 
1125  sV1 *= divVec;
1126  sV2 *= divVec;
1127  sV3 *= divVec;
1128  sV4 *= divVec;
1129  sV5 *= divVec;
1130  sV6 *= divVec;
1131  sV7 *= divVec;
1132  sV8 *= divVec;
1133 
1134  lPred = c7x::strm_agen<2, vec>::get_vpred();
1135  vec *psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1136  __vstore_pred(lPred, psV, sV1);
1137 
1138  lPred = c7x::strm_agen<2, vec>::get_vpred();
1139  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1140  __vstore_pred(lPred, psV, sV2);
1141 
1142  lPred = c7x::strm_agen<2, vec>::get_vpred();
1143  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1144  __vstore_pred(lPred, psV, sV3);
1145 
1146  lPred = c7x::strm_agen<2, vec>::get_vpred();
1147  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1148  __vstore_pred(lPred, psV, sV4);
1149 
1150  lPred = c7x::strm_agen<2, vec>::get_vpred();
1151  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1152  __vstore_pred(lPred, psV, sV5);
1153 
1154  lPred = c7x::strm_agen<2, vec>::get_vpred();
1155  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1156  __vstore_pred(lPred, psV, sV6);
1157 
1158  lPred = c7x::strm_agen<2, vec>::get_vpred();
1159  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1160  __vstore_pred(lPred, psV, sV7);
1161 
1162  lPred = c7x::strm_agen<2, vec>::get_vpred();
1163  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvU);
1164  __vstore_pred(lPred, psV, sV8);
1165  }
1166  __SE1_CLOSE();
1167  __SA1_CLOSE();
1168  }
1169 
1170  if (nTiles4 > 0) {
1171  /* 1 X (4*eleCount) TILE */
1172 
1173  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
1174  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1175  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
1176  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
1177  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
1178 
1179  dataType *pSE1 = pLocalInvU + colLimit8;
1180  dataType *pSA1 = pLocalInvU + colLimit8;
1181  dataType *pSA0 = pLastInvU;
1182  dataType *pSA2 = pLastInvU;
1183 
1184  if (col) {
1185  __SE1_OPEN(pSE1, seMatrixParams);
1186  __SA1_OPEN(saMatrixParams);
1187  }
1188 
1189  for (int32_t tile = 0; tile < nTiles4; tile++) {
1190  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1191  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1192  vec sV1 = __vload_pred(lPred, pLoadVec);
1193 
1194  lPred = c7x::strm_agen<0, vec>::get_vpred();
1195  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1196  vec sV2 = __vload_pred(lPred, pLoadVec);
1197 
1198  lPred = c7x::strm_agen<0, vec>::get_vpred();
1199  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1200  vec sV3 = __vload_pred(lPred, pLoadVec);
1201 
1202  lPred = c7x::strm_agen<0, vec>::get_vpred();
1203  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1204  vec sV4 = __vload_pred(lPred, pLoadVec);
1205  for (int32_t vertical = 0; vertical < col; vertical++) {
1206  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1207 
1208  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1209  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1210  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1211  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1212 
1213  v1 -= sV1 * scalarDup;
1214  v2 -= sV2 * scalarDup;
1215  v3 -= sV3 * scalarDup;
1216  v4 -= sV4 * scalarDup;
1217 
1218  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1219  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1220  __vstore_pred(sPred, pStoreVec, v1);
1221 
1222  sPred = c7x::strm_agen<1, vec>::get_vpred();
1223  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1224  __vstore_pred(sPred, pStoreVec, v2);
1225 
1226  sPred = c7x::strm_agen<1, vec>::get_vpred();
1227  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1228  __vstore_pred(sPred, pStoreVec, v3);
1229 
1230  sPred = c7x::strm_agen<1, vec>::get_vpred();
1231  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1232  __vstore_pred(sPred, pStoreVec, v4);
1233  }
1234 
1235  sV1 *= divVec;
1236  sV2 *= divVec;
1237  sV3 *= divVec;
1238  sV4 *= divVec;
1239 
1240  lPred = c7x::strm_agen<2, vec>::get_vpred();
1241  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1242  __vstore_pred(lPred, psV, sV1);
1243 
1244  lPred = c7x::strm_agen<2, vec>::get_vpred();
1245  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1246  __vstore_pred(lPred, psV, sV2);
1247 
1248  lPred = c7x::strm_agen<2, vec>::get_vpred();
1249  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1250  __vstore_pred(lPred, psV, sV3);
1251 
1252  lPred = c7x::strm_agen<2, vec>::get_vpred();
1253  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1254  __vstore_pred(lPred, psV, sV4);
1255  }
1256  __SE1_CLOSE();
1257  __SA1_CLOSE();
1258  }
1259 
1260  if (nTiles2 > 0) {
1261  /* 1 X (2*eleCount) TILE */
1262 
1263  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
1264  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1265  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
1266  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
1267  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
1268 
1269  dataType *pSE1 = pLocalInvU + colLimit8 + colLimit4;
1270  dataType *pSA1 = pLocalInvU + colLimit8 + colLimit4;
1271  dataType *pSA0 = pLastInvU;
1272  dataType *pSA2 = pLastInvU;
1273 
1274  if (col) {
1275  __SE1_OPEN(pSE1, seMatrixParams);
1276  __SA1_OPEN(saMatrixParams);
1277  }
1278 
1279  for (int32_t tile = 0; tile < nTiles2; tile++) {
1280  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1281  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1282  vec sV1 = __vload_pred(lPred, pLoadVec);
1283 
1284  lPred = c7x::strm_agen<0, vec>::get_vpred();
1285  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1286  vec sV2 = __vload_pred(lPred, pLoadVec);
1287 
1288  for (int32_t vertical = 0; vertical < col; vertical++) {
1289  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1290 
1291  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1292  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1293 
1294  v1 -= sV1 * scalarDup;
1295  v2 -= sV2 * scalarDup;
1296 
1297  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1298  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1299  __vstore_pred(sPred, pStoreVec, v1);
1300 
1301  sPred = c7x::strm_agen<1, vec>::get_vpred();
1302  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1303  __vstore_pred(sPred, pStoreVec, v2);
1304  }
1305 
1306  sV1 *= divVec;
1307  sV2 *= divVec;
1308 
1309  lPred = c7x::strm_agen<2, vec>::get_vpred();
1310  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1311  __vstore_pred(lPred, psV, sV1);
1312 
1313  lPred = c7x::strm_agen<2, vec>::get_vpred();
1314  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1315  __vstore_pred(lPred, psV, sV2);
1316  }
1317 
1318  __SE1_CLOSE();
1319  __SA1_CLOSE();
1320  }
1321 
1322  if (nTiles1 > 0) {
1323  /* 1 X (1*eleCount) TILE */
1324 
1325  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
1326  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1327  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
1328  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
1329  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
1330 
1331  dataType *pSE1 = pLocalInvU + colLimit8 + colLimit4 + colLimit2;
1332  dataType *pSA1 = pLocalInvU + colLimit8 + colLimit4 + colLimit2;
1333  dataType *pSA0 = pLastInvU;
1334  dataType *pSA2 = pLastInvU;
1335 
1336  if (col) {
1337  __SE1_OPEN(pSE1, seMatrixParams);
1338  __SA1_OPEN(saMatrixParams);
1339  }
1340 
1341  for (int32_t tile = 0; tile < nTiles1; tile++) {
1342  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1343  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1344  vec sV1 = __vload_pred(lPred, pLoadVec);
1345 
1346  for (int32_t vertical = 0; vertical < col; vertical++) {
1347  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1348 
1349  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1350 
1351  v1 -= sV1 * scalarDup;
1352 
1353  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1354  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1355  __vstore_pred(sPred, pStoreVec, v1);
1356  }
1357 
1358  sV1 *= divVec;
1359 
1360  lPred = c7x::strm_agen<2, vec>::get_vpred();
1361  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1362  __vstore_pred(lPred, psV, sV1);
1363  }
1364 
1365  __SE1_CLOSE();
1366  __SA1_CLOSE();
1367  }
1368  __SE0_CLOSE();
1369  __SA0_CLOSE();
1370  __SA2_CLOSE();
1371  }
1372 
1373  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
1374 }
1375 template void DSPLIB_lud_inv_invU_exec_ci<float>(float *pLocalU,
1376  float *pLocalInvU,
1377  int32_t nCols,
1378  int32_t colStrideOrder,
1379  int32_t colInvAStride,
1380  float *factArray,
1381  uint8_t *pBlock);
1382 template void DSPLIB_lud_inv_invU_exec_ci<double>(double *pLocalU,
1383  double *pLocalInvU,
1384  int32_t nCols,
1385  int32_t colStrideOrder,
1386  int32_t colInvAStride,
1387  double *factArray,
1388  uint8_t *pBlock);
1389 
1390 template <typename dataType>
1391 void DSPLIB_lud_inv_invL_exec_ci(dataType *pLocalL,
1392  dataType *pLocalInvL,
1393  int32_t nCols,
1394  int32_t colStrideOrder,
1395  int32_t colInvLStride,
1396  dataType *factArray,
1397  uint8_t *pBlock)
1398 {
1399  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
1400  typedef typename c7x::make_full_vector<dataType>::type vec;
1401 
1402  /* Factor copy params */
1403  __SE_TEMPLATE_v1 se0ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1404  __SE_TEMPLATE_v1 se1ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1405  __SA_TEMPLATE_v1 sa0ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1406  __SA_TEMPLATE_v1 sa1ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1407 
1408  uint32_t eleCount = c7x::element_count_of<vec>::value;
1409 
1410  /* Updation params */
1411  __SE_TEMPLATE_v1 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1412  __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1413  __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1414  __SA_TEMPLATE_v1 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1415  __SA_TEMPLATE_v1 saRefStoreParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1416 
1417  int32_t lenTile8 = 8;
1418  int32_t lenTile4 = 4;
1419  int32_t lenTile2 = 2;
1420  int32_t lenTile1 = 1;
1421 
1422  int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
1423  int32_t nTiles8 = nTiles1 / lenTile8;
1424  nTiles1 -= nTiles8 * lenTile8;
1425  int32_t nTiles4 = nTiles1 / lenTile4;
1426  nTiles1 -= nTiles4 * lenTile4;
1427  int32_t nTiles2 = nTiles1 / lenTile2;
1428  nTiles1 -= nTiles2 * lenTile2;
1429 
1430  int32_t remainingCols = nCols;
1431  int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
1432  colLimit8 = (remainingCols < (colLimit8)) ? remainingCols : colLimit8;
1433 
1434  remainingCols = remainingCols - colLimit8;
1435  int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
1436  colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
1437 
1438  remainingCols = remainingCols - colLimit4;
1439  int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
1440  colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
1441 
1442  int32_t colLimit1 = remainingCols - colLimit2;
1443  seScalarParams.ICNT1 = 2 * (nTiles8 + nTiles4 + nTiles2 + nTiles1);
1444 
1445  for (int32_t col = 0; col < nCols; col++) {
1446  dataType *pRefL = pLocalL + (colStrideOrder * col);
1447  dataType *pRefInvL = pLocalInvL + (colInvLStride * col);
1448  dataType *pStartL = pLocalL + (colStrideOrder * (col + 1));
1449  dataType *pStartInvL = pLocalInvL + (colStrideOrder * (col + 1));
1450  int32_t nRows = (nCols - 1) - col;
1451 
1452  dataType diag = pLocalL[col + col * colStrideOrder];
1453 #ifdef LUD_INV_HIGH_PRECISION
1454  vec divVec = (vec) (1 / diag);
1455 #else
1456  dataType recipScalar = __recip(diag);
1457  dataType twoP0 = 2.0;
1458 
1459  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
1460  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
1461 
1462  vec divVec = (vec) recipScalar;
1463 #endif
1464  if (nRows > 0) {
1465  DSPLIB_lud_inv_factor_exec_ci<dataType, vec>(&pStartL[col], colStrideOrder, nRows, factArray, divVec, pBlock,
1466  se0ParamsFact, se1ParamsFact, sa0ParamsFact, sa1ParamsFact);
1467  seScalarParams.ICNT0 = nRows;
1468  __SE0_OPEN(factArray, seScalarParams);
1469  }
1470 
1471  __SA0_OPEN(saRefParams);
1472  __SA2_OPEN(saRefStoreParams);
1473 
1474  if (nTiles8 > 0) {
1475  /* 1 X (8*eleCount) TILE */
1476 
1477  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
1478  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1479  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
1480  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
1481  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
1482 
1483  if (nRows) {
1484  __SE1_OPEN(pStartL, seMatrixParams);
1485  __SA1_OPEN(saMatrixParams);
1486  }
1487 
1488  for (int32_t tile = 0; tile < nTiles8; tile++) {
1489  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1490  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1491  vec sV1 = __vload_pred(lPred, pLoadVec);
1492 
1493  lPred = c7x::strm_agen<0, vec>::get_vpred();
1494  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1495  vec sV2 = __vload_pred(lPred, pLoadVec);
1496 
1497  lPred = c7x::strm_agen<0, vec>::get_vpred();
1498  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1499  vec sV3 = __vload_pred(lPred, pLoadVec);
1500 
1501  lPred = c7x::strm_agen<0, vec>::get_vpred();
1502  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1503  vec sV4 = __vload_pred(lPred, pLoadVec);
1504 
1505  lPred = c7x::strm_agen<0, vec>::get_vpred();
1506  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1507  vec sV5 = __vload_pred(lPred, pLoadVec);
1508 
1509  lPred = c7x::strm_agen<0, vec>::get_vpred();
1510  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1511  vec sV6 = __vload_pred(lPred, pLoadVec);
1512 
1513  lPred = c7x::strm_agen<0, vec>::get_vpred();
1514  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1515  vec sV7 = __vload_pred(lPred, pLoadVec);
1516 
1517  lPred = c7x::strm_agen<0, vec>::get_vpred();
1518  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefL);
1519  vec sV8 = __vload_pred(lPred, pLoadVec);
1520 
1521  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1522 
1523  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1524 
1525  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1526  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1527  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1528  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1529  vec v5 = c7x::strm_eng<1, vec>::get_adv();
1530  vec v6 = c7x::strm_eng<1, vec>::get_adv();
1531  vec v7 = c7x::strm_eng<1, vec>::get_adv();
1532  vec v8 = c7x::strm_eng<1, vec>::get_adv();
1533 
1534  v1 -= sV1 * scalarDup;
1535  v2 -= sV2 * scalarDup;
1536  v3 -= sV3 * scalarDup;
1537  v4 -= sV4 * scalarDup;
1538  v5 -= sV5 * scalarDup;
1539  v6 -= sV6 * scalarDup;
1540  v7 -= sV7 * scalarDup;
1541  v8 -= sV8 * scalarDup;
1542 
1543  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1544  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1545  __vstore_pred(sPred, pStoreVec, v1);
1546 
1547  sPred = c7x::strm_agen<1, vec>::get_vpred();
1548  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1549  __vstore_pred(sPred, pStoreVec, v2);
1550 
1551  sPred = c7x::strm_agen<1, vec>::get_vpred();
1552  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1553  __vstore_pred(sPred, pStoreVec, v3);
1554 
1555  sPred = c7x::strm_agen<1, vec>::get_vpred();
1556  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1557  __vstore_pred(sPred, pStoreVec, v4);
1558 
1559  sPred = c7x::strm_agen<1, vec>::get_vpred();
1560  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1561  __vstore_pred(sPred, pStoreVec, v5);
1562 
1563  sPred = c7x::strm_agen<1, vec>::get_vpred();
1564  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1565  __vstore_pred(sPred, pStoreVec, v6);
1566 
1567  sPred = c7x::strm_agen<1, vec>::get_vpred();
1568  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1569  __vstore_pred(sPred, pStoreVec, v7);
1570 
1571  sPred = c7x::strm_agen<1, vec>::get_vpred();
1572  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartL);
1573  __vstore_pred(sPred, pStoreVec, v8);
1574  }
1575  }
1576  __SE1_CLOSE();
1577  __SA1_CLOSE();
1578  }
1579 
1580  if (nTiles4 > 0) {
1581  /* 1 X (4*eleCount) TILE */
1582 
1583  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
1584  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1585  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
1586  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
1587  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
1588 
1589  dataType *pSE1 = pStartL + colLimit8;
1590  dataType *pSA1 = pStartL + colLimit8;
1591  dataType *pSA0 = pRefL;
1592 
1593  if (nRows) {
1594  __SE1_OPEN(pSE1, seMatrixParams);
1595  __SA1_OPEN(saMatrixParams);
1596  }
1597 
1598  for (int32_t tile = 0; tile < nTiles4; tile++) {
1599  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1600  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1601  vec sV1 = __vload_pred(lPred, pLoadVec);
1602 
1603  lPred = c7x::strm_agen<0, vec>::get_vpred();
1604  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1605  vec sV2 = __vload_pred(lPred, pLoadVec);
1606 
1607  lPred = c7x::strm_agen<0, vec>::get_vpred();
1608  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1609  vec sV3 = __vload_pred(lPred, pLoadVec);
1610 
1611  lPred = c7x::strm_agen<0, vec>::get_vpred();
1612  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1613  vec sV4 = __vload_pred(lPred, pLoadVec);
1614 
1615  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1616  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1617 
1618  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1619  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1620  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1621  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1622 
1623  v1 -= sV1 * scalarDup;
1624  v2 -= sV2 * scalarDup;
1625  v3 -= sV3 * scalarDup;
1626  v4 -= sV4 * scalarDup;
1627 
1628  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1629  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1630  __vstore_pred(sPred, pStoreVec, v1);
1631 
1632  sPred = c7x::strm_agen<1, vec>::get_vpred();
1633  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1634  __vstore_pred(sPred, pStoreVec, v2);
1635 
1636  sPred = c7x::strm_agen<1, vec>::get_vpred();
1637  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1638  __vstore_pred(sPred, pStoreVec, v3);
1639 
1640  sPred = c7x::strm_agen<1, vec>::get_vpred();
1641  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1642  __vstore_pred(sPred, pStoreVec, v4);
1643  }
1644  }
1645  __SE1_CLOSE();
1646  __SA1_CLOSE();
1647  }
1648 
1649  if (nTiles2 > 0) {
1650  /* 1 X (2*eleCount) TILE */
1651 
1652  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
1653  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1654  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
1655  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
1656  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
1657 
1658  dataType *pSE1 = pStartL + colLimit8 + colLimit4;
1659  dataType *pSA1 = pStartL + colLimit8 + colLimit4;
1660  dataType *pSA0 = pRefL;
1661 
1662  if (nRows) {
1663  __SE1_OPEN(pSE1, seMatrixParams);
1664  __SA1_OPEN(saMatrixParams);
1665  }
1666 
1667  for (int32_t tile = 0; tile < nTiles2; tile++) {
1668  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1669  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1670  vec sV1 = __vload_pred(lPred, pLoadVec);
1671 
1672  lPred = c7x::strm_agen<0, vec>::get_vpred();
1673  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1674  vec sV2 = __vload_pred(lPred, pLoadVec);
1675 
1676  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1677  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1678 
1679  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1680  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1681 
1682  v1 -= sV1 * scalarDup;
1683  v2 -= sV2 * scalarDup;
1684 
1685  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1686  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1687  __vstore_pred(sPred, pStoreVec, v1);
1688 
1689  sPred = c7x::strm_agen<1, vec>::get_vpred();
1690  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1691  __vstore_pred(sPred, pStoreVec, v2);
1692  }
1693  }
1694 
1695  __SE1_CLOSE();
1696  __SA1_CLOSE();
1697  }
1698  if (nTiles1 > 0) {
1699  /* 1 X (1*eleCount) TILE */
1700 
1701  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
1702  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1703  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
1704  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
1705  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
1706 
1707  dataType *pSE1 = pStartL + colLimit8 + colLimit4 + colLimit2;
1708  dataType *pSA1 = pStartL + colLimit8 + colLimit4 + colLimit2;
1709  dataType *pSA0 = pRefL;
1710 
1711  if (nRows) {
1712  __SE1_OPEN(pSE1, seMatrixParams);
1713  __SA1_OPEN(saMatrixParams);
1714  }
1715 
1716  for (int32_t tile = 0; tile < nTiles1; tile++) {
1717  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1718  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1719  vec sV1 = __vload_pred(lPred, pLoadVec);
1720 
1721  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1722  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1723 
1724  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1725 
1726  v1 -= sV1 * scalarDup;
1727 
1728  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1729  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1730  __vstore_pred(sPred, pStoreVec, v1);
1731  }
1732  }
1733 
1734  __SE1_CLOSE();
1735  __SA1_CLOSE();
1736  }
1737 
1738  __SA0_CLOSE();
1739  __SA2_CLOSE();
1740 
1741  __SA0_OPEN(saRefParams);
1742  __SA2_OPEN(saRefStoreParams);
1743 
1744  if (nTiles8 > 0) {
1745  /* 1 X (8*eleCount) TILE */
1746 
1747  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
1748  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1749  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
1750  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
1751  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
1752 
1753  if (nRows) {
1754  __SE1_OPEN(pStartInvL, seMatrixParams);
1755  __SA1_OPEN(saMatrixParams);
1756  }
1757 
1758  for (int32_t tile = 0; tile < nTiles8; tile++) {
1759  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1760  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1761  vec sV1 = __vload_pred(lPred, pLoadVec);
1762 
1763  lPred = c7x::strm_agen<0, vec>::get_vpred();
1764  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1765  vec sV2 = __vload_pred(lPred, pLoadVec);
1766 
1767  lPred = c7x::strm_agen<0, vec>::get_vpred();
1768  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1769  vec sV3 = __vload_pred(lPred, pLoadVec);
1770 
1771  lPred = c7x::strm_agen<0, vec>::get_vpred();
1772  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1773  vec sV4 = __vload_pred(lPred, pLoadVec);
1774 
1775  lPred = c7x::strm_agen<0, vec>::get_vpred();
1776  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1777  vec sV5 = __vload_pred(lPred, pLoadVec);
1778 
1779  lPred = c7x::strm_agen<0, vec>::get_vpred();
1780  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1781  vec sV6 = __vload_pred(lPred, pLoadVec);
1782 
1783  lPred = c7x::strm_agen<0, vec>::get_vpred();
1784  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1785  vec sV7 = __vload_pred(lPred, pLoadVec);
1786 
1787  lPred = c7x::strm_agen<0, vec>::get_vpred();
1788  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pRefInvL);
1789  vec sV8 = __vload_pred(lPred, pLoadVec);
1790  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1791  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1792 
1793  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1794  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1795  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1796  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1797  vec v5 = c7x::strm_eng<1, vec>::get_adv();
1798  vec v6 = c7x::strm_eng<1, vec>::get_adv();
1799  vec v7 = c7x::strm_eng<1, vec>::get_adv();
1800  vec v8 = c7x::strm_eng<1, vec>::get_adv();
1801 
1802  v1 -= sV1 * scalarDup;
1803  v2 -= sV2 * scalarDup;
1804  v3 -= sV3 * scalarDup;
1805  v4 -= sV4 * scalarDup;
1806  v5 -= sV5 * scalarDup;
1807  v6 -= sV6 * scalarDup;
1808  v7 -= sV7 * scalarDup;
1809  v8 -= sV8 * scalarDup;
1810 
1811  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1812  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1813  __vstore_pred(sPred, pStoreVec, v1);
1814 
1815  sPred = c7x::strm_agen<1, vec>::get_vpred();
1816  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1817  __vstore_pred(sPred, pStoreVec, v2);
1818 
1819  sPred = c7x::strm_agen<1, vec>::get_vpred();
1820  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1821  __vstore_pred(sPred, pStoreVec, v3);
1822 
1823  sPred = c7x::strm_agen<1, vec>::get_vpred();
1824  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1825  __vstore_pred(sPred, pStoreVec, v4);
1826 
1827  sPred = c7x::strm_agen<1, vec>::get_vpred();
1828  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1829  __vstore_pred(sPred, pStoreVec, v5);
1830 
1831  sPred = c7x::strm_agen<1, vec>::get_vpred();
1832  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1833  __vstore_pred(sPred, pStoreVec, v6);
1834 
1835  sPred = c7x::strm_agen<1, vec>::get_vpred();
1836  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1837  __vstore_pred(sPred, pStoreVec, v7);
1838 
1839  sPred = c7x::strm_agen<1, vec>::get_vpred();
1840  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pStartInvL);
1841  __vstore_pred(sPred, pStoreVec, v8);
1842  }
1843  }
1844  __SE1_CLOSE();
1845  __SA1_CLOSE();
1846  }
1847 
1848  if (nTiles4 > 0) {
1849 
1850  /* 1 X (4*eleCount) TILE */
1851 
1852  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
1853  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1854  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
1855  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
1856  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
1857 
1858  dataType *pSE1 = pStartInvL + colLimit8;
1859  dataType *pSA1 = pStartInvL + colLimit8;
1860  dataType *pSA0 = pRefInvL;
1861 
1862  if (nRows) {
1863  __SE1_OPEN(pSE1, seMatrixParams);
1864  __SA1_OPEN(saMatrixParams);
1865  }
1866 
1867  for (int32_t tile = 0; tile < nTiles4; tile++) {
1868  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1869  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1870  vec sV1 = __vload_pred(lPred, pLoadVec);
1871 
1872  lPred = c7x::strm_agen<0, vec>::get_vpred();
1873  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1874  vec sV2 = __vload_pred(lPred, pLoadVec);
1875 
1876  lPred = c7x::strm_agen<0, vec>::get_vpred();
1877  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1878  vec sV3 = __vload_pred(lPred, pLoadVec);
1879 
1880  lPred = c7x::strm_agen<0, vec>::get_vpred();
1881  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1882  vec sV4 = __vload_pred(lPred, pLoadVec);
1883 
1884  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1885  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1886 
1887  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1888  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1889  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1890  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1891 
1892  v1 -= sV1 * scalarDup;
1893  v2 -= sV2 * scalarDup;
1894  v3 -= sV3 * scalarDup;
1895  v4 -= sV4 * scalarDup;
1896 
1897  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1898  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1899  __vstore_pred(sPred, pStoreVec, v1);
1900 
1901  sPred = c7x::strm_agen<1, vec>::get_vpred();
1902  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1903  __vstore_pred(sPred, pStoreVec, v2);
1904 
1905  sPred = c7x::strm_agen<1, vec>::get_vpred();
1906  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1907  __vstore_pred(sPred, pStoreVec, v3);
1908 
1909  sPred = c7x::strm_agen<1, vec>::get_vpred();
1910  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1911  __vstore_pred(sPred, pStoreVec, v4);
1912  }
1913  }
1914  __SE1_CLOSE();
1915  __SA1_CLOSE();
1916  }
1917 
1918  if (nTiles2 > 0) {
1919  /* 1 X (2*eleCount) TILE */
1920 
1921  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
1922  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1923  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
1924  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
1925  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
1926 
1927  dataType *pSE1 = pStartInvL + colLimit8 + colLimit4;
1928  dataType *pSA1 = pStartInvL + colLimit8 + colLimit4;
1929  dataType *pSA0 = pRefInvL;
1930 
1931  if (nRows) {
1932  __SE1_OPEN(pSE1, seMatrixParams);
1933  __SA1_OPEN(saMatrixParams);
1934  }
1935 
1936  for (int32_t tile = 0; tile < nTiles2; tile++) {
1937  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1938  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1939  vec sV1 = __vload_pred(lPred, pLoadVec);
1940 
1941  lPred = c7x::strm_agen<0, vec>::get_vpred();
1942  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1943  vec sV2 = __vload_pred(lPred, pLoadVec);
1944 
1945  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1946  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1947 
1948  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1949  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1950 
1951  v1 -= sV1 * scalarDup;
1952  v2 -= sV2 * scalarDup;
1953 
1954  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1955  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1956  __vstore_pred(sPred, pStoreVec, v1);
1957 
1958  sPred = c7x::strm_agen<1, vec>::get_vpred();
1959  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1960  __vstore_pred(sPred, pStoreVec, v2);
1961  }
1962  }
1963 
1964  __SE1_CLOSE();
1965  __SA1_CLOSE();
1966  }
1967 
1968  if (nTiles1 > 0) {
1969  /* 1 X (1*eleCount) TILE */
1970 
1971  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
1972  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
1973  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
1974  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
1975  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
1976 
1977  dataType *pSE1 = pStartInvL + colLimit8 + colLimit4 + colLimit2;
1978  dataType *pSA1 = pStartInvL + colLimit8 + colLimit4 + colLimit2;
1979  dataType *pSA0 = pRefInvL;
1980 
1981  if (nRows) {
1982  __SE1_OPEN(pSE1, seMatrixParams);
1983  __SA1_OPEN(saMatrixParams);
1984  }
1985 
1986  for (int32_t tile = 0; tile < nTiles1; tile++) {
1987  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1988  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1989  vec sV1 = __vload_pred(lPred, pLoadVec);
1990 
1991  for (int32_t vertical = 0; vertical < nRows; vertical++) {
1992  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1993 
1994  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1995 
1996  v1 -= sV1 * scalarDup;
1997 
1998  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1999  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
2000  __vstore_pred(sPred, pStoreVec, v1);
2001  }
2002  }
2003 
2004  __SE1_CLOSE();
2005  __SA1_CLOSE();
2006  }
2007  __SE0_CLOSE();
2008  __SA0_CLOSE();
2009  __SA2_CLOSE();
2010  }
2011 
2012  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
2013 }
2014 template void DSPLIB_lud_inv_invL_exec_ci<float>(float *pLocalL,
2015  float *pLocalInvL,
2016  int32_t nCols,
2017  int32_t colStrideOrder,
2018  int32_t colInvLStride,
2019  float *factArray,
2020  uint8_t *pBlock);
2021 template void DSPLIB_lud_inv_invL_exec_ci<double>(double *pLocalL,
2022  double *pLocalInvL,
2023  int32_t nCols,
2024  int32_t colStrideOrder,
2025  int32_t colInvLStride,
2026  double *factArray,
2027  uint8_t *pBlock);
2028 
2029 template <typename dataType>
2031  void *restrict pP,
2032  void *restrict pL,
2033  void *restrict pU,
2034  void *restrict pinvA,
2035  void *restrict pStratch)
2036 {
2037  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
2038 
2039  DSPLIB_lud_inv_PrivArgs *pKerPrivArgs = (DSPLIB_lud_inv_PrivArgs *) handle;
2040 
2041  int32_t order = pKerPrivArgs->order;
2042  int32_t strideOrder = pKerPrivArgs->strideOrder;
2043  int32_t strideP = pKerPrivArgs->strideP;
2044  int32_t dataSize = sizeof(dataType);
2045  int32_t dataSizeP = sizeof(uint16_t);
2046 
2047  int32_t orderStride = strideOrder / dataSize;
2048  int32_t orderPStride = strideP / dataSizeP;
2049 
2050  /* Typecast void pointers to respective data type */
2051  unsigned short *pPLocal = (unsigned short *) pP;
2052  dataType *pLLocal = (dataType *) pL;
2053  dataType *pULocal = (dataType *) pU;
2054  dataType *pinvALocal = (dataType *) pinvA;
2055  dataType *pFactArray = (dataType *) pStratch;
2056  uint8_t *pBlock = pKerPrivArgs->bufPblock;
2057  DSPLIB_matMul_PrivArgs *pMatMulKerPrivArgs = &pKerPrivArgs->pMatMulKerPrivArgs;
2058 
2059  DSPLIB_DEBUGPRINTFN(0, "pPLocal: %p pLLocal: %p pULocal: %p pinvALocal: %p order: %d\n", pPLocal, pLLocal, pULocal,
2060  pinvALocal, order);
2061 
2062  dataType *invL, *invU, *invU_x_invL;
2063  /* set inv_A matrix to identity */
2064  invL = &pinvALocal[0];
2065  DSPLIB_lud_identity_matrix_generate_exec_ci<dataType>(invL, order, orderStride, pBlock);
2066  DSPLIB_lud_inv_invL_exec_ci<dataType>(pLLocal, invL, order, orderStride, orderStride, pFactArray, pBlock);
2067 
2068  /* set invU matrix to identity */
2069  invU = &pLLocal[0];
2070  DSPLIB_lud_identity_matrix_generate_exec_ci<dataType>(invU, order, orderStride, pBlock);
2071 
2072  /* use Gauss Jordan algorithm to invert U whose result is in L */
2073  DSPLIB_lud_inv_invU_exec_ci(pULocal, invU, order, orderStride, orderStride, pFactArray, pBlock);
2074 
2075  /* -----------------------------------------------------------------------------------
2076  To compute "inv_A=inv(U)*inv(L)*P".
2077  - Multiply invU and invL
2078  - As P is permutation matrix we use indices of "1" present in each row of Permutation Matrix (pPLocaL)to shuffle the
2079  columns of invU_x_invL
2080  --------------------------------------------------------------------------------------*/
2081  invU_x_invL = &pULocal[0];
2082  /* Multiply invU * invL */
2083  DSPLIB_matMul_exec_ci<dataType>(pMatMulKerPrivArgs, invU, invL, invU_x_invL);
2084 
2085  uint32_t *permuteOrder = (uint32_t *) pFactArray;
2086  /* Get the indices of "1" present in each row of "pPLocal" and store in "permuteOrder" */
2087  DSPLIB_lud_inv_permuteIndex_ci(pPLocal, order, orderPStride, permuteOrder, pBlock);
2088 
2089  /* To shuffle the columns first transpose the matrix invU_x_invL*/
2090  DSPLIB_matTrans_exec_ci<dataType>(&pKerPrivArgs->pMatTransKerPrivArgs, invU_x_invL, pinvALocal);
2091 
2092  /* Based on indices present in "permuteOrder" shuffle the rows of transposed invU_x_invL */
2093  DSPLIB_lud_inv_permuteRows_ci<dataType>(pinvALocal, order, orderStride, permuteOrder, invU_x_invL, pBlock);
2094 
2095  /* Transpose back invU_x_invL to get the final result i.e. shuffled columns */
2096  DSPLIB_matTrans_exec_ci<dataType>(&pKerPrivArgs->pMatTransKerPrivArgs, invU_x_invL, pinvALocal);
2097 
2098  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
2099  return DSPLIB_SUCCESS;
2100 }
2101 
2102 // explicit instantiation for the different data type versions
2104  void *restrict pP,
2105  void *restrict pL,
2106  void *restrict pU,
2107  void *restrict pinvA,
2108  void *restrict pStratch);
2109 
2111  void *restrict pP,
2112  void *restrict pL,
2113  void *restrict pU,
2114  void *restrict pinvA,
2115  void *restrict pStratch);
2116 
2117 /* ======================================================================== */
2118 /* End of file: DSPLIB_lud_inv_ci.cpp */
2119 /* ======================================================================== */
template void DSPLIB_lud_inv_invU_exec_ci< double >(double *pLocalU, double *pLocalInvU, int32_t nCols, int32_t colStrideOrder, int32_t colInvAStride, double *factArray, uint8_t *pBlock)
template void DSPLIB_lud_inv_opt_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_inv_invU_exec_ci< float >(float *pLocalU, float *pLocalInvU, int32_t nCols, int32_t colStrideOrder, int32_t colInvAStride, float *factArray, uint8_t *pBlock)
template DSPLIB_STATUS DSPLIB_lud_inv_init_ci< float >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsinvA, const DSPLIB_lud_invInitArgs *pKerInitArgs)
template void DSPLIB_lud_inv_invL_exec_ci< float >(float *pLocalL, float *pLocalInvL, int32_t nCols, int32_t colStrideOrder, int32_t colInvLStride, float *factArray, uint8_t *pBlock)
template void DSPLIB_lud_inv_permuteRows_ci< float >(float *pIn, int32_t order, int32_t colStride, uint32_t *permuteOrder, float *pOut, uint8_t *pBlock)
void DSPLIB_lud_inv_opt_init_ci(DSPLIB_kernelHandle handle)
void DSPLIB_lud_inv_permuteIndex_ci(unsigned short *pIn, int32_t order, int32_t colPStride, uint32_t *permuteOrder, uint8_t *pBlock)
template void DSPLIB_lud_inv_permuteRows_init_ci< double >(DSPLIB_kernelHandle handle)
void DSPLIB_lud_inv_permuteRows_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_lud_inv_permuteRows_ci< double >(double *pIn, int32_t order, int32_t colStride, uint32_t *permuteOrder, double *pOut, uint8_t *pBlock)
template void DSPLIB_lud_inv_permuteRows_init_ci< float >(DSPLIB_kernelHandle handle)
DSPLIB_STATUS DSPLIB_lud_inv_init_ci(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsinvA, const DSPLIB_lud_invInitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template void DSPLIB_lud_inv_invL_exec_ci< double >(double *pLocalL, double *pLocalInvL, int32_t nCols, int32_t colStrideOrder, int32_t colInvLStride, double *factArray, uint8_t *pBlock)
template DSPLIB_STATUS DSPLIB_lud_inv_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pinvA, void *restrict pStratch)
template DSPLIB_STATUS DSPLIB_lud_inv_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pinvA, void *restrict pStratch)
template DSPLIB_STATUS DSPLIB_lud_inv_init_ci< double >(DSPLIB_kernelHandle handle, const DSPLIB_bufParams2D_t *bufParamsP, const DSPLIB_bufParams2D_t *bufParamsL, const DSPLIB_bufParams2D_t *bufParamsU, const DSPLIB_bufParams2D_t *bufParamsinvA, const DSPLIB_lud_invInitArgs *pKerInitArgs)
static dataType DSPLIB_lud_inv_factor_exec_ci(dataType *pCol, int32_t colStride, int32_t nRows, dataType *pFactor, vec scaleVec, uint8_t *pBlock, __SE_TEMPLATE_v1 se0Params, __SE_TEMPLATE_v1 se1Params, __SA_TEMPLATE_v1 sa0Params, __SA_TEMPLATE_v1 sa1Params)
template void DSPLIB_lud_inv_opt_init_ci< double >(DSPLIB_kernelHandle handle)
void DSPLIB_lud_inv_invU_exec_ci(dataType *pLocalU, dataType *pLocalInvU, int32_t nCols, int32_t colStrideOrder, int32_t colInvAStride, dataType *factArray, uint8_t *pBlock)
void DSPLIB_lud_inv_permuteRows_ci(dataType *pIn, int32_t order, int32_t colStride, uint32_t *permuteOrder, dataType *pOut, uint8_t *pBlock)
DSPLIB_STATUS DSPLIB_lud_inv_exec_ci(DSPLIB_kernelHandle handle, void *restrict pP, void *restrict pL, void *restrict pU, void *restrict pinvA, void *restrict pStratch)
This function is the main execution function for the C7x implementation of the kernel....
void DSPLIB_lud_inv_invL_exec_ci(dataType *pLocalL, dataType *pLocalInvL, int32_t nCols, int32_t colStrideOrder, int32_t colInvLStride, dataType *factArray, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_lud_inv.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 2 dimensional buffer descriptor.
int32_t stride_y
Stride in Y dimension in bytes.
uint32_t dim_x
Width of buffer in X dimension in elements.
uint32_t dim_y
Height of buffer in Y dimension in elements.
Structure containing the parameters to initialize the kernel.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
DSPLIB_matMul_PrivArgs pMatMulKerPrivArgs
Privargs for the matMul kernel.
uint8_t bufPblock[DSPLIB_LUD_INV_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters
DSPLIB_matTrans_PrivArgs pMatTransKerPrivArgs
Privargs for the matTrans kernel.
int32_t strideOrder
Stride between rows of input and output data matrix
int32_t order
Size of input buffer for different batches DSPLIB_lud_inv_init that will be retrieved and used by DSP...
int32_t strideP
Stride between rows of output data matrix P
Structure containing the parameters to initialize the kernel.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
Structure containing the parameters to initialize the kernel.
uint32_t dimX
Size of input data.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
int32_t strideOut
Stride between rows of output data matrix
uint32_t heightIn
Height of input data matrix
int32_t strideIn
Stride between rows of input data matrix
uint32_t widthIn
Size of input buffer for different batches DSPLIB_matTrans_init that will be retrieved and used by DS...