DSPLIB User Guide
DSPLIB_qrd_inverse_ci.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2016 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden withInvA |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 /*******************************************************************************
29  *
30  * INCLUDES
31  *
32  ******************************************************************************/
33 #include "DSPLIB_qrd_common.h"
35 
36 /*******************************************************************************
37  *
38  * INITIALIZATION
39  *
40  ******************************************************************************/
41 template <typename dataType> void DSPLIB_qrd_inverse_factor_init_ci(DSPLIB_kernelHandle handle)
42 {
43  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
45  uint8_t *pBlock = pKerPrivArgs->bufPblock;
46 
47  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
48  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
49 
50  int32_t strideR = pKerPrivArgs->strideR;
51  int32_t nCols = pKerPrivArgs->widthR;
52  int32_t colStrideR = strideR / sizeof(dataType);
53 
54  typedef typename c7x::make_full_vector<dataType>::type vec;
55 
56  uint32_t eleCount = c7x::element_count_of<vec>::value;
57  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
58  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
59  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
60 
61  se0Params.ICNT0 = 1;
62  se0Params.ICNT1 = eleCount;
63  se0Params.DIM1 = colStrideR;
64  se0Params.DIM2 = colStrideR * eleCount;
65  se0Params.DIMFMT = __SE_DIMFMT_3D;
66  se0Params.ELETYPE = SE_ELETYPE;
67  se0Params.VECLEN = SE_VECLEN;
68  if (sizeof(dataType) == 4) {
69  se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
70  }
71  else {
72  se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
73  }
74 
75  sa0Params.ICNT0 = nCols;
76  sa0Params.VECLEN = SA_VECLEN;
77  sa0Params.DIMFMT = __SA_DIMFMT_1D;
78 
79  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (SE_PARAM_SIZE)) = se0Params;
80  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = sa0Params;
81 
82  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
83 }
86 
87 template <typename dataType> void DSPLIB_qrd_inverse_R_invA_init_ci(DSPLIB_kernelHandle handle)
88 {
89  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
91  uint8_t *pBlock = pKerPrivArgs->bufPblock;
92  int32_t nCols = pKerPrivArgs->widthR;
93  int32_t strideR = pKerPrivArgs->strideR;
94  int32_t colStrideR = strideR / sizeof(dataType);
95 
96  typedef typename c7x::make_full_vector<dataType>::type vec;
97 
98  int32_t lenTile8 = 8;
99  uint32_t eleCount = c7x::element_count_of<vec>::value;
100  int32_t nTiles_8 = DSPLIB_ceilingDiv(nCols, (eleCount * lenTile8));
101 
102  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
103  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
104  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
105  __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
106 
107  __SE_TEMPLATE_v1 seScalarParams = __gen_SE_TEMPLATE_v1();
108  __SE_TEMPLATE_v1 seMatrixParams = __gen_SE_TEMPLATE_v1();
109  __SA_TEMPLATE_v1 saMatrixParams = __gen_SA_TEMPLATE_v1();
110 
111  seScalarParams.ICNT1 = nTiles_8;
112  seScalarParams.DIM1 = 0;
113  seScalarParams.ELEDUP = SE_ELEDUP;
114  seScalarParams.DIMFMT = __SE_DIMFMT_2D;
115  seScalarParams.VECLEN = SE_VECLEN;
116  seScalarParams.ELETYPE = SE_ELETYPE;
117 
118  seMatrixParams.ICNT0 = (eleCount * lenTile8);
119  seMatrixParams.DIM1 = colStrideR;
120  seMatrixParams.ICNT2 = nTiles_8;
121  seMatrixParams.DIM2 = (eleCount * lenTile8);
122  seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
123  seMatrixParams.ELETYPE = SE_ELETYPE;
124  seMatrixParams.VECLEN = SE_VECLEN;
125  seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
126  seMatrixParams.DECDIM1_WIDTH = nCols;
127 
128  saMatrixParams.ICNT0 = (eleCount * lenTile8);
129  saMatrixParams.DIM1 = colStrideR;
130  saMatrixParams.ICNT2 = nTiles_8;
131  saMatrixParams.DIM2 = (eleCount * lenTile8);
132  saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
133  saMatrixParams.VECLEN = SA_VECLEN;
134  saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
135  saMatrixParams.DECDIM1_WIDTH = nCols;
136 
137  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = seScalarParams;
138  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = seMatrixParams;
139  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = saMatrixParams;
140 
141  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
142 }
145 
146 template <typename dataType>
148  DSPLIB_bufParams2D_t *bufParamsQ,
149  DSPLIB_bufParams2D_t *bufParamsR,
150  DSPLIB_bufParams2D_t *bufParamsInvA,
151  DSPLIB_bufParams2D_t *bufParamsInvAFinal,
152  const DSPLIB_qrdInvInitArgs *pKerInitArgs)
153 {
154  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
155  DSPLIB_STATUS status = DSPLIB_SUCCESS;
156  DSPLIB_qrd_inverse_PrivArgs *pKerPrivArgs = (DSPLIB_qrd_inverse_PrivArgs *) handle;
157  DSPLIB_matMul_PrivArgs *pMatMulKerPrivArgs = &pKerPrivArgs->pMatMulKerPrivArgs;
158  DSPLIB_matTrans_PrivArgs *pMatTransKerPrivArgs = &pKerPrivArgs->pMatTransKerPrivArgs;
159 
160  DSPLIB_matMul_InitArgs kerInitArgsMatMul;
161  DSPLIB_matTransInitArgs kerInitArgsMatTrans;
162 
163  kerInitArgsMatMul.funcStyle = pKerInitArgs->funcStyle;
164  kerInitArgsMatTrans.funcStyle = pKerInitArgs->funcStyle;
165  kerInitArgsMatTrans.dimX = bufParamsQ->dim_x;
166  kerInitArgsMatTrans.dimY = bufParamsQ->dim_y;
167 
168  uint32_t strideIn0Elements = pKerPrivArgs->strideInvA / sizeof(dataType);
169  uint32_t strideIn1Elements = pKerPrivArgs->strideR / sizeof(dataType);
170  uint32_t strideOutElements = pKerPrivArgs->strideInvA / sizeof(dataType);
171 
172  pMatTransKerPrivArgs->widthIn = bufParamsQ->dim_x;
173  pMatTransKerPrivArgs->heightIn = bufParamsQ->dim_y;
174  pMatTransKerPrivArgs->strideIn = bufParamsQ->stride_y;
175  pMatTransKerPrivArgs->strideOut = bufParamsR->stride_y;
176 
177  pMatMulKerPrivArgs->M = pKerPrivArgs->heightR;
178  pMatMulKerPrivArgs->N = pKerPrivArgs->widthR;
179  pMatMulKerPrivArgs->K = pKerPrivArgs->heightR;
180 
181  pMatMulKerPrivArgs->strideIn0Elements = strideIn0Elements;
182  pMatMulKerPrivArgs->strideIn1Elements = strideIn1Elements;
183  pMatMulKerPrivArgs->strideOutElements = strideOutElements;
184 
185  DSPLIB_qrd_identity_matrix_generate_init_ci<dataType>(pKerPrivArgs->heightR, pKerPrivArgs->strideR,
186  pKerPrivArgs->bufPblock);
187  DSPLIB_qrd_inverse_R_invA_init_ci<dataType>(handle);
188 
189  DSPLIB_matTrans_init_ci<dataType>(pMatTransKerPrivArgs, bufParamsQ, bufParamsR, &kerInitArgsMatTrans);
190 
191  DSPLIB_matMul_init_ci<dataType>(pMatMulKerPrivArgs, bufParamsInvA, bufParamsR, bufParamsInvA, &kerInitArgsMatMul);
192  DSPLIB_qrd_inverse_factor_init_ci<dataType>(handle);
193  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", status);
194  return status;
195 }
197  DSPLIB_bufParams2D_t *bufParamsQ,
198  DSPLIB_bufParams2D_t *bufParamsR,
199  DSPLIB_bufParams2D_t *bufParamsInvA,
200  DSPLIB_bufParams2D_t *bufParamsInvAFinal,
201  const DSPLIB_qrdInvInitArgs *pKerInitArgs);
203  DSPLIB_bufParams2D_t *bufParamsQ,
204  DSPLIB_bufParams2D_t *bufParamsR,
205  DSPLIB_bufParams2D_t *bufParamsInvA,
206  DSPLIB_bufParams2D_t *bufParamsInvAFinal,
207  const DSPLIB_qrdInvInitArgs *pKerInitArgs);
208 
209 /*******************************************************************************
210  *
211  * IMPLEMENTATION
212  *
213  ******************************************************************************/
214 
215 template <typename dataType, typename vec = typename c7x::make_full_vector<dataType>::type>
216 static inline dataType DSPLIB_qrd_inverse_factor_exec_ci(dataType *pR,
217  int32_t colStrideR,
218  int32_t nRows,
219  dataType *pFactor,
220  vec scaleVec,
221  uint8_t *pBlock,
222  __SE_TEMPLATE_v1 se0Params,
223  __SE_TEMPLATE_v1 se1Params,
224  __SA_TEMPLATE_v1 sa0Params,
225  __SA_TEMPLATE_v1 sa1Params)
226 {
227 
228  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
229  dataType sum = 0;
230 
231  uint32_t eleCount = c7x::element_count_of<vec>::value;
232 
233  int32_t nVec = DSPLIB_ceilingDiv(nRows, eleCount);
234  int32_t se0ICNT2 = nVec / 2;
235  int32_t se1ICNT2 = nVec - se0ICNT2;
236 
237  se0Params.ICNT2 = se0ICNT2;
238 
239  se1Params.ICNT2 = se1ICNT2;
240 
241  dataType *pSE0 = pR;
242  dataType *pSE1 = pR + (se0ICNT2 * colStrideR * eleCount);
243 
244  __SE1_OPEN(pSE1, se1Params);
245  if (se0ICNT2 > 0) {
246  __SE0_OPEN(pSE0, se0Params);
247  }
248 
249  int32_t vertical;
250 
251  sa0Params.ICNT0 = (se0ICNT2 * eleCount);
252 
253  sa1Params.ICNT0 = nRows - ((se0ICNT2 * eleCount));
254 
255  dataType *pFactorHalf = pFactor + (se0ICNT2 * eleCount);
256  if (sa0Params.ICNT0){
257  __SA0_OPEN(sa0Params);
258  }
259 
260  __SA1_OPEN(sa1Params);
261 
262  for (vertical = 0; vertical < se0ICNT2 - 1; vertical += 2) {
263  vec v1 = c7x::strm_eng<0, vec>::get_adv();
264  vec v2 = c7x::strm_eng<1, vec>::get_adv();
265  vec v3 = c7x::strm_eng<0, vec>::get_adv();
266  vec v4 = c7x::strm_eng<1, vec>::get_adv();
267 
268  __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
269  vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
270  __vstore_pred(pred, pStoreVec, scaleVec * v1);
271 
272  pred = c7x::strm_agen<1, vec>::get_vpred();
273  pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
274  __vstore_pred(pred, pStoreVec, scaleVec * v2);
275 
276  pred = c7x::strm_agen<0, vec>::get_vpred();
277  pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
278  __vstore_pred(pred, pStoreVec, scaleVec * v3);
279 
280  pred = c7x::strm_agen<1, vec>::get_vpred();
281  pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
282  __vstore_pred(pred, pStoreVec, scaleVec * v4);
283  }
284 
285  for (; vertical < se0ICNT2; vertical++) {
286  vec v1 = c7x::strm_eng<0, vec>::get_adv();
287  vec v2 = c7x::strm_eng<1, vec>::get_adv();
288 
289  __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
290  vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv((dataType *) pFactor);
291  __vstore_pred(pred, pStoreVec, scaleVec * v1);
292 
293  pred = c7x::strm_agen<1, vec>::get_vpred();
294  pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
295  __vstore_pred(pred, pStoreVec, scaleVec * v2);
296  }
297  if (se0ICNT2 != se1ICNT2) {
298  vec v1 = c7x::strm_eng<1, vec>::get_adv();
299 
300  __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
301  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv((dataType *) pFactorHalf);
302  __vstore_pred(pred, pStoreVec, scaleVec * v1);
303  }
304 
305  if (sa0Params.ICNT0){
306  __SA0_CLOSE();
307  }
308 
309  __SA1_CLOSE();
310 
311  __SE1_CLOSE();
312 
313  if (se0ICNT2 > 0) {
314  __SE0_CLOSE();
315  }
316 
317  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
318  return sum;
319 }
320 template float DSPLIB_qrd_inverse_factor_exec_ci<float, typename c7x::make_full_vector<float>::type>(
321  float *pR,
322  int32_t colStrideR,
323  int32_t nRows,
324  float *pFactor,
325  typename c7x::make_full_vector<float>::type scale,
326  uint8_t *pBlock,
327  __SE_TEMPLATE_v1 se0Params,
328  __SE_TEMPLATE_v1 se1Params,
329  __SA_TEMPLATE_v1 sa0Params,
330  __SA_TEMPLATE_v1 sa1Params);
331 template double DSPLIB_qrd_inverse_factor_exec_ci<double, typename c7x::make_full_vector<double>::type>(
332  double *pR,
333  int32_t colStrideR,
334  int32_t nRows,
335  double *pFactor,
336  typename c7x::make_full_vector<double>::type scale,
337  uint8_t *pBlock,
338  __SE_TEMPLATE_v1 se0Params,
339  __SE_TEMPLATE_v1 se1Params,
340  __SA_TEMPLATE_v1 sa0Params,
341  __SA_TEMPLATE_v1 sa1Params);
342 
343 template <typename dataType>
344 void DSPLIB_qrd_inverse_R_invA_exec_ci(dataType *pLocalR,
345  dataType *pLocalInvA,
346  int32_t nCols,
347  int32_t colStrideR,
348  int32_t colInvAStride,
349  dataType *factArray,
350  uint8_t *pBlock)
351 {
352  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
353  typedef typename c7x::make_full_vector<dataType>::type vec;
354 
355  __SE_TEMPLATE_v1 se0ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (SE_PARAM_SIZE));
356  __SE_TEMPLATE_v1 se1ParamsFact = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (SE_PARAM_SIZE));
357  __SA_TEMPLATE_v1 sa0ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
358  __SA_TEMPLATE_v1 sa1ParamsFact = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
359 
360  uint32_t eleCount = c7x::element_count_of<vec>::value;
361 
362  __SE_TEMPLATE_v1 seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
363  __SE_TEMPLATE_v1 seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
364  __SA_TEMPLATE_v1 saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
365  __SA_TEMPLATE_v1 saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
366  __SA_TEMPLATE_v1 saRefStoreParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
367 
368  int32_t lenTile8 = 8; // hard code
369  int32_t lenTile4 = 4;
370  int32_t lenTile2 = 2;
371  int32_t lenTile1 = 1;
372 
373  int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
374  int32_t nTiles8 = nTiles1 / lenTile8; // left shift
375  nTiles1 -= nTiles8 * lenTile8;
376  int32_t nTiles4 = nTiles1 / lenTile4;
377  nTiles1 -= nTiles4 * lenTile4;
378  int32_t nTiles2 = nTiles1 / lenTile2;
379  nTiles1 -= nTiles2 * lenTile2;
380 
381  int32_t remainingCols = nCols; // comment
382  int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
383  colLimit8 = (remainingCols < colLimit8) ? remainingCols : colLimit8;
384 
385  remainingCols = remainingCols - colLimit8;
386  int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
387  colLimit4 = (remainingCols < colLimit4) ? remainingCols : colLimit4;
388 
389  remainingCols = remainingCols - colLimit4;
390  int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
391  colLimit2 = (remainingCols < colLimit2) ? remainingCols : colLimit2;
392 
393  int32_t colLimit1 = remainingCols - colLimit2;
394  seScalarParams.ICNT1 = 2 * (nTiles8 + nTiles4 + nTiles2 + nTiles1);
395 
396  for (int32_t col = nCols - 1; col >= 0; col--) {
397  dataType *pLastR = pLocalR + (colStrideR * col);
398  dataType *pLastInvA = pLocalInvA + (colStrideR * col);
399 
400  dataType diag = pLocalR[col + col * colStrideR];
401 
402  dataType recipScalar = __recip(diag);
403  dataType twoP0 = 2.0;
404  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
405  recipScalar = recipScalar * (twoP0 - (diag * recipScalar));
406 
407  vec divVec = (vec) recipScalar;
408 
409  if (col > 0) {
410  DSPLIB_qrd_inverse_factor_exec_ci<dataType, vec>(&pLocalR[col], colStrideR, col, factArray, divVec, pBlock,
411  se0ParamsFact, se1ParamsFact, sa0ParamsFact, sa1ParamsFact);
412  seScalarParams.ICNT0 = col;
413  __SE0_OPEN(factArray, seScalarParams);
414  }
415 
416  __SA0_OPEN(saRefParams);
417  __SA2_OPEN(saRefStoreParams);
418 
419  if (nTiles8 > 0) {
420  /* 1 X (8 * eleCount) TILE */
421 
422  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
423  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
424  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
425  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
426  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
427 
428  if (col) {
429  __SE1_OPEN(pLocalR, seMatrixParams);
430  __SA1_OPEN(saMatrixParams);
431  }
432 
433  for (int32_t tile = 0; tile < nTiles8; tile++) {
434  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
435  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
436  vec sV1 = __vload_pred(lPred, pLoadVec);
437 
438  lPred = c7x::strm_agen<0, vec>::get_vpred();
439  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
440  vec sV2 = __vload_pred(lPred, pLoadVec);
441 
442  lPred = c7x::strm_agen<0, vec>::get_vpred();
443  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
444  vec sV3 = __vload_pred(lPred, pLoadVec);
445 
446  lPred = c7x::strm_agen<0, vec>::get_vpred();
447  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
448  vec sV4 = __vload_pred(lPred, pLoadVec);
449 
450  lPred = c7x::strm_agen<0, vec>::get_vpred();
451  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
452  vec sV5 = __vload_pred(lPred, pLoadVec);
453 
454  lPred = c7x::strm_agen<0, vec>::get_vpred();
455  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
456  vec sV6 = __vload_pred(lPred, pLoadVec);
457 
458  lPred = c7x::strm_agen<0, vec>::get_vpred();
459  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
460  vec sV7 = __vload_pred(lPred, pLoadVec);
461 
462  lPred = c7x::strm_agen<0, vec>::get_vpred();
463  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastR);
464  vec sV8 = __vload_pred(lPred, pLoadVec);
465 
466  for (int32_t vertical = 0; vertical < col; vertical++) {
467  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
468 
469  vec v1 = c7x::strm_eng<1, vec>::get_adv();
470  vec v2 = c7x::strm_eng<1, vec>::get_adv();
471  vec v3 = c7x::strm_eng<1, vec>::get_adv();
472  vec v4 = c7x::strm_eng<1, vec>::get_adv();
473  vec v5 = c7x::strm_eng<1, vec>::get_adv();
474  vec v6 = c7x::strm_eng<1, vec>::get_adv();
475  vec v7 = c7x::strm_eng<1, vec>::get_adv();
476  vec v8 = c7x::strm_eng<1, vec>::get_adv();
477 
478  v1 -= sV1 * scalarDup;
479  v2 -= sV2 * scalarDup;
480  v3 -= sV3 * scalarDup;
481  v4 -= sV4 * scalarDup;
482  v5 -= sV5 * scalarDup;
483  v6 -= sV6 * scalarDup;
484  v7 -= sV7 * scalarDup;
485  v8 -= sV8 * scalarDup;
486 
487  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
488  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
489  __vstore_pred(sPred, pStoreVec, v1);
490 
491  sPred = c7x::strm_agen<1, vec>::get_vpred();
492  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
493  __vstore_pred(sPred, pStoreVec, v2);
494 
495  sPred = c7x::strm_agen<1, vec>::get_vpred();
496  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
497  __vstore_pred(sPred, pStoreVec, v3);
498 
499  sPred = c7x::strm_agen<1, vec>::get_vpred();
500  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
501  __vstore_pred(sPred, pStoreVec, v4);
502 
503  sPred = c7x::strm_agen<1, vec>::get_vpred();
504  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
505  __vstore_pred(sPred, pStoreVec, v5);
506 
507  sPred = c7x::strm_agen<1, vec>::get_vpred();
508  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
509  __vstore_pred(sPred, pStoreVec, v6);
510 
511  sPred = c7x::strm_agen<1, vec>::get_vpred();
512  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
513  __vstore_pred(sPred, pStoreVec, v7);
514 
515  sPred = c7x::strm_agen<1, vec>::get_vpred();
516  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalR);
517  __vstore_pred(sPred, pStoreVec, v8);
518  }
519 
520  sV1 *= divVec;
521  sV2 *= divVec;
522  sV3 *= divVec;
523  sV4 *= divVec;
524  sV5 *= divVec;
525  sV6 *= divVec;
526  sV7 *= divVec;
527  sV8 *= divVec;
528 
529  lPred = c7x::strm_agen<2, vec>::get_vpred();
530  vec *psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
531  __vstore_pred(lPred, psV, sV1);
532 
533  lPred = c7x::strm_agen<2, vec>::get_vpred();
534  psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
535  __vstore_pred(lPred, psV, sV2);
536 
537  lPred = c7x::strm_agen<2, vec>::get_vpred();
538  psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
539  __vstore_pred(lPred, psV, sV3);
540 
541  lPred = c7x::strm_agen<2, vec>::get_vpred();
542  psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
543  __vstore_pred(lPred, psV, sV4);
544 
545  lPred = c7x::strm_agen<2, vec>::get_vpred();
546  psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
547  __vstore_pred(lPred, psV, sV5);
548 
549  lPred = c7x::strm_agen<2, vec>::get_vpred();
550  psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
551  __vstore_pred(lPred, psV, sV6);
552 
553  lPred = c7x::strm_agen<2, vec>::get_vpred();
554  psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
555  __vstore_pred(lPred, psV, sV7);
556 
557  lPred = c7x::strm_agen<2, vec>::get_vpred();
558  psV = c7x::strm_agen<2, vec>::get_adv(pLastR);
559  __vstore_pred(lPred, psV, sV8);
560  }
561  __SE1_CLOSE();
562  __SA1_CLOSE();
563  }
564 
565  if (nTiles4 > 0) {
566  /* 1 X (4 * eleCount) TILE */
567 
568  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
569  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
570  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
571  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
572  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
573 
574  dataType *pSE1 = pLocalR + colLimit8;
575  dataType *pSA1 = pLocalR + colLimit8;
576  dataType *pSA0 = pLastR; // + colLimit8;
577  dataType *pSA2 = pLastR; // + colLimit8;
578 
579  if (col) {
580  __SE1_OPEN(pSE1, seMatrixParams);
581  __SA1_OPEN(saMatrixParams);
582  }
583 
584  for (int32_t tile = 0; tile < nTiles4; tile++) {
585  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
586  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
587  vec sV1 = __vload_pred(lPred, pLoadVec);
588 
589  lPred = c7x::strm_agen<0, vec>::get_vpred();
590  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
591  vec sV2 = __vload_pred(lPred, pLoadVec);
592 
593  lPred = c7x::strm_agen<0, vec>::get_vpred();
594  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
595  vec sV3 = __vload_pred(lPred, pLoadVec);
596 
597  lPred = c7x::strm_agen<0, vec>::get_vpred();
598  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
599  vec sV4 = __vload_pred(lPred, pLoadVec);
600 
601  for (int32_t vertical = 0; vertical < col; vertical++) {
602  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
603 
604  vec v1 = c7x::strm_eng<1, vec>::get_adv();
605  vec v2 = c7x::strm_eng<1, vec>::get_adv();
606  vec v3 = c7x::strm_eng<1, vec>::get_adv();
607  vec v4 = c7x::strm_eng<1, vec>::get_adv();
608 
609  v1 -= sV1 * scalarDup;
610  v2 -= sV2 * scalarDup;
611  v3 -= sV3 * scalarDup;
612  v4 -= sV4 * scalarDup;
613 
614  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
615  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
616  __vstore_pred(sPred, pStoreVec, v1);
617 
618  sPred = c7x::strm_agen<1, vec>::get_vpred();
619  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
620  __vstore_pred(sPred, pStoreVec, v2);
621 
622  sPred = c7x::strm_agen<1, vec>::get_vpred();
623  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
624  __vstore_pred(sPred, pStoreVec, v3);
625 
626  sPred = c7x::strm_agen<1, vec>::get_vpred();
627  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
628  __vstore_pred(sPred, pStoreVec, v4);
629  }
630 
631  sV1 *= divVec;
632  sV2 *= divVec;
633  sV3 *= divVec;
634  sV4 *= divVec;
635 
636  lPred = c7x::strm_agen<2, vec>::get_vpred();
637  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
638  __vstore_pred(lPred, psV, sV1);
639 
640  lPred = c7x::strm_agen<2, vec>::get_vpred();
641  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
642  __vstore_pred(lPred, psV, sV2);
643 
644  lPred = c7x::strm_agen<2, vec>::get_vpred();
645  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
646  __vstore_pred(lPred, psV, sV3);
647 
648  lPred = c7x::strm_agen<2, vec>::get_vpred();
649  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
650  __vstore_pred(lPred, psV, sV4);
651  }
652  __SE1_CLOSE();
653  __SA1_CLOSE();
654  }
655 
656  if (nTiles2 > 0) {
657  /* 1 X (2*eleCount) TILE */
658 
659  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
660  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
661  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
662  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
663  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
664 
665  dataType *pSE1 = pLocalR + colLimit8 + colLimit4;
666  dataType *pSA1 = pLocalR + colLimit8 + colLimit4;
667  dataType *pSA0 = pLastR; // + colLimit8 + colLimit4;
668  dataType *pSA2 = pLastR; // + colLimit8 + colLimit4;
669 
670  if (col) {
671  __SE1_OPEN(pSE1, seMatrixParams);
672  __SA1_OPEN(saMatrixParams);
673  }
674 
675  for (int32_t tile = 0; tile < nTiles2; tile++) {
676  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
677  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
678  vec sV1 = __vload_pred(lPred, pLoadVec);
679 
680  lPred = c7x::strm_agen<0, vec>::get_vpred();
681  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
682  vec sV2 = __vload_pred(lPred, pLoadVec);
683  for (int32_t vertical = 0; vertical < col; vertical++) {
684  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
685 
686  vec v1 = c7x::strm_eng<1, vec>::get_adv();
687  vec v2 = c7x::strm_eng<1, vec>::get_adv();
688 
689  v1 -= sV1 * scalarDup;
690  v2 -= sV2 * scalarDup;
691 
692  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
693  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
694  __vstore_pred(sPred, pStoreVec, v1);
695 
696  sPred = c7x::strm_agen<1, vec>::get_vpred();
697  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
698  __vstore_pred(sPred, pStoreVec, v2);
699  }
700 
701  sV1 *= divVec;
702  sV2 *= divVec;
703 
704  lPred = c7x::strm_agen<2, vec>::get_vpred();
705  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
706  __vstore_pred(lPred, psV, sV1);
707 
708  lPred = c7x::strm_agen<2, vec>::get_vpred();
709  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
710  __vstore_pred(lPred, psV, sV2);
711  }
712  __SE1_CLOSE();
713  __SA1_CLOSE();
714  }
715  if (nTiles1 > 0) {
716  /* 1 X (1*eleCount) TILE */
717 
718  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
719  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
720  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
721  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
722  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
723 
724  dataType *pSE1 = pLocalR + colLimit8 + colLimit4 + colLimit2;
725  dataType *pSA1 = pLocalR + colLimit8 + colLimit4 + colLimit2;
726  dataType *pSA0 = pLastR; // + colLimit8 + colLimit4;
727  dataType *pSA2 = pLastR; // + colLimit8 + colLimit4;
728 
729  if (col) {
730  __SE1_OPEN(pSE1, seMatrixParams);
731  __SA1_OPEN(saMatrixParams);
732  }
733 
734  for (int32_t tile = 0; tile < nTiles1; tile++) {
735  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
736  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
737  vec sV1 = __vload_pred(lPred, pLoadVec);
738 
739  for (int32_t vertical = 0; vertical < col; vertical++) {
740  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
741 
742  vec v1 = c7x::strm_eng<1, vec>::get_adv();
743 
744  v1 -= sV1 * scalarDup;
745 
746  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
747  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
748  __vstore_pred(sPred, pStoreVec, v1);
749  }
750 
751  sV1 *= divVec;
752 
753  lPred = c7x::strm_agen<2, vec>::get_vpred();
754  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
755  __vstore_pred(lPred, psV, sV1);
756  }
757  __SE1_CLOSE();
758  __SA1_CLOSE();
759  }
760  __SA0_CLOSE();
761  __SA2_CLOSE();
762 
763  __SA0_OPEN(saRefParams);
764  __SA2_OPEN(saRefStoreParams);
765 
766  if (nTiles8 > 0) {
767  /* 1 X (8*eleCount) TILE */
768 
769  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile8;
770  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
771  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles8;
772  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile8;
773  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit8;
774 
775  if (col) {
776  __SE1_OPEN(pLocalInvA, seMatrixParams);
777  __SA1_OPEN(saMatrixParams);
778  }
779 
780  for (int32_t tile = 0; tile < nTiles8; tile++) {
781  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
782  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
783  vec sV1 = __vload_pred(lPred, pLoadVec);
784 
785  lPred = c7x::strm_agen<0, vec>::get_vpred();
786  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
787  vec sV2 = __vload_pred(lPred, pLoadVec);
788 
789  lPred = c7x::strm_agen<0, vec>::get_vpred();
790  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
791  vec sV3 = __vload_pred(lPred, pLoadVec);
792 
793  lPred = c7x::strm_agen<0, vec>::get_vpred();
794  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
795  vec sV4 = __vload_pred(lPred, pLoadVec);
796 
797  lPred = c7x::strm_agen<0, vec>::get_vpred();
798  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
799  vec sV5 = __vload_pred(lPred, pLoadVec);
800 
801  lPred = c7x::strm_agen<0, vec>::get_vpred();
802  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
803  vec sV6 = __vload_pred(lPred, pLoadVec);
804 
805  lPred = c7x::strm_agen<0, vec>::get_vpred();
806  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
807  vec sV7 = __vload_pred(lPred, pLoadVec);
808 
809  lPred = c7x::strm_agen<0, vec>::get_vpred();
810  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLastInvA);
811  vec sV8 = __vload_pred(lPred, pLoadVec);
812 
813  for (int32_t vertical = 0; vertical < col; vertical++) {
814  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
815 
816  vec v1 = c7x::strm_eng<1, vec>::get_adv();
817  vec v2 = c7x::strm_eng<1, vec>::get_adv();
818  vec v3 = c7x::strm_eng<1, vec>::get_adv();
819  vec v4 = c7x::strm_eng<1, vec>::get_adv();
820  vec v5 = c7x::strm_eng<1, vec>::get_adv();
821  vec v6 = c7x::strm_eng<1, vec>::get_adv();
822  vec v7 = c7x::strm_eng<1, vec>::get_adv();
823  vec v8 = c7x::strm_eng<1, vec>::get_adv();
824 
825  v1 -= sV1 * scalarDup;
826  v2 -= sV2 * scalarDup;
827  v3 -= sV3 * scalarDup;
828  v4 -= sV4 * scalarDup;
829  v5 -= sV5 * scalarDup;
830  v6 -= sV6 * scalarDup;
831  v7 -= sV7 * scalarDup;
832  v8 -= sV8 * scalarDup;
833 
834  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
835  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
836  __vstore_pred(sPred, pStoreVec, v1);
837 
838  sPred = c7x::strm_agen<1, vec>::get_vpred();
839  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
840  __vstore_pred(sPred, pStoreVec, v2);
841 
842  sPred = c7x::strm_agen<1, vec>::get_vpred();
843  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
844  __vstore_pred(sPred, pStoreVec, v3);
845 
846  sPred = c7x::strm_agen<1, vec>::get_vpred();
847  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
848  __vstore_pred(sPred, pStoreVec, v4);
849 
850  sPred = c7x::strm_agen<1, vec>::get_vpred();
851  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
852  __vstore_pred(sPred, pStoreVec, v5);
853 
854  sPred = c7x::strm_agen<1, vec>::get_vpred();
855  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
856  __vstore_pred(sPred, pStoreVec, v6);
857 
858  sPred = c7x::strm_agen<1, vec>::get_vpred();
859  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
860  __vstore_pred(sPred, pStoreVec, v7);
861 
862  sPred = c7x::strm_agen<1, vec>::get_vpred();
863  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pLocalInvA);
864  __vstore_pred(sPred, pStoreVec, v8);
865  }
866 
867  sV1 *= divVec;
868  sV2 *= divVec;
869  sV3 *= divVec;
870  sV4 *= divVec;
871  sV5 *= divVec;
872  sV6 *= divVec;
873  sV7 *= divVec;
874  sV8 *= divVec;
875 
876  lPred = c7x::strm_agen<2, vec>::get_vpred();
877  vec *psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
878  __vstore_pred(lPred, psV, sV1);
879 
880  lPred = c7x::strm_agen<2, vec>::get_vpred();
881  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
882  __vstore_pred(lPred, psV, sV2);
883 
884  lPred = c7x::strm_agen<2, vec>::get_vpred();
885  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
886  __vstore_pred(lPred, psV, sV3);
887 
888  lPred = c7x::strm_agen<2, vec>::get_vpred();
889  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
890  __vstore_pred(lPred, psV, sV4);
891 
892  lPred = c7x::strm_agen<2, vec>::get_vpred();
893  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
894  __vstore_pred(lPred, psV, sV5);
895 
896  lPred = c7x::strm_agen<2, vec>::get_vpred();
897  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
898  __vstore_pred(lPred, psV, sV6);
899 
900  lPred = c7x::strm_agen<2, vec>::get_vpred();
901  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
902  __vstore_pred(lPred, psV, sV7);
903 
904  lPred = c7x::strm_agen<2, vec>::get_vpred();
905  psV = c7x::strm_agen<2, vec>::get_adv(pLastInvA);
906  __vstore_pred(lPred, psV, sV8);
907  }
908  __SE1_CLOSE();
909  __SA1_CLOSE();
910  }
911 
912  if (nTiles4 > 0) {
913  /* 1 X (4*eleCount) TILE */
914 
915  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
916  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
917  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
918  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile4;
919  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
920 
921  dataType *pSE1 = pLocalInvA + colLimit8;
922  dataType *pSA1 = pLocalInvA + colLimit8;
923  dataType *pSA0 = pLastInvA; // + colLimit8;
924  dataType *pSA2 = pLastInvA; // + colLimit8;
925 
926  if (col) {
927  __SE1_OPEN(pSE1, seMatrixParams);
928  __SA1_OPEN(saMatrixParams);
929  }
930 
931  for (int32_t tile = 0; tile < nTiles4; tile++) {
932  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
933  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
934  vec sV1 = __vload_pred(lPred, pLoadVec);
935 
936  lPred = c7x::strm_agen<0, vec>::get_vpred();
937  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
938  vec sV2 = __vload_pred(lPred, pLoadVec);
939 
940  lPred = c7x::strm_agen<0, vec>::get_vpred();
941  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
942  vec sV3 = __vload_pred(lPred, pLoadVec);
943 
944  lPred = c7x::strm_agen<0, vec>::get_vpred();
945  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
946  vec sV4 = __vload_pred(lPred, pLoadVec);
947 
948  for (int32_t vertical = 0; vertical < col; vertical++) {
949  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
950 
951  vec v1 = c7x::strm_eng<1, vec>::get_adv();
952  vec v2 = c7x::strm_eng<1, vec>::get_adv();
953  vec v3 = c7x::strm_eng<1, vec>::get_adv();
954  vec v4 = c7x::strm_eng<1, vec>::get_adv();
955 
956  v1 -= sV1 * scalarDup;
957  v2 -= sV2 * scalarDup;
958  v3 -= sV3 * scalarDup;
959  v4 -= sV4 * scalarDup;
960 
961  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
962  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
963  __vstore_pred(sPred, pStoreVec, v1);
964 
965  sPred = c7x::strm_agen<1, vec>::get_vpred();
966  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
967  __vstore_pred(sPred, pStoreVec, v2);
968 
969  sPred = c7x::strm_agen<1, vec>::get_vpred();
970  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
971  __vstore_pred(sPred, pStoreVec, v3);
972 
973  sPred = c7x::strm_agen<1, vec>::get_vpred();
974  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
975  __vstore_pred(sPred, pStoreVec, v4);
976  }
977 
978  sV1 *= divVec;
979  sV2 *= divVec;
980  sV3 *= divVec;
981  sV4 *= divVec;
982 
983  lPred = c7x::strm_agen<2, vec>::get_vpred();
984  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
985  __vstore_pred(lPred, psV, sV1);
986 
987  lPred = c7x::strm_agen<2, vec>::get_vpred();
988  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
989  __vstore_pred(lPred, psV, sV2);
990 
991  lPred = c7x::strm_agen<2, vec>::get_vpred();
992  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
993  __vstore_pred(lPred, psV, sV3);
994 
995  lPred = c7x::strm_agen<2, vec>::get_vpred();
996  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
997  __vstore_pred(lPred, psV, sV4);
998  }
999  __SE1_CLOSE();
1000  __SA1_CLOSE();
1001  }
1002 
1003  if (nTiles2 > 0) {
1004  /* 1 X (2*eleCount) TILE */
1005  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
1006  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1007  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
1008  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile2;
1009  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
1010 
1011  dataType *pSE1 = pLocalInvA + colLimit8 + colLimit4;
1012  dataType *pSA1 = pLocalInvA + colLimit8 + colLimit4;
1013  dataType *pSA0 = pLastInvA; // + colLimit8;
1014  dataType *pSA2 = pLastInvA; // + colLimit8;
1015 
1016  if (col) {
1017  __SE1_OPEN(pSE1, seMatrixParams);
1018  __SA1_OPEN(saMatrixParams);
1019  }
1020 
1021  for (int32_t tile = 0; tile < nTiles2; tile++) {
1022  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1023  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1024  vec sV1 = __vload_pred(lPred, pLoadVec);
1025 
1026  lPred = c7x::strm_agen<0, vec>::get_vpred();
1027  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1028  vec sV2 = __vload_pred(lPred, pLoadVec);
1029 
1030  for (int32_t vertical = 0; vertical < col; vertical++) {
1031  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1032 
1033  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1034  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1035 
1036  v1 -= sV1 * scalarDup;
1037  v2 -= sV2 * scalarDup;
1038 
1039  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1040  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1041  __vstore_pred(sPred, pStoreVec, v1);
1042 
1043  sPred = c7x::strm_agen<1, vec>::get_vpred();
1044  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1045  __vstore_pred(sPred, pStoreVec, v2);
1046  }
1047 
1048  sV1 *= divVec;
1049  sV2 *= divVec;
1050 
1051  lPred = c7x::strm_agen<2, vec>::get_vpred();
1052  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1053  __vstore_pred(lPred, psV, sV1);
1054 
1055  lPred = c7x::strm_agen<2, vec>::get_vpred();
1056  psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1057  __vstore_pred(lPred, psV, sV2);
1058  }
1059 
1060  __SE1_CLOSE();
1061  __SA1_CLOSE();
1062  }
1063  if (nTiles1 > 0) {
1064  /* 1 X (1*eleCount) TILE */
1065  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
1066  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = col;
1067  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
1068  seMatrixParams.DIM2 = saMatrixParams.DIM2 = eleCount * lenTile1;
1069  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
1070 
1071  dataType *pSE1 = pLocalInvA + colLimit8 + colLimit4 + colLimit2;
1072  dataType *pSA1 = pLocalInvA + colLimit8 + colLimit4 + colLimit2;
1073  dataType *pSA0 = pLastInvA; // + colLimit8;
1074  dataType *pSA2 = pLastInvA; // + colLimit8;
1075 
1076  if (col) {
1077  __SE1_OPEN(pSE1, seMatrixParams);
1078  __SA1_OPEN(saMatrixParams);
1079  }
1080 
1081  for (int32_t tile = 0; tile < nTiles1; tile++) {
1082  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1083  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
1084  vec sV1 = __vload_pred(lPred, pLoadVec);
1085 
1086  for (int32_t vertical = 0; vertical < col; vertical++) {
1087  vec scalarDup = c7x::strm_eng<0, vec>::get_adv();
1088 
1089  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1090 
1091  v1 -= sV1 * scalarDup;
1092 
1093  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1094  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSA1);
1095  __vstore_pred(sPred, pStoreVec, v1);
1096  }
1097 
1098  sV1 *= divVec;
1099 
1100  lPred = c7x::strm_agen<2, vec>::get_vpred();
1101  vec *psV = c7x::strm_agen<2, vec>::get_adv(pSA2);
1102  __vstore_pred(lPred, psV, sV1);
1103  }
1104 
1105  __SE1_CLOSE();
1106  __SA1_CLOSE();
1107  }
1108 
1109  __SA0_CLOSE();
1110  __SA2_CLOSE();
1111  }
1112 
1113  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
1114 }
1115 template void DSPLIB_qrd_inverse_R_invA_exec_ci<float>(float *pLocalR,
1116  float *pLocalInvA,
1117  int32_t nCols,
1118  int32_t colStrideR,
1119  int32_t colInvAStride,
1120  float *factArray,
1121  uint8_t *pBlock);
1122 template void DSPLIB_qrd_inverse_R_invA_exec_ci<double>(double *pLocalR,
1123  double *pLocalInvA,
1124  int32_t nCols,
1125  int32_t colStrideR,
1126  int32_t colInvAStride,
1127  double *factArray,
1128  uint8_t *pBlock);
1129 
1130 template <typename dataType>
1132  void *restrict pQ,
1133  void *restrict pR,
1134  void *restrict pInvA,
1135  void *restrict pInvAScratch,
1136  void *restrict pScratch)
1137 {
1138  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
1139 
1140  DSPLIB_STATUS status = DSPLIB_SUCCESS;
1141 
1142  DSPLIB_qrd_inverse_PrivArgs *pKerPrivArgs = (DSPLIB_qrd_inverse_PrivArgs *) handle;
1143  int32_t strideR = pKerPrivArgs->strideR;
1144  int32_t heightR = pKerPrivArgs->heightR;
1145  int32_t widthR = pKerPrivArgs->widthR;
1146  int32_t strideInvA = pKerPrivArgs->strideInvA;
1147  int32_t dataSize = sizeof(dataType);
1148  uint8_t *pBlock = pKerPrivArgs->bufPblock;
1149  DSPLIB_matMul_PrivArgs *pMatMulKerPrivArgs = &pKerPrivArgs->pMatMulKerPrivArgs;
1150  DSPLIB_matTrans_PrivArgs *pMatTransKerPrivArgs = &pKerPrivArgs->pMatTransKerPrivArgs;
1151 
1152  /* Typecast void pointers to respective data type */
1153  dataType *pLocalQ = (dataType *) pQ;
1154  dataType *pLocalR = (dataType *) pR;
1155  dataType *pLocalInvA = (dataType *) pInvA;
1156  dataType *pLocalInvAScratch = (dataType *) pInvAScratch;
1157  dataType *pFactArray = (dataType *) pScratch;
1158 
1159  int32_t colStrideR = strideR / dataSize;
1160  int32_t colInvAStride = strideInvA / dataSize;
1161 
1162  DSPLIB_DEBUGPRINTFN(0, "pLocalQ: %p pLocalR: %p pLocalInvA: %p widthR: %d heightR: %d\n", pLocalQ, pLocalR,
1163  pLocalInvA, widthR, heightR);
1164 
1165  /* ------------------------------------------------------------------- */
1166  /* Write each column of 'pLocal' to a row of 'pLocalInvA'. */
1167  /* ------------------------------------------------------------------- */
1168 
1169  /* set pLocalInvA matrix to identity */
1170 
1171  DSPLIB_qrd_identity_matrix_generate_exec_ci<dataType>(pLocalInvAScratch, heightR, colInvAStride, pBlock);
1172  DSPLIB_qrd_inverse_R_invA_exec_ci<dataType>(pLocalR, pLocalInvAScratch, widthR, colStrideR, colInvAStride,
1173  pFactArray, pBlock);
1174  /* pLocalInvA = inv_R * Q' */
1175  DSPLIB_matTrans_exec_ci<dataType>(pMatTransKerPrivArgs, pLocalQ, pLocalR);
1176  DSPLIB_matMul_exec_ci<dataType>(pMatMulKerPrivArgs, pLocalInvAScratch, pLocalR, pLocalInvA);
1177  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", status);
1178 
1179  return (status);
1180 }
1181 
1182 // explicit instantiation for the different data type versions
1184  void *restrict pQ,
1185  void *restrict pR,
1186  void *restrict pInvA,
1187  void *restrict pInvScratch,
1188  void *restrict pScratch);
1189 
1191  void *restrict pQ,
1192  void *restrict pR,
1193  void *restrict pInvA,
1194  void *restrict pInvScratch,
1195  void *restrict pScratch);
1196 /* ======================================================================== */
1197 /* End of file: DSPLIB_qrd_inverse_ci.cpp */
1198 /* ======================================================================== */
template DSPLIB_STATUS DSPLIB_qrd_inverse_exec_ci< float >(DSPLIB_kernelHandle handle, void *restrict pQ, void *restrict pR, void *restrict pInvA, void *restrict pInvScratch, void *restrict pScratch)
void DSPLIB_qrd_inverse_R_invA_exec_ci(dataType *pLocalR, dataType *pLocalInvA, int32_t nCols, int32_t colStrideR, int32_t colInvAStride, dataType *factArray, uint8_t *pBlock)
static dataType DSPLIB_qrd_inverse_factor_exec_ci(dataType *pR, int32_t colStrideR, int32_t nRows, dataType *pFactor, vec scaleVec, uint8_t *pBlock, __SE_TEMPLATE_v1 se0Params, __SE_TEMPLATE_v1 se1Params, __SA_TEMPLATE_v1 sa0Params, __SA_TEMPLATE_v1 sa1Params)
template void DSPLIB_qrd_inverse_R_invA_init_ci< double >(DSPLIB_kernelHandle handle)
DSPLIB_STATUS DSPLIB_qrd_inverse_init_ci(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsQ, DSPLIB_bufParams2D_t *bufParamsR, DSPLIB_bufParams2D_t *bufParamsInvA, DSPLIB_bufParams2D_t *bufParamsInvAFinal, const DSPLIB_qrdInvInitArgs *pKerInitArgs)
This function is the initialization function for the C7x implementation of the kernel....
template void DSPLIB_qrd_inverse_factor_init_ci< float >(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_qrd_inverse_exec_ci< double >(DSPLIB_kernelHandle handle, void *restrict pQ, void *restrict pR, void *restrict pInvA, void *restrict pInvScratch, void *restrict pScratch)
template void DSPLIB_qrd_inverse_R_invA_exec_ci< float >(float *pLocalR, float *pLocalInvA, int32_t nCols, int32_t colStrideR, int32_t colInvAStride, float *factArray, uint8_t *pBlock)
DSPLIB_STATUS DSPLIB_qrd_inverse_exec_ci(DSPLIB_kernelHandle handle, void *restrict pQ, void *restrict pR, void *restrict pInvA, void *restrict pInvAScratch, void *restrict pScratch)
This function is the main execution function for the C7x implementation of the kernel....
void DSPLIB_qrd_inverse_R_invA_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_qrd_inverse_R_invA_init_ci< float >(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_qrd_inverse_init_ci< float >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsQ, DSPLIB_bufParams2D_t *bufParamsR, DSPLIB_bufParams2D_t *bufParamsInvA, DSPLIB_bufParams2D_t *bufParamsInvAFinal, const DSPLIB_qrdInvInitArgs *pKerInitArgs)
template void DSPLIB_qrd_inverse_factor_init_ci< double >(DSPLIB_kernelHandle handle)
template DSPLIB_STATUS DSPLIB_qrd_inverse_init_ci< double >(DSPLIB_kernelHandle handle, DSPLIB_bufParams2D_t *bufParamsQ, DSPLIB_bufParams2D_t *bufParamsR, DSPLIB_bufParams2D_t *bufParamsInvA, DSPLIB_bufParams2D_t *bufParamsInvAFinal, const DSPLIB_qrdInvInitArgs *pKerInitArgs)
void DSPLIB_qrd_inverse_factor_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_qrd_inverse_R_invA_exec_ci< double >(double *pLocalR, double *pLocalInvA, int32_t nCols, int32_t colStrideR, int32_t colInvAStride, double *factArray, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_qrd_inverse.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
DSPLIB_STATUS_NAME
The enumeration of all status codes.
Definition: DSPLIB_types.h:151
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
A structure for a 2 dimensional buffer descriptor.
int32_t stride_y
Stride in Y dimension in bytes.
uint32_t dim_x
Width of buffer in X dimension in elements.
uint32_t dim_y
Height of buffer in Y dimension in elements.
Structure containing the parameters to initialize the kernel.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
Structure containing the parameters to initialize the kernel.
uint32_t dimX
Size of input data.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
int32_t strideOut
Stride between rows of output data matrix
uint32_t heightIn
Height of input data matrix
int32_t strideIn
Stride between rows of input data matrix
uint32_t widthIn
Size of input buffer for different batches DSPLIB_matTrans_init that will be retrieved and used by DS...
Structure containing the parameters to initialize the kernel.
int8_t funcStyle
Variant of the function refer to DSPLIB_FUNCTION_STYLE
Structure that is reserved for internal use by the kernel.
uint32_t heightR
Height of input data matrix
DSPLIB_matMul_PrivArgs pMatMulKerPrivArgs
Privargs for the matMul kernel.
int32_t strideR
Stride between rows of R output data matrix
DSPLIB_matTrans_PrivArgs pMatTransKerPrivArgs
Privargs for the matTrans kernel.
uint8_t bufPblock[DSPLIB_QRD_INVERSE_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters
uint32_t widthR
Size of input buffer for different batches DSPLIB_qrd_inverse_init that will be retrieved and used by...
int32_t strideInvA
Stride between rows of input data matrix