DSPLIB User Guide
DSPLIB_qrd_ci_opt.cpp
Go to the documentation of this file.
1 /*******************************************************************************
2 **+--------------------------------------------------------------------------+**
3 **| **** |**
4 **| **** |**
5 **| ******o*** |**
6 **| ********_///_**** |**
7 **| ***** /_//_/ **** |**
8 **| ** ** (__/ **** |**
9 **| ********* |**
10 **| **** |**
11 **| *** |**
12 **| |**
13 **| Copyright (c) 2016 Texas Instruments Incorporated |**
14 **| ALL RIGHTS RESERVED |**
15 **| |**
16 **| Permission to use, copy, modify, or distribute this software, |**
17 **| whether in part or in whole, for any purpose is forbidden without |**
18 **| a signed licensing agreement and NDA from Texas Instruments |**
19 **| Incorporated (TI). |**
20 **| |**
21 **| TI makes no representation or warranties with respect to the |**
22 **| performance of this computer program, and specifically disclaims |**
23 **| any responsibility for any damages, special or consequential, |**
24 **| connected with the use of this program. |**
25 **| |**
26 **+--------------------------------------------------------------------------+**
27 *******************************************************************************/
28 
29 /*******************************************************************************
30  *
31  * INCLUDES
32  *
33  ******************************************************************************/
34 #include "DSPLIB_qrd_priv.h"
35 
36 /*******************************************************************************
37  *
38  * INITIALIZATION
39  *
40  ******************************************************************************/
41 template <typename dataType> void DSPLIB_R_column_init_ci(DSPLIB_kernelHandle handle)
42 {
43  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
44 
45  DSPLIB_qrd_PrivArgs *pKerPrivArgs = (DSPLIB_qrd_PrivArgs *) handle;
46  uint8_t *pBlock = pKerPrivArgs->bufPblock;
47  int32_t strideR = pKerPrivArgs->strideR;
48  int32_t colStrideR = strideR / sizeof(dataType);
49  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
50 
51  sa0Params.ICNT0 = 1;
52  sa0Params.DIMFMT = __SA_DIMFMT_2D;
53  sa0Params.VECLEN = __SA_VECLEN_1ELEM;
54  sa0Params.DIM1 = 2 * colStrideR;
55 
56  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = sa0Params;
57  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
58 }
61 
62 template <typename dataType> void DSPLIB_Q_matrix_init_ci(DSPLIB_kernelHandle handle)
63 {
64  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
65  DSPLIB_qrd_PrivArgs *pKerPrivArgs = (DSPLIB_qrd_PrivArgs *) handle;
66  uint8_t *pBlock = pKerPrivArgs->bufPblock;
67  int32_t strideQ = pKerPrivArgs->strideQ;
68  int32_t strideR = pKerPrivArgs->strideR;
69  int32_t nRows = pKerPrivArgs->heightA;
70 
71  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
72  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
73  __SE_TEMPLATE_v1 se2Params = __gen_SE_TEMPLATE_v1();
74 
75  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
76  __SA_TEMPLATE_v1 sa1Params = __gen_SA_TEMPLATE_v1();
77  __SA_TEMPLATE_v1 sa2Params = __gen_SA_TEMPLATE_v1();
78  __SA_TEMPLATE_v1 sa5Params = __gen_SA_TEMPLATE_v1();
79 
80  typedef typename c7x::make_full_vector<dataType>::type vec;
81  int32_t eleCount = c7x::element_count_of<vec>::value;
82  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
83  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
84  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
85  int32_t colStrideQ = strideQ / sizeof(dataType);
86  int32_t colStrideR = strideR / sizeof(dataType);
87 
88  int32_t nVec = DSPLIB_ceilingDiv(nRows, eleCount);
89  int32_t se0TICNT2 = nVec / 2;
90  int32_t se1TICNT2 = nVec - se0TICNT2;
91 
92  se0Params.ICNT1 = eleCount;
93  se0Params.DIM1 = colStrideQ;
94  se0Params.DIM2 = eleCount * colStrideQ;
95  se0Params.DIMFMT = __SE_DIMFMT_3D;
96  se0Params.ELETYPE = SE_ELETYPE;
97  se0Params.VECLEN = SE_VECLEN;
98  se0Params.ICNT2 = se0TICNT2;
99  if (sizeof(dataType) == 4) {
100  se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
101  }
102  else {
103  se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
104  }
105 
106  se1Params.ICNT2 = se1TICNT2;
107  se1Params.ICNT1 = eleCount;
108  se1Params.DIM1 = colStrideQ;
109  se1Params.DIM2 = eleCount * colStrideQ;
110  se1Params.DIMFMT = __SE_DIMFMT_3D;
111  se1Params.ELETYPE = SE_ELETYPE;
112  se1Params.VECLEN = SE_VECLEN;
113  if (sizeof(dataType) == 4) {
114  se1Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
115  }
116  else {
117  se1Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
118  }
119 
120  sa2Params.DIM1 = 0;
121  sa2Params.DIMFMT = __SA_DIMFMT_2D;
122  sa2Params.VECLEN = __SA_VECLEN_1ELEM;
123  sa2Params.ICNT0 = nRows;
124  sa2Params.ICNT1 = se1TICNT2;
125 
126  sa0Params.ICNT0 = se0TICNT2 * eleCount;
127  sa0Params.DIMFMT = __SA_DIMFMT_1D;
128  sa0Params.VECLEN = SA_VECLEN;
129 
130  sa1Params.DIMFMT = __SA_DIMFMT_1D;
131  sa1Params.VECLEN = SA_VECLEN;
132  sa1Params.ICNT0 = nRows - (se0TICNT2 * eleCount);
133 
134  int32_t se1ICNT1 = nRows / 2;
135  int32_t se0ICNT1 = nRows - se1ICNT1;
136 
137  int32_t lenTile = 8;
138  se2Params.ICNT0 = (eleCount * lenTile);
139  se2Params.DIM1 = colStrideR * 2;
140  se2Params.DIM2 = (eleCount * lenTile);
141  se2Params.ICNT1 = se0ICNT1;
142  se2Params.DIMFMT = __SE_DIMFMT_3D;
143  se2Params.ELETYPE = SE_ELETYPE;
144  se2Params.VECLEN = SE_VECLEN;
145  se2Params.DECDIM1 = __SE_DECDIM_DIM2;
146 
147  sa5Params.ICNT0 = eleCount;
148  sa5Params.DIMFMT = __SA_DIMFMT_1D;
149  sa5Params.VECLEN = SA_VECLEN;
150 
151  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = se0Params;
152  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = se1Params;
153  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = sa0Params;
154  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = sa1Params;
155  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = sa2Params;
156  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE)) = se2Params;
157  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE)) = sa5Params;
158 
159  int32_t lenTile8 = 8;
160  __SE_TEMPLATE_v1 seScalarParams = __gen_SE_TEMPLATE_v1();
161  __SE_TEMPLATE_v1 seMatrixParams = __gen_SE_TEMPLATE_v1();
162 
163  __SA_TEMPLATE_v1 saMatrixParams = __gen_SA_TEMPLATE_v1();
164 
165  __SE_ELEDUP SE_ELEDUP = c7x::se_eledup<dataType>::value;
166 
167  seScalarParams.DIM1 = 0;
168  seScalarParams.ELEDUP = SE_ELEDUP;
169  seScalarParams.DIMFMT = __SE_DIMFMT_2D;
170  seScalarParams.VECLEN = SE_VECLEN;
171  seScalarParams.ELETYPE = SE_ELETYPE;
172 
173  seMatrixParams.ICNT0 = (eleCount * lenTile8);
174  seMatrixParams.DIM1 = colStrideR;
175  seMatrixParams.DIM2 = (eleCount * lenTile8);
176  seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
177  seMatrixParams.ELETYPE = SE_ELETYPE;
178  seMatrixParams.VECLEN = SE_VECLEN;
179  seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
180 
181  saMatrixParams.ICNT0 = (eleCount * lenTile8);
182  saMatrixParams.DIM1 = colStrideR;
183  saMatrixParams.DIM2 = (eleCount * lenTile8);
184  saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
185  saMatrixParams.VECLEN = SA_VECLEN;
186  saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
187 
188  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE)) = seScalarParams;
189  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE)) = seMatrixParams;
190 
191  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE)) = saMatrixParams;
192 
193  int32_t lenTile4 = 4;
194  seScalarParams = __gen_SE_TEMPLATE_v1();
195  seMatrixParams = __gen_SE_TEMPLATE_v1();
196 
197  saMatrixParams = __gen_SA_TEMPLATE_v1();
198 
199  SE_ELEDUP = c7x::se_eledup<dataType>::value;
200 
201  seMatrixParams.ICNT0 = (eleCount * lenTile4);
202  seMatrixParams.DIM1 = colStrideQ;
203  seMatrixParams.DIM2 = (eleCount * lenTile4);
204  seMatrixParams.DIMFMT = __SE_DIMFMT_3D;
205  seMatrixParams.ELETYPE = SE_ELETYPE;
206  seMatrixParams.VECLEN = SE_VECLEN;
207  seMatrixParams.DECDIM1 = __SE_DECDIM_DIM2;
208 
209  saMatrixParams.ICNT0 = (eleCount * lenTile4);
210  saMatrixParams.DIM1 = colStrideQ;
211  saMatrixParams.DIM2 = (eleCount * lenTile4);
212  saMatrixParams.DIMFMT = __SA_DIMFMT_3D;
213  saMatrixParams.VECLEN = SA_VECLEN;
214  saMatrixParams.DECDIM1 = __SA_DECDIM_DIM2;
215 
216  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE)) = seMatrixParams;
217  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (17 * SE_PARAM_SIZE)) = saMatrixParams;
218 
219  se2Params = __gen_SE_TEMPLATE_v1();
220  se2Params.ICNT0 = (eleCount * lenTile4);
221  se2Params.DIM1 = colStrideR * 2;
222  se2Params.DIM2 = (eleCount * lenTile4);
223  se2Params.ICNT1 = se0ICNT1;
224  se2Params.DIMFMT = __SE_DIMFMT_3D;
225  se2Params.ELETYPE = SE_ELETYPE;
226  se2Params.VECLEN = SE_VECLEN;
227  se2Params.DECDIM1 = __SE_DECDIM_DIM2;
228  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE)) = se2Params;
229 
230  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
231 }
234 
235 /*******************************************************************************
236  *
237  * IMPLEMENTATION
238  *
239  ******************************************************************************/
240 
241 template <typename dataType>
242 void DSPLIB_qrd_R_matrix_exec_ci(dataType *pLocalR,
243  dataType *pLocalU,
244  dataType *pSum,
245  dataType scale,
246  int32_t colStrideR,
247  int32_t nRows,
248  int32_t nCols,
249  uint8_t *pBlock)
250 {
251  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
252 
253  typedef typename c7x::make_full_vector<dataType>::type vec;
254  dataType *rStore = pLocalR + 1;
255 
256  int32_t eleCount = c7x::element_count_of<vec>::value;
257  int32_t lenTile8 = 8;
258  int32_t lenTile4 = 4;
259  int32_t lenTile2 = 2;
260  int32_t lenTile1 = 1;
261 
262  int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
263  int32_t nTiles8 = nTiles1 / lenTile8;
264  nTiles1 -= nTiles8 * lenTile8;
265  int32_t nTiles4 = nTiles1 / lenTile4;
266  nTiles1 -= nTiles4 * lenTile4;
267  int32_t nTiles2 = nTiles1 / lenTile2;
268  nTiles1 -= nTiles2 * lenTile2;
269 
270  int32_t remainingCols = nCols;
271  int32_t colLimit8 = nTiles8 * lenTile8 * eleCount;
272  colLimit8 = (remainingCols < (colLimit8)) ? remainingCols : colLimit8;
273 
274  remainingCols = remainingCols - colLimit8;
275  int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
276  colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
277 
278  remainingCols = remainingCols - colLimit4;
279  int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
280  colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
281 
282  int32_t colLimit1 = remainingCols - colLimit2;
283 
284  __SE_TEMPLATE_v1 se0Params;
285  __SE_TEMPLATE_v1 se1Params;
286 
287  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
288  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (9 * SE_PARAM_SIZE));
289 
290  __SA_TEMPLATE_v1 sa0Params;
291  __SA_TEMPLATE_v1 sa1Params;
292 
293  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
294  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (11 * SE_PARAM_SIZE));
295 
296  sa1Params.ICNT0 = nCols;
297 
298  __SA1_OPEN(sa1Params);
299 
300 
301  sa0Params.ICNT0 = nRows;
302  sa0Params.ICNT1 = nTiles8 + nTiles4 + nTiles2 + nTiles1;
303  __SA0_OPEN(sa0Params);
304 
305  int32_t se1ICNT1 = nRows / 2;
306  int32_t se0ICNT1 = nRows - se1ICNT1;
307  se0Params.ICNT1 = se0ICNT1;
308  se1Params.ICNT1 = se1ICNT1;
309 
310  if (nTiles8 > 0) {
311  se0Params.ICNT2 = nTiles8;
312  se0Params.DECDIM1_WIDTH = colLimit8;
313  se0Params.ICNT0 = (eleCount * lenTile8);
314 
315  se1Params.ICNT2 = nTiles8;
316  se1Params.DECDIM1_WIDTH = colLimit8;
317  se1Params.ICNT0 = (eleCount * lenTile8);
318 
319  vec sV01 = (vec) 0;
320  vec sV02 = (vec) 0;
321  vec sV03 = (vec) 0;
322  vec sV04 = (vec) 0;
323  vec sV05 = (vec) 0;
324  vec sV06 = (vec) 0;
325  vec sV07 = (vec) 0;
326  vec sV08 = (vec) 0;
327 
328  vec sV11 = (vec) 0;
329  vec sV12 = (vec) 0;
330  vec sV13 = (vec) 0;
331  vec sV14 = (vec) 0;
332  vec sV15 = (vec) 0;
333  vec sV16 = (vec) 0;
334  vec sV17 = (vec) 0;
335  vec sV18 = (vec) 0;
336 
337  __SE0_OPEN(rStore, se0Params);
338  __SE1_OPEN(rStore + colStrideR, se1Params);
339 
340 
341  for (int tile = 0; tile < nTiles8; tile++) {
342 
343  sV01 = (vec) 0;
344  sV02 = (vec) 0;
345  sV03 = (vec) 0;
346  sV04 = (vec) 0;
347  sV05 = (vec) 0;
348  sV06 = (vec) 0;
349  sV07 = (vec) 0;
350  sV08 = (vec) 0;
351 
352  sV11 = (vec) 0;
353  sV12 = (vec) 0;
354  sV13 = (vec) 0;
355  sV14 = (vec) 0;
356  sV15 = (vec) 0;
357  sV16 = (vec) 0;
358  sV17 = (vec) 0;
359  sV18 = (vec) 0;
360  for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
361  vec v01 = c7x::strm_eng<0, vec>::get_adv();
362  vec v02 = c7x::strm_eng<0, vec>::get_adv();
363  vec v03 = c7x::strm_eng<0, vec>::get_adv();
364  vec v04 = c7x::strm_eng<0, vec>::get_adv();
365  vec v05 = c7x::strm_eng<0, vec>::get_adv();
366  vec v06 = c7x::strm_eng<0, vec>::get_adv();
367  vec v07 = c7x::strm_eng<0, vec>::get_adv();
368  vec v08 = c7x::strm_eng<0, vec>::get_adv();
369 
370  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
371  vec uV1 = __vload_dup(pU);
372 
373  vec v11 = c7x::strm_eng<1, vec>::get_adv();
374  vec v12 = c7x::strm_eng<1, vec>::get_adv();
375  vec v13 = c7x::strm_eng<1, vec>::get_adv();
376  vec v14 = c7x::strm_eng<1, vec>::get_adv();
377  vec v15 = c7x::strm_eng<1, vec>::get_adv();
378  vec v16 = c7x::strm_eng<1, vec>::get_adv();
379  vec v17 = c7x::strm_eng<1, vec>::get_adv();
380  vec v18 = c7x::strm_eng<1, vec>::get_adv();
381 
382  pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
383  vec uV2 = __vload_dup(pU);
384 
385  sV01 += v01 * uV1;
386  sV02 += v02 * uV1;
387  sV03 += v03 * uV1;
388  sV04 += v04 * uV1;
389  sV05 += v05 * uV1;
390  sV06 += v06 * uV1;
391  sV07 += v07 * uV1;
392  sV08 += v08 * uV1;
393 
394  sV11 += v11 * uV2;
395  sV12 += v12 * uV2;
396  sV13 += v13 * uV2;
397  sV14 += v14 * uV2;
398  sV15 += v15 * uV2;
399  sV16 += v16 * uV2;
400  sV17 += v17 * uV2;
401  sV18 += v18 * uV2;
402  }
403 
404  if (se1ICNT1 != se0ICNT1) /* For last odd numbered row */
405  {
406  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
407  vec uV1 = __vload_dup(pU);
408 
409  vec v01 = c7x::strm_eng<0, vec>::get_adv();
410  vec v02 = c7x::strm_eng<0, vec>::get_adv();
411  vec v03 = c7x::strm_eng<0, vec>::get_adv();
412  vec v04 = c7x::strm_eng<0, vec>::get_adv();
413  vec v05 = c7x::strm_eng<0, vec>::get_adv();
414  vec v06 = c7x::strm_eng<0, vec>::get_adv();
415  vec v07 = c7x::strm_eng<0, vec>::get_adv();
416  vec v08 = c7x::strm_eng<0, vec>::get_adv();
417 
418  sV01 += v01 * uV1;
419  sV02 += v02 * uV1;
420  sV03 += v03 * uV1;
421  sV04 += v04 * uV1;
422  sV05 += v05 * uV1;
423  sV06 += v06 * uV1;
424  sV07 += v07 * uV1;
425  sV08 += v08 * uV1;
426  }
427 
428  sV01 += sV11;
429  sV02 += sV12;
430  sV03 += sV13;
431  sV04 += sV14;
432  sV05 += sV15;
433  sV06 += sV16;
434  sV07 += sV17;
435  sV08 += sV18;
436 
437  sV01 *= scale;
438  sV02 *= scale;
439  sV03 *= scale;
440  sV04 *= scale;
441  sV05 *= scale;
442  sV06 *= scale;
443  sV07 *= scale;
444  sV08 *= scale;
445 
446  __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
447  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
448  __vstore_pred(pred, pStoreVec, sV01);
449 
450  pred = c7x::strm_agen<1, vec>::get_vpred();
451  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
452  __vstore_pred(pred, pStoreVec, sV02);
453 
454  pred = c7x::strm_agen<1, vec>::get_vpred();
455  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
456  __vstore_pred(pred, pStoreVec, sV03);
457 
458  pred = c7x::strm_agen<1, vec>::get_vpred();
459  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
460  __vstore_pred(pred, pStoreVec, sV04);
461 
462  pred = c7x::strm_agen<1, vec>::get_vpred();
463  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
464  __vstore_pred(pred, pStoreVec, sV05);
465 
466  pred = c7x::strm_agen<1, vec>::get_vpred();
467  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
468  __vstore_pred(pred, pStoreVec, sV06);
469 
470  pred = c7x::strm_agen<1, vec>::get_vpred();
471  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
472  __vstore_pred(pred, pStoreVec, sV07);
473 
474  pred = c7x::strm_agen<1, vec>::get_vpred();
475  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
476  __vstore_pred(pred, pStoreVec, sV08);
477  }
478  __SE0_CLOSE();
479  __SE1_CLOSE();
480  }
481  if (nTiles4 > 0) {
482 
483  se0Params.ICNT2 = nTiles4;
484  se0Params.DECDIM1_WIDTH = colLimit4;
485  se0Params.ICNT0 = (eleCount * lenTile4);
486 
487  se1Params.ICNT2 = nTiles4;
488  se1Params.DECDIM1_WIDTH = colLimit4;
489  se1Params.ICNT0 = (eleCount * lenTile4);
490 
491  vec sV01 = (vec) 0;
492  vec sV02 = (vec) 0;
493  vec sV03 = (vec) 0;
494  vec sV04 = (vec) 0;
495 
496  vec sV11 = (vec) 0;
497  vec sV12 = (vec) 0;
498  vec sV13 = (vec) 0;
499  vec sV14 = (vec) 0;
500 
501  dataType *pSE0 = rStore + colLimit8;
502  dataType *pSE1 = pSE0 + colStrideR;
503 
504  __SE0_OPEN(pSE0, se0Params);
505 
506  __SE1_OPEN(pSE1, se1Params);
507 
508  for (int tile = 0; tile < nTiles4; tile++) {
509 
510  sV01 = (vec) 0;
511  sV02 = (vec) 0;
512  sV03 = (vec) 0;
513  sV04 = (vec) 0;
514 
515  sV11 = (vec) 0;
516  sV12 = (vec) 0;
517  sV13 = (vec) 0;
518  sV14 = (vec) 0;
519 
520  for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
521 
522  vec v01 = c7x::strm_eng<0, vec>::get_adv();
523  vec v02 = c7x::strm_eng<0, vec>::get_adv();
524  vec v03 = c7x::strm_eng<0, vec>::get_adv();
525  vec v04 = c7x::strm_eng<0, vec>::get_adv();
526  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
527  vec uV1 = __vload_dup(pU);
528 
529  vec v11 = c7x::strm_eng<1, vec>::get_adv();
530  vec v12 = c7x::strm_eng<1, vec>::get_adv();
531  vec v13 = c7x::strm_eng<1, vec>::get_adv();
532  vec v14 = c7x::strm_eng<1, vec>::get_adv();
533 
534  pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
535  vec uV2 = __vload_dup(pU);
536 
537  sV01 += v01 * uV1;
538  sV02 += v02 * uV1;
539  sV03 += v03 * uV1;
540  sV04 += v04 * uV1;
541 
542  sV11 += v11 * uV2;
543  sV12 += v12 * uV2;
544  sV13 += v13 * uV2;
545  sV14 += v14 * uV2;
546  }
547 
548  if (se1ICNT1 != se0ICNT1) /* For last odd numbered row */
549  {
550  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
551  vec uV1 = __vload_dup(pU);
552 
553  vec v01 = c7x::strm_eng<0, vec>::get_adv();
554  vec v02 = c7x::strm_eng<0, vec>::get_adv();
555  vec v03 = c7x::strm_eng<0, vec>::get_adv();
556  vec v04 = c7x::strm_eng<0, vec>::get_adv();
557 
558  sV01 += v01 * uV1;
559  sV02 += v02 * uV1;
560  sV03 += v03 * uV1;
561  sV04 += v04 * uV1;
562  }
563 
564  sV01 += sV11;
565  sV02 += sV12;
566  sV03 += sV13;
567  sV04 += sV14;
568 
569  sV01 *= scale;
570  sV02 *= scale;
571  sV03 *= scale;
572  sV04 *= scale;
573 
574  __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
575  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
576  __vstore_pred(pred, pStoreVec, sV01);
577 
578  pred = c7x::strm_agen<1, vec>::get_vpred();
579  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
580  __vstore_pred(pred, pStoreVec, sV02);
581 
582  pred = c7x::strm_agen<1, vec>::get_vpred();
583  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
584  __vstore_pred(pred, pStoreVec, sV03);
585 
586  pred = c7x::strm_agen<1, vec>::get_vpred();
587  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
588  __vstore_pred(pred, pStoreVec, sV04);
589  }
590  __SE0_CLOSE();
591  __SE1_CLOSE();
592  }
593  if (nTiles2 > 0) {
594 
595  se0Params.ICNT2 = nTiles2;
596  se0Params.DECDIM1_WIDTH = colLimit2;
597  se0Params.ICNT0 = (eleCount * lenTile2);
598 
599  se1Params.ICNT2 = nTiles2;
600  se1Params.DECDIM1_WIDTH = colLimit2;
601  se1Params.ICNT0 = (eleCount * lenTile2);
602 
603  vec sV01 = (vec) 0;
604  vec sV02 = (vec) 0;
605 
606  vec sV11 = (vec) 0;
607  vec sV12 = (vec) 0;
608 
609  dataType *pSE0 = rStore + colLimit8 + colLimit4;
610  dataType *pSE1 = pSE0 + colStrideR;
611 
612  __SE0_OPEN(pSE0, se0Params);
613  __SE1_OPEN(pSE1, se1Params);
614 
615 
616  for (int tile = 0; tile < nTiles2; tile++) {
617  sV01 = (vec) 0;
618  sV02 = (vec) 0;
619 
620  sV11 = (vec) 0;
621  sV12 = (vec) 0;
622 
623  for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
624  vec v01 = c7x::strm_eng<0, vec>::get_adv();
625  vec v02 = c7x::strm_eng<0, vec>::get_adv();
626 
627  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
628  vec uV1 = __vload_dup(pU);
629 
630  vec v11 = c7x::strm_eng<1, vec>::get_adv();
631  vec v12 = c7x::strm_eng<1, vec>::get_adv();
632 
633  pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
634  vec uV2 = __vload_dup(pU);
635 
636  sV01 += v01 * uV1;
637  sV02 += v02 * uV1;
638 
639  sV11 += v11 * uV2;
640  sV12 += v12 * uV2;
641  }
642 
643  if (se1ICNT1 != se0ICNT1) /* For last odd numbered row */
644  {
645  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
646  vec uV1 = __vload_dup(pU);
647 
648  vec v01 = c7x::strm_eng<0, vec>::get_adv();
649  vec v02 = c7x::strm_eng<0, vec>::get_adv();
650 
651  sV01 += v01 * uV1;
652  sV02 += v02 * uV1;
653  }
654 
655  sV01 += sV11;
656  sV02 += sV12;
657 
658  sV01 *= scale;
659  sV02 *= scale;
660 
661  __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
662  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
663  __vstore_pred(pred, pStoreVec, sV01);
664 
665  pred = c7x::strm_agen<1, vec>::get_vpred();
666  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
667  __vstore_pred(pred, pStoreVec, sV02);
668  }
669  __SE0_CLOSE();
670  __SE1_CLOSE();
671  }
672  if (nTiles1 > 0) {
673  se0Params.ICNT1 = se0ICNT1;
674  se0Params.ICNT2 = nTiles1;
675  se0Params.DECDIM1_WIDTH = colLimit1;
676  se0Params.ICNT0 = (eleCount * lenTile1);
677 
678  se1Params.ICNT1 = se1ICNT1;
679  se1Params.ICNT2 = nTiles1;
680  se1Params.DECDIM1_WIDTH = colLimit1;
681  se1Params.ICNT0 = (eleCount * lenTile1);
682 
683  vec sV01 = (vec) 0;
684 
685  vec sV11 = (vec) 0;
686 
687  dataType *pSE0 = rStore + colLimit8 + colLimit4 + colLimit2;
688  dataType *pSE1 = pSE0 + colStrideR;
689 
690  __SE0_OPEN(pSE0, se0Params);
691  __SE1_OPEN(pSE1, se1Params);
692 
693 
694  for (int tile = 0; tile < nTiles1; tile++) {
695 
696  sV01 = (vec) 0;
697 
698  sV11 = (vec) 0;
699 
700  for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
701  vec v01 = c7x::strm_eng<0, vec>::get_adv();
702 
703  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
704  vec uV1 = __vload_dup(pU);
705 
706  vec v11 = c7x::strm_eng<1, vec>::get_adv();
707 
708  pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
709  vec uV2 = __vload_dup(pU);
710 
711  sV01 += v01 * uV1;
712 
713  sV11 += v11 * uV2;
714  }
715 
716  if (se1ICNT1 != se0ICNT1) /* For last odd numbered row */
717  {
718  dataType *pU = c7x::strm_agen<0, dataType>::get_adv(pLocalU);
719  vec uV1 = __vload_dup(pU);
720 
721  vec v01 = c7x::strm_eng<0, vec>::get_adv();
722 
723  sV01 += v01 * uV1;
724  }
725 
726  sV01 += sV11;
727 
728  sV01 *= scale;
729 
730  __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
731  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSum);
732  __vstore_pred(pred, pStoreVec, sV01);
733  }
734  __SE0_CLOSE();
735  __SE1_CLOSE();
736  }
737  __SA0_CLOSE();
738  __SA1_CLOSE();
739 
740  __SE_TEMPLATE_v1 seScalarParams;
741  __SE_TEMPLATE_v1 seMatrixParams;
742 
743  __SA_TEMPLATE_v1 saMatrixParams;
744 
745  seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
746  seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE));
747  saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
748 
749  sa1Params.ICNT0 = nCols;
750  seScalarParams.ICNT0 = nRows;
751  seScalarParams.ICNT1 = nTiles8 + nTiles4 + nTiles2 + nTiles1;
752 
753  sa1Params.ICNT0 = nCols;
754 
755  __SA0_OPEN(sa1Params);
756  __SE0_OPEN(pLocalU, seScalarParams);
757 
758  seMatrixParams.ICNT1 = saMatrixParams.ICNT1 = nRows;
759 
760  if (nTiles8 > 0) {
761  seMatrixParams.ICNT2 = nTiles8;
762  seMatrixParams.DECDIM1_WIDTH = colLimit8;
763 
764  saMatrixParams.ICNT2 = nTiles8;
765  saMatrixParams.DECDIM1_WIDTH = colLimit8;
766 
767  __SE1_OPEN(rStore, seMatrixParams);
768  __SA1_OPEN(saMatrixParams);
769 
770  for (int32_t tile = 0; tile < nTiles8; tile++) {
771  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
772  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
773  vec sV1 = __vload_pred(lPred, pLoadVec);
774 
775  lPred = c7x::strm_agen<0, vec>::get_vpred();
776  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
777  vec sV2 = __vload_pred(lPred, pLoadVec);
778 
779  lPred = c7x::strm_agen<0, vec>::get_vpred();
780  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
781  vec sV3 = __vload_pred(lPred, pLoadVec);
782 
783  lPred = c7x::strm_agen<0, vec>::get_vpred();
784  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
785  vec sV4 = __vload_pred(lPred, pLoadVec);
786 
787  lPred = c7x::strm_agen<0, vec>::get_vpred();
788  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
789  vec sV5 = __vload_pred(lPred, pLoadVec);
790 
791  lPred = c7x::strm_agen<0, vec>::get_vpred();
792  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
793  vec sV6 = __vload_pred(lPred, pLoadVec);
794 
795  lPred = c7x::strm_agen<0, vec>::get_vpred();
796  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
797  vec sV7 = __vload_pred(lPred, pLoadVec);
798 
799  lPred = c7x::strm_agen<0, vec>::get_vpred();
800  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSum);
801  vec sV8 = __vload_pred(lPred, pLoadVec);
802 
803  for (int32_t vertical = 0; vertical < nRows; vertical++) {
804 
805  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
806 
807  vec v1 = c7x::strm_eng<1, vec>::get_adv();
808  vec v2 = c7x::strm_eng<1, vec>::get_adv();
809  vec v3 = c7x::strm_eng<1, vec>::get_adv();
810  vec v4 = c7x::strm_eng<1, vec>::get_adv();
811  vec v5 = c7x::strm_eng<1, vec>::get_adv();
812  vec v6 = c7x::strm_eng<1, vec>::get_adv();
813  vec v7 = c7x::strm_eng<1, vec>::get_adv();
814  vec v8 = c7x::strm_eng<1, vec>::get_adv();
815 
816  v1 -= sV1 * scalarDup1;
817  v2 -= sV2 * scalarDup1;
818  v3 -= sV3 * scalarDup1;
819  v4 -= sV4 * scalarDup1;
820  v5 -= sV5 * scalarDup1;
821  v6 -= sV6 * scalarDup1;
822  v7 -= sV7 * scalarDup1;
823  v8 -= sV8 * scalarDup1;
824 
825  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
826  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
827  __vstore_pred(sPred, pStoreVec, v1);
828 
829  sPred = c7x::strm_agen<1, vec>::get_vpred();
830  pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
831  __vstore_pred(sPred, pStoreVec, v2);
832 
833  sPred = c7x::strm_agen<1, vec>::get_vpred();
834  pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
835  __vstore_pred(sPred, pStoreVec, v3);
836 
837  sPred = c7x::strm_agen<1, vec>::get_vpred();
838  pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
839  __vstore_pred(sPred, pStoreVec, v4);
840 
841  sPred = c7x::strm_agen<1, vec>::get_vpred();
842  pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
843  __vstore_pred(sPred, pStoreVec, v5);
844 
845  sPred = c7x::strm_agen<1, vec>::get_vpred();
846  pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
847  __vstore_pred(sPred, pStoreVec, v6);
848 
849  sPred = c7x::strm_agen<1, vec>::get_vpred();
850  pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
851  __vstore_pred(sPred, pStoreVec, v7);
852 
853  sPred = c7x::strm_agen<1, vec>::get_vpred();
854  pStoreVec = c7x::strm_agen<1, vec>::get_adv(rStore);
855  __vstore_pred(sPred, pStoreVec, v8);
856  }
857  }
858 
859  __SE1_CLOSE();
860  __SA1_CLOSE();
861  }
862 
863  if (nTiles4 > 0) {
864 
865  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile4;
866  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles4;
867  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit4;
868 
869  dataType *pSE1 = rStore + colLimit8;
870  dataType *pSA0 = pSum;
871 
872  __SE1_OPEN(pSE1, seMatrixParams);
873  __SA1_OPEN(saMatrixParams);
874 
875  for (int32_t tile = 0; tile < nTiles4; tile++) {
876  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
877  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
878  vec sV1 = __vload_pred(lPred, pLoadVec);
879 
880  lPred = c7x::strm_agen<0, vec>::get_vpred();
881  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
882  vec sV2 = __vload_pred(lPred, pLoadVec);
883 
884  lPred = c7x::strm_agen<0, vec>::get_vpred();
885  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
886  vec sV3 = __vload_pred(lPred, pLoadVec);
887 
888  lPred = c7x::strm_agen<0, vec>::get_vpred();
889  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
890  vec sV4 = __vload_pred(lPred, pLoadVec);
891 
892  for (int32_t vertical = 0; vertical < nRows; vertical++) {
893  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
894 
895  vec v1 = c7x::strm_eng<1, vec>::get_adv();
896  vec v2 = c7x::strm_eng<1, vec>::get_adv();
897  vec v3 = c7x::strm_eng<1, vec>::get_adv();
898  vec v4 = c7x::strm_eng<1, vec>::get_adv();
899 
900  v1 -= sV1 * scalarDup1;
901  v2 -= sV2 * scalarDup1;
902  v3 -= sV3 * scalarDup1;
903  v4 -= sV4 * scalarDup1;
904 
905  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
906  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
907  __vstore_pred(sPred, pStoreVec, v1);
908 
909  sPred = c7x::strm_agen<1, vec>::get_vpred();
910  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
911  __vstore_pred(sPred, pStoreVec, v2);
912 
913  sPred = c7x::strm_agen<1, vec>::get_vpred();
914  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
915  __vstore_pred(sPred, pStoreVec, v3);
916 
917  sPred = c7x::strm_agen<1, vec>::get_vpred();
918  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
919  __vstore_pred(sPred, pStoreVec, v4);
920  }
921  }
922 
923  __SE1_CLOSE();
924  __SA1_CLOSE();
925  }
926  if (nTiles2 > 0) {
927 
928  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile2;
929  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles2;
930  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit2;
931 
932  dataType *pSE1 = rStore + colLimit8 + colLimit4;
933  dataType *pSA0 = pSum;
934 
935  __SE1_OPEN(pSE1, seMatrixParams);
936  __SA1_OPEN(saMatrixParams);
937 
938  for (int32_t tile = 0; tile < nTiles2; tile++) {
939  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
940  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
941  vec sV1 = __vload_pred(lPred, pLoadVec);
942 
943  lPred = c7x::strm_agen<0, vec>::get_vpred();
944  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
945  vec sV2 = __vload_pred(lPred, pLoadVec);
946 
947  for (int32_t vertical = 0; vertical < nRows; vertical++) {
948  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
949 
950  vec v1 = c7x::strm_eng<1, vec>::get_adv();
951  vec v2 = c7x::strm_eng<1, vec>::get_adv();
952 
953  v1 -= sV1 * scalarDup1;
954  v2 -= sV2 * scalarDup1;
955 
956  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
957  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
958  __vstore_pred(sPred, pStoreVec, v1);
959 
960  sPred = c7x::strm_agen<1, vec>::get_vpred();
961  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
962  __vstore_pred(sPred, pStoreVec, v2);
963  }
964  }
965 
966  __SE1_CLOSE();
967  __SA1_CLOSE();
968  }
969  if (nTiles1 > 0) {
970 
971  seMatrixParams.ICNT0 = saMatrixParams.ICNT0 = eleCount * lenTile1;
972  seMatrixParams.ICNT2 = saMatrixParams.ICNT2 = nTiles1;
973  seMatrixParams.DECDIM1_WIDTH = saMatrixParams.DECDIM1_WIDTH = colLimit1;
974 
975  dataType *pSE1 = rStore + colLimit8 + colLimit4 + colLimit2;
976  dataType *pSA0 = pSum;
977 
978  __SE1_OPEN(pSE1, seMatrixParams);
979  __SA1_OPEN(saMatrixParams);
980 
981  for (int32_t tile = 0; tile < nTiles1; tile++) {
982  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
983  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pSA0);
984  vec sV1 = __vload_pred(lPred, pLoadVec);
985 
986  for (int32_t vertical = 0; vertical < nRows; vertical++) {
987  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
988 
989  vec v1 = c7x::strm_eng<1, vec>::get_adv();
990  v1 -= sV1 * scalarDup1;
991 
992  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
993  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
994  __vstore_pred(sPred, pStoreVec, v1);
995  }
996  }
997 
998  __SE1_CLOSE();
999  __SA1_CLOSE();
1000  }
1001  __SE0_CLOSE();
1002  __SA0_CLOSE();
1003  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
1004 }
1005 
1006 template void DSPLIB_qrd_R_matrix_exec_ci<float>(float *pLocalR,
1007  float *pLocalU,
1008  float *sum,
1009  float scale,
1010  int32_t colStrideR,
1011  int32_t nRows,
1012  int32_t nCols,
1013  uint8_t *pBlock);
1014 template void DSPLIB_qrd_R_matrix_exec_ci<double>(double *pLocalR,
1015  double *pLocalU,
1016  double *sum,
1017  double scale,
1018  int32_t colStrideR,
1019  int32_t nRows,
1020  int32_t nCols,
1021  uint8_t *pBlock);
1022 
1023 template <typename dataType>
1024 void DSPLIB_qrd_Q_matrix_exec_ci(dataType *pLocalQ,
1025  dataType *pLocalU,
1026  dataType *pSum,
1027  dataType scale,
1028  int32_t colStrideQ,
1029  int32_t nRows,
1030  int32_t nCols,
1031  uint8_t *pBlock)
1032 {
1033  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
1034 
1035  typedef typename c7x::make_full_vector<dataType>::type vec;
1036 
1037  int32_t eleCount = c7x::element_count_of<vec>::value;
1038 
1039  /************** Q SUM CALC ************/
1040 
1041  __SE_TEMPLATE_v1 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1042  __SE_TEMPLATE_v1 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1043  __SA_TEMPLATE_v1 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
1044  __SA_TEMPLATE_v1 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
1045  __SA_TEMPLATE_v1 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
1046 
1047  int32_t se0TICNT2 = se0Params.ICNT2;
1048  int32_t se1TICNT2 = se1Params.ICNT2;
1049 
1050  se0Params.ICNT0 = se1Params.ICNT0 = nCols;
1051 
1052  sa2Params.ICNT0 = nCols;
1053 
1054  dataType *pSE0 = pLocalQ;
1055  dataType *pSE1Local = pLocalQ + (se0TICNT2 * eleCount * colStrideQ);
1056  dataType *pSA0 = pSum;
1057  dataType *pSA1 = pSum + (se0TICNT2 * eleCount);
1058 
1059  vec scaleV = (vec) (scale);
1060 
1061  __SA1_OPEN(sa1Params);
1062  __SA2_OPEN(sa2Params);
1063  __SE1_OPEN(pSE1Local, se1Params);
1064  if (se0TICNT2 > 0) {
1065  __SA0_OPEN(sa0Params);
1066  __SE0_OPEN(pSE0, se0Params);
1067  }
1068 
1069  for (int32_t verticalCnt = 0; verticalCnt < se0TICNT2; verticalCnt++) {
1070  vec sV1 = (vec) 0;
1071  vec sV2 = (vec) 0;
1072  vec sV3 = (vec) 0;
1073  vec sV4 = (vec) 0;
1074  vec sV5 = (vec) 0;
1075  vec sV6 = (vec) 0;
1076  vec sV7 = (vec) 0;
1077  vec sV8 = (vec) 0;
1078 
1079  int32_t horizontal = 0;
1080 
1081  for (; horizontal < (nCols) -3; horizontal += 4) {
1082 
1083  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1084  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1085 
1086  vec v3 = c7x::strm_eng<0, vec>::get_adv();
1087  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1088 
1089  vec v5 = c7x::strm_eng<0, vec>::get_adv();
1090  vec v6 = c7x::strm_eng<1, vec>::get_adv();
1091 
1092  vec v7 = c7x::strm_eng<0, vec>::get_adv();
1093  vec v8 = c7x::strm_eng<1, vec>::get_adv();
1094 
1095  dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1096  vec u1 = __vload_dup(pU1);
1097 
1098  dataType *pU2 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1099  vec u2 = __vload_dup(pU2);
1100 
1101  dataType *pU3 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1102  vec u3 = __vload_dup(pU3);
1103 
1104  dataType *pU4 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1105  vec u4 = __vload_dup(pU4);
1106 
1107  sV1 += v1 * u1;
1108  sV2 += v2 * u1;
1109  sV3 += v3 * u2;
1110  sV4 += v4 * u2;
1111  sV5 += v5 * u3;
1112  sV6 += v6 * u3;
1113  sV7 += v7 * u4;
1114  sV8 += v8 * u4;
1115  }
1116  sV5 = sV5 + sV7;
1117  sV6 = sV6 + sV8;
1118 
1119  for (; horizontal < (nCols) -1; horizontal += 2) {
1120  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1121  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1122 
1123  vec v3 = c7x::strm_eng<0, vec>::get_adv();
1124  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1125 
1126  dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1127  vec u1 = __vload_dup(pU1);
1128 
1129  dataType *pU2 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1130  vec u2 = __vload_dup(pU2);
1131 
1132  sV1 += v1 * u1;
1133  sV2 += v2 * u1;
1134  sV3 += v3 * u2;
1135  sV4 += v4 * u2;
1136  }
1137  sV1 = sV1 + sV3;
1138  sV2 = sV2 + sV4;
1139 
1140  for (; horizontal < (nCols); horizontal++) {
1141  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1142  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1143 
1144  dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1145  vec u1 = __vload_dup(pU1);
1146 
1147  sV1 += v1 * u1;
1148  sV2 += v2 * u1;
1149  }
1150 
1151  sV1 = sV1 + sV5;
1152  sV2 = sV2 + sV6;
1153 
1154  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1155  vec *pStoreVec1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1156  __vstore_pred(pred1, pStoreVec1, sV1 * scaleV);
1157 
1158  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1159  vec *pStoreVec2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1160  __vstore_pred(pred2, pStoreVec2, sV2 * scaleV);
1161  }
1162 
1163  if (se1TICNT2 != se0TICNT2) {
1164 
1165  vec sV2 = (vec) 0;
1166  for (int32_t horizontal = 0; horizontal < nCols; horizontal++) {
1167  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1168 
1169  dataType *pU1 = c7x::strm_agen<2, dataType>::get_adv(pLocalU);
1170  vec u1 = __vload_dup(pU1);
1171 
1172  sV2 += v2 * u1;
1173  }
1174  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1175  vec *pStoreVec2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1176  __vstore_pred(pred2, pStoreVec2, sV2 * scaleV);
1177  }
1178 
1179  __SA1_CLOSE();
1180  __SA2_CLOSE();
1181  __SE1_CLOSE();
1182  if (se0TICNT2 > 0) {
1183  __SA0_CLOSE();
1184  __SE0_CLOSE();
1185  }
1186  /****************************** Q UPDATION ****************************
1187  ***********************************************************************/
1188  dataType *qStore = pLocalQ;
1189 
1190  __SE_TEMPLATE_v1 seScalarParams;
1191  __SE_TEMPLATE_v1 seMatrixParams;
1192 
1193  __SA_TEMPLATE_v1 saMatrixParams;
1194  __SA_TEMPLATE_v1 saRefParams;
1195 
1196  seScalarParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
1197  seMatrixParams = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE));
1198  saMatrixParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (17 * SE_PARAM_SIZE));
1199  saRefParams = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
1200 
1201  int32_t lenTile4 = 4;
1202  int32_t lenTile2 = 2;
1203  int32_t lenTile1 = 1;
1204 
1205  int32_t nTiles1 = DSPLIB_ceilingDiv(nCols, (eleCount));
1206  int32_t nTiles4 = nTiles1 / lenTile4;
1207  nTiles1 -= nTiles4 * lenTile4;
1208  int32_t nTiles2 = nTiles1 / lenTile2;
1209  nTiles1 -= nTiles2 * lenTile2;
1210 
1211  int32_t remainingCols = nCols;
1212  int32_t colLimit4 = nTiles4 * lenTile4 * eleCount;
1213  colLimit4 = (remainingCols < (colLimit4)) ? remainingCols : colLimit4;
1214 
1215  remainingCols = remainingCols - colLimit4;
1216  int32_t colLimit2 = nTiles2 * lenTile2 * eleCount;
1217  colLimit2 = (remainingCols < (colLimit2)) ? remainingCols : colLimit2;
1218 
1219  int32_t colLimit1 = remainingCols - colLimit2;
1220 
1221  seScalarParams.ICNT0 = nRows;
1222  seScalarParams.ICNT1 = nTiles4 + nTiles2 + nTiles1;
1223 
1224  seMatrixParams.ICNT1 = nRows;
1225  saMatrixParams.ICNT1 = nRows;
1226 
1227  saRefParams.ICNT0 = nCols;
1228 
1229  __SE0_OPEN(pSum, seScalarParams);
1230  __SA0_OPEN(saRefParams);
1231  if (nTiles4 > 0) {
1232  seMatrixParams.ICNT2 = nTiles4;
1233  seMatrixParams.DECDIM1_WIDTH = colLimit4;
1234 
1235  saMatrixParams.ICNT2 = nTiles4;
1236  saMatrixParams.DECDIM1_WIDTH = colLimit4;
1237 
1238  __SE1_OPEN(qStore, seMatrixParams);
1239  __SA1_OPEN(saMatrixParams);
1240 
1241  for (int32_t tile = 0; tile < nTiles4; tile++) {
1242  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1243  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1244  vec sV1 = __vload_pred(lPred, pLoadVec);
1245 
1246  lPred = c7x::strm_agen<0, vec>::get_vpred();
1247  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1248  vec sV2 = __vload_pred(lPred, pLoadVec);
1249 
1250  lPred = c7x::strm_agen<0, vec>::get_vpred();
1251  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1252  vec sV3 = __vload_pred(lPred, pLoadVec);
1253 
1254  lPred = c7x::strm_agen<0, vec>::get_vpred();
1255  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1256  vec sV4 = __vload_pred(lPred, pLoadVec);
1257 
1258  int32_t vertical = 0;
1259 
1260  for (; vertical < nRows - 3; vertical += 4) {
1261  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1262  vec scalarDup2 = c7x::strm_eng<0, vec>::get_adv();
1263  vec scalarDup3 = c7x::strm_eng<0, vec>::get_adv();
1264  vec scalarDup4 = c7x::strm_eng<0, vec>::get_adv();
1265 
1266  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1267  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1268  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1269  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1270  vec v5 = c7x::strm_eng<1, vec>::get_adv();
1271  vec v6 = c7x::strm_eng<1, vec>::get_adv();
1272  vec v7 = c7x::strm_eng<1, vec>::get_adv();
1273  vec v8 = c7x::strm_eng<1, vec>::get_adv();
1274 
1275  vec v9 = c7x::strm_eng<1, vec>::get_adv();
1276  vec v10 = c7x::strm_eng<1, vec>::get_adv();
1277  vec v11 = c7x::strm_eng<1, vec>::get_adv();
1278  vec v12 = c7x::strm_eng<1, vec>::get_adv();
1279  vec v13 = c7x::strm_eng<1, vec>::get_adv();
1280  vec v14 = c7x::strm_eng<1, vec>::get_adv();
1281  vec v15 = c7x::strm_eng<1, vec>::get_adv();
1282  vec v16 = c7x::strm_eng<1, vec>::get_adv();
1283 
1284  v1 -= sV1 * scalarDup1;
1285  v2 -= sV2 * scalarDup1;
1286  v3 -= sV3 * scalarDup1;
1287  v4 -= sV4 * scalarDup1;
1288  v5 -= sV1 * scalarDup2;
1289  v6 -= sV2 * scalarDup2;
1290  v7 -= sV3 * scalarDup2;
1291  v8 -= sV4 * scalarDup2;
1292 
1293  v9 -= sV1 * scalarDup3;
1294  v10 -= sV2 * scalarDup3;
1295  v11 -= sV3 * scalarDup3;
1296  v12 -= sV4 * scalarDup3;
1297  v13 -= sV1 * scalarDup4;
1298  v14 -= sV2 * scalarDup4;
1299  v15 -= sV3 * scalarDup4;
1300  v16 -= sV4 * scalarDup4;
1301 
1302  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1303  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1304  __vstore_pred(sPred, pStoreVec, v1);
1305 
1306  sPred = c7x::strm_agen<1, vec>::get_vpred();
1307  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1308  __vstore_pred(sPred, pStoreVec, v2);
1309 
1310  sPred = c7x::strm_agen<1, vec>::get_vpred();
1311  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1312  __vstore_pred(sPred, pStoreVec, v3);
1313 
1314  sPred = c7x::strm_agen<1, vec>::get_vpred();
1315  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1316  __vstore_pred(sPred, pStoreVec, v4);
1317 
1318  sPred = c7x::strm_agen<1, vec>::get_vpred();
1319  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1320  __vstore_pred(sPred, pStoreVec, v5);
1321 
1322  sPred = c7x::strm_agen<1, vec>::get_vpred();
1323  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1324  __vstore_pred(sPred, pStoreVec, v6);
1325 
1326  sPred = c7x::strm_agen<1, vec>::get_vpred();
1327  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1328  __vstore_pred(sPred, pStoreVec, v7);
1329 
1330  sPred = c7x::strm_agen<1, vec>::get_vpred();
1331  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1332  __vstore_pred(sPred, pStoreVec, v8);
1333 
1334  sPred = c7x::strm_agen<1, vec>::get_vpred();
1335  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1336  __vstore_pred(sPred, pStoreVec, v9);
1337 
1338  sPred = c7x::strm_agen<1, vec>::get_vpred();
1339  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1340  __vstore_pred(sPred, pStoreVec, v10);
1341 
1342  sPred = c7x::strm_agen<1, vec>::get_vpred();
1343  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1344  __vstore_pred(sPred, pStoreVec, v11);
1345 
1346  sPred = c7x::strm_agen<1, vec>::get_vpred();
1347  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1348  __vstore_pred(sPred, pStoreVec, v12);
1349 
1350  sPred = c7x::strm_agen<1, vec>::get_vpred();
1351  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1352  __vstore_pred(sPred, pStoreVec, v13);
1353 
1354  sPred = c7x::strm_agen<1, vec>::get_vpred();
1355  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1356  __vstore_pred(sPred, pStoreVec, v14);
1357 
1358  sPred = c7x::strm_agen<1, vec>::get_vpred();
1359  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1360  __vstore_pred(sPred, pStoreVec, v15);
1361 
1362  sPred = c7x::strm_agen<1, vec>::get_vpred();
1363  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1364  __vstore_pred(sPred, pStoreVec, v16);
1365  }
1366 
1367  for (; vertical < nRows; vertical++) {
1368  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1369 
1370  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1371  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1372  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1373  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1374 
1375  v1 -= sV1 * scalarDup1;
1376  v2 -= sV2 * scalarDup1;
1377  v3 -= sV3 * scalarDup1;
1378  v4 -= sV4 * scalarDup1;
1379 
1380  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1381  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1382  __vstore_pred(sPred, pStoreVec, v1);
1383 
1384  sPred = c7x::strm_agen<1, vec>::get_vpred();
1385  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1386  __vstore_pred(sPred, pStoreVec, v2);
1387 
1388  sPred = c7x::strm_agen<1, vec>::get_vpred();
1389  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1390  __vstore_pred(sPred, pStoreVec, v3);
1391 
1392  sPred = c7x::strm_agen<1, vec>::get_vpred();
1393  pStoreVec = c7x::strm_agen<1, vec>::get_adv(qStore);
1394  __vstore_pred(sPred, pStoreVec, v4);
1395  }
1396  }
1397  __SE1_CLOSE();
1398  __SA1_CLOSE();
1399  }
1400  if (nTiles2 > 0) {
1401  seMatrixParams.ICNT2 = nTiles2;
1402  seMatrixParams.DECDIM1_WIDTH = colLimit2;
1403  seMatrixParams.ICNT0 = (eleCount * lenTile2);
1404 
1405  saMatrixParams.ICNT2 = nTiles2;
1406  saMatrixParams.DECDIM1_WIDTH = colLimit2;
1407  saMatrixParams.ICNT0 = (eleCount * lenTile2);
1408 
1409  dataType *pSE1 = qStore + colLimit4;
1410 
1411  __SE1_OPEN(pSE1, seMatrixParams);
1412  __SA1_OPEN(saMatrixParams);
1413 
1414  for (int32_t tile = 0; tile < nTiles2; tile++) {
1415  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1416  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1417  vec sV1 = __vload_pred(lPred, pLoadVec);
1418 
1419  lPred = c7x::strm_agen<0, vec>::get_vpred();
1420  pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1421  vec sV2 = __vload_pred(lPred, pLoadVec);
1422 
1423  int32_t vertical = 0;
1424 
1425  for (; vertical < nRows - 3; vertical += 4) {
1426  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1427  vec scalarDup2 = c7x::strm_eng<0, vec>::get_adv();
1428  vec scalarDup3 = c7x::strm_eng<0, vec>::get_adv();
1429  vec scalarDup4 = c7x::strm_eng<0, vec>::get_adv();
1430 
1431  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1432  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1433  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1434  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1435  vec v5 = c7x::strm_eng<1, vec>::get_adv();
1436  vec v6 = c7x::strm_eng<1, vec>::get_adv();
1437  vec v7 = c7x::strm_eng<1, vec>::get_adv();
1438  vec v8 = c7x::strm_eng<1, vec>::get_adv();
1439 
1440  v1 -= sV1 * scalarDup1;
1441  v2 -= sV2 * scalarDup1;
1442  v3 -= sV1 * scalarDup2;
1443  v4 -= sV2 * scalarDup2;
1444  v5 -= sV1 * scalarDup3;
1445  v6 -= sV2 * scalarDup3;
1446  v7 -= sV1 * scalarDup4;
1447  v8 -= sV2 * scalarDup4;
1448 
1449  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1450  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1451  __vstore_pred(sPred, pStoreVec, v1);
1452 
1453  sPred = c7x::strm_agen<1, vec>::get_vpred();
1454  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1455  __vstore_pred(sPred, pStoreVec, v2);
1456 
1457  sPred = c7x::strm_agen<1, vec>::get_vpred();
1458  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1459  __vstore_pred(sPred, pStoreVec, v3);
1460 
1461  sPred = c7x::strm_agen<1, vec>::get_vpred();
1462  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1463  __vstore_pred(sPred, pStoreVec, v4);
1464 
1465  sPred = c7x::strm_agen<1, vec>::get_vpred();
1466  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1467  __vstore_pred(sPred, pStoreVec, v5);
1468 
1469  sPred = c7x::strm_agen<1, vec>::get_vpred();
1470  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1471  __vstore_pred(sPred, pStoreVec, v6);
1472 
1473  sPred = c7x::strm_agen<1, vec>::get_vpred();
1474  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1475  __vstore_pred(sPred, pStoreVec, v7);
1476 
1477  sPred = c7x::strm_agen<1, vec>::get_vpred();
1478  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1479  __vstore_pred(sPred, pStoreVec, v8);
1480  }
1481 
1482  for (; vertical < nRows; vertical++) {
1483  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1484 
1485  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1486  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1487 
1488  v1 -= sV1 * scalarDup1;
1489  v2 -= sV2 * scalarDup1;
1490 
1491  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1492  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1493  __vstore_pred(sPred, pStoreVec, v1);
1494 
1495  sPred = c7x::strm_agen<1, vec>::get_vpred();
1496  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1497  __vstore_pred(sPred, pStoreVec, v2);
1498  }
1499  }
1500  __SE1_CLOSE();
1501  __SA1_CLOSE();
1502  }
1503  if (nTiles1 > 0) {
1504  seMatrixParams.ICNT2 = nTiles1;
1505  seMatrixParams.DECDIM1_WIDTH = colLimit1;
1506  seMatrixParams.ICNT0 = (eleCount * lenTile1);
1507 
1508  saMatrixParams.ICNT2 = nTiles1;
1509  saMatrixParams.DECDIM1_WIDTH = colLimit1;
1510  saMatrixParams.ICNT0 = (eleCount * lenTile1);
1511 
1512  dataType *pSE1 = qStore + colLimit4 + colLimit2;
1513 
1514  __SE1_OPEN(pSE1, seMatrixParams);
1515  __SA1_OPEN(saMatrixParams);
1516 
1517  for (int32_t tile = 0; tile < nTiles1; tile++) {
1518  __vpred lPred = c7x::strm_agen<0, vec>::get_vpred();
1519  vec *pLoadVec = c7x::strm_agen<0, vec>::get_adv(pLocalU);
1520  vec sV1 = __vload_pred(lPred, pLoadVec);
1521 
1522  int32_t vertical = 0;
1523 
1524  for (; vertical < nRows - 3; vertical += 4) {
1525  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1526  vec scalarDup2 = c7x::strm_eng<0, vec>::get_adv();
1527  vec scalarDup3 = c7x::strm_eng<0, vec>::get_adv();
1528  vec scalarDup4 = c7x::strm_eng<0, vec>::get_adv();
1529 
1530  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1531  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1532  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1533  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1534 
1535  v1 -= sV1 * scalarDup1;
1536  v2 -= sV1 * scalarDup2;
1537  v3 -= sV1 * scalarDup3;
1538  v4 -= sV1 * scalarDup4;
1539 
1540  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1541  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1542  __vstore_pred(sPred, pStoreVec, v1);
1543 
1544  sPred = c7x::strm_agen<1, vec>::get_vpred();
1545  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1546  __vstore_pred(sPred, pStoreVec, v2);
1547 
1548  sPred = c7x::strm_agen<1, vec>::get_vpred();
1549  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1550  __vstore_pred(sPred, pStoreVec, v3);
1551 
1552  sPred = c7x::strm_agen<1, vec>::get_vpred();
1553  pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1554  __vstore_pred(sPred, pStoreVec, v4);
1555  }
1556  for (; vertical < nRows; vertical++) {
1557  vec scalarDup1 = c7x::strm_eng<0, vec>::get_adv();
1558 
1559  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1560 
1561  v1 -= sV1 * scalarDup1;
1562 
1563  __vpred sPred = c7x::strm_agen<1, vec>::get_vpred();
1564  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(pSE1);
1565  __vstore_pred(sPred, pStoreVec, v1);
1566  }
1567  }
1568  __SE1_CLOSE();
1569  __SA1_CLOSE();
1570  }
1571 
1572  __SE0_CLOSE();
1573  __SA0_CLOSE();
1574 
1575  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
1576 }
1577 template void DSPLIB_qrd_Q_matrix_exec_ci<float>(float *pLocalQ,
1578  float *pLocalU,
1579  float *sum,
1580  float scale,
1581  int32_t colStrideQ,
1582  int32_t nRows,
1583  int32_t nCols,
1584  uint8_t *pBlock);
1585 template void DSPLIB_qrd_Q_matrix_exec_ci<double>(double *pLocalQ,
1586  double *pLocalU,
1587  double *sum,
1588  double scale,
1589  int32_t colStrideQ,
1590  int32_t nRows,
1591  int32_t nCols,
1592  uint8_t *pBlock);
1593 /* ======================================================================== */
1594 /* End of file: DSPLIB_qrd_ci_opt.cpp */
1595 /* ======================================================================== */
template void DSPLIB_qrd_R_matrix_exec_ci< float >(float *pLocalR, float *pLocalU, float *sum, float scale, int32_t colStrideR, int32_t nRows, int32_t nCols, uint8_t *pBlock)
template void DSPLIB_Q_matrix_init_ci< float >(DSPLIB_kernelHandle handle)
void DSPLIB_R_column_init_ci(DSPLIB_kernelHandle handle)
void DSPLIB_qrd_Q_matrix_exec_ci(dataType *pLocalQ, dataType *pLocalU, dataType *pSum, dataType scale, int32_t colStrideQ, int32_t nRows, int32_t nCols, uint8_t *pBlock)
template void DSPLIB_qrd_Q_matrix_exec_ci< double >(double *pLocalQ, double *pLocalU, double *sum, double scale, int32_t colStrideQ, int32_t nRows, int32_t nCols, uint8_t *pBlock)
void DSPLIB_qrd_R_matrix_exec_ci(dataType *pLocalR, dataType *pLocalU, dataType *pSum, dataType scale, int32_t colStrideR, int32_t nRows, int32_t nCols, uint8_t *pBlock)
void DSPLIB_Q_matrix_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_R_column_init_ci< double >(DSPLIB_kernelHandle handle)
template void DSPLIB_R_column_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_qrd_R_matrix_exec_ci< double >(double *pLocalR, double *pLocalU, double *sum, double scale, int32_t colStrideR, int32_t nRows, int32_t nCols, uint8_t *pBlock)
template void DSPLIB_Q_matrix_init_ci< double >(DSPLIB_kernelHandle handle)
template void DSPLIB_qrd_Q_matrix_exec_ci< float >(float *pLocalQ, float *pLocalU, float *sum, float scale, int32_t colStrideQ, int32_t nRows, int32_t nCols, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_qrd.
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
Structure that is reserved for internal use by the kernel.
int32_t strideR
Stride between rows of R output data matrix
uint32_t heightA
Height of input data matrix
uint8_t bufPblock[DSPLIB_QRD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters.
int32_t strideQ
Stride between rows of Q output data matrix