DSPLIB User Guide
DSPLIB_svd_bidiag_uMat_ci.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  * *
3  * module name :DSPLIB *
4  * *
5  * module descripton :Digital Signal Processing Library module for C7x+MMA *
6  * *
7  * Copyright (C) 2017-2018 Texas Instruments Incorporated - https://www.ti.com/ *
8  * ALL RIGHTS RESERVED *
9  * *
10  ******************************************************************************/
11 
23 /* *****************************************************************************
24  *
25  * INCLUDES
26  *
27  ***************************************************************************** */
28 
29 #include "DSPLIB_svd_priv.h"
30 
31 /* *****************************************************************************
32  *
33  * INITIALIZATION
34  *
35  ***************************************************************************** */
36 
37 
41 template <typename dataType> void DSPLIB_bidiag_u_init_ci(DSPLIB_kernelHandle handle)
42 {
43  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
44 
45  DSPLIB_svd_PrivArgs *pKerPrivArgs = (DSPLIB_svd_PrivArgs *) handle;
46  uint8_t *pBlock = pKerPrivArgs->bufPblock;
47  int32_t strideU = pKerPrivArgs->strideU;
48  int32_t dataSize = sizeof(dataType);
49 
50  __SE_TEMPLATE_v1 se0Params = __gen_SE_TEMPLATE_v1();
51  __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
52  __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
53  __SE_TEMPLATE_v1 se2Params = __gen_SE_TEMPLATE_v1();
54  __SA_TEMPLATE_v1 sa2Params = __gen_SA_TEMPLATE_v1();
55  __SA_TEMPLATE_v1 sa3Params = __gen_SA_TEMPLATE_v1();
56  __SA_TEMPLATE_v1 sa4Params = __gen_SA_TEMPLATE_v1();
57  __SE_TEMPLATE_v1 se3Params = __gen_SE_TEMPLATE_v1();
58  __SA_TEMPLATE_v1 sa5Params = __gen_SA_TEMPLATE_v1();
59 
60  typedef typename c7x::make_full_vector<dataType>::type vec;
61  int32_t eleCount = c7x::element_count_of<vec>::value;
62  __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
63  __SE_VECLEN SE_VECLEN = c7x::se_veclen<vec>::value;
64  __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
65 
66  int32_t colUStride = strideU / sizeof(dataType);
67 
68  se0Params.ICNT0 = 1;
69  se0Params.ICNT1 = eleCount;
70  se0Params.DIM1 = colUStride;
71  se0Params.DIM2 = eleCount * colUStride;
72  se0Params.ICNT3 = 1;
73  se0Params.DIM3 = 0;
74  se0Params.DIMFMT = __SE_DIMFMT_4D;
75  se0Params.ELETYPE = SE_ELETYPE;
76  se0Params.VECLEN = SE_VECLEN;
77 
78  if (dataSize == 4) {
79  se0Params.TRANSPOSE = __SE_TRANSPOSE_32BIT;
80  }
81  else {
82  se0Params.TRANSPOSE = __SE_TRANSPOSE_64BIT;
83  }
84 
85  sa0Params.ICNT1 = 1;
86  sa0Params.DIM1 = 0;
87  sa0Params.DIMFMT = __SA_DIMFMT_2D;
88  sa0Params.VECLEN = SA_VECLEN;
89 
90  se1Params.ICNT1 = 3;
91  se1Params.DIM1 = 0;
92  se1Params.DIMFMT = __SE_DIMFMT_2D;
93  se1Params.ELETYPE = SE_ELETYPE;
94  se1Params.VECLEN = SE_VECLEN;
95 
96  int32_t lenTile = 8;
97  se2Params.ICNT0 = eleCount * lenTile;
98  se2Params.DIM1 = colUStride * 2;
99  se2Params.DIM2 = eleCount * lenTile;
100  se2Params.ICNT3 = 2;
101  se2Params.DIM3 = 0;
102  se2Params.DIMFMT = __SE_DIMFMT_4D;
103  se2Params.ELETYPE = SE_ELETYPE;
104  se2Params.VECLEN = SE_VECLEN;
105  se2Params.DECDIM1 = __SE_DECDIM_DIM2;
106 
107  sa2Params.ICNT0 = eleCount * lenTile;
108  sa2Params.DIM1 = colUStride * 2;
109  sa2Params.DIM2 = eleCount * lenTile;
110  sa2Params.DIMFMT = __SA_DIMFMT_3D;
111  sa2Params.VECLEN = SA_VECLEN;
112  sa2Params.DECDIM1 = __SA_DECDIM_DIM2;
113 
114  sa3Params.DIM1 = 0;
115  sa3Params.DIMFMT = __SA_DIMFMT_2D;
116  sa3Params.VECLEN = __SA_VECLEN_1ELEM;
117 
118  sa4Params.ICNT0 = eleCount;
119  sa4Params.DIM1 = eleCount;
120  sa4Params.ICNT2 = 2;
121  sa4Params.DIM2 = 0;
122  sa4Params.DIMFMT = __SA_DIMFMT_3D;
123  sa4Params.VECLEN = SA_VECLEN;
124  sa4Params.DECDIM1 = __SA_DECDIM_DIM1;
125 
126  int32_t rowBlock = 8;
127  se3Params.ICNT0 = eleCount;
128  se3Params.ICNT1 = rowBlock;
129  se3Params.DIM1 = colUStride;
130  se3Params.DIM2 = eleCount;
131  se3Params.DIM3 = 2 * rowBlock * colUStride;
132  se3Params.DIMFMT = __SE_DIMFMT_4D;
133  se3Params.ELETYPE = SE_ELETYPE;
134  se3Params.VECLEN = SE_VECLEN;
135  se3Params.DECDIM2 = __SE_DECDIM_DIM2;
136 
137  sa5Params.ICNT0 = eleCount;
138  sa5Params.ICNT1 = rowBlock;
139  sa5Params.DIM1 = colUStride;
140  sa5Params.DIM2 = eleCount;
141  sa5Params.DIM3 = 2 * rowBlock * colUStride;
142  sa5Params.DIMFMT = __SA_DIMFMT_4D;
143  sa5Params.VECLEN = SA_VECLEN;
144  sa5Params.DECDIM2 = __SA_DECDIM_DIM2;
145 
146  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE)) = se0Params;
147  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE)) = sa0Params;
148  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE)) = se1Params;
149  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE)) = se2Params;
150  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE)) = sa2Params;
151  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE)) = sa3Params;
152  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE)) = sa4Params;
153  *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE)) = se3Params;
154  *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE)) = sa5Params;
155 
156  DSPLIB_DEBUGPRINTFN(0, "Exiting function with return status: %d\n", DSPLIB_SUCCESS);
157 }
160 
161 /* *****************************************************************************
162  *
163  * IMPLEMENTATION
164  *
165  ***************************************************************************** */
166 
171 template <typename dataType>
172 dataType DSPLIB_bidiag_uCol_halfnorm_ci(dataType *U,
173  int32_t Nrows,
174  int32_t Ncols,
175  int32_t colUStride,
176  dataType *half_norm_squared,
177  dataType *U1,
178  dataType *s,
179  uint8_t *pBlock)
180 {
181  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
182 
183  __SE_TEMPLATE_v1 se0Params, se1Params;
184  __SA_TEMPLATE_v1 sa0Params, sa1Params;
185  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
186  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
187  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
188  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
189 
190  typedef typename c7x::make_full_vector<dataType>::type vec;
191  int32_t eleCount = c7x::element_count_of<vec>::value;
192 
193  int32_t nVec = Nrows / eleCount;
194  int32_t se0ICNT2 = nVec / 2;
195  int32_t se1ICNT2 = nVec - se0ICNT2;
196  int32_t remainingEle = Nrows - (nVec * eleCount);
197  /* ******************************** CALCULATE COL SCALE *********************************
198  *************************************************************************************** */
199 
200  se0Params.ICNT2 = se0ICNT2;
201  se0Params.ICNT3 = 2;
202 
203  se1Params.ICNT2 = se1ICNT2;
204  se1Params.ICNT3 = 2;
205 
206  sa0Params.ICNT0 = se0ICNT2 * eleCount;
207  sa1Params.ICNT0 = Nrows - (se0ICNT2 * eleCount);
208 
209  vec acc1, acc2, acc3, acc4, acc5, acc6;
210  acc1 = acc2 = acc3 = acc4 = acc5 = acc6 = (vec) 0;
211 
212  dataType *pSE0 = U;
213  dataType *pSE1 = U + (se0ICNT2 * colUStride * eleCount);
214 
215  if (se1ICNT2 > 0) {
216  __SE1_OPEN(pSE1, se1Params);
217  }
218  if (se0ICNT2 > 0) {
219  __SE0_OPEN(pSE0, se0Params);
220  }
221  int32_t iterloop1 = se0ICNT2 / 3;
222  int32_t vertical = iterloop1 * 3;
223  for (int32_t iter = 0; iter < iterloop1; iter++) {
224  vec v1 = c7x::strm_eng<0, vec>::get_adv();
225  vec v2 = c7x::strm_eng<1, vec>::get_adv();
226  vec v3 = c7x::strm_eng<0, vec>::get_adv();
227  vec v4 = c7x::strm_eng<1, vec>::get_adv();
228  vec v5 = c7x::strm_eng<0, vec>::get_adv();
229  vec v6 = c7x::strm_eng<1, vec>::get_adv();
230 
231  acc1 += __abs(v1);
232  acc2 += __abs(v2);
233  acc3 += __abs(v3);
234  acc4 += __abs(v4);
235  acc5 += __abs(v5);
236  acc6 += __abs(v6);
237  }
238  for (; vertical < se0ICNT2 - 1; vertical += 2) {
239  vec v1 = c7x::strm_eng<0, vec>::get_adv();
240  vec v2 = c7x::strm_eng<1, vec>::get_adv();
241  vec v3 = c7x::strm_eng<0, vec>::get_adv();
242  vec v4 = c7x::strm_eng<1, vec>::get_adv();
243 
244  acc1 += __abs(v1);
245  acc2 += __abs(v2);
246  acc3 += __abs(v3);
247  acc4 += __abs(v4);
248  }
249  for (; vertical < se0ICNT2; vertical++) {
250  vec v1 = c7x::strm_eng<0, vec>::get_adv();
251  vec v2 = c7x::strm_eng<1, vec>::get_adv();
252 
253  acc1 += __abs(v1);
254  acc2 += __abs(v2);
255  }
256 
257  if (se0ICNT2 != se1ICNT2) {
258  vec v1 = c7x::strm_eng<1, vec>::get_adv();
259 
260  acc1 += __abs(v1);
261  }
262 
263  acc1 = acc1 + acc2;
264  acc3 = acc3 + acc4;
265  acc5 = acc5 + acc6;
266  acc1 = acc1 + acc3 + acc5;
267 
268  dataType scale = 0;
269  c7x_horizontal_add(acc1, &scale);
270 
271  if (remainingEle > 0) {
272 
273  dataType *remU = U + (nVec * eleCount * colUStride);
274  for (int32_t i = 0; i < remainingEle; i++) {
275  scale += __abs(remU[i * colUStride]);
276  }
277  }
278 
279  /* ******************************** CALCULATE COL S2 *********************************
280  *************************************************************************************** */
281 
282  if (scale > 0) {
283 
284  dataType *normUStore = U1 + colUStride;
285  dataType *pSA0 = normUStore;
286  dataType *pSA1 = normUStore + (se0ICNT2 * eleCount);
287  dataType scalarRecip = getRecip(scale);
288  vec reciprocalScale = (vec) scalarRecip;
289  dataType s2 = 0;
290 
291  acc1 = (vec) 0;
292  acc2 = (vec) 0;
293  acc3 = (vec) 0;
294  acc4 = (vec) 0;
295 
296  __SA1_OPEN(sa1Params);
297  if (se0ICNT2 > 0) {
298  __SA0_OPEN(sa0Params);
299  }
300  vertical = 0;
301  for (vertical = 0; vertical < se0ICNT2 - 1; vertical += 2) {
302  vec v1 = c7x::strm_eng<0, vec>::get_adv();
303  vec v2 = c7x::strm_eng<1, vec>::get_adv();
304  vec v3 = c7x::strm_eng<0, vec>::get_adv();
305  vec v4 = c7x::strm_eng<1, vec>::get_adv();
306 
307  v1 = v1 * reciprocalScale;
308  v2 = v2 * reciprocalScale;
309  v3 = v3 * reciprocalScale;
310  v4 = v4 * reciprocalScale;
311 
312  acc1 += v1 * v1;
313  acc2 += v2 * v2;
314  acc3 += v3 * v3;
315  acc4 += v4 * v4;
316 
317  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
318  vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
319  __vstore_pred(pred1, pNormU1, v1);
320 
321  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
322  vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
323  __vstore_pred(pred2, pNormU2, v2);
324 
325  __vpred pred3 = c7x::strm_agen<0, vec>::get_vpred();
326  vec *pNormU3 = c7x::strm_agen<0, vec>::get_adv(pSA0);
327  __vstore_pred(pred3, pNormU3, v3);
328 
329  __vpred pred4 = c7x::strm_agen<1, vec>::get_vpred();
330  vec *pNormU4 = c7x::strm_agen<1, vec>::get_adv(pSA1);
331  __vstore_pred(pred4, pNormU4, v4);
332  }
333 
334  for (; vertical < se0ICNT2; vertical++) {
335  vec v1 = c7x::strm_eng<0, vec>::get_adv();
336  vec v2 = c7x::strm_eng<1, vec>::get_adv();
337 
338  v1 = v1 * reciprocalScale;
339  v2 = v2 * reciprocalScale;
340 
341  acc1 += v1 * v1;
342  acc2 += v2 * v2;
343 
344  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
345  vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
346  __vstore_pred(pred1, pNormU1, v1);
347 
348  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
349  vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
350  __vstore_pred(pred2, pNormU2, v2);
351  }
352 
353  if (se0ICNT2 != se1ICNT2) {
354  vec v2 = c7x::strm_eng<1, vec>::get_adv();
355 
356  v2 = v2 * reciprocalScale;
357 
358  acc2 += v2 * v2;
359 
360  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
361  vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
362  __vstore_pred(pred2, pNormU2, v2);
363  }
364 
365  acc1 = acc1 + acc2;
366  acc3 = acc3 + acc4;
367  acc1 = acc1 + acc3;
368  c7x_horizontal_add(acc1, &s2);
369 
370  if (remainingEle > 0) {
371  __SE1_CLOSE();
372  dataType *remU = U + (nVec * eleCount * colUStride);
373  se1Params.ICNT1 = remainingEle;
374  se1Params.ICNT2 = 1;
375  __SE1_OPEN(remU, se1Params);
376  vec v2 = c7x::strm_eng<1, vec>::get_adv();
377 
378  v2 = v2 * reciprocalScale;
379 
380  vec vn = v2 * v2;
381 
382  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
383  vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
384  __vstore_pred(pred2, pNormU2, v2);
385 
386  for (int32_t i = 0; i < remainingEle; i++) {
387  s2 += vn.s[i];
388  }
389  }
390 
391  dataType diagEle = U[0] * scalarRecip;
392 
393  const dataType Half = 0.5;
394  const dataType OneP5 = 1.5;
395  dataType x = __recip_sqrt(s2);
396  x = x * (OneP5 - (s2 * x * x * Half));
397  x = x * (OneP5 - (s2 * x * x * Half));
398  dataType y = s2 * x;
399 
400  if (diagEle < 0) {
401  *s = y;
402  }
403  else {
404  *s = -y;
405  }
406  *half_norm_squared = diagEle * (*s) - s2;
407  *pSA0 = (*pSA0) - (*s);
408  }
409 
410  __SE1_CLOSE();
411  __SA1_CLOSE();
412  if (se0ICNT2 > 0) {
413  __SE0_CLOSE();
414  __SA0_CLOSE();
415  }
416 
417  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
418  return scale;
419 }
421  int32_t Nrows,
422  int32_t Ncols,
423  int32_t colUStride,
424  float *half_norm_squared,
425  float *U1,
426  float *s,
427  uint8_t *pBlock);
428 template double DSPLIB_bidiag_uCol_halfnorm_ci<double>(double *U,
429  int32_t Nrows,
430  int32_t Ncols,
431  int32_t colUStride,
432  double *half_norm_squared,
433  double *U1,
434  double *s,
435  uint8_t *pBlock);
436 
441 template <typename dataType>
442 void DSPLIB_bidiag_uCol_ci(dataType *U,
443  int32_t Nrows,
444  int32_t Ncols,
445  int32_t colUStride,
446  dataType half_norm_squared,
447  dataType *U1,
448  dataType scale,
449  uint8_t *pBlock)
450 {
451  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
452 
453  typedef typename c7x::make_full_vector<dataType>::type vec;
454  int32_t eleCount = c7x::element_count_of<vec>::value;
455 
456  __SE_TEMPLATE_v1 se0Params;
457  __SE_TEMPLATE_v1 se1Params;
458  __SA_TEMPLATE_v1 sa2Params;
459 
460  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
461  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
462  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
463 
464  int32_t lenTile = 8;
465  int32_t nTiles = DSPLIB_ceilingDiv(Ncols, eleCount * lenTile);
466  int32_t se1ICNT1 = Nrows / 2;
467  int32_t se0ICNT1 = Nrows - se1ICNT1;
468 
469  se0Params.ICNT1 = se0ICNT1;
470  se0Params.ICNT2 = nTiles;
471  se0Params.DECDIM1_WIDTH = Ncols;
472 
473  se1Params.ICNT1 = se1ICNT1;
474  se1Params.ICNT2 = nTiles;
475  se1Params.DECDIM1_WIDTH = Ncols;
476 
477  sa2Params.ICNT1 = lenTile * nTiles;
478  sa2Params.DECDIM1_WIDTH = Ncols;
479 
480  dataType *siStore = (dataType *) U1;
481  dataType *reciprocalLoad = (dataType *) U1 + colUStride;
482 
483  vec reciprocalHalfNorm = (vec) getRecip(half_norm_squared);
484 
485  __SA_TEMPLATE_v1 sa3Params;
486  sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
487  sa3Params.ICNT0 = Nrows;
488  sa3Params.ICNT1 = 2 * nTiles;
489 
490  /* ******************************** CALCULATE COL SI **************************************
491  *************************************************************************************** */
492 
493  __SE0_OPEN(U + 1, se0Params);
494  __SA2_OPEN(sa2Params);
495  __SA3_OPEN(sa3Params);
496 
497  /* if (se1ICNT1 > 0) */ /* if number of rows > 1 */
498  {
499  __SE1_OPEN(U + 1 + colUStride, se1Params);
500  }
501 
502  for (int32_t tile = 0; tile < nTiles; tile++) {
503  vec acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8;
504  acc1 = acc2 = acc3 = acc4 = acc5 = acc6 = acc7 = acc8 = (vec) 0;
505  for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
506  vec v01 = c7x::strm_eng<0, vec>::get_adv();
507  vec v02 = c7x::strm_eng<0, vec>::get_adv();
508  vec v03 = c7x::strm_eng<0, vec>::get_adv();
509  vec v04 = c7x::strm_eng<0, vec>::get_adv();
510  vec v05 = c7x::strm_eng<0, vec>::get_adv();
511  vec v06 = c7x::strm_eng<0, vec>::get_adv();
512  vec v07 = c7x::strm_eng<0, vec>::get_adv();
513  vec v08 = c7x::strm_eng<0, vec>::get_adv();
514 
515  vec v11 = c7x::strm_eng<1, vec>::get_adv();
516  vec v12 = c7x::strm_eng<1, vec>::get_adv();
517  vec v13 = c7x::strm_eng<1, vec>::get_adv();
518  vec v14 = c7x::strm_eng<1, vec>::get_adv();
519  vec v15 = c7x::strm_eng<1, vec>::get_adv();
520  vec v16 = c7x::strm_eng<1, vec>::get_adv();
521  vec v17 = c7x::strm_eng<1, vec>::get_adv();
522  vec v18 = c7x::strm_eng<1, vec>::get_adv();
523 
524  dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
525  vec u1 = __vload_dup(pU1);
526  dataType *pU2 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
527  vec u2 = __vload_dup(pU2);
528 
529  acc1 += v01 * u1 + v11 * u2;
530  acc2 += v02 * u1 + v12 * u2;
531  acc3 += v03 * u1 + v13 * u2;
532  acc4 += v04 * u1 + v14 * u2;
533  acc5 += v05 * u1 + v15 * u2;
534  acc6 += v06 * u1 + v16 * u2;
535  acc7 += v07 * u1 + v17 * u2;
536  acc8 += v08 * u1 + v18 * u2;
537  }
538 
539  if (se1ICNT1 != se0ICNT1) /* For last odd numbered row */
540  {
541  vec v01 = c7x::strm_eng<0, vec>::get_adv();
542  vec v02 = c7x::strm_eng<0, vec>::get_adv();
543  vec v03 = c7x::strm_eng<0, vec>::get_adv();
544  vec v04 = c7x::strm_eng<0, vec>::get_adv();
545  vec v05 = c7x::strm_eng<0, vec>::get_adv();
546  vec v06 = c7x::strm_eng<0, vec>::get_adv();
547  vec v07 = c7x::strm_eng<0, vec>::get_adv();
548  vec v08 = c7x::strm_eng<0, vec>::get_adv();
549 
550  dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
551  vec u1 = __vload_dup(pU1);
552 
553  acc1 += v01 * u1;
554  acc2 += v02 * u1;
555  acc3 += v03 * u1;
556  acc4 += v04 * u1;
557  acc5 += v05 * u1;
558  acc6 += v06 * u1;
559  acc7 += v07 * u1;
560  acc8 += v08 * u1;
561  }
562 
563  __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
564  vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
565  __vstore_pred(pred, pStoreVec, acc1 * reciprocalHalfNorm);
566 
567  pred = c7x::strm_agen<2, vec>::get_vpred();
568  pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
569  __vstore_pred(pred, pStoreVec, acc2 * reciprocalHalfNorm);
570 
571  pred = c7x::strm_agen<2, vec>::get_vpred();
572  pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
573  __vstore_pred(pred, pStoreVec, acc3 * reciprocalHalfNorm);
574 
575  pred = c7x::strm_agen<2, vec>::get_vpred();
576  pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
577  __vstore_pred(pred, pStoreVec, acc4 * reciprocalHalfNorm);
578 
579  pred = c7x::strm_agen<2, vec>::get_vpred();
580  pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
581  __vstore_pred(pred, pStoreVec, acc5 * reciprocalHalfNorm);
582 
583  pred = c7x::strm_agen<2, vec>::get_vpred();
584  pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
585  __vstore_pred(pred, pStoreVec, acc6 * reciprocalHalfNorm);
586 
587  pred = c7x::strm_agen<2, vec>::get_vpred();
588  pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
589  __vstore_pred(pred, pStoreVec, acc7 * reciprocalHalfNorm);
590 
591  pred = c7x::strm_agen<2, vec>::get_vpred();
592  pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
593  __vstore_pred(pred, pStoreVec, acc8 * reciprocalHalfNorm);
594  }
595 
596  /* ******************************** UPDATE COL ******************************************
597  *************************************************************************************** */
598  __SA_TEMPLATE_v1 sa0Params;
599  __SA_TEMPLATE_v1 sa1Params;
600 
601  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
602  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
603 
604  sa0Params.ICNT1 = se0ICNT1;
605  sa0Params.ICNT2 = nTiles;
606  sa0Params.DECDIM1_WIDTH = Ncols;
607 
608  sa1Params.ICNT1 = se1ICNT1;
609  sa1Params.ICNT2 = nTiles;
610  sa1Params.DECDIM1_WIDTH = Ncols;
611 
612  __SA0_OPEN(sa0Params);
613  /* if (se1ICNT1 > 0) */ {
614  __SA1_OPEN(sa1Params);
615  }
616  for (int32_t tile = 0; tile < nTiles; tile++) {
617  __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
618  vec *pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
619  vec si1 = __vload_pred(pred, pSi);
620 
621  pred = c7x::strm_agen<2, vec>::get_vpred();
622  pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
623  vec si2 = __vload_pred(pred, pSi);
624 
625  pred = c7x::strm_agen<2, vec>::get_vpred();
626  pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
627  vec si3 = __vload_pred(pred, pSi);
628 
629  pred = c7x::strm_agen<2, vec>::get_vpred();
630  pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
631  vec si4 = __vload_pred(pred, pSi);
632 
633  pred = c7x::strm_agen<2, vec>::get_vpred();
634  pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
635  vec si5 = __vload_pred(pred, pSi);
636 
637  pred = c7x::strm_agen<2, vec>::get_vpred();
638  pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
639  vec si6 = __vload_pred(pred, pSi);
640 
641  pred = c7x::strm_agen<2, vec>::get_vpred();
642  pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
643  vec si7 = __vload_pred(pred, pSi);
644 
645  pred = c7x::strm_agen<2, vec>::get_vpred();
646  pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
647  vec si8 = __vload_pred(pred, pSi);
648 
649  /* if (Nrows >= 2) */ {
650  dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
651  vec u1 = __vload_dup(pU1);
652  dataType *pU2 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
653  vec u2 = __vload_dup(pU2);
654  for (int32_t vertical = 0; vertical < se1ICNT1 - 1; vertical++) {
655  vec v01 = c7x::strm_eng<0, vec>::get_adv();
656  vec v02 = c7x::strm_eng<0, vec>::get_adv();
657  vec v03 = c7x::strm_eng<0, vec>::get_adv();
658  vec v04 = c7x::strm_eng<0, vec>::get_adv();
659  vec v05 = c7x::strm_eng<0, vec>::get_adv();
660  vec v06 = c7x::strm_eng<0, vec>::get_adv();
661  vec v07 = c7x::strm_eng<0, vec>::get_adv();
662  vec v08 = c7x::strm_eng<0, vec>::get_adv();
663 
664  vec v11 = c7x::strm_eng<1, vec>::get_adv();
665  vec v12 = c7x::strm_eng<1, vec>::get_adv();
666  vec v13 = c7x::strm_eng<1, vec>::get_adv();
667  vec v14 = c7x::strm_eng<1, vec>::get_adv();
668  vec v15 = c7x::strm_eng<1, vec>::get_adv();
669  vec v16 = c7x::strm_eng<1, vec>::get_adv();
670  vec v17 = c7x::strm_eng<1, vec>::get_adv();
671  vec v18 = c7x::strm_eng<1, vec>::get_adv();
672 
673  vec ele1 = u1;
674  vec ele2 = u2;
675 
676  v01 += ele1 * si1;
677  v02 += ele1 * si2;
678  v03 += ele1 * si3;
679  v04 += ele1 * si4;
680  v05 += ele1 * si5;
681  v06 += ele1 * si6;
682  v07 += ele1 * si7;
683  v08 += ele1 * si8;
684 
685  v11 += ele2 * si1;
686  v12 += ele2 * si2;
687  v13 += ele2 * si3;
688  v14 += ele2 * si4;
689  v15 += ele2 * si5;
690  v16 += ele2 * si6;
691  v17 += ele2 * si7;
692  v18 += ele2 * si8;
693 
694  pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
695  u1 = __vload_dup(pU1);
696  pU2 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
697  u2 = __vload_dup(pU2);
698 
699  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
700  vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
701  __vstore_pred(pred1, p1, v01);
702 
703  pred1 = c7x::strm_agen<0, vec>::get_vpred();
704  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
705  __vstore_pred(pred1, p1, v02);
706 
707  pred1 = c7x::strm_agen<0, vec>::get_vpred();
708  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
709  __vstore_pred(pred1, p1, v03);
710 
711  pred1 = c7x::strm_agen<0, vec>::get_vpred();
712  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
713  __vstore_pred(pred1, p1, v04);
714 
715  pred1 = c7x::strm_agen<0, vec>::get_vpred();
716  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
717  __vstore_pred(pred1, p1, v05);
718 
719  pred1 = c7x::strm_agen<0, vec>::get_vpred();
720  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
721  __vstore_pred(pred1, p1, v06);
722 
723  pred1 = c7x::strm_agen<0, vec>::get_vpred();
724  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
725  __vstore_pred(pred1, p1, v07);
726 
727  pred1 = c7x::strm_agen<0, vec>::get_vpred();
728  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
729  __vstore_pred(pred1, p1, v08);
730 
731  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
732  vec *p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
733  __vstore_pred(pred2, p2, v11);
734 
735  pred2 = c7x::strm_agen<1, vec>::get_vpred();
736  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
737  __vstore_pred(pred2, p2, v12);
738 
739  pred2 = c7x::strm_agen<1, vec>::get_vpred();
740  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
741  __vstore_pred(pred2, p2, v13);
742 
743  pred2 = c7x::strm_agen<1, vec>::get_vpred();
744  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
745  __vstore_pred(pred2, p2, v14);
746 
747  pred2 = c7x::strm_agen<1, vec>::get_vpred();
748  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
749  __vstore_pred(pred2, p2, v15);
750 
751  pred2 = c7x::strm_agen<1, vec>::get_vpred();
752  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
753  __vstore_pred(pred2, p2, v16);
754 
755  pred2 = c7x::strm_agen<1, vec>::get_vpred();
756  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
757  __vstore_pred(pred2, p2, v17);
758 
759  pred2 = c7x::strm_agen<1, vec>::get_vpred();
760  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
761  __vstore_pred(pred2, p2, v18);
762  }
763 
764  {
765  vec v01 = c7x::strm_eng<0, vec>::get_adv();
766  vec v02 = c7x::strm_eng<0, vec>::get_adv();
767  vec v03 = c7x::strm_eng<0, vec>::get_adv();
768  vec v04 = c7x::strm_eng<0, vec>::get_adv();
769  vec v05 = c7x::strm_eng<0, vec>::get_adv();
770  vec v06 = c7x::strm_eng<0, vec>::get_adv();
771  vec v07 = c7x::strm_eng<0, vec>::get_adv();
772  vec v08 = c7x::strm_eng<0, vec>::get_adv();
773 
774  vec v11 = c7x::strm_eng<1, vec>::get_adv();
775  vec v12 = c7x::strm_eng<1, vec>::get_adv();
776  vec v13 = c7x::strm_eng<1, vec>::get_adv();
777  vec v14 = c7x::strm_eng<1, vec>::get_adv();
778  vec v15 = c7x::strm_eng<1, vec>::get_adv();
779  vec v16 = c7x::strm_eng<1, vec>::get_adv();
780  vec v17 = c7x::strm_eng<1, vec>::get_adv();
781  vec v18 = c7x::strm_eng<1, vec>::get_adv();
782 
783  vec ele1 = u1;
784  vec ele2 = u2;
785 
786  v01 += ele1 * si1;
787  v02 += ele1 * si2;
788  v03 += ele1 * si3;
789  v04 += ele1 * si4;
790  v05 += ele1 * si5;
791  v06 += ele1 * si6;
792  v07 += ele1 * si7;
793  v08 += ele1 * si8;
794 
795  v11 += ele2 * si1;
796  v12 += ele2 * si2;
797  v13 += ele2 * si3;
798  v14 += ele2 * si4;
799  v15 += ele2 * si5;
800  v16 += ele2 * si6;
801  v17 += ele2 * si7;
802  v18 += ele2 * si8;
803 
804  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
805  vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
806  __vstore_pred(pred1, p1, v01);
807 
808  pred1 = c7x::strm_agen<0, vec>::get_vpred();
809  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
810  __vstore_pred(pred1, p1, v02);
811 
812  pred1 = c7x::strm_agen<0, vec>::get_vpred();
813  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
814  __vstore_pred(pred1, p1, v03);
815 
816  pred1 = c7x::strm_agen<0, vec>::get_vpred();
817  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
818  __vstore_pred(pred1, p1, v04);
819 
820  pred1 = c7x::strm_agen<0, vec>::get_vpred();
821  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
822  __vstore_pred(pred1, p1, v05);
823 
824  pred1 = c7x::strm_agen<0, vec>::get_vpred();
825  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
826  __vstore_pred(pred1, p1, v06);
827 
828  pred1 = c7x::strm_agen<0, vec>::get_vpred();
829  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
830  __vstore_pred(pred1, p1, v07);
831 
832  pred1 = c7x::strm_agen<0, vec>::get_vpred();
833  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
834  __vstore_pred(pred1, p1, v08);
835 
836  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
837  vec *p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
838  __vstore_pred(pred2, p2, v11);
839 
840  pred2 = c7x::strm_agen<1, vec>::get_vpred();
841  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
842  __vstore_pred(pred2, p2, v12);
843 
844  pred2 = c7x::strm_agen<1, vec>::get_vpred();
845  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
846  __vstore_pred(pred2, p2, v13);
847 
848  pred2 = c7x::strm_agen<1, vec>::get_vpred();
849  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
850  __vstore_pred(pred2, p2, v14);
851 
852  pred2 = c7x::strm_agen<1, vec>::get_vpred();
853  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
854  __vstore_pred(pred2, p2, v15);
855 
856  pred2 = c7x::strm_agen<1, vec>::get_vpred();
857  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
858  __vstore_pred(pred2, p2, v16);
859 
860  pred2 = c7x::strm_agen<1, vec>::get_vpred();
861  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
862  __vstore_pred(pred2, p2, v17);
863 
864  pred2 = c7x::strm_agen<1, vec>::get_vpred();
865  p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
866  __vstore_pred(pred2, p2, v18);
867  }
868  }
869 
870  if (se0ICNT1 != se1ICNT1) {
871  vec v01 = c7x::strm_eng<0, vec>::get_adv();
872  vec v02 = c7x::strm_eng<0, vec>::get_adv();
873  vec v03 = c7x::strm_eng<0, vec>::get_adv();
874  vec v04 = c7x::strm_eng<0, vec>::get_adv();
875  vec v05 = c7x::strm_eng<0, vec>::get_adv();
876  vec v06 = c7x::strm_eng<0, vec>::get_adv();
877  vec v07 = c7x::strm_eng<0, vec>::get_adv();
878  vec v08 = c7x::strm_eng<0, vec>::get_adv();
879 
880  dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(reciprocalLoad);
881  vec u1 = __vload_dup(pU1);
882 
883  v01 += u1 * si1;
884  v02 += u1 * si2;
885  v03 += u1 * si3;
886  v04 += u1 * si4;
887  v05 += u1 * si5;
888  v06 += u1 * si6;
889  v07 += u1 * si7;
890  v08 += u1 * si8;
891 
892  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
893  vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
894  __vstore_pred(pred1, p1, v01);
895 
896  pred1 = c7x::strm_agen<0, vec>::get_vpred();
897  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
898  __vstore_pred(pred1, p1, v02);
899 
900  pred1 = c7x::strm_agen<0, vec>::get_vpred();
901  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
902  __vstore_pred(pred1, p1, v03);
903 
904  pred1 = c7x::strm_agen<0, vec>::get_vpred();
905  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
906  __vstore_pred(pred1, p1, v04);
907 
908  pred1 = c7x::strm_agen<0, vec>::get_vpred();
909  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
910  __vstore_pred(pred1, p1, v05);
911 
912  pred1 = c7x::strm_agen<0, vec>::get_vpred();
913  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
914  __vstore_pred(pred1, p1, v06);
915 
916  pred1 = c7x::strm_agen<0, vec>::get_vpred();
917  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
918  __vstore_pred(pred1, p1, v07);
919 
920  pred1 = c7x::strm_agen<0, vec>::get_vpred();
921  p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
922  __vstore_pred(pred1, p1, v08);
923  }
924  }
925 
926  __SE0_CLOSE();
927  __SA0_CLOSE();
928  __SA2_CLOSE();
929  __SA3_CLOSE();
930  /* if (se1ICNT1 > 0) */ {
931  __SE1_CLOSE();
932  __SA1_CLOSE();
933  }
934 
935  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
936 }
937 template void DSPLIB_bidiag_uCol_ci<float>(float *U,
938  int32_t Nrows,
939  int32_t Ncols,
940  int32_t colUStride,
941  float half_norm_squared,
942  float *U1,
943  float scale,
944  uint8_t *pBlock);
945 template void DSPLIB_bidiag_uCol_ci<double>(double *U,
946  int32_t Nrows,
947  int32_t Ncols,
948  int32_t colUStride,
949  double half_norm_squared,
950  double *U1,
951  double scale,
952  uint8_t *pBlock);
953 
958 template <typename dataType>
959 dataType DSPLIB_bidiag_uRow_halfnorm_ci(dataType *U,
960  int32_t Nrows,
961  int32_t Ncols,
962  int32_t colUStride,
963  dataType *half_norm_squared,
964  dataType *U1,
965  dataType *s,
966  dataType *superdiag,
967  uint8_t *pBlock)
968 {
969  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
970 
971  typedef typename c7x::make_full_vector<dataType>::type vec;
972  int32_t eleCount = c7x::element_count_of<vec>::value;
973 
974  __SE_TEMPLATE_v1 se0Params, se1Params;
975 
976  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
977  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (2 * SE_PARAM_SIZE));
978 
979  int32_t nVec = DSPLIB_ceilingDiv(Ncols, eleCount);
980 
981  /* Calculating loopCount for 12 vector loads */
982  int32_t iterloop1 = nVec / 12;
983  int32_t remainingVec = nVec - (iterloop1 * 12);
984 
985  /* Calculating loopCount for 4 vector loads */
986  int32_t iterloop2 = remainingVec / 4;
987  remainingVec = remainingVec - (iterloop2 * 4);
988 
989  /* Calculating loopCount for remaining vectors */
990  int32_t iterloop3 = remainingVec / 2;
991 
992  int32_t se0Iter = nVec / 2;
993  int32_t se1Iter = nVec - se0Iter;
994 
995  int32_t se0ICNT0 = se0Iter * eleCount;
996  int32_t se1ICNT0 = Ncols - se0ICNT0;
997 
998  se0Params.ICNT0 = se0ICNT0;
999  se1Params.ICNT0 = se1ICNT0;
1000 
1001  /* ******************************** CALCULATE ROW SCALE **************************************
1002  *************************************************************************************** */
1003  // vec acc = (vec)0;
1004  vec acc1 = (vec) 0;
1005  vec acc2 = (vec) 0;
1006  vec acc3 = (vec) 0;
1007  vec acc4 = (vec) 0;
1008  vec acc5 = (vec) 0;
1009  vec acc6 = (vec) 0;
1010 
1011  __SE1_OPEN(&U[se0ICNT0], se1Params);
1012  if (se0ICNT0 > 0) {
1013  __SE0_OPEN(U, se0Params);
1014  }
1015  int32_t iter = 0;
1016  for (iter = 0; iter < iterloop1; iter++) {
1017  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1018  vec v2 = c7x::strm_eng<0, vec>::get_adv();
1019  vec v3 = c7x::strm_eng<0, vec>::get_adv();
1020  vec v4 = c7x::strm_eng<0, vec>::get_adv();
1021  vec v5 = c7x::strm_eng<0, vec>::get_adv();
1022  vec v6 = c7x::strm_eng<0, vec>::get_adv();
1023 
1024  vec v7 = c7x::strm_eng<1, vec>::get_adv();
1025  vec v8 = c7x::strm_eng<1, vec>::get_adv();
1026  vec v9 = c7x::strm_eng<1, vec>::get_adv();
1027  vec v10 = c7x::strm_eng<1, vec>::get_adv();
1028  vec v11 = c7x::strm_eng<1, vec>::get_adv();
1029  vec v12 = c7x::strm_eng<1, vec>::get_adv();
1030 
1031  acc1 += __abs(v1);
1032  acc2 += __abs(v2);
1033  acc3 += __abs(v3);
1034  acc4 += __abs(v4);
1035  acc5 += __abs(v5);
1036  acc6 += __abs(v6);
1037 
1038  acc1 += __abs(v7);
1039  acc2 += __abs(v8);
1040  acc3 += __abs(v9);
1041  acc4 += __abs(v10);
1042  acc5 += __abs(v11);
1043  acc6 += __abs(v12);
1044  }
1045 
1046  for (iter = 0; iter < iterloop2; iter++) {
1047  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1048  vec v2 = c7x::strm_eng<0, vec>::get_adv();
1049  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1050  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1051 
1052  acc1 += __abs(v1);
1053  acc2 += __abs(v2);
1054  acc3 += __abs(v3);
1055  acc4 += __abs(v4);
1056  }
1057 
1058  for (iter = 0; iter < iterloop3; iter++) {
1059  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1060  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1061 
1062  acc1 += __abs(v1);
1063  acc2 += __abs(v2);
1064  }
1065 
1066  acc1 = acc1 + acc2;
1067  acc3 = acc3 + acc4;
1068  acc5 = acc5 + acc6;
1069 
1070  acc1 = acc1 + acc3 + acc5;
1071 
1072  if (se0Iter != se1Iter) {
1073  vec v = c7x::strm_eng<1, vec>::get_adv();
1074  acc1 += __abs(v);
1075  }
1076 
1077  dataType scale = 0;
1078  c7x_horizontal_add(acc1, &scale);
1079  if (scale > 0) {
1080  /* ******************************** CALCULATE ROW S2 **************************************
1081  *************************************************************************************** */
1082  dataType *normUStore = U1 + colUStride;
1083  dataType *pSA0 = normUStore;
1084  dataType *pSA1 = normUStore + se0ICNT0;
1085 
1086  vec lastV = 0;
1087  int32_t lastIndex = 0;
1088 
1089  /* Calculating loopCount for 4 vector loads */
1090  iterloop1 = nVec / 4;
1091  remainingVec = nVec - (iterloop1 * 4);
1092 
1093  /* Calculating loopCount for remaining vectors */
1094  iterloop2 = remainingVec / 2;
1095 
1096  vec reciprocalScale = (vec) getRecip(scale);
1097  // acc = (vec)0;
1098  acc1 = (vec) 0;
1099  acc2 = (vec) 0;
1100  acc3 = (vec) 0;
1101  acc4 = (vec) 0;
1102 
1103  __SA_TEMPLATE_v1 sa0Params, sa1Params;
1104  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1105  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1106 
1107  sa0Params.ICNT0 = se0ICNT0;
1108  sa1Params.ICNT0 = se1ICNT0;
1109  sa0Params.ICNT1 = sa1Params.ICNT1 = 2;
1110 
1111  __SA1_OPEN(sa1Params);
1112  if (se0ICNT0 > 0) {
1113  __SA0_OPEN(sa0Params);
1114  }
1115  vec v1;
1116  vec v2;
1117  vec v3;
1118  vec v4;
1119  for (iter = 0; iter < iterloop1; iter++) {
1120  v1 = c7x::strm_eng<0, vec>::get_adv();
1121  v2 = c7x::strm_eng<0, vec>::get_adv();
1122  v3 = c7x::strm_eng<1, vec>::get_adv();
1123  v4 = c7x::strm_eng<1, vec>::get_adv();
1124 
1125  v1 = v1 * reciprocalScale;
1126  v2 = v2 * reciprocalScale;
1127  v3 = v3 * reciprocalScale;
1128  v4 = v4 * reciprocalScale;
1129 
1130  acc1 += v1 * v1;
1131  acc2 += v2 * v2;
1132  acc3 += v3 * v3;
1133  acc4 += v4 * v4;
1134 
1135  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1136  vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1137  __vstore_pred(pred1, pNormU1, v1);
1138 
1139  __vpred pred2 = c7x::strm_agen<0, vec>::get_vpred();
1140  vec *pNormU2 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1141  __vstore_pred(pred2, pNormU2, v2);
1142 
1143  __vpred pred3 = c7x::strm_agen<1, vec>::get_vpred();
1144  vec *pNormU3 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1145  __vstore_pred(pred3, pNormU3, v3);
1146 
1147  __vpred pred4 = c7x::strm_agen<1, vec>::get_vpred();
1148  vec *pNormU4 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1149  __vstore_pred(pred4, pNormU4, v4);
1150  }
1151 
1152  if (iterloop1 * 4 == nVec) {
1153  lastIndex = se1ICNT0 - ((se1Iter - 1) * eleCount) - 1;
1154  lastV = v4;
1155  }
1156 
1157  for (iter = 0; iter < iterloop2; iter++) {
1158  v1 = c7x::strm_eng<0, vec>::get_adv();
1159  v2 = c7x::strm_eng<1, vec>::get_adv();
1160 
1161  v1 = v1 * reciprocalScale;
1162  v2 = v2 * reciprocalScale;
1163 
1164  acc1 += v1 * v1;
1165  acc2 += v2 * v2;
1166 
1167  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1168  vec *pNormU1 = c7x::strm_agen<0, vec>::get_adv(pSA0);
1169  __vstore_pred(pred1, pNormU1, v1);
1170 
1171  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1172  vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1173  __vstore_pred(pred2, pNormU2, v2);
1174  }
1175 
1176  if (iterloop2 * 2 == nVec) {
1177  lastIndex = se1ICNT0 - ((se1Iter - 1) * eleCount) - 1;
1178  lastV = v2;
1179  }
1180  acc1 = acc1 + acc2;
1181  acc3 = acc3 + acc4;
1182  acc1 = acc1 + acc3;
1183 
1184  if (se0Iter != se1Iter) {
1185  v1 = c7x::strm_eng<1, vec>::get_adv();
1186  v1 = v1 * reciprocalScale;
1187  acc1 += v1 * v1;
1188 
1189  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1190  vec *pNormU2 = c7x::strm_agen<1, vec>::get_adv(pSA1);
1191  __vstore_pred(pred2, pNormU2, v1);
1192 
1193  lastIndex = se1ICNT0 - ((se1Iter - 1) * eleCount) - 1;
1194  lastV = v1;
1195  }
1196 
1197  __SE1_CLOSE();
1198  if (se0ICNT0 > 0) {
1199  __SE0_CLOSE();
1200  }
1201 
1202  dataType s2 = 0;
1203  c7x_horizontal_add(acc1, &s2);
1204 
1205  const dataType Half = 0.5;
1206  const dataType OneP5 = 1.5;
1207  dataType x = __recip_sqrt(s2);
1208  x = x * (OneP5 - (s2 * x * x * Half));
1209  x = x * (OneP5 - (s2 * x * x * Half));
1210  dataType y = s2 * x;
1211 
1212  if (lastV.s[lastIndex] < 0) {
1213  *s = y;
1214  }
1215  else {
1216  *s = -y;
1217  }
1218 
1219  dataType recipScale = getRecip(scale);
1220  *half_norm_squared = (U[0] * recipScale) * (*s) - s2;
1221 
1222  U[0] = U[0] - ((*s) * scale);
1223  *pSA0 = (*pSA0) - (*s);
1224 
1225  /* ******************************** UPDATE ROW SUPERDIAG *********************************
1226  *************************************************************************************** */
1227  vec reciprocalHalfNorm = (vec) getRecip(*half_norm_squared);
1228 
1229  __SE1_OPEN(pSA1, se1Params);
1230  if (se0ICNT0 > 0) {
1231  __SE0_OPEN(pSA0, se0Params);
1232  }
1233  for (int32_t horizontal = 0; horizontal < nVec - 1; horizontal += 2) {
1234  v1 = c7x::strm_eng<0, vec>::get_adv();
1235  v2 = c7x::strm_eng<1, vec>::get_adv();
1236 
1237  v1 = v1 * reciprocalHalfNorm;
1238  v2 = v2 * reciprocalHalfNorm;
1239 
1240  __vpred pred = c7x::strm_agen<0, vec>::get_vpred();
1241  vec *pStoreVec = c7x::strm_agen<0, vec>::get_adv(superdiag);
1242  __vstore_pred(pred, pStoreVec, v1);
1243 
1244  pred = c7x::strm_agen<1, vec>::get_vpred();
1245  pStoreVec = c7x::strm_agen<1, vec>::get_adv(&superdiag[se0ICNT0]);
1246  __vstore_pred(pred, pStoreVec, v2);
1247  }
1248 
1249  if (se0Iter != se1Iter) {
1250  v1 = c7x::strm_eng<1, vec>::get_adv();
1251 
1252  v1 = v1 * reciprocalHalfNorm;
1253 
1254  __vpred pred = c7x::strm_agen<1, vec>::get_vpred();
1255  vec *pStoreVec = c7x::strm_agen<1, vec>::get_adv(&superdiag[se0ICNT0]);
1256  __vstore_pred(pred, pStoreVec, v1);
1257  }
1258 
1259  __SE1_CLOSE();
1260  __SA1_CLOSE();
1261  if (se0ICNT0) {
1262  __SE0_CLOSE();
1263  __SA0_CLOSE();
1264  }
1265  }
1266  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
1267  return scale;
1268 }
1270  int32_t Nrows,
1271  int32_t Ncols,
1272  int32_t colUStride,
1273  float *half_norm_squared,
1274  float *U1,
1275  float *s,
1276  float *superdiag,
1277  uint8_t *pBlock);
1278 template double DSPLIB_bidiag_uRow_halfnorm_ci<double>(double *U,
1279  int32_t Nrows,
1280  int32_t Ncols,
1281  int32_t colUStride,
1282  double *half_norm_squared,
1283  double *U1,
1284  double *s,
1285  double *superdiag,
1286  uint8_t *pBlock);
1287 
1292 template <typename dataType>
1293 void DSPLIB_bidiag_uRow_ci(dataType *U,
1294  int32_t Nrows,
1295  int32_t Ncols,
1296  int32_t colUStride,
1297  dataType *superdiag,
1298  dataType *U1,
1299  dataType scale,
1300  uint8_t *pBlock)
1301 {
1302  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Entering function");
1303 
1304  typedef typename c7x::make_full_vector<dataType>::type vec;
1305  int32_t eleCount = c7x::element_count_of<vec>::value;
1306 
1307  __SE_TEMPLATE_v1 se0Params, se1Params;
1308  __SA_TEMPLATE_v1 sa0Params, sa1Params, sa2Params, sa3Params;
1309  dataType *siStore = (dataType *) U1;
1310 
1311  /* ******************************** CALCULATE ROW SI **************************************
1312  *************************************************************************************** */
1313  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
1314  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (7 * SE_PARAM_SIZE));
1315  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
1316  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (8 * SE_PARAM_SIZE));
1317  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1318  sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1319 
1320  int32_t nVec = DSPLIB_ceilingDiv(Ncols, eleCount);
1321  int32_t rowBlock = 8;
1322 
1323  int32_t numBlocks = Nrows / (rowBlock);
1324  int32_t se1ICNT3 = numBlocks / 2;
1325  int32_t se0ICNT3 = numBlocks - se1ICNT3;
1326 
1327  int32_t remainingRows = Nrows - (numBlocks * rowBlock);
1328 
1329  int32_t remSE1ICNT1 = remainingRows / 2;
1330  int32_t remSE0ICNT1 = remainingRows - remSE1ICNT1;
1331 
1332  se0Params.ICNT2 = se1Params.ICNT2 = nVec;
1333  se0Params.ICNT3 = se0ICNT3;
1334  se1Params.ICNT3 = se1ICNT3;
1335  se0Params.DECDIM2_WIDTH = se1Params.DECDIM2_WIDTH = Ncols;
1336 
1337  sa2Params.ICNT0 = Ncols;
1338  sa2Params.ICNT1 = (se0ICNT3 * rowBlock) + remSE0ICNT1;
1339 
1340  sa3Params.ICNT0 = Nrows;
1341  sa3Params.ICNT1 = 1;
1342 
1343  dataType *reciprocalLoad = (dataType *) U1 + colUStride;
1344 
1345  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
1346  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
1347  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1348 
1349  int32_t se0ICNT0 = Ncols / 2;
1350  int32_t se1ICNT0 = Ncols - se0ICNT0;
1351  int32_t se1ICNT2 = DSPLIB_ceilingDiv(Nrows, eleCount);
1352 
1353  se0Params.ICNT0 = se0ICNT0;
1354  se0Params.ICNT2 = se1ICNT2;
1355 
1356  se1Params.ICNT0 = se1ICNT0;
1357  se1Params.ICNT2 = se1ICNT2;
1358 
1359  sa2Params.ICNT0 = Nrows;
1360 
1361  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1362  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1363 
1364  sa0Params.ICNT0 = se0ICNT0;
1365  sa1Params.ICNT0 = se1ICNT0;
1366  sa0Params.ICNT1 = sa1Params.ICNT1 = se1ICNT2;
1367 
1368  __SA1_OPEN(sa1Params);
1369  __SE1_OPEN(&U[se0ICNT0 + colUStride], se1Params);
1370  __SA2_OPEN(sa2Params);
1371  if (se0ICNT0 > 0) {
1372  __SA0_OPEN(sa0Params);
1373  __SE0_OPEN(&U[colUStride], se0Params);
1374  for (int32_t vertical = 0; vertical < se1ICNT2; vertical++) {
1375  vec acc1 = (vec) 0;
1376  vec acc2 = (vec) 0;
1377  vec acc3 = (vec) 0;
1378  vec acc4 = (vec) 0;
1379  int32_t horizontal = 0;
1380  for (horizontal = 0; horizontal < se0ICNT0 - 1; horizontal += 2) {
1381  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1382  vec v2 = c7x::strm_eng<0, vec>::get_adv();
1383 
1384  vec v3 = c7x::strm_eng<1, vec>::get_adv();
1385  vec v4 = c7x::strm_eng<1, vec>::get_adv();
1386 
1387  dataType *pU1 = c7x::strm_agen<0, dataType>::get_adv(reciprocalLoad);
1388  vec refCol1 = __vload_dup(pU1);
1389  dataType *pU2 = c7x::strm_agen<0, dataType>::get_adv(reciprocalLoad);
1390  vec refCol2 = __vload_dup(pU2);
1391  dataType *pU3 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1392  vec refCol3 = __vload_dup(pU3);
1393  dataType *pU4 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1394  vec refCol4 = __vload_dup(pU4);
1395 
1396  acc1 += v1 * refCol1;
1397  acc2 += v2 * refCol2;
1398  acc3 += v3 * refCol3;
1399  acc4 += v4 * refCol4;
1400  }
1401 
1402  for (; horizontal < se0ICNT0; horizontal++) {
1403  vec v1 = c7x::strm_eng<0, vec>::get_adv();
1404  vec v2 = c7x::strm_eng<1, vec>::get_adv();
1405 
1406  dataType *pU1 = c7x::strm_agen<0, dataType>::get_adv(reciprocalLoad);
1407  vec refCol1 = __vload_dup(pU1);
1408  dataType *pU2 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1409  vec refCol2 = __vload_dup(pU2);
1410 
1411  acc1 += v1 * refCol1;
1412  acc2 += v2 * refCol2;
1413  }
1414  /*Last Column process for odd NCols*/
1415  if (se0ICNT0 != se1ICNT0) {
1416  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1417 
1418  dataType *pU1 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1419  vec refCol1 = __vload_dup(pU1);
1420 
1421  acc1 += v1 * refCol1;
1422  }
1423 
1424  acc1 = acc1 + acc2;
1425  acc3 = acc3 + acc4;
1426 
1427  acc1 = acc1 + acc3;
1428 
1429  __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1430  vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
1431  __vstore_pred(pred, pStoreVec, acc1);
1432  }
1433  __SE0_CLOSE();
1434  }
1435  else /* Only one column Single SE process */
1436  {
1437  dataType *pU1 = c7x::strm_agen<1, dataType>::get_adv(reciprocalLoad + se0ICNT0);
1438  vec refCol1 = __vload_dup(pU1);
1439 
1440  for (int32_t vertical = 0; vertical < se1ICNT2; vertical++) {
1441  vec v1 = c7x::strm_eng<1, vec>::get_adv();
1442 
1443  vec acc1 = v1 * refCol1;
1444 
1445  __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1446  vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
1447  __vstore_pred(pred, pStoreVec, acc1);
1448  }
1449  }
1450 
1451  __SE1_CLOSE();
1452  __SA2_CLOSE();
1453  __SA0_CLOSE();
1454  __SA1_CLOSE();
1455 
1456  /* ******************************** UPDATE ROWS **************************************
1457  *************************************************************************************** */
1458 
1459  se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1460  se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
1461  sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1462  sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
1463  sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
1464  sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (5 * SE_PARAM_SIZE));
1465 
1466  int32_t lenTile = 8;
1467  int32_t nTiles = DSPLIB_ceilingDiv(Ncols, eleCount * lenTile);
1468  int32_t se1ICNT1 = Nrows / 2;
1469  int32_t se0ICNT1 = Nrows - se1ICNT1;
1470 
1471  se0Params.ICNT1 = se0ICNT1;
1472  se0Params.ICNT2 = nTiles;
1473  se0Params.DECDIM1_WIDTH = Ncols;
1474 
1475  se1Params.ICNT1 = se1ICNT1;
1476  se1Params.ICNT2 = nTiles;
1477  se1Params.DECDIM1_WIDTH = Ncols;
1478 
1479  sa2Params.ICNT0 = Ncols;
1480 
1481  sa3Params.ICNT0 = Nrows;
1482  sa3Params.ICNT1 = nTiles;
1483 
1484  sa0Params.ICNT1 = se0ICNT1;
1485  sa0Params.ICNT2 = nTiles;
1486  sa0Params.DECDIM1_WIDTH = Ncols;
1487 
1488  sa1Params.ICNT1 = se1ICNT1;
1489  sa1Params.ICNT2 = nTiles;
1490  sa1Params.DECDIM1_WIDTH = Ncols;
1491 
1492  dataType *pSE0 = U + colUStride;
1493  dataType *pSE1 = U + (2 * colUStride);
1494 
1495  __SE0_OPEN(pSE0, se0Params);
1496  __SA0_OPEN(sa0Params);
1497  __SA2_OPEN(sa2Params);
1498  __SA3_OPEN(sa3Params);
1499  if (se1ICNT1 > 0) {
1500  __SE1_OPEN(pSE1, se1Params);
1501  __SA1_OPEN(sa1Params);
1502 
1503  for (int32_t tile = 0; tile < nTiles; tile++) {
1504  __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1505  vec *pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1506  vec sd1 = __vload_pred(pred, pSd);
1507 
1508  pred = c7x::strm_agen<2, vec>::get_vpred();
1509  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1510  vec sd2 = __vload_pred(pred, pSd);
1511 
1512  pred = c7x::strm_agen<2, vec>::get_vpred();
1513  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1514  vec sd3 = __vload_pred(pred, pSd);
1515 
1516  pred = c7x::strm_agen<2, vec>::get_vpred();
1517  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1518  vec sd4 = __vload_pred(pred, pSd);
1519 
1520  pred = c7x::strm_agen<2, vec>::get_vpred();
1521  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1522  vec sd5 = __vload_pred(pred, pSd);
1523 
1524  pred = c7x::strm_agen<2, vec>::get_vpred();
1525  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1526  vec sd6 = __vload_pred(pred, pSd);
1527 
1528  pred = c7x::strm_agen<2, vec>::get_vpred();
1529  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1530  vec sd7 = __vload_pred(pred, pSd);
1531 
1532  pred = c7x::strm_agen<2, vec>::get_vpred();
1533  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1534  vec sd8 = __vload_pred(pred, pSd);
1535 
1536  /* if (Nrows >= 2) */ {
1537  dataType *pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1538  vec si1 = __vload_dup(pSi1);
1539  dataType *pSi2 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1540  vec si2 = __vload_dup(pSi2);
1541  for (int32_t vertical = 0; vertical < se1ICNT1 - 1; vertical++) {
1542  vec v01 = c7x::strm_eng<0, vec>::get_adv();
1543  vec v02 = c7x::strm_eng<0, vec>::get_adv();
1544  vec v03 = c7x::strm_eng<0, vec>::get_adv();
1545  vec v04 = c7x::strm_eng<0, vec>::get_adv();
1546  vec v05 = c7x::strm_eng<0, vec>::get_adv();
1547  vec v06 = c7x::strm_eng<0, vec>::get_adv();
1548  vec v07 = c7x::strm_eng<0, vec>::get_adv();
1549  vec v08 = c7x::strm_eng<0, vec>::get_adv();
1550 
1551  vec v11 = c7x::strm_eng<1, vec>::get_adv();
1552  vec v12 = c7x::strm_eng<1, vec>::get_adv();
1553  vec v13 = c7x::strm_eng<1, vec>::get_adv();
1554  vec v14 = c7x::strm_eng<1, vec>::get_adv();
1555  vec v15 = c7x::strm_eng<1, vec>::get_adv();
1556  vec v16 = c7x::strm_eng<1, vec>::get_adv();
1557  vec v17 = c7x::strm_eng<1, vec>::get_adv();
1558  vec v18 = c7x::strm_eng<1, vec>::get_adv();
1559 
1560  v01 += si1 * sd1;
1561  v02 += si1 * sd2;
1562  v03 += si1 * sd3;
1563  v04 += si1 * sd4;
1564  v05 += si1 * sd5;
1565  v06 += si1 * sd6;
1566  v07 += si1 * sd7;
1567  v08 += si1 * sd8;
1568  v11 += si2 * sd1;
1569  v12 += si2 * sd2;
1570  v13 += si2 * sd3;
1571  v14 += si2 * sd4;
1572  v15 += si2 * sd5;
1573  v16 += si2 * sd6;
1574  v17 += si2 * sd7;
1575  v18 += si2 * sd8;
1576 
1577  pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1578  si1 = __vload_dup(pSi1);
1579  pSi2 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1580  si2 = __vload_dup(pSi2);
1581 
1582  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1583  vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1584  __vstore_pred(pred1, p1, v01);
1585 
1586  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1587  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1588  __vstore_pred(pred1, p1, v02);
1589 
1590  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1591  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1592  __vstore_pred(pred1, p1, v03);
1593 
1594  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1595  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1596  __vstore_pred(pred1, p1, v04);
1597 
1598  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1599  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1600  __vstore_pred(pred1, p1, v05);
1601 
1602  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1603  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1604  __vstore_pred(pred1, p1, v06);
1605 
1606  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1607  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1608  __vstore_pred(pred1, p1, v07);
1609 
1610  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1611  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1612  __vstore_pred(pred1, p1, v08);
1613 
1614  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1615  vec *p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1616  __vstore_pred(pred2, p2, v11);
1617 
1618  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1619  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1620  __vstore_pred(pred2, p2, v12);
1621 
1622  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1623  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1624  __vstore_pred(pred2, p2, v13);
1625 
1626  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1627  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1628  __vstore_pred(pred2, p2, v14);
1629 
1630  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1631  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1632  __vstore_pred(pred2, p2, v15);
1633 
1634  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1635  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1636  __vstore_pred(pred2, p2, v16);
1637 
1638  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1639  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1640  __vstore_pred(pred2, p2, v17);
1641 
1642  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1643  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1644  __vstore_pred(pred2, p2, v18);
1645  }
1646 
1647  {
1648  vec v01 = c7x::strm_eng<0, vec>::get_adv();
1649  vec v02 = c7x::strm_eng<0, vec>::get_adv();
1650  vec v03 = c7x::strm_eng<0, vec>::get_adv();
1651  vec v04 = c7x::strm_eng<0, vec>::get_adv();
1652  vec v05 = c7x::strm_eng<0, vec>::get_adv();
1653  vec v06 = c7x::strm_eng<0, vec>::get_adv();
1654  vec v07 = c7x::strm_eng<0, vec>::get_adv();
1655  vec v08 = c7x::strm_eng<0, vec>::get_adv();
1656 
1657  vec v11 = c7x::strm_eng<1, vec>::get_adv();
1658  vec v12 = c7x::strm_eng<1, vec>::get_adv();
1659  vec v13 = c7x::strm_eng<1, vec>::get_adv();
1660  vec v14 = c7x::strm_eng<1, vec>::get_adv();
1661  vec v15 = c7x::strm_eng<1, vec>::get_adv();
1662  vec v16 = c7x::strm_eng<1, vec>::get_adv();
1663  vec v17 = c7x::strm_eng<1, vec>::get_adv();
1664  vec v18 = c7x::strm_eng<1, vec>::get_adv();
1665 
1666  v01 += si1 * sd1;
1667  v02 += si1 * sd2;
1668  v03 += si1 * sd3;
1669  v04 += si1 * sd4;
1670  v05 += si1 * sd5;
1671  v06 += si1 * sd6;
1672  v07 += si1 * sd7;
1673  v08 += si1 * sd8;
1674  v11 += si2 * sd1;
1675  v12 += si2 * sd2;
1676  v13 += si2 * sd3;
1677  v14 += si2 * sd4;
1678  v15 += si2 * sd5;
1679  v16 += si2 * sd6;
1680  v17 += si2 * sd7;
1681  v18 += si2 * sd8;
1682 
1683  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1684  vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1685  __vstore_pred(pred1, p1, v01);
1686 
1687  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1688  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1689  __vstore_pred(pred1, p1, v02);
1690 
1691  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1692  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1693  __vstore_pred(pred1, p1, v03);
1694 
1695  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1696  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1697  __vstore_pred(pred1, p1, v04);
1698 
1699  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1700  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1701  __vstore_pred(pred1, p1, v05);
1702 
1703  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1704  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1705  __vstore_pred(pred1, p1, v06);
1706 
1707  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1708  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1709  __vstore_pred(pred1, p1, v07);
1710 
1711  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1712  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1713  __vstore_pred(pred1, p1, v08);
1714 
1715  __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
1716  vec *p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1717  __vstore_pred(pred2, p2, v11);
1718 
1719  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1720  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1721  __vstore_pred(pred2, p2, v12);
1722 
1723  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1724  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1725  __vstore_pred(pred2, p2, v13);
1726 
1727  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1728  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1729  __vstore_pred(pred2, p2, v14);
1730 
1731  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1732  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1733  __vstore_pred(pred2, p2, v15);
1734 
1735  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1736  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1737  __vstore_pred(pred2, p2, v16);
1738 
1739  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1740  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1741  __vstore_pred(pred2, p2, v17);
1742 
1743  pred2 = c7x::strm_agen<1, vec>::get_vpred();
1744  p2 = c7x::strm_agen<1, vec>::get_adv(pSE1);
1745  __vstore_pred(pred2, p2, v18);
1746  }
1747  }
1748 
1749  if (se0ICNT1 != se1ICNT1) {
1750  vec v01 = c7x::strm_eng<0, vec>::get_adv();
1751  vec v02 = c7x::strm_eng<0, vec>::get_adv();
1752  vec v03 = c7x::strm_eng<0, vec>::get_adv();
1753  vec v04 = c7x::strm_eng<0, vec>::get_adv();
1754  vec v05 = c7x::strm_eng<0, vec>::get_adv();
1755  vec v06 = c7x::strm_eng<0, vec>::get_adv();
1756  vec v07 = c7x::strm_eng<0, vec>::get_adv();
1757  vec v08 = c7x::strm_eng<0, vec>::get_adv();
1758 
1759  dataType *pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1760  vec si1 = __vload_dup(pSi1);
1761 
1762  v01 += si1 * sd1;
1763  v02 += si1 * sd2;
1764  v03 += si1 * sd3;
1765  v04 += si1 * sd4;
1766  v05 += si1 * sd5;
1767  v06 += si1 * sd6;
1768  v07 += si1 * sd7;
1769  v08 += si1 * sd8;
1770 
1771  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1772  vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1773  __vstore_pred(pred1, p1, v01);
1774 
1775  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1776  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1777  __vstore_pred(pred1, p1, v02);
1778 
1779  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1780  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1781  __vstore_pred(pred1, p1, v03);
1782 
1783  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1784  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1785  __vstore_pred(pred1, p1, v04);
1786 
1787  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1788  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1789  __vstore_pred(pred1, p1, v05);
1790 
1791  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1792  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1793  __vstore_pred(pred1, p1, v06);
1794 
1795  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1796  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1797  __vstore_pred(pred1, p1, v07);
1798 
1799  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1800  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1801  __vstore_pred(pred1, p1, v08);
1802  }
1803  }
1804  __SE1_CLOSE();
1805  __SA1_CLOSE();
1806  }
1807  else {
1808 
1809  for (int32_t tile = 0; tile < nTiles; tile++) {
1810  __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
1811  vec *pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1812  vec sd1 = __vload_pred(pred, pSd);
1813 
1814  pred = c7x::strm_agen<2, vec>::get_vpred();
1815  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1816  vec sd2 = __vload_pred(pred, pSd);
1817 
1818  pred = c7x::strm_agen<2, vec>::get_vpred();
1819  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1820  vec sd3 = __vload_pred(pred, pSd);
1821 
1822  pred = c7x::strm_agen<2, vec>::get_vpred();
1823  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1824  vec sd4 = __vload_pred(pred, pSd);
1825 
1826  pred = c7x::strm_agen<2, vec>::get_vpred();
1827  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1828  vec sd5 = __vload_pred(pred, pSd);
1829 
1830  pred = c7x::strm_agen<2, vec>::get_vpred();
1831  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1832  vec sd6 = __vload_pred(pred, pSd);
1833 
1834  pred = c7x::strm_agen<2, vec>::get_vpred();
1835  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1836  vec sd7 = __vload_pred(pred, pSd);
1837 
1838  pred = c7x::strm_agen<2, vec>::get_vpred();
1839  pSd = c7x::strm_agen<2, vec>::get_adv(superdiag);
1840  vec sd8 = __vload_pred(pred, pSd);
1841 
1842  vec v01 = c7x::strm_eng<0, vec>::get_adv();
1843  vec v02 = c7x::strm_eng<0, vec>::get_adv();
1844  vec v03 = c7x::strm_eng<0, vec>::get_adv();
1845  vec v04 = c7x::strm_eng<0, vec>::get_adv();
1846  vec v05 = c7x::strm_eng<0, vec>::get_adv();
1847  vec v06 = c7x::strm_eng<0, vec>::get_adv();
1848  vec v07 = c7x::strm_eng<0, vec>::get_adv();
1849  vec v08 = c7x::strm_eng<0, vec>::get_adv();
1850 
1851  dataType *pSi1 = c7x::strm_agen<3, dataType>::get_adv(siStore);
1852  vec si1 = __vload_dup(pSi1);
1853 
1854  v01 += si1 * sd1;
1855  v02 += si1 * sd2;
1856  v03 += si1 * sd3;
1857  v04 += si1 * sd4;
1858  v05 += si1 * sd5;
1859  v06 += si1 * sd6;
1860  v07 += si1 * sd7;
1861  v08 += si1 * sd8;
1862 
1863  __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
1864  vec *p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1865  __vstore_pred(pred1, p1, v01);
1866 
1867  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1868  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1869  __vstore_pred(pred1, p1, v02);
1870 
1871  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1872  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1873  __vstore_pred(pred1, p1, v03);
1874 
1875  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1876  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1877  __vstore_pred(pred1, p1, v04);
1878 
1879  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1880  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1881  __vstore_pred(pred1, p1, v05);
1882 
1883  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1884  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1885  __vstore_pred(pred1, p1, v06);
1886 
1887  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1888  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1889  __vstore_pred(pred1, p1, v07);
1890 
1891  pred1 = c7x::strm_agen<0, vec>::get_vpred();
1892  p1 = c7x::strm_agen<0, vec>::get_adv(pSE0);
1893  __vstore_pred(pred1, p1, v08);
1894  }
1895  }
1896 
1897  __SE0_CLOSE();
1898  __SA0_CLOSE();
1899  __SA2_CLOSE();
1900  __SA3_CLOSE();
1901 
1902  DSPLIB_DEBUGPRINTFN(0, "%s\n", "Exiting function");
1903 }
1904 template void DSPLIB_bidiag_uRow_ci<float>(float *U,
1905  int32_t Nrows,
1906  int32_t Ncols,
1907  int32_t colUStride,
1908  float *superdiag,
1909  float *U1,
1910  float scale,
1911  uint8_t *pBlock);
1912 template void DSPLIB_bidiag_uRow_ci<double>(double *U,
1913  int32_t Nrows,
1914  int32_t Ncols,
1915  int32_t colUStride,
1916  double *superdiag,
1917  double *U1,
1918  double scale,
1919  uint8_t *pBlock);
1920 
1921 /* ======================================================================== */
1922 /* End of file: DSPLIB_svd_bidiag_uMat_ci.cpp */
1923 /* ======================================================================== */
template double DSPLIB_bidiag_uCol_halfnorm_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double *half_norm_squared, double *U1, double *s, uint8_t *pBlock)
template void DSPLIB_bidiag_u_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_bidiag_u_init_ci< double >(DSPLIB_kernelHandle handle)
void DSPLIB_bidiag_uCol_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType half_norm_squared, dataType *U1, dataType scale, uint8_t *pBlock)
This function implements the Household processing on columns of input U matrix corresponding to the n...
template void DSPLIB_bidiag_uCol_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double half_norm_squared, double *U1, double scale, uint8_t *pBlock)
template void DSPLIB_bidiag_uRow_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double *superdiag, double *U1, double scale, uint8_t *pBlock)
template float DSPLIB_bidiag_uRow_halfnorm_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float *half_norm_squared, float *U1, float *s, float *superdiag, uint8_t *pBlock)
void DSPLIB_bidiag_u_init_ci(DSPLIB_kernelHandle handle)
template float DSPLIB_bidiag_uCol_halfnorm_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float *half_norm_squared, float *U1, float *s, uint8_t *pBlock)
template void DSPLIB_bidiag_uRow_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float *superdiag, float *U1, float scale, uint8_t *pBlock)
dataType DSPLIB_bidiag_uRow_halfnorm_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType *half_norm_squared, dataType *U1, dataType *s, dataType *superdiag, uint8_t *pBlock)
This function calculates the half-norm corresponding to the row of input matrix U and returns scale.
dataType DSPLIB_bidiag_uCol_halfnorm_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType *half_norm_squared, dataType *U1, dataType *s, uint8_t *pBlock)
This function calculates the half-norms corresponding to the column of input matrix U and returns sca...
void DSPLIB_bidiag_uRow_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType *superdiag, dataType *U1, dataType scale, uint8_t *pBlock)
This function implements the Household processing on rows of input U matrix corresponding to the natu...
template double DSPLIB_bidiag_uRow_halfnorm_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double *half_norm_squared, double *U1, double *s, double *superdiag, uint8_t *pBlock)
template void DSPLIB_bidiag_uCol_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float half_norm_squared, float *U1, float scale, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_svd.
dataType getRecip(dataType value)
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
Definition: DSPLIB_types.h:83
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Definition: DSPLIB_types.h:172
@ DSPLIB_SUCCESS
Definition: DSPLIB_types.h:152
Structure that is reserved for internal use by the kernel.
uint32_t strideU
Stride between rows of U matrix
uint8_t bufPblock[DSPLIB_SVD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters