44 uint8_t *pBlock = pKerPrivArgs->
bufPblock;
45 int32_t strideU = pKerPrivArgs->
strideU;
47 __SE_TEMPLATE_v1 se1Params = __gen_SE_TEMPLATE_v1();
48 __SE_TEMPLATE_v1 se3Params = __gen_SE_TEMPLATE_v1();
49 __SA_TEMPLATE_v1 sa0Params = __gen_SA_TEMPLATE_v1();
50 __SA_TEMPLATE_v1 sa1Params = __gen_SA_TEMPLATE_v1();
51 __SA_TEMPLATE_v1 sa2Params = __gen_SA_TEMPLATE_v1();
52 __SA_TEMPLATE_v1 sa3Params = __gen_SA_TEMPLATE_v1();
53 __SA_TEMPLATE_v1 sa4Params = __gen_SA_TEMPLATE_v1();
55 typedef typename c7x::make_full_vector<dataType>::type vec;
56 int32_t eleCount = c7x::element_count_of<vec>::value;
57 __SE_ELETYPE SE_ELETYPE = c7x::se_eletype<vec>::value;
58 __SA_VECLEN SA_VECLEN = c7x::sa_veclen<vec>::value;
60 int32_t colUStride = strideU /
sizeof(dataType);
63 se1Params.DIM1 = colUStride;
65 se1Params.DIMFMT = __SE_DIMFMT_3D;
66 se1Params.ELETYPE = SE_ELETYPE;
67 se1Params.VECLEN = __SE_VECLEN_1ELEM;
68 se1Params.GRPDUP = __SE_GRPDUP_ON;
70 sa0Params.ICNT0 = eleCount;
71 sa0Params.DIM1 = colUStride;
72 sa0Params.DIM2 = eleCount * 2;
73 sa0Params.DIMFMT = __SA_DIMFMT_3D;
74 sa0Params.VECLEN = SA_VECLEN;
75 sa0Params.DECDIM1 = __SA_DECDIM_DIM2;
78 se3Params.DIM1 = colUStride + 1;
79 se3Params.DIMFMT = __SE_DIMFMT_2D;
80 se3Params.ELETYPE = SE_ELETYPE;
81 se3Params.VECLEN = __SE_VECLEN_1ELEM;
84 sa3Params.DIM1 = colUStride + 1;
85 sa3Params.DIMFMT = __SA_DIMFMT_2D;
86 sa3Params.VECLEN = __SA_VECLEN_1ELEM;
89 sa1Params.DIM1 = colUStride;
90 sa1Params.DIMFMT = __SA_DIMFMT_2D;
91 sa1Params.VECLEN = __SA_VECLEN_1ELEM;
93 sa2Params.DIM1 = colUStride;
94 sa2Params.DIMFMT = __SA_DIMFMT_2D;
95 sa2Params.VECLEN = SA_VECLEN;
98 sa4Params.DIM1 = colUStride;
100 sa4Params.DIMFMT = __SA_DIMFMT_3D;
101 sa4Params.VECLEN = __SA_VECLEN_1ELEM;
103 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE)) = se1Params;
104 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE)) = sa0Params;
105 *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE)) = se3Params;
106 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE)) = sa3Params;
107 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE)) = sa1Params;
108 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (17 * SE_PARAM_SIZE)) = sa2Params;
109 *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (18 * SE_PARAM_SIZE)) = sa4Params;
126 template <
typename dataType>
131 __SA_TEMPLATE_v1 sa0Params, sa1Params;
132 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (17 * SE_PARAM_SIZE));
133 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (17 * SE_PARAM_SIZE));
135 typedef typename c7x::make_full_vector<dataType>::type vec;
136 int32_t eleCount = c7x::element_count_of<vec>::value;
138 int32_t expandCols = Nrows - Ncols;
139 int32_t sa0ICNT = Nrows / 2;
140 int32_t sa1ICNT = Nrows - sa0ICNT;
142 sa0Params.ICNT0 = sa1Params.ICNT0 = expandCols;
143 sa0Params.ICNT1 = sa0ICNT;
144 sa1Params.ICNT1 = sa1ICNT;
146 int32_t nVec = DSPLIB_ceilingDiv(expandCols, eleCount);
147 int32_t totalIter = sa0ICNT * nVec;
148 int32_t uOffsetSA0 = Ncols;
149 int32_t uOffsetSA1 = Ncols + (sa0ICNT * colUStride);
151 __SA1_OPEN(sa1Params);
153 __SA0_OPEN(sa0Params);
154 for (int32_t iter = 0; iter < totalIter; iter++) {
155 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
156 vec *pStoreVec1 = c7x::strm_agen<0, vec>::get_adv(U + uOffsetSA0);
157 __vstore_pred(pred1, pStoreVec1, (vec) 0);
159 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
160 vec *pStoreVec2 = c7x::strm_agen<1, vec>::get_adv(U + uOffsetSA1);
161 __vstore_pred(pred2, pStoreVec2, (vec) 0);
166 if (sa0ICNT != sa1ICNT) {
167 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
168 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
169 vec *pStoreVec2 = c7x::strm_agen<1, vec>::get_adv(U + uOffsetSA1);
170 __vstore_pred(pred2, pStoreVec2, (vec) 0);
186 template <
typename dataType>
197 typedef typename c7x::make_full_vector<dataType>::type vec;
198 int32_t eleCount = c7x::element_count_of<vec>::value;
201 __SE_TEMPLATE_v1 se0Params;
202 __SE_TEMPLATE_v1 se1Params;
203 __SA_TEMPLATE_v1 sa0Params;
204 __SA_TEMPLATE_v1 sa1Params;
206 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
207 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (12 * SE_PARAM_SIZE));
208 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
209 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (13 * SE_PARAM_SIZE));
211 int32_t nVec = DSPLIB_ceilingDiv(Ncols, eleCount);
212 int32_t sa1ICNT2 = nVec / 2;
213 int32_t sa0ICNT2 = nVec - sa1ICNT2;
215 sa0Params.ICNT1 = Nrows;
216 sa0Params.ICNT2 = sa0ICNT2;
217 sa0Params.DECDIM1_WIDTH = Ncols;
219 sa1Params.ICNT1 = Nrows;
220 sa1Params.ICNT2 = sa1ICNT2;
221 sa1Params.DECDIM1_WIDTH = Ncols - eleCount;
223 se0Params.ICNT2 = nVec;
225 se1Params.ICNT1 = Nrows;
226 se1Params.ICNT2 = sa0ICNT2;
228 dataType reciprocalFactor =
getRecip((U[0] * s));
229 vec reciprocalVec = (vec) reciprocalFactor;
231 __SE1_OPEN(U, se1Params);
232 __SE0_OPEN(U + colUStride, se0Params);
233 __SA0_OPEN(sa0Params);
235 __SA1_OPEN(sa1Params);
237 for (int32_t horizontal = 0; horizontal < sa1ICNT2; horizontal++) {
238 vec uCol1 = c7x::strm_eng<0, vec>::get_adv();
239 vec uCol2 = c7x::strm_eng<0, vec>::get_adv();
240 for (int32_t vertical = 0; vertical < Nrows; vertical++) {
241 vec uEle = c7x::strm_eng<1, vec>::get_adv();
243 vec v1 = uCol1 * uEle * reciprocalVec;
244 vec v2 = uCol2 * uEle * reciprocalVec;
246 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
247 vec *pStoreVec1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
248 __vstore_pred(pred1, pStoreVec1, v1);
250 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
251 vec *pStoreVec2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + eleCount);
252 __vstore_pred(pred2, pStoreVec2, v2);
258 if (sa1ICNT2 != sa0ICNT2) {
259 vec uCol1 = c7x::strm_eng<0, vec>::get_adv();
260 for (int32_t vertical = 0; vertical < Nrows; vertical++) {
261 vec uEle = c7x::strm_eng<1, vec>::get_adv();
263 vec v1 = uCol1 * uEle * reciprocalVec;
265 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
266 vec *pStoreVec1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
267 __vstore_pred(pred1, pStoreVec1, v1);
275 __SE_TEMPLATE_v1 se3Params;
276 __SA_TEMPLATE_v1 sa3Params;
278 se3Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (14 * SE_PARAM_SIZE));
279 sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
281 se3Params.ICNT1 = sa3Params.ICNT1 = Ncols;
283 __SE0_OPEN(U + 1 + colUStride, se3Params);
284 __SA0_OPEN(sa3Params);
286 for (int32_t diag = 0; diag < Ncols; diag++) {
287 vec diagEle = c7x::strm_eng<0, vec>::get_adv();
288 dataType *pStore = c7x::strm_agen<0, dataType>::get_adv(U + 1 + colUStride);
289 *pStore = diagEle.s[0] + 1;
293 __SA_TEMPLATE_v1 sa3Params;
295 sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (15 * SE_PARAM_SIZE));
297 sa3Params.ICNT1 = Ncols;
299 __SA0_OPEN(sa3Params);
301 for (int32_t diag = 0; diag < Ncols; diag++) {
302 dataType *pStore = c7x::strm_agen<0, dataType>::get_adv(U + 1 + colUStride);
327 template <
typename dataType>
332 __SE_TEMPLATE_v1 se0Params;
333 __SA_TEMPLATE_v1 sa0Params;
334 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (0 * SE_PARAM_SIZE));
335 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (16 * SE_PARAM_SIZE));
337 typedef typename c7x::make_full_vector<dataType>::type vec;
338 int32_t eleCount = c7x::element_count_of<vec>::value;
340 int32_t nVec = Nrows / eleCount;
341 int32_t remainingRows = Nrows - (nVec * eleCount);
343 vec invNormFactor = (vec) normFactor;
345 se0Params.ICNT2 = DSPLIB_ceilingDiv(Nrows, eleCount);
346 sa0Params.ICNT1 = Nrows;
348 __SA0_OPEN(sa0Params);
351 __SE0_OPEN(U, se0Params);
353 for (int32_t vertical = 0; vertical < nVec; vertical++) {
354 vec v1 = c7x::strm_eng<0, vec>::get_adv();
358 for (int32_t i = 0; i < eleCount; i++) {
359 dataType *pStoreVec1 = c7x::strm_agen<0, dataType>::get_adv(U);
360 *pStoreVec1 = v1.s[i];
364 if (remainingRows > 0) {
365 vec v1 = c7x::strm_eng<0, vec>::get_adv();
367 for (int32_t i = 0; i < remainingRows; i++) {
368 dataType *pStoreVec1 = c7x::strm_agen<0, dataType>::get_adv(U);
369 *pStoreVec1 = v1.s[i];
376 for (int32_t vertical = 0; vertical < Nrows; vertical++) {
377 dataType *pStoreVec1 = c7x::strm_agen<0, dataType>::get_adv(U);
394 template <
typename dataType>
404 typedef typename c7x::make_full_vector<dataType>::type vec;
405 int32_t eleCount = c7x::element_count_of<vec>::value;
407 __SA_TEMPLATE_v1 sa2ParamsUpdate0;
409 sa2ParamsUpdate0 = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (1 * SE_PARAM_SIZE));
411 int32_t nVec = DSPLIB_ceilingDiv(Ncols, eleCount);
413 sa2ParamsUpdate0.ICNT0 = Ncols;
416 __SA2_OPEN(sa2ParamsUpdate0);
417 for (int32_t horizontal = 0; horizontal < nVec; horizontal++) {
418 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
419 vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(U + 1);
420 __vstore_pred(pred, pStoreVec, (vec) 0);
427 vec siNormFactor = (vec)
getRecip((U[0] * s));
429 __SE_TEMPLATE_v1 se0Params;
430 __SE_TEMPLATE_v1 se1Params;
431 __SA_TEMPLATE_v1 sa2Params;
433 se0Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
434 se1Params = *(__SE_TEMPLATE_v1 *) ((uint8_t *) pBlock + (3 * SE_PARAM_SIZE));
435 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
438 int32_t nTiles = DSPLIB_ceilingDiv(Ncols, eleCount * lenTile);
439 int32_t se1ICNT1 = Nrows / 2;
440 int32_t se0ICNT1 = Nrows - se1ICNT1;
442 se0Params.ICNT1 = se0ICNT1;
443 se0Params.ICNT2 = nTiles;
444 se0Params.DECDIM1_WIDTH = Ncols;
446 se1Params.ICNT1 = se1ICNT1;
447 se1Params.ICNT2 = nTiles;
448 se1Params.DECDIM1_WIDTH = Ncols;
450 sa2Params.ICNT1 = lenTile * nTiles;
451 sa2Params.DECDIM1_WIDTH = Ncols;
453 dataType *siStore = (dataType *) U1;
455 __SA_TEMPLATE_v1 sa3Params;
456 sa3Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (18 * SE_PARAM_SIZE));
457 sa3Params.ICNT1 = Nrows;
458 sa3Params.ICNT2 = 2 * nTiles;
460 __SE0_OPEN(U + 1, se0Params);
461 __SA2_OPEN(sa2Params);
462 __SA3_OPEN(sa3Params);
466 __SE1_OPEN(U + 1 + colUStride, se1Params);
468 for (int32_t tile = 0; tile < nTiles; tile++) {
469 vec acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8;
470 acc1 = acc2 = acc3 = acc4 = acc5 = acc6 = acc7 = acc8 = (vec) 0;
471 for (int32_t vertical = 0; vertical < se1ICNT1; vertical++) {
472 vec v01 = c7x::strm_eng<0, vec>::get_adv();
473 vec v02 = c7x::strm_eng<0, vec>::get_adv();
474 vec v03 = c7x::strm_eng<0, vec>::get_adv();
475 vec v04 = c7x::strm_eng<0, vec>::get_adv();
476 vec v05 = c7x::strm_eng<0, vec>::get_adv();
477 vec v06 = c7x::strm_eng<0, vec>::get_adv();
478 vec v07 = c7x::strm_eng<0, vec>::get_adv();
479 vec v08 = c7x::strm_eng<0, vec>::get_adv();
481 vec v11 = c7x::strm_eng<1, vec>::get_adv();
482 vec v12 = c7x::strm_eng<1, vec>::get_adv();
483 vec v13 = c7x::strm_eng<1, vec>::get_adv();
484 vec v14 = c7x::strm_eng<1, vec>::get_adv();
485 vec v15 = c7x::strm_eng<1, vec>::get_adv();
486 vec v16 = c7x::strm_eng<1, vec>::get_adv();
487 vec v17 = c7x::strm_eng<1, vec>::get_adv();
488 vec v18 = c7x::strm_eng<1, vec>::get_adv();
490 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(U);
491 vec u1 = __vload_dup(pU1);
492 dataType *pU2 = c7x::strm_agen<3, dataType>::get_adv(U);
493 vec u2 = __vload_dup(pU2);
495 acc1 += v01 * u1 + v11 * u2;
496 acc2 += v02 * u1 + v12 * u2;
497 acc3 += v03 * u1 + v13 * u2;
498 acc4 += v04 * u1 + v14 * u2;
499 acc5 += v05 * u1 + v15 * u2;
500 acc6 += v06 * u1 + v16 * u2;
501 acc7 += v07 * u1 + v17 * u2;
502 acc8 += v08 * u1 + v18 * u2;
505 if (se1ICNT1 != se0ICNT1)
507 vec v01 = c7x::strm_eng<0, vec>::get_adv();
508 vec v02 = c7x::strm_eng<0, vec>::get_adv();
509 vec v03 = c7x::strm_eng<0, vec>::get_adv();
510 vec v04 = c7x::strm_eng<0, vec>::get_adv();
511 vec v05 = c7x::strm_eng<0, vec>::get_adv();
512 vec v06 = c7x::strm_eng<0, vec>::get_adv();
513 vec v07 = c7x::strm_eng<0, vec>::get_adv();
514 vec v08 = c7x::strm_eng<0, vec>::get_adv();
516 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(U);
517 vec u1 = __vload_dup(pU1);
529 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
530 vec *pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
531 __vstore_pred(pred, pStoreVec, acc1 * siNormFactor);
533 pred = c7x::strm_agen<2, vec>::get_vpred();
534 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
535 __vstore_pred(pred, pStoreVec, acc2 * siNormFactor);
537 pred = c7x::strm_agen<2, vec>::get_vpred();
538 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
539 __vstore_pred(pred, pStoreVec, acc3 * siNormFactor);
541 pred = c7x::strm_agen<2, vec>::get_vpred();
542 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
543 __vstore_pred(pred, pStoreVec, acc4 * siNormFactor);
545 pred = c7x::strm_agen<2, vec>::get_vpred();
546 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
547 __vstore_pred(pred, pStoreVec, acc5 * siNormFactor);
549 pred = c7x::strm_agen<2, vec>::get_vpred();
550 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
551 __vstore_pred(pred, pStoreVec, acc6 * siNormFactor);
553 pred = c7x::strm_agen<2, vec>::get_vpred();
554 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
555 __vstore_pred(pred, pStoreVec, acc7 * siNormFactor);
557 pred = c7x::strm_agen<2, vec>::get_vpred();
558 pStoreVec = c7x::strm_agen<2, vec>::get_adv(siStore);
559 __vstore_pred(pred, pStoreVec, acc8 * siNormFactor);
565 __SA_TEMPLATE_v1 sa0Params;
566 __SA_TEMPLATE_v1 sa1Params;
568 sa0Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
569 sa1Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (4 * SE_PARAM_SIZE));
570 sa2Params = *(__SA_TEMPLATE_v1 *) ((uint8_t *) pBlock + (6 * SE_PARAM_SIZE));
572 sa0Params.ICNT1 = se0ICNT1;
573 sa0Params.ICNT2 = nTiles;
574 sa0Params.DECDIM1_WIDTH = Ncols;
576 sa1Params.ICNT1 = se1ICNT1;
577 sa1Params.ICNT2 = nTiles;
578 sa1Params.DECDIM1_WIDTH = Ncols;
580 __SA0_OPEN(sa0Params);
582 __SA1_OPEN(sa1Params);
584 for (int32_t tile = 0; tile < nTiles; tile++) {
585 __vpred pred = c7x::strm_agen<2, vec>::get_vpred();
586 vec *pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
587 vec si1 = __vload_pred(pred, pSi);
589 pred = c7x::strm_agen<2, vec>::get_vpred();
590 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
591 vec si2 = __vload_pred(pred, pSi);
593 pred = c7x::strm_agen<2, vec>::get_vpred();
594 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
595 vec si3 = __vload_pred(pred, pSi);
597 pred = c7x::strm_agen<2, vec>::get_vpred();
598 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
599 vec si4 = __vload_pred(pred, pSi);
601 pred = c7x::strm_agen<2, vec>::get_vpred();
602 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
603 vec si5 = __vload_pred(pred, pSi);
605 pred = c7x::strm_agen<2, vec>::get_vpred();
606 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
607 vec si6 = __vload_pred(pred, pSi);
609 pred = c7x::strm_agen<2, vec>::get_vpred();
610 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
611 vec si7 = __vload_pred(pred, pSi);
613 pred = c7x::strm_agen<2, vec>::get_vpred();
614 pSi = c7x::strm_agen<2, vec>::get_adv(siStore);
615 vec si8 = __vload_pred(pred, pSi);
618 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(U);
619 vec u1 = __vload_dup(pU1);
620 dataType *pU2 = c7x::strm_agen<3, dataType>::get_adv(U);
621 vec u2 = __vload_dup(pU2);
622 for (int32_t vertical = 0; vertical < se1ICNT1 - 1; vertical++) {
623 vec v01 = c7x::strm_eng<0, vec>::get_adv();
624 vec v02 = c7x::strm_eng<0, vec>::get_adv();
625 vec v03 = c7x::strm_eng<0, vec>::get_adv();
626 vec v04 = c7x::strm_eng<0, vec>::get_adv();
627 vec v05 = c7x::strm_eng<0, vec>::get_adv();
628 vec v06 = c7x::strm_eng<0, vec>::get_adv();
629 vec v07 = c7x::strm_eng<0, vec>::get_adv();
630 vec v08 = c7x::strm_eng<0, vec>::get_adv();
632 vec v11 = c7x::strm_eng<1, vec>::get_adv();
633 vec v12 = c7x::strm_eng<1, vec>::get_adv();
634 vec v13 = c7x::strm_eng<1, vec>::get_adv();
635 vec v14 = c7x::strm_eng<1, vec>::get_adv();
636 vec v15 = c7x::strm_eng<1, vec>::get_adv();
637 vec v16 = c7x::strm_eng<1, vec>::get_adv();
638 vec v17 = c7x::strm_eng<1, vec>::get_adv();
639 vec v18 = c7x::strm_eng<1, vec>::get_adv();
662 pU1 = c7x::strm_agen<3, dataType>::get_adv(U);
663 u1 = __vload_dup(pU1);
664 pU2 = c7x::strm_agen<3, dataType>::get_adv(U);
665 u2 = __vload_dup(pU2);
667 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
668 vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
669 __vstore_pred(pred1, p1, v01);
671 pred1 = c7x::strm_agen<0, vec>::get_vpred();
672 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
673 __vstore_pred(pred1, p1, v02);
675 pred1 = c7x::strm_agen<0, vec>::get_vpred();
676 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
677 __vstore_pred(pred1, p1, v03);
679 pred1 = c7x::strm_agen<0, vec>::get_vpred();
680 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
681 __vstore_pred(pred1, p1, v04);
683 pred1 = c7x::strm_agen<0, vec>::get_vpred();
684 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
685 __vstore_pred(pred1, p1, v05);
687 pred1 = c7x::strm_agen<0, vec>::get_vpred();
688 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
689 __vstore_pred(pred1, p1, v06);
691 pred1 = c7x::strm_agen<0, vec>::get_vpred();
692 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
693 __vstore_pred(pred1, p1, v07);
695 pred1 = c7x::strm_agen<0, vec>::get_vpred();
696 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
697 __vstore_pred(pred1, p1, v08);
699 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
700 vec *p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
701 __vstore_pred(pred2, p2, v11);
703 pred2 = c7x::strm_agen<1, vec>::get_vpred();
704 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
705 __vstore_pred(pred2, p2, v12);
707 pred2 = c7x::strm_agen<1, vec>::get_vpred();
708 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
709 __vstore_pred(pred2, p2, v13);
711 pred2 = c7x::strm_agen<1, vec>::get_vpred();
712 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
713 __vstore_pred(pred2, p2, v14);
715 pred2 = c7x::strm_agen<1, vec>::get_vpred();
716 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
717 __vstore_pred(pred2, p2, v15);
719 pred2 = c7x::strm_agen<1, vec>::get_vpred();
720 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
721 __vstore_pred(pred2, p2, v16);
723 pred2 = c7x::strm_agen<1, vec>::get_vpred();
724 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
725 __vstore_pred(pred2, p2, v17);
727 pred2 = c7x::strm_agen<1, vec>::get_vpred();
728 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
729 __vstore_pred(pred2, p2, v18);
733 vec v01 = c7x::strm_eng<0, vec>::get_adv();
734 vec v02 = c7x::strm_eng<0, vec>::get_adv();
735 vec v03 = c7x::strm_eng<0, vec>::get_adv();
736 vec v04 = c7x::strm_eng<0, vec>::get_adv();
737 vec v05 = c7x::strm_eng<0, vec>::get_adv();
738 vec v06 = c7x::strm_eng<0, vec>::get_adv();
739 vec v07 = c7x::strm_eng<0, vec>::get_adv();
740 vec v08 = c7x::strm_eng<0, vec>::get_adv();
742 vec v11 = c7x::strm_eng<1, vec>::get_adv();
743 vec v12 = c7x::strm_eng<1, vec>::get_adv();
744 vec v13 = c7x::strm_eng<1, vec>::get_adv();
745 vec v14 = c7x::strm_eng<1, vec>::get_adv();
746 vec v15 = c7x::strm_eng<1, vec>::get_adv();
747 vec v16 = c7x::strm_eng<1, vec>::get_adv();
748 vec v17 = c7x::strm_eng<1, vec>::get_adv();
749 vec v18 = c7x::strm_eng<1, vec>::get_adv();
772 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
773 vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
774 __vstore_pred(pred1, p1, v01);
776 pred1 = c7x::strm_agen<0, vec>::get_vpred();
777 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
778 __vstore_pred(pred1, p1, v02);
780 pred1 = c7x::strm_agen<0, vec>::get_vpred();
781 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
782 __vstore_pred(pred1, p1, v03);
784 pred1 = c7x::strm_agen<0, vec>::get_vpred();
785 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
786 __vstore_pred(pred1, p1, v04);
788 pred1 = c7x::strm_agen<0, vec>::get_vpred();
789 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
790 __vstore_pred(pred1, p1, v05);
792 pred1 = c7x::strm_agen<0, vec>::get_vpred();
793 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
794 __vstore_pred(pred1, p1, v06);
796 pred1 = c7x::strm_agen<0, vec>::get_vpred();
797 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
798 __vstore_pred(pred1, p1, v07);
800 pred1 = c7x::strm_agen<0, vec>::get_vpred();
801 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
802 __vstore_pred(pred1, p1, v08);
804 __vpred pred2 = c7x::strm_agen<1, vec>::get_vpred();
805 vec *p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
806 __vstore_pred(pred2, p2, v11);
808 pred2 = c7x::strm_agen<1, vec>::get_vpred();
809 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
810 __vstore_pred(pred2, p2, v12);
812 pred2 = c7x::strm_agen<1, vec>::get_vpred();
813 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
814 __vstore_pred(pred2, p2, v13);
816 pred2 = c7x::strm_agen<1, vec>::get_vpred();
817 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
818 __vstore_pred(pred2, p2, v14);
820 pred2 = c7x::strm_agen<1, vec>::get_vpred();
821 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
822 __vstore_pred(pred2, p2, v15);
824 pred2 = c7x::strm_agen<1, vec>::get_vpred();
825 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
826 __vstore_pred(pred2, p2, v16);
828 pred2 = c7x::strm_agen<1, vec>::get_vpred();
829 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
830 __vstore_pred(pred2, p2, v17);
832 pred2 = c7x::strm_agen<1, vec>::get_vpred();
833 p2 = c7x::strm_agen<1, vec>::get_adv(U + 1 + colUStride);
834 __vstore_pred(pred2, p2, v18);
838 if (se0ICNT1 != se1ICNT1) {
839 vec v01 = c7x::strm_eng<0, vec>::get_adv();
840 vec v02 = c7x::strm_eng<0, vec>::get_adv();
841 vec v03 = c7x::strm_eng<0, vec>::get_adv();
842 vec v04 = c7x::strm_eng<0, vec>::get_adv();
843 vec v05 = c7x::strm_eng<0, vec>::get_adv();
844 vec v06 = c7x::strm_eng<0, vec>::get_adv();
845 vec v07 = c7x::strm_eng<0, vec>::get_adv();
846 vec v08 = c7x::strm_eng<0, vec>::get_adv();
848 dataType *pU1 = c7x::strm_agen<3, dataType>::get_adv(U);
849 vec u1 = __vload_dup(pU1);
860 __vpred pred1 = c7x::strm_agen<0, vec>::get_vpred();
861 vec *p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
862 __vstore_pred(pred1, p1, v01);
864 pred1 = c7x::strm_agen<0, vec>::get_vpred();
865 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
866 __vstore_pred(pred1, p1, v02);
868 pred1 = c7x::strm_agen<0, vec>::get_vpred();
869 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
870 __vstore_pred(pred1, p1, v03);
872 pred1 = c7x::strm_agen<0, vec>::get_vpred();
873 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
874 __vstore_pred(pred1, p1, v04);
876 pred1 = c7x::strm_agen<0, vec>::get_vpred();
877 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
878 __vstore_pred(pred1, p1, v05);
880 pred1 = c7x::strm_agen<0, vec>::get_vpred();
881 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
882 __vstore_pred(pred1, p1, v06);
884 pred1 = c7x::strm_agen<0, vec>::get_vpred();
885 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
886 __vstore_pred(pred1, p1, v07);
888 pred1 = c7x::strm_agen<0, vec>::get_vpred();
889 p1 = c7x::strm_agen<0, vec>::get_adv(U + 1);
890 __vstore_pred(pred1, p1, v08);
void DSPLIB_bidiag_uFinal_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType s, dataType *U1, uint8_t *pBlock)
This function implements the process corresponding to the "update U" loop in natural implementation.
void DSPLIB_bidiag_uFinal_initalize_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, dataType s, dataType *U1, uint8_t *pBlock)
This function implements the process corresponding to the "initial U" loop in natural implementation.
void DSPLIB_bidiag_uFinal_init_ci(DSPLIB_kernelHandle handle)
template void DSPLIB_bidiag_uFinal_expand_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, uint8_t *pBlock)
template void DSPLIB_bidiag_uFinal_expand_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, uint8_t *pBlock)
void DSPLIB_bidiag_uFinal_normalize_ci(dataType *U, int32_t Nrows, dataType s, int32_t colUStride, uint8_t *pBlock)
This function normalizes the column of input matrix U.
void DSPLIB_bidiag_uFinal_expand_ci(dataType *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, uint8_t *pBlock)
This function expands columns of U matrix to get a square matrix and fill the columns with '0' values...
template void DSPLIB_bidiag_uFinal_init_ci< float >(DSPLIB_kernelHandle handle)
template void DSPLIB_bidiag_uFinal_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double s, double *U1, uint8_t *pBlock)
template void DSPLIB_bidiag_uFinal_init_ci< double >(DSPLIB_kernelHandle handle)
template void DSPLIB_bidiag_uFinal_initalize_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float s, float *U1, uint8_t *pBlock)
template void DSPLIB_bidiag_uFinal_ci< float >(float *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, float s, float *U1, uint8_t *pBlock)
template void DSPLIB_bidiag_uFinal_normalize_ci< double >(double *U, int32_t Nrows, double s, int32_t colUStride, uint8_t *pBlock)
template void DSPLIB_bidiag_uFinal_initalize_ci< double >(double *U, int32_t Nrows, int32_t Ncols, int32_t colUStride, double s, double *U1, uint8_t *pBlock)
template void DSPLIB_bidiag_uFinal_normalize_ci< float >(float *U, int32_t Nrows, float s, int32_t colUStride, uint8_t *pBlock)
Header file for kernel's internal use. For the kernel's interface, please see DSPLIB_svd.
dataType getRecip(dataType value)
#define DSPLIB_DEBUGPRINTFN(N, fmt,...)
void * DSPLIB_kernelHandle
Handle type for DSPLIB operations.
Structure that is reserved for internal use by the kernel.
uint32_t strideU
Stride between rows of U matrix
uint8_t bufPblock[DSPLIB_SVD_IXX_IXX_OXX_PBLOCK_SIZE]
Buffer to save SE & SA configuration parameters